Compare commits

...

2 Commits

Author SHA1 Message Date
Ryan Rix 76e307574b [WIP] attachment extraction 2024-04-01 19:44:23 -07:00
Ryan Rix 7b7d2859bd make the python wrapper have a command to parse a file agian 2024-04-01 19:44:06 -07:00
4 changed files with 213 additions and 42 deletions

View File

@ -455,6 +455,58 @@ impl Link {
}
#+end_src
** Attachment
Headings can have attachments, arbitrary files which may be linked to in a shorthand =attachment:= org link or referred to by relative path.
we'll probably have multiple Attachment types with Images that can be post_processed in to a cache file directory, pngcrushed etc... or maybe thatll go in the arcology layer...
#+begin_src rust
#[derive(Debug, Clone, Default)]
#[pyclass(dict)]
pub enum AttachmentType {
Document,
Image,
Video,
#[default]
File
}
#[derive(Debug, Clone, Default)]
#[pyclass(dict)]
pub struct Attachment {
#[pyo3(get)]
pub node_id: String,
#[pyo3(get)]
pub file_path: String,
#[pyo3(get)]
pub atype: AttachmentType,
}
impl fmt::Display for Attachment {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Attachment({} in {} is {:?})",
self.file_path,
self.node_id,
self.atype
)
}
}
#[pymethods]
impl Attachment {
pub fn __repr__(slf: PyRef<'_, Self>) -> PyResult<String> {
Ok(slf.to_string())
}
pub fn __str__(slf: PyRef<'_, Self>) -> PyResult<String> {
Self::__repr__(slf)
}
}
#+end_src
* The Arroyo Org Parser
:PROPERTIES:
:header-args:rust: :tangle src/parse.rs :mkdirp yes
@ -474,7 +526,7 @@ use std::collections::HashMap;
use std::{error::Error, fs};
// use std::collections::HashMap;
use crate::types::{Document, Heading, InvalidDocError, Keyword, Link};
use crate::types::{Attachment, Document, Heading, InvalidDocError, Keyword, Link};
#+end_src
** The public interface
@ -641,11 +693,23 @@ And now we step in to the state machine. It iterates over each element, providin
.iter()
.map(|mbox| mbox.clone().to_string())
.collect();
inherited_tags.push(new_tags);
inherited_tags.push(new_tags.clone());
let most_tags = inherited_tags.concat();
let all_tags: Vec<String> = [filetags.clone(), most_tags].concat();
let attach_tag = String::from("ATTACH");
let maybe_has_attach = new_tags.contains(&attach_tag);
let attachments = if cur_id.is_some() {
// XXX factor this out down to the bottom
let attach_path = find_attach_dir(
&export_properties,
Path::new(&path),
cur_id.clone().unwrap(),
);
fetch_attachments(cur_id.clone().unwrap(), attach_path.into());
};
let h = Heading {
id: cur_id.clone(),
level: cur_level,
@ -657,6 +721,7 @@ And now we step in to the state machine. It iterates over each element, providin
properties: export_properties,
refs,
aliases,
// attachments
..Default::default()
};
headings.push(h);
@ -810,6 +875,41 @@ Having populated all these variables, the headings have the links spliced back i
}
#+end_src
** Org Attachment Extraction
:PROPERTIES:
:ID: 20240401T184220.230505
:END:
check some paths, verify the file exists, construct an =Attachment= for each.
but [[file:/nix/store/cqzcrzmqdfw5gchf30a5f927qfp01llp-emacs-packages-deps/share/emacs/site-lisp/elpa/org-9.6.21/org-attach.el::setq attach-dir (org-entry-get nil "DIR" org-attach-use-inheritance][org-attach-dir checks some properties to calculate a directory]] and [[file:/nix/store/cqzcrzmqdfw5gchf30a5f927qfp01llp-emacs-packages-deps/share/emacs/site-lisp/elpa/org-9.6.21/org-attach.el::defun org-attach-dir-from-id (id &optional existing][org-attach-dir-from-id in org-attach.el]] has a list of functions that it tries to check
want to reimplement this properly...
#+begin_src rust
use std::path::Path;
fn find_attach_dir(props: &HashMap<String, String>, file_path: &Path, node_id: String) -> String {
match vec![String::from("DIR"), String::from("ATTACH_DIR")]
.iter()
.find_map(|prop| props.get(prop))
{
Some(dir) => dir.to_string(),
None => {
// XXX run org-attach dir finder functions
// needs file_path and node_id to construct that
"".to_string()
}
}
}
fn fetch_attachments(node_id: String, src_file_path: String) -> Vec<Attachment> {
let src_file_dir = Path::new(&src_file_path).parent();
vec![]
}
#+end_src
** =split_quoted_string=
:PROPERTIES:
:ID: 20231023T130916.139809
@ -835,13 +935,13 @@ fn split_quoted_string(quoted_str: String) -> Result<Vec<String>, Box<dyn Error>
.map(|inner_val| match inner_val {
lexpr::Value::String(string) => string.to_string(),
lexpr::Value::Symbol(string) => string.to_string(),
others => todo!("{:?}", others),
others => todo!("lexpr roam_aliases or so {:?}", dbg!(others)),
})
.collect(),
),
Ok(lexpr::Value::Symbol(sym)) => Some(vec![sym.to_string()]),
Err(_) => todo!(),
val => todo!("{:?}", val),
Err(the_err) => todo!("XXX {:?}", the_err),
val => todo!("??? {:?}", val),
};
ret
})
@ -1469,7 +1569,7 @@ import glob
from typing import Optional
# from . import persist_one_file
from .arroyo_rs import htmlize_file, ExportOptions
from .arroyo_rs import parse_file, htmlize_file, ExportOptions
# from . import models
# from sqlmodel import Session
#+end_src
@ -1484,22 +1584,16 @@ def cli():
@cli.command()
@click.option("--source", "-s", help="Org source directory", default="~/org")
@click.option("--file-glob", "-g", help="File search glob", default="**/*.org")
@click.option("--dest", "-d", help="Sqlite Database Location", default="./arroyo.db")
def generate_db(source, dest, file_glob):
#engine = models.make_engine(dest)
def parse_files(source, file_glob):
expanded_src = os.path.expanduser(source)
files = glob.glob(file_glob, root_dir=expanded_src, recursive=True)
files = map(lambda it: os.path.join(expanded_src, it), files)
files = list(filter(os.path.isfile, files))
#expanded_src = os.path.expanduser(source)
#files = glob.glob(file_glob, root_dir=expanded_src, recursive=True)
#files = map(lambda it: os.path.join(expanded_src, it), files)
#files = list(filter(os.path.isfile, files))
#with Session(engine) as session:
# docs = [
# x for x in [
# persist_one_file(session, path)
# for path in files
# ] if x is not None
# ]
docs = [
parse_file(f)
for f in files
]
print(f"Parsed {len(files)} files.")
print(f"Persisted {len(docs)} docs.")

View File

@ -5,7 +5,7 @@ import glob
from typing import Optional
# from . import persist_one_file
from .arroyo_rs import htmlize_file, ExportOptions
from .arroyo_rs import parse_file, htmlize_file, ExportOptions
# from . import models
# from sqlmodel import Session
# Click command wrapper:1 ends here
@ -18,22 +18,16 @@ def cli():
@cli.command()
@click.option("--source", "-s", help="Org source directory", default="~/org")
@click.option("--file-glob", "-g", help="File search glob", default="**/*.org")
@click.option("--dest", "-d", help="Sqlite Database Location", default="./arroyo.db")
def generate_db(source, dest, file_glob):
#engine = models.make_engine(dest)
def parse_files(source, file_glob):
expanded_src = os.path.expanduser(source)
files = glob.glob(file_glob, root_dir=expanded_src, recursive=True)
files = map(lambda it: os.path.join(expanded_src, it), files)
files = list(filter(os.path.isfile, files))
#expanded_src = os.path.expanduser(source)
#files = glob.glob(file_glob, root_dir=expanded_src, recursive=True)
#files = map(lambda it: os.path.join(expanded_src, it), files)
#files = list(filter(os.path.isfile, files))
#with Session(engine) as session:
# docs = [
# x for x in [
# persist_one_file(session, path)
# for path in files
# ] if x is not None
# ]
docs = [
parse_file(f)
for f in files
]
print(f"Parsed {len(files)} files.")
print(f"Persisted {len(docs)} docs.")

View File

@ -9,7 +9,7 @@ use std::collections::HashMap;
use std::{error::Error, fs};
// use std::collections::HashMap;
use crate::types::{Document, Heading, InvalidDocError, Keyword, Link};
use crate::types::{Attachment, Document, Heading, InvalidDocError, Keyword, Link};
// The Arroyo Org Parser:1 ends here
// [[file:../arroyo-native-parser.org::*The public interface][The public interface:1]]
@ -135,11 +135,23 @@ pub fn extract_headings(path: String, tree: &Org) -> Result<Vec<Heading>> {
.iter()
.map(|mbox| mbox.clone().to_string())
.collect();
inherited_tags.push(new_tags);
inherited_tags.push(new_tags.clone());
let most_tags = inherited_tags.concat();
let all_tags: Vec<String> = [filetags.clone(), most_tags].concat();
let attach_tag = String::from("ATTACH");
let maybe_has_attach = new_tags.contains(&attach_tag);
let attachments = if cur_id.is_some() {
// XXX factor this out down to the bottom
let attach_path = find_attach_dir(
&export_properties,
Path::new(&path),
cur_id.clone().unwrap(),
);
fetch_attachments(cur_id.clone().unwrap(), attach_path.into());
};
let h = Heading {
id: cur_id.clone(),
level: cur_level,
@ -151,6 +163,7 @@ pub fn extract_headings(path: String, tree: &Org) -> Result<Vec<Heading>> {
properties: export_properties,
refs,
aliases,
// attachments
..Default::default()
};
headings.push(h);
@ -271,6 +284,30 @@ pub fn extract_headings(path: String, tree: &Org) -> Result<Vec<Heading>> {
}
// Cleaning up:1 ends here
// [[file:../arroyo-native-parser.org::*Org Attachment Extraction][Org Attachment Extraction:1]]
use std::path::Path;
fn find_attach_dir(props: &HashMap<String, String>, file_path: &Path, node_id: String) -> String {
match vec![String::from("DIR"), String::from("ATTACH_DIR")]
.iter()
.find_map(|prop| props.get(prop))
{
Some(dir) => dir.to_string(),
None => {
// XXX run org-attach dir finder functions
// needs file_path and node_id to construct that
"".to_string()
}
}
}
fn fetch_attachments(node_id: String, src_file_path: String) -> Vec<Attachment> {
let src_file_dir = Path::new(&src_file_path).parent();
vec![]
}
// Org Attachment Extraction:1 ends here
// [[file:../arroyo-native-parser.org::*=split_quoted_string=][=split_quoted_string=:1]]
fn split_quoted_string(quoted_str: String) -> Result<Vec<String>, Box<dyn Error>> {
let str_as_list = format!("[{}]", &quoted_str);
@ -287,13 +324,13 @@ fn split_quoted_string(quoted_str: String) -> Result<Vec<String>, Box<dyn Error>
.map(|inner_val| match inner_val {
lexpr::Value::String(string) => string.to_string(),
lexpr::Value::Symbol(string) => string.to_string(),
others => todo!("{:?}", others),
others => todo!("lexpr roam_aliases or so {:?}", dbg!(others)),
})
.collect(),
),
Ok(lexpr::Value::Symbol(sym)) => Some(vec![sym.to_string()]),
Err(_) => todo!(),
val => todo!("{:?}", val),
Err(the_err) => todo!("XXX {:?}", the_err),
val => todo!("??? {:?}", val),
};
ret
})

View File

@ -187,3 +187,49 @@ impl Link {
}
}
// Link:1 ends here
// [[file:../arroyo-native-parser.org::*Attachment][Attachment:1]]
#[derive(Debug, Clone, Default)]
#[pyclass(dict)]
pub enum AttachmentType {
Document,
Image,
Video,
#[default]
File
}
#[derive(Debug, Clone, Default)]
#[pyclass(dict)]
pub struct Attachment {
#[pyo3(get)]
pub node_id: String,
#[pyo3(get)]
pub file_path: String,
#[pyo3(get)]
pub atype: AttachmentType,
}
impl fmt::Display for Attachment {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(
f,
"Attachment({} in {} is {:?})",
self.file_path,
self.node_id,
self.atype
)
}
}
#[pymethods]
impl Attachment {
pub fn __repr__(slf: PyRef<'_, Self>) -> PyResult<String> {
Ok(slf.to_string())
}
pub fn __str__(slf: PyRef<'_, Self>) -> PyResult<String> {
Self::__repr__(slf)
}
}
// Attachment:1 ends here