try to structure this functionality in to a cohesive narrative, fail.

document a todo
lift HTML link rewriting in to a public function which the Atom parser can use
2024-01-17 00:23:41 -08:00 · 2024-01-17 00:22:47 -08:00 · 2024-01-17 00:22:23 -08:00
3 changed files with 557 additions and 290 deletions
--- a/arroyo-native-parser.org
+++ b/arroyo-native-parser.org
@ -648,6 +648,9 @@ Event::Start(orgize::Element::Title(title)) => {
 :END:

 **** File-level Property Drawer parsing
+:PROPERTIES:
+:ID:       20240116T235328.441922
+:END:

 Handling the file-level properties drawer is a bit of a pain -- some day I'll roll this in to =orgize= itself so that these can be accessed via a =PropertiesMap= as in the heading parsing above, but I don't get this library well enough to do that right now.

@ -716,6 +719,8 @@ Event::End(orgize::Element::Drawer(_drawer)) => {
 }
 #+END_SRC

+***** NEXT fix orgize to expose file-level propertiesmap
+
 **** Link parsing

 Look; I'm gonna be honest here. I don't remember why the links are stored outside the heading until the end of the document parsing. Some ownership bullshit, and the COW types, if I recall.
@ -915,6 +920,13 @@ impl<E: From<Error>, H: HtmlHandler<E>> ArroyoHtmlHandler<E, H> {
            ..Default::default()
        }
    }
+
+    pub fn rewrite_link_from(&self, id: &String) -> String {
+        match self.options.link_retargets.get(id) {
+            Some(path) => HtmlEscape(&path).to_string(),
+            _ => HtmlEscape(format!("/404?key={}", id)).to_string(),
+        }
+    }
 }

 impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoHtmlHandler<E, H> {
@ -980,39 +992,20 @@ Link exporting is going to be the most complicated part of this because it does
                    Some((proto, stripped_dest)) => (proto, stripped_dest.into()),
                    None => ("", string_path),
                };
+                let desc = link.desc.clone().unwrap_or(link.path.clone());
                match proto {
-                    "id" => {
-                        let maybe_new_target = self.options.link_retargets.get(&stripped_dest);
-                        match maybe_new_target {
-                            Some(path) => {
-                                let desc = link.desc.clone().unwrap_or(path.clone().into());
-                                write!(
-                                    w,
-                                    "<a class=\"internal\" href=\"{}\">{}</a>",
-                                    HtmlEscape(&path),
-                                    HtmlEscape(&desc),
-                                )?
-                            }
-                            _ => {
-                                let desc = link.desc.clone().unwrap_or(link.path.clone());
-                                write!(
-                                    w,
-                                    "<a href=\"/404?key={}\">{}</a>",
-                                    HtmlEscape(&link.path),
-                                    HtmlEscape(&desc),
-                                )?
-                            }
-                        };
-                    }
-                    "roam" => {
-                        let desc = link.desc.clone().unwrap_or(link.path.clone());
-                        write!(
-                            w,
-                            "<a href=\"/404?key={}\">{}</a>",
-                            HtmlEscape(&link.path),
-                            HtmlEscape(&desc),
-                        )?
-                    }
+                    "id" => write!(
+                        w,
+                        "<a href=\"{}\">{}</a>",
+                        self.rewrite_link_from(&stripped_dest),
+                        HtmlEscape(&desc),
+                    )?,
+                    "roam" => write!(
+                        w,
+                        "<a href=\"/404?key={}\">{}</a>",
+                        HtmlEscape(&link.path),
+                        HtmlEscape(&desc),
+                    )?,
                    _ => self.inner.start(w, &Element::Link(link.clone()))?,
                }
            }
@ -1112,12 +1105,13 @@ This thing is similar in many respects to the HTML Handler, and it uses it direc

 #+begin_src rust :tangle src/export_atom.rs
 use anyhow::Result;
+use regex;
 use std::fs;
 use std::io::{Error, Write};
 use std::marker::PhantomData;

 use orgize::export::{DefaultHtmlHandler, HtmlEscape, HtmlHandler, SyntectHtmlHandler};
-use orgize::{Element, Org};
+use orgize::{elements, Element, Org};

 use crate::export_html::ArroyoHtmlHandler;
 use crate::export_html::ExportOptions;
@ -1127,23 +1121,27 @@ This is just some basic implementation definitions and junk.

 #+begin_src rust :tangle src/export_atom.rs
 pub struct ArroyoAtomHandler<E: From<Error>, H: HtmlHandler<E>> {
-    // pub options: ExportOptions,
-    pub inner: H,
+    pub options: ExportOptions,
+    pub inner: ArroyoHtmlHandler<E, H>,
    pub error_type: PhantomData<E>,
-    pub in_heading: bool,
+
+    // internal parser state
+    in_heading: bool,
+    in_drawer: bool,
+    heading_lvl: usize,
+
+    // Document metadata placed in feed
+    pub filetags: Vec<String>,
+    pub authors: Vec<String>,
+    pub feed_title: String,
+    pub last_date: String,
 }

 impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
-    // pub fn new(options: ExportOptions, inner: H) -> Self {
-    //     ArroyoHtmlHandler {
-    //         inner,
-    //         options,
-    //         ..Default::default()
-    //     }
-    // }
-    pub fn new(inner: H) -> Self {
+    pub fn new(options: ExportOptions, inner: ArroyoHtmlHandler<E, H>) -> Self {
        ArroyoAtomHandler {
            inner,
+            options,
            ..Default::default()
        }
    }
@ -1152,10 +1150,18 @@ impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
 impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
    fn default() -> Self {
        ArroyoAtomHandler {
-            inner: H::default(),
+            inner: ArroyoHtmlHandler::default(),
            error_type: PhantomData,
-            // options: ExportOptions::default(),
+            options: ExportOptions::default(),
+
            in_heading: false,
+            in_drawer: false,
+            heading_lvl: 0,
+            last_date: "".into(),
+            feed_title: "".into(),
+
+            filetags: vec![],
+            authors: vec![],
        }
    }
 }
@ -1166,8 +1172,8 @@ impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
 #+begin_src rust :tangle src/export_atom.rs
 pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
    let syntect_handler = SyntectHtmlHandler::new(DefaultHtmlHandler);
-    let html_handler = ArroyoHtmlHandler::new(options, syntect_handler);
-    let mut handler = ArroyoAtomHandler::new(html_handler);
+    let html_handler = ArroyoHtmlHandler::new(options.clone(), syntect_handler);
+    let mut handler = ArroyoAtomHandler::new(options.clone(), html_handler);

    let org = String::from_utf8(fs::read(path.clone())?).unwrap();
    let org_tree = &Org::parse_custom(
@ -1192,107 +1198,50 @@ pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
 }
 #+end_src

-This is functionally the same as the Arroyo HTML Handler.
+This is uses the same custom org handling that the HTML exporter does, but in a worse fashion.

 #+begin_src rust :tangle src/export_atom.rs
 impl<E: From<Error>, H: HtmlHandler<E>> HtmlHandler<E> for ArroyoAtomHandler<E, H> {
    fn start<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
-        match element {
+        (match dbg!(element) {
 #+end_src

-The root element contains metadata which need to be populated with data pulled from the doc, keywords, etc:
+The root element contains metadata which need to be populated with data pulled from the doc, keywords, etc. This is defined below:

 #+begin_src rust :tangle src/export_atom.rs
-            Element::Document { .. } => {
-                write!(
-                    w,
-                    "<?xml version=\"1.0\" encoding=\"utf-8\"?>
-                     <feed xmlns=\"http://www.w3.org/2005/Atom\">
-                       <title>Example Feed</title>
-                       <link href=\"http://example.org/\"/>
-                       <updated>2003-12-13T18:30:02Z</updated>
-                       <author>
-                         <name>John Doe</name>
-                       </author>
-                       <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
-                )?;
-            }
+            Element::Document { .. } => self.start_document(w, element),
 #+end_src

-Each title is the basis of a new heading and a new entry in the Atom document. There's still some work to do here, but it will only publish headings that have an ID and a PUBDATE property, and don't have certain tags. The basic metadata for the heading is injected in to the final document.
-
-The title needs to know to close the previous one, this is pretty ugly, but the structure of the =Orgize= iterator don't work well for this.
+=start_keyword= extracts document metadata required to populate the Atom feed.

 #+begin_src rust :tangle src/export_atom.rs
-            Element::Title(title) => {
-                // !!!
-                let the_link = "";
-                // !!!
-                let ignore_tags = vec![
-                    String::from("noexport"),
-                    String::from("NOEXPORT"),
-                    String::from("ignore"),
-                ];
-                let export_tags = title
-                    .tags
-                    .clone()
-                    .into_iter()
-                    .map(String::from)
-                    .find(|v| ignore_tags.contains(v));
-                let id = title
-                    .properties
-                    .clone()
-                    .into_iter()
-                    .find(|(k, _v)| k == "ID")
-                    .map(|(_k, v)| String::from(v))
-                    .unwrap_or("".to_string());
-                let pubdate = title
-                    .properties
-                    .clone()
-                    .into_iter()
-                    .find(|(k, _v)| k == "PUBDATE")
-                    .map(|(_k, v)| String::from(v))
-                    .unwrap_or("".to_string());
+            Element::Keyword(kw) => self.start_keyword(w, kw),
+            #+end_src

-                if id == "" || pubdate == "" || export_tags.is_some() {
-                    self.in_heading = false;
-                } else if id != "" && pubdate != "" && export_tags.is_none() {
-                    if self.in_heading == true {
-                        write!(w, "</content>\n")?;
-                        write!(w, "</entry>\n")?;
-                    }
-                    let s = format!(
-                        "<entry>
-                           <title>{}</title>
-                           <link href=\"{}\"/>
-                           <id>urn:roam:{}</id>
-                           <updated>{}</updated>\n
+Each title is immediately contained in a Heading, but contains all the actual metadata. It is the basis of a new heading and thus a new entry in the Atom document.

-                           <content type=\"html\">&lt;h{}&gt;",
-                        title.raw,
-                        the_link,
-                        id,
-                        HtmlEscape(pubdate),
-                        title.level,
-                    );
-                    self.in_heading = true;
-                    write!(w, "{}", s)?
-                }
-            }
+There's still some work to do here, but it will only publish headings that have an ID and a PUBDATE property, and don't have certain tags. The basic metadata for the heading is injected in to the final document.
+
+The title needs to know to close the previous one, this is pretty ugly, but the structure of the =Orgize= iterator don't work well for this. This functionality is pretty ugly and explained better below:
+
+#+begin_src rust :tangle src/export_atom.rs
+            Element::Title(title) => self.start_title(w, title),
 #+end_src

-Any other elements will be HTML-ized and then escaped, and emitted in to the doc as HTML-encoded HTML.
+Because Orgize doesn't properly parse file-level =PROPERTIES= drawers in to a pseudo-heading, I have a forked version of Orgize that lets me reach in and do that myself. I handle =Text= elements to do this.
+
+I would love to eliminate this logic, it's lifted from the parser above.

 #+begin_src rust :tangle src/export_atom.rs
-            _t => {
-                if self.in_heading == true {
-                    let mut buf = InternalWriter::new();
-                    self.inner.start(&mut buf, element)?;
-                    let s = buf.to_utf8().unwrap();
-                    write!(w, "{}", s)?
-                }
-            }
-        }
+            Element::Text { value } => self.start_text(w, value),
+#+end_src
+
+Any other elements will be HTML-ized and then escaped, and emitted in to the doc as HTML-encoded HTML, assuming we're inside of a heading.
+
+#+begin_src rust :tangle src/export_atom.rs
+            _t => self.start_rest(w, element),
+        })
+        .unwrap(); // if we can't parse something, just fucken panic.
 #+end_src

 #+begin_src rust :tangle src/export_atom.rs
@ -1300,35 +1249,234 @@ Any other elements will be HTML-ized and then escaped, and emitted in to the doc
    }

    fn end<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
-        match element {
+        (match element {
 #+end_src

 The end functions is to close out document entities. we only use it to encode the end of the Document to make the XML valid, and everything goes through the same "HTML and HTML Escape" path as above.

 #+begin_src rust :tangle src/export_atom.rs
            // Element::Title(_title) => {}
-            Element::Document { .. } => {
-                write!(w, "</content>\n")?;
-                write!(w, "</entry>\n")?;
-                write!(w, "</feed>")?;
-            }
+            Element::Document { .. } => self.end_document(w, element),
 #+end_src

 #+begin_src rust :tangle src/export_atom.rs
-            _ => {
-                if self.in_heading == true {
-                    let mut buf = InternalWriter::new();
-                    self.inner.end(&mut buf, element)?;
-                    let s = buf.to_utf8().unwrap();
-                    write!(w, "{}", s)?
-                }
+            _ => self.end_rest(w, element),
+        })
+        .ok();
+        Ok(())
+    }
+}
+#+end_src
+
+All of those things are implemented in the =struct= =impl=.
+
+#+begin_src rust :tangle src/export_atom.rs
+impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
+#+end_src
+
+Processing the document to create metadata is going to be a bit of a pain because keywords need to be extracted out of the document and perhaps even turned in to URLs, etc...
+
+#+begin_src rust :tangle src/export_atom.rs
+    fn start_document<W: Write>(&mut self, mut w: W, _document: &elements::Element) -> Result<()> {
+        Ok(write!(
+            w,
+            "<?xml version=\"1.0\" encoding=\"utf-8\"?>
+             <feed xmlns=\"http://www.w3.org/2005/Atom\">
+               <link href=\"http://example.org/\"/>
+               <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
+        )?)
+    }
+
+    fn end_document<W: Write>(&mut self, mut w: W, _document: &elements::Element) -> Result<()> {
+        // the last heading/entry is still "open", close it.
+        write!(
+            w,
+            r#"    </content>
+                 </entry>
+                 <updated>{}</updated>
+               </feed>"#,
+            self.last_date,
+        )?;
+        Ok(())
+    }
+#+end_src
+
+Processing a title in to an Atom =<entry>= is pretty ugly. Really what I want is a way to get all this out of Heading objects but those lack enough information to fill out the =<entry>= metadata.
+
+#+begin_src rust :tangle src/export_atom.rs
+    fn start_title<W: Write>(&mut self, mut w: W, title: &elements::Title) -> Result<()> {
+        let ignore_tags = vec![
+            String::from("noexport"),
+            String::from("NOEXPORT"),
+            String::from("ignore"),
+        ];
+        let export_tags = title
+            .tags
+            .clone()
+            .into_iter()
+            .map(String::from)
+            .find(|v| ignore_tags.contains(v));
+        let props = title.properties.clone().into_hash_map();
+        let id = props
+            .get("ID")
+            .map(|id| id.clone().into())
+            .unwrap_or("".to_string());
+        let pubdate = props
+            .get("PUBDATE")
+            .map(|pubdate| pubdate.clone().into())
+            .unwrap_or("".to_string());
+        let the_link = self.inner.rewrite_link_from(&id);
+
+        if id == "" || pubdate == "" || export_tags.is_some() && self.in_heading {
+            self.in_heading = false;
+            Ok(())
+        } else if id != "" && pubdate != "" && export_tags.is_none() {
+            // close previous heading; note that self.heading_lvl defaults to 0
+            if title.level <= self.heading_lvl {
+                write!(w, "</content>\n")?;
+                write!(w, "</entry>\n")?;
            }
+            let date = match rfcize_datestamp(pubdate.clone()) {
+                Ok(date) => date,
+                Err(_) => {
+                    dbg!(format!("bad date {}", pubdate.clone()));
+                    HtmlEscape(pubdate.clone()).to_string()
+                }
+            };
+            if self.last_date < date {
+                self.last_date = date.clone();
+            }
+            let title_text = match strip_links_from_str(&title.raw.clone()) {
+                Ok(text) => HtmlEscape(text),
+                Err(the_err) => {
+                    dbg!(format!("bad title {} {}", title.raw.clone(), the_err));
+                    HtmlEscape(title.raw.to_string())
+                }
+            };
+            let cat_xmls = "";
+            let s = format!(
+                "<entry>
+                   <title>{}</title>
+                   <link href=\"{}\"/>
+                   <id>{}</id>
+                   <updated>{}</updated>\n
+                   {}
+
+                   <content type=\"html\">&lt;h{}&gt;", // the HTML encoded heading opening needs to be added here!
+                title_text,
+                the_link.to_string(), // link
+                the_link.to_string(), // ID
+                date,
+                cat_xmls,
+                title.level,
+            );
+            self.heading_lvl = title.level;
+            self.in_heading = true;
+            Ok(write!(w, "{}", s)?)
+        } else {
+            Ok(())
+        }
+    }
+#+end_src
+
+#+begin_src rust :tangle src/export_atom.rs
+    fn start_keyword<W: Write>(&mut self, mut w: W, kw: &elements::Keyword) -> Result<()> {
+        // dbg!(kw);
+        match kw.key.as_ref() {
+            "FILETAGS" => {
+                kw.value
+                    .split(":")
+                    .map(String::from)
+                    .filter(|s| !s.is_empty())
+                    .for_each(|s| self.filetags.push(s));
+                dbg!(&self.filetags);
+            }
+            "TITLE" => {
+                self.feed_title = kw.value.to_string();
+                write!(w, r#"<title>{}</title>"#, self.feed_title)?;
+            }
+            "AUTHOR" => {
+                let re = regex::Regex::new(r"(?<name>[\w\s\d]+) <(?<email>.*)>").unwrap();
+
+                re.captures_iter(&kw.value)
+                    .map(|caps| {
+                        format!(
+                            "<author><name>{}</name><email>{}</email></author>",
+                            &caps["name"], &caps["email"]
+                        )
+                    })
+                    .for_each(|s| {
+                        self.authors.push(s.clone());
+                        write!(w, "{}", s).ok();
+                    });
+                dbg!(&self.authors);
+            }
+            _ => {}
+        }
+        Ok(())
+    }
+#+end_src
+
+#+begin_src rust :tangle src/export_atom.rs
+    fn start_rest<W: Write>(&mut self, mut w: W, element: &elements::Element) -> Result<()> {
+        Ok(if self.in_heading == true {
+            let mut buf = InternalWriter::new();
+            self.inner.start(&mut buf, element).ok(); // panic if this fails.
+            let s = buf.to_utf8().unwrap();
+            write!(w, "{}", s)?
+        })
+    }
+
+    fn end_rest<W: Write>(&mut self, mut w: W, element: &elements::Element) -> Result<()> {
+        if self.in_heading == true {
+            let mut buf = InternalWriter::new();
+            self.inner.end(&mut buf, element).ok();
+            let s = buf.to_utf8().unwrap();
+            write!(w, "{}", s)?
        }
        Ok(())
    }
 }
 #+end_src

+**** Strip Links from Strings
+
+#+begin_src rust :tangle src/export_atom.rs
+fn strip_links_from_str(in_str: &str) -> Result<String> {
+    // title.raw.replace("[", "&#91;").replace("]", "&#92;"),
+    let re = regex::Regex::new(r"\[(?<wrapped_the_link>\[\])\[(?<text>)\]\]")?;
+
+    Ok(re.replace_all(in_str, "$text").to_string())
+}
+#+end_src
+
+**** Convert my org-style timestamps to RFC-3339 strings
+
+#+begin_src rust :tangle src/export_atom.rs
+fn rfcize_datestamp(in_str: String) -> Result<String> {
+    let re = regex::Regex::new(
+        r"<?(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2}) \w+ (?<hour>\d{2}):(?<minutes>\d{2})>?",
+    )?;
+    let date: Option<String> = re
+        .captures_iter(&in_str)
+        .map(|caps| {
+            let year = caps.name("year").unwrap().as_str();
+            let month = caps.name("month").unwrap().as_str();
+            let day = caps.name("day").unwrap().as_str();
+            let hour = caps.name("hour").unwrap().as_str();
+            let minutes = caps.name("minutes").unwrap().as_str();
+            let ret: String = format!("{}-{}-{}T{}:{}:00-08:00", year, month, day, hour, minutes);
+            ret
+        })
+        .next();
+    Ok(date.ok_or(Error::new(std::io::ErrorKind::Other, "invalid date"))?)
+}
+#+end_src
+
+**** Internal Buffer Writer for Escaping Entities
+
+I implemented a really simple/dumb =Write= interface so will =HtmlEscape= things which are written to it. I should make this take a String under the hood instead of =Vec<bytes>= but meh it's good enough for now.
+
 #+begin_src rust :tangle src/export_atom.rs
 struct InternalWriter {
    inner: Vec<u8>,
@ -1342,7 +1490,6 @@ impl InternalWriter {
    pub fn new() -> InternalWriter {
        return InternalWriter { inner: Vec::new() };
    }
-
 }

 impl Write for &mut InternalWriter {
--- a/src/export_atom.rs
+++ b/src/export_atom.rs
@ -1,11 +1,12 @@
 // [[file:../arroyo-native-parser.org::*First Pass][First Pass:1]]
 use anyhow::Result;
+use regex;
 use std::fs;
 use std::io::{Error, Write};
 use std::marker::PhantomData;

 use orgize::export::{DefaultHtmlHandler, HtmlEscape, HtmlHandler, SyntectHtmlHandler};
-use orgize::{Element, Org};
+use orgize::{elements, Element, Org};

 use crate::export_html::ArroyoHtmlHandler;
 use crate::export_html::ExportOptions;
@ -13,23 +14,27 @@ use crate::export_html::ExportOptions;

 // [[file:../arroyo-native-parser.org::*First Pass][First Pass:2]]
 pub struct ArroyoAtomHandler<E: From<Error>, H: HtmlHandler<E>> {
-    // pub options: ExportOptions,
-    pub inner: H,
+    pub options: ExportOptions,
+    pub inner: ArroyoHtmlHandler<E, H>,
    pub error_type: PhantomData<E>,
-    pub in_heading: bool,
+
+    // internal parser state
+    in_heading: bool,
+    in_drawer: bool,
+    heading_lvl: usize,
+
+    // Document metadata placed in feed
+    pub filetags: Vec<String>,
+    pub authors: Vec<String>,
+    pub feed_title: String,
+    pub last_date: String,
 }

 impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
-    // pub fn new(options: ExportOptions, inner: H) -> Self {
-    //     ArroyoHtmlHandler {
-    //         inner,
-    //         options,
-    //         ..Default::default()
-    //     }
-    // }
-    pub fn new(inner: H) -> Self {
+    pub fn new(options: ExportOptions, inner: ArroyoHtmlHandler<E, H>) -> Self {
        ArroyoAtomHandler {
            inner,
+            options,
            ..Default::default()
        }
    }
@ -38,10 +43,18 @@ impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
 impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
    fn default() -> Self {
        ArroyoAtomHandler {
-            inner: H::default(),
+            inner: ArroyoHtmlHandler::default(),
            error_type: PhantomData,
-            // options: ExportOptions::default(),
+            options: ExportOptions::default(),
+
            in_heading: false,
+            in_drawer: false,
+            heading_lvl: 0,
+            last_date: "".into(),
+            feed_title: "".into(),
+
+            filetags: vec![],
+            authors: vec![],
        }
    }
 }
@ -50,8 +63,8 @@ impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
 // [[file:../arroyo-native-parser.org::*First Pass][First Pass:3]]
 pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
    let syntect_handler = SyntectHtmlHandler::new(DefaultHtmlHandler);
-    let html_handler = ArroyoHtmlHandler::new(options, syntect_handler);
-    let mut handler = ArroyoAtomHandler::new(html_handler);
+    let html_handler = ArroyoHtmlHandler::new(options.clone(), syntect_handler);
+    let mut handler = ArroyoAtomHandler::new(options.clone(), html_handler);

    let org = String::from_utf8(fs::read(path.clone())?).unwrap();
    let org_tree = &Org::parse_custom(
@ -79,129 +92,249 @@ pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
 // [[file:../arroyo-native-parser.org::*First Pass][First Pass:4]]
 impl<E: From<Error>, H: HtmlHandler<E>> HtmlHandler<E> for ArroyoAtomHandler<E, H> {
    fn start<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
-        match element {
+        (match dbg!(element) {
 // First Pass:4 ends here

 // [[file:../arroyo-native-parser.org::*First Pass][First Pass:5]]
-            Element::Document { .. } => {
-                write!(
-                    w,
-                    "<?xml version=\"1.0\" encoding=\"utf-8\"?>
-                     <feed xmlns=\"http://www.w3.org/2005/Atom\">
-                       <title>Example Feed</title>
-                       <link href=\"http://example.org/\"/>
-                       <updated>2003-12-13T18:30:02Z</updated>
-                       <author>
-                         <name>John Doe</name>
-                       </author>
-                       <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
-                )?;
-            }
+            Element::Document { .. } => self.start_document(w, element),
 // First Pass:5 ends here

 // [[file:../arroyo-native-parser.org::*First Pass][First Pass:6]]
-            Element::Title(title) => {
-                // !!!
-                let the_link = "";
-                // !!!
-                let ignore_tags = vec![
-                    String::from("noexport"),
-                    String::from("NOEXPORT"),
-                    String::from("ignore"),
-                ];
-                let export_tags = title
-                    .tags
-                    .clone()
-                    .into_iter()
-                    .map(String::from)
-                    .find(|v| ignore_tags.contains(v));
-                let id = title
-                    .properties
-                    .clone()
-                    .into_iter()
-                    .find(|(k, _v)| k == "ID")
-                    .map(|(_k, v)| String::from(v))
-                    .unwrap_or("".to_string());
-                let pubdate = title
-                    .properties
-                    .clone()
-                    .into_iter()
-                    .find(|(k, _v)| k == "PUBDATE")
-                    .map(|(_k, v)| String::from(v))
-                    .unwrap_or("".to_string());
-
-                if id == "" || pubdate == "" || export_tags.is_some() {
-                    self.in_heading = false;
-                } else if id != "" && pubdate != "" && export_tags.is_none() {
-                    if self.in_heading == true {
-                        write!(w, "</content>\n")?;
-                        write!(w, "</entry>\n")?;
-                    }
-                    let s = format!(
-                        "<entry>
-                           <title>{}</title>
-                           <link href=\"{}\"/>
-                           <id>urn:roam:{}</id>
-                           <updated>{}</updated>\n
-
-                           <content type=\"html\">&lt;h{}&gt;",
-                        title.raw,
-                        the_link,
-                        id,
-                        HtmlEscape(pubdate),
-                        title.level,
-                    );
-                    self.in_heading = true;
-                    write!(w, "{}", s)?
-                }
-            }
+            Element::Keyword(kw) => self.start_keyword(w, kw),
 // First Pass:6 ends here

 // [[file:../arroyo-native-parser.org::*First Pass][First Pass:7]]
-            _t => {
-                if self.in_heading == true {
-                    let mut buf = InternalWriter::new();
-                    self.inner.start(&mut buf, element)?;
-                    let s = buf.to_utf8().unwrap();
-                    write!(w, "{}", s)?
-                }
-            }
-        }
+            Element::Title(title) => self.start_title(w, title),
 // First Pass:7 ends here

 // [[file:../arroyo-native-parser.org::*First Pass][First Pass:8]]
+            Element::Text { value } => self.start_text(w, value),
+// First Pass:8 ends here
+
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:9]]
+            _t => self.start_rest(w, element),
+        })
+        .unwrap(); // if we can't parse something, just fucken panic.
+// First Pass:9 ends here
+
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:10]]
        Ok(())
    }

    fn end<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
-        match element {
-// First Pass:8 ends here
+        (match element {
+// First Pass:10 ends here

-// [[file:../arroyo-native-parser.org::*First Pass][First Pass:9]]
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:11]]
            // Element::Title(_title) => {}
-            Element::Document { .. } => {
+            Element::Document { .. } => self.end_document(w, element),
+// First Pass:11 ends here
+
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:12]]
+            _ => self.end_rest(w, element),
+        })
+        .ok();
+        Ok(())
+    }
+}
+// First Pass:12 ends here
+
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:13]]
+impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
+// First Pass:13 ends here
+
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:14]]
+    fn start_document<W: Write>(&mut self, mut w: W, _document: &elements::Element) -> Result<()> {
+        Ok(write!(
+            w,
+            "<?xml version=\"1.0\" encoding=\"utf-8\"?>
+             <feed xmlns=\"http://www.w3.org/2005/Atom\">
+               <link href=\"http://example.org/\"/>
+               <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
+        )?)
+    }
+
+    fn end_document<W: Write>(&mut self, mut w: W, _document: &elements::Element) -> Result<()> {
+        // the last heading/entry is still "open", close it.
+        write!(
+            w,
+            r#"    </content>
+                 </entry>
+                 <updated>{}</updated>
+               </feed>"#,
+            self.last_date,
+        )?;
+        Ok(())
+    }
+// First Pass:14 ends here
+
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:15]]
+    fn start_title<W: Write>(&mut self, mut w: W, title: &elements::Title) -> Result<()> {
+        let ignore_tags = vec![
+            String::from("noexport"),
+            String::from("NOEXPORT"),
+            String::from("ignore"),
+        ];
+        let export_tags = title
+            .tags
+            .clone()
+            .into_iter()
+            .map(String::from)
+            .find(|v| ignore_tags.contains(v));
+        let props = title.properties.clone().into_hash_map();
+        let id = props
+            .get("ID")
+            .map(|id| id.clone().into())
+            .unwrap_or("".to_string());
+        let pubdate = props
+            .get("PUBDATE")
+            .map(|pubdate| pubdate.clone().into())
+            .unwrap_or("".to_string());
+        let the_link = self.inner.rewrite_link_from(&id);
+
+        if id == "" || pubdate == "" || export_tags.is_some() && self.in_heading {
+            self.in_heading = false;
+            Ok(())
+        } else if id != "" && pubdate != "" && export_tags.is_none() {
+            // close previous heading; note that self.heading_lvl defaults to 0
+            if title.level <= self.heading_lvl {
                write!(w, "</content>\n")?;
                write!(w, "</entry>\n")?;
-                write!(w, "</feed>")?;
            }
-// First Pass:9 ends here
-
-// [[file:../arroyo-native-parser.org::*First Pass][First Pass:10]]
-            _ => {
-                if self.in_heading == true {
-                    let mut buf = InternalWriter::new();
-                    self.inner.end(&mut buf, element)?;
-                    let s = buf.to_utf8().unwrap();
-                    write!(w, "{}", s)?
+            let date = match rfcize_datestamp(pubdate.clone()) {
+                Ok(date) => date,
+                Err(_) => {
+                    dbg!(format!("bad date {}", pubdate.clone()));
+                    HtmlEscape(pubdate.clone()).to_string()
                }
+            };
+            if self.last_date < date {
+                self.last_date = date.clone();
            }
+            let title_text = match strip_links_from_str(&title.raw.clone()) {
+                Ok(text) => HtmlEscape(text),
+                Err(the_err) => {
+                    dbg!(format!("bad title {} {}", title.raw.clone(), the_err));
+                    HtmlEscape(title.raw.to_string())
+                }
+            };
+            let cat_xmls = "";
+            let s = format!(
+                "<entry>
+                   <title>{}</title>
+                   <link href=\"{}\"/>
+                   <id>{}</id>
+                   <updated>{}</updated>\n
+                   {}
+
+                   <content type=\"html\">&lt;h{}&gt;", // the HTML encoded heading opening needs to be added here!
+                title_text,
+                the_link.to_string(), // link
+                the_link.to_string(), // ID
+                date,
+                cat_xmls,
+                title.level,
+            );
+            self.heading_lvl = title.level;
+            self.in_heading = true;
+            Ok(write!(w, "{}", s)?)
+        } else {
+            Ok(())
+        }
+    }
+// First Pass:15 ends here
+
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:16]]
+    fn start_keyword<W: Write>(&mut self, mut w: W, kw: &elements::Keyword) -> Result<()> {
+        // dbg!(kw);
+        match kw.key.as_ref() {
+            "FILETAGS" => {
+                kw.value
+                    .split(":")
+                    .map(String::from)
+                    .filter(|s| !s.is_empty())
+                    .for_each(|s| self.filetags.push(s));
+                dbg!(&self.filetags);
+            }
+            "TITLE" => {
+                self.feed_title = kw.value.to_string();
+                write!(w, r#"<title>{}</title>"#, self.feed_title)?;
+            }
+            "AUTHOR" => {
+                let re = regex::Regex::new(r"(?<name>[\w\s\d]+) <(?<email>.*)>").unwrap();
+
+                re.captures_iter(&kw.value)
+                    .map(|caps| {
+                        format!(
+                            "<author><name>{}</name><email>{}</email></author>",
+                            &caps["name"], &caps["email"]
+                        )
+                    })
+                    .for_each(|s| {
+                        self.authors.push(s.clone());
+                        write!(w, "{}", s).ok();
+                    });
+                dbg!(&self.authors);
+            }
+            _ => {}
+        }
+        Ok(())
+    }
+// First Pass:16 ends here
+
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:17]]
+    fn start_rest<W: Write>(&mut self, mut w: W, element: &elements::Element) -> Result<()> {
+        Ok(if self.in_heading == true {
+            let mut buf = InternalWriter::new();
+            self.inner.start(&mut buf, element).ok(); // panic if this fails.
+            let s = buf.to_utf8().unwrap();
+            write!(w, "{}", s)?
+        })
+    }
+
+    fn end_rest<W: Write>(&mut self, mut w: W, element: &elements::Element) -> Result<()> {
+        if self.in_heading == true {
+            let mut buf = InternalWriter::new();
+            self.inner.end(&mut buf, element).ok();
+            let s = buf.to_utf8().unwrap();
+            write!(w, "{}", s)?
        }
        Ok(())
    }
 }
-// First Pass:10 ends here
+// First Pass:17 ends here

-// [[file:../arroyo-native-parser.org::*First Pass][First Pass:11]]
+// [[file:../arroyo-native-parser.org::*Strip Links from Strings][Strip Links from Strings:1]]
+fn strip_links_from_str(in_str: &str) -> Result<String> {
+    // title.raw.replace("[", "&#91;").replace("]", "&#92;"),
+    let re = regex::Regex::new(r"\[(?<wrapped_the_link>\[\])\[(?<text>)\]\]")?;
+
+    Ok(re.replace_all(in_str, "$text").to_string())
+}
+// Strip Links from Strings:1 ends here
+
+// [[file:../arroyo-native-parser.org::*Convert my org-style timestamps to RFC-3339 strings][Convert my org-style timestamps to RFC-3339 strings:1]]
+fn rfcize_datestamp(in_str: String) -> Result<String> {
+    let re = regex::Regex::new(
+        r"<?(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2}) \w+ (?<hour>\d{2}):(?<minutes>\d{2})>?",
+    )?;
+    let date: Option<String> = re
+        .captures_iter(&in_str)
+        .map(|caps| {
+            let year = caps.name("year").unwrap().as_str();
+            let month = caps.name("month").unwrap().as_str();
+            let day = caps.name("day").unwrap().as_str();
+            let hour = caps.name("hour").unwrap().as_str();
+            let minutes = caps.name("minutes").unwrap().as_str();
+            let ret: String = format!("{}-{}-{}T{}:{}:00-08:00", year, month, day, hour, minutes);
+            ret
+        })
+        .next();
+    Ok(date.ok_or(Error::new(std::io::ErrorKind::Other, "invalid date"))?)
+}
+// Convert my org-style timestamps to RFC-3339 strings:1 ends here
+
+// [[file:../arroyo-native-parser.org::*Internal Buffer Writer for Escaping Entities][Internal Buffer Writer for Escaping Entities:1]]
 struct InternalWriter {
    inner: Vec<u8>,
 }
@ -214,7 +347,6 @@ impl InternalWriter {
    pub fn new() -> InternalWriter {
        return InternalWriter { inner: Vec::new() };
    }
-
 }

 impl Write for &mut InternalWriter {
@ -257,4 +389,4 @@ impl Write for &mut InternalWriter {
        Ok(())
    }
 }
-// First Pass:11 ends here
+// Internal Buffer Writer for Escaping Entities:1 ends here
--- a/src/export_html.rs
+++ b/src/export_html.rs
@ -62,6 +62,13 @@ impl<E: From<Error>, H: HtmlHandler<E>> ArroyoHtmlHandler<E, H> {
            ..Default::default()
        }
    }
+
+    pub fn rewrite_link_from(&self, id: &String) -> String {
+        match self.options.link_retargets.get(id) {
+            Some(path) => HtmlEscape(&path).to_string(),
+            _ => HtmlEscape(format!("/404?key={}", id)).to_string(),
+        }
+    }
 }

 impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoHtmlHandler<E, H> {
@ -117,39 +124,20 @@ impl<E: From<Error>, H: HtmlHandler<E>> HtmlHandler<E> for ArroyoHtmlHandler<E,
                    Some((proto, stripped_dest)) => (proto, stripped_dest.into()),
                    None => ("", string_path),
                };
+                let desc = link.desc.clone().unwrap_or(link.path.clone());
                match proto {
-                    "id" => {
-                        let maybe_new_target = self.options.link_retargets.get(&stripped_dest);
-                        match maybe_new_target {
-                            Some(path) => {
-                                let desc = link.desc.clone().unwrap_or(path.clone().into());
-                                write!(
-                                    w,
-                                    "<a class=\"internal\" href=\"{}\">{}</a>",
-                                    HtmlEscape(&path),
-                                    HtmlEscape(&desc),
-                                )?
-                            }
-                            _ => {
-                                let desc = link.desc.clone().unwrap_or(link.path.clone());
-                                write!(
-                                    w,
-                                    "<a href=\"/404?key={}\">{}</a>",
-                                    HtmlEscape(&link.path),
-                                    HtmlEscape(&desc),
-                                )?
-                            }
-                        };
-                    }
-                    "roam" => {
-                        let desc = link.desc.clone().unwrap_or(link.path.clone());
-                        write!(
-                            w,
-                            "<a href=\"/404?key={}\">{}</a>",
-                            HtmlEscape(&link.path),
-                            HtmlEscape(&desc),
-                        )?
-                    }
+                    "id" => write!(
+                        w,
+                        "<a href=\"{}\">{}</a>",
+                        self.rewrite_link_from(&stripped_dest),
+                        HtmlEscape(&desc),
+                    )?,
+                    "roam" => write!(
+                        w,
+                        "<a href=\"/404?key={}\">{}</a>",
+                        HtmlEscape(&link.path),
+                        HtmlEscape(&desc),
+                    )?,
                    _ => self.inner.start(w, &Element::Link(link.clone()))?,
                }
            }
Author	SHA1	Message	Date
Ryan Rix	c7c05f2742	try to structure this functionality in to a cohesive narrative, fail.	2024-01-17 00:23:41 -08:00
Ryan Rix	6a6fbd8ecc	document a todo	2024-01-17 00:22:47 -08:00
Ryan Rix	3459a29433	lift HTML link rewriting in to a public function which the Atom parser can use	2024-01-17 00:22:23 -08:00