90% validating atom generator

in-progress towards atom export
2024-01-13 20:37:55 -08:00 · 2024-01-13 19:22:50 -08:00
6 changed files with 540 additions and 18 deletions
--- a/arroyo-native-parser.org
+++ b/arroyo-native-parser.org
@ -829,7 +829,7 @@ I wrote some simple unit tests for this below.

 ** The HTML exporter
 :PROPERTIES:
-:header-args:rust: :tangle src/export.rs :mkdirp yes
+:header-args:rust: :tangle src/export_html.rs :mkdirp yes
 :ID:       20240112T120813.386800
 :END:

@ -838,12 +838,12 @@ I'm not exactly looking forward to writing this lul.
 - rewrite URLs from a HashMap passed in; higher-level things can map from IDs to URLs, files to URLs, and re-write the rest to 404s.
 - re-write [[id:2e31b385-a003-4369-a136-c6b78c0917e1][org-fc]] clozes to be =<span>='s
 - drop org-fc drawers
- code highlighting by building this on top of =Orgize::export::SyntectHtmlHandler=
+- code highlighting by building this on top of =Orgize::export_html::SyntectHtmlHandler=

 stretch:
 - tufte side-notes

-=crate::export::htmlize_file= is the public entrypoint for this functionality, below.
+=crate::export_html::htmlize_file= is the public entrypoint for this functionality, below.

 Here's the top-matter:

@ -1093,15 +1093,269 @@ The Atom exporter turns a set of org mode headings in to an Atom feed for servin

 For now maybe it is easier to assume that the headings are all in one file; that's how the existing [[id:arcology/atom-gen][Arcology Feed Generator]] behaves, you can turn a page in to an rss feed with an unholy abomination of lua and pandoc and xml templates. Surely something better can be designed now.

-the primary tension of the arroyo library now is that it is existing solely in the realm of the arcology project's design goals, and I need to start deciding whether a design goal of this library is to support non-arcology document systems.
+the primary tension of the arroyo library now is that its design context is only in the realm of the arcology project's design goals. I need to start deciding whether a design goal of this library is to support non-arcology document systems. surely interoperable but different document systems could be built on top of arroyo

-so the first pass of this API could take a file path, extract the feed metadata from keywords; it could construct an entire atom feed, falling back to the custom HTML exporter to fill out the feed with text content. That's probably fine, and an API that other document servers could work with.
+*** First Pass
+
+so the first pass of this API could take a file path, extract the feed metadata from keywords and heading properties; it could construct an entire atom feed, falling back to the custom HTML exporter to fill out the feed with text content. That's probably fine, and an API that other document servers could work with.
+
+the trick to designing this is that a lot of different shit has to be bolted together
+
+- the orgize parser has to iterate over each heading
+- each heading needs to be htmlize
+- each htmlize heading needs to be escaped
+- the htmlized headings need to be injected in to the atom doc
+
+it would be nice to do this in one pass...
+
+#+begin_src rust :tangle src/export_atom.rs
+use anyhow::Result;
+use std::fs;
+use std::io::{Error, Write};
+use std::marker::PhantomData;
+
+use orgize::export::{DefaultHtmlHandler, HtmlEscape, HtmlHandler, SyntectHtmlHandler};
+use orgize::{Element, Org};
+
+use crate::export_html::ArroyoHtmlHandler;
+use crate::export_html::ExportOptions;
+
+// sure would be nice..... some day i'll understand lifetimes enough
+// to write a function that goes path -> orgize::Org
+// use crate::parse::orgize_document;
+
+pub struct ArroyoAtomHandler<E: From<Error>, H: HtmlHandler<E>> {
+    // pub options: ExportOptions,
+    pub inner: H,
+    pub error_type: PhantomData<E>,
+    pub in_heading: bool,
+}
+
+impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
+    // pub fn new(options: ExportOptions, inner: H) -> Self {
+    //     ArroyoHtmlHandler {
+    //         inner,
+    //         options,
+    //         ..Default::default()
+    //     }
+    // }
+    pub fn new(inner: H) -> Self {
+        ArroyoAtomHandler {
+            inner,
+            ..Default::default()
+        }
+    }
+}
+
+impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
+    fn default() -> Self {
+        ArroyoAtomHandler {
+            inner: H::default(),
+            error_type: PhantomData,
+            // options: ExportOptions::default(),
+            in_heading: false,
+        }
+    }
+}
+
+pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
+    let syntect_handler = SyntectHtmlHandler::new(DefaultHtmlHandler);
+    let html_handler = ArroyoHtmlHandler::new(options, syntect_handler);
+    let mut handler = ArroyoAtomHandler::new(html_handler);
+
+    let org = String::from_utf8(fs::read(path.clone())?).unwrap();
+    let org_tree = &Org::parse_custom(
+        &org,
+        &orgize::ParseConfig {
+            // Need to pull these from environment or options...
+            todo_keywords: (
+                vec![
+                    "NEXT".to_string(),
+                    "INPROGRESS".to_string(),
+                    "WAITING".to_string(),
+                ],
+                vec!["DONE".to_string(), "CANCELLED".to_string()],
+            ),
+            ..Default::default()
+        },
+    );
+
+    let mut vec = vec![];
+    org_tree.write_html_custom(&mut vec, &mut handler)?;
+    Ok(String::from_utf8(vec)?)
+}
+
+impl<E: From<Error>, H: HtmlHandler<E>> HtmlHandler<E> for ArroyoAtomHandler<E, H> {
+    fn start<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
+        match element {
+            Element::Document { .. } => {
+                write!(
+                    w,
+                    "<?xml version=\"1.0\" encoding=\"utf-8\"?>
+                     <feed xmlns=\"http://www.w3.org/2005/Atom\">
+                       <title>Example Feed</title>
+                       <link href=\"http://example.org/\"/>
+                       <updated>2003-12-13T18:30:02Z</updated>
+                       <author>
+                         <name>John Doe</name>
+                       </author>
+                       <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
+                )?;
+            }
+            Element::Title(title) => {
+                // !!!
+                let the_link = "";
+                // !!!
+                let ignore_tags = vec![
+                    String::from("noexport"),
+                    String::from("NOEXPORT"),
+                    String::from("ignore"),
+                ];
+                let export_tags = title
+                    .tags
+                    .clone()
+                    .into_iter()
+                    .map(String::from)
+                    .find(|v| ignore_tags.contains(v));
+                let id = title
+                    .properties
+                    .clone()
+                    .into_iter()
+                    .find(|(k, _v)| k == "ID")
+                    .map(|(_k, v)| String::from(v))
+                    .unwrap_or("".to_string());
+                let pubdate = title
+                    .properties
+                    .clone()
+                    .into_iter()
+                    .find(|(k, _v)| k == "PUBDATE")
+                    .map(|(_k, v)| String::from(v))
+                    .unwrap_or("".to_string());
+
+                if id != "" && pubdate != "" && export_tags.is_none() {
+                    if self.in_heading == true {
+                        write!(w, "</content>\n")?;
+                        write!(w, "</entry>\n")?;
+                    }
+                    let s = format!(
+                        "<entry>
+                           <title>{}</title>
+                           <link href=\"{}\"/>
+                           <id>urn:{}</id>
+                           <updated>{}</updated>\n
+
+                           <content type=\"html\">",
+                        title.raw,
+                        the_link,
+                        id,
+                        HtmlEscape(pubdate)
+                    );
+                    self.in_heading = true;
+                    write!(w, "{}", s)?
+                }
+            }
+            _t => {
+                if self.in_heading == true {
+                    let mut buf = InternalWriter::new();
+                    self.inner.start(&mut buf, element)?;
+                    let s = buf.to_utf8().unwrap();
+                    // dbg!(_t);
+                    // dbg!(s.clone());
+                    write!(w, "{}", s)?
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn end<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
+        match element {
+            Element::Title(_title) => {}
+            Element::Document { .. } => {
+                write!(w, "</content>\n")?;
+                write!(w, "</entry>\n")?;
+                write!(w, "</feed>")?;
+            }
+            _ => {
+                if self.in_heading == true {
+                    let mut buf = InternalWriter::new();
+                    self.inner.end(&mut buf, element)?;
+                    let s = buf.to_utf8().unwrap();
+                    // dbg!(_t);
+                    // dbg!(s.clone());
+                    write!(w, "{}", s)?
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+struct InternalWriter {
+    inner: Vec<u8>,
+}
+
+impl InternalWriter {
+    pub fn to_utf8(self) -> Result<String> {
+        return Ok(String::from_utf8(self.inner)?);
+    }
+
+    pub fn new() -> InternalWriter {
+        return InternalWriter { inner: Vec::new() };
+    }
+}
+
+impl Write for &mut InternalWriter {
+    #[inline]
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        let s = String::from_utf8(buf.to_vec()).unwrap();
+        let esc = format!("{}", s);
+        self.inner.write(esc.as_bytes())
+    }
+
+    #[inline]
+    fn write_vectored(&mut self, bufs: &[std::io::IoSlice<'_>]) -> std::io::Result<usize> {
+        self.inner.write_vectored(bufs)
+    }
+
+    #[inline]
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.inner.flush()
+    }
+
+    #[inline]
+    fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
+        let s = String::from_utf8(buf.to_vec()).unwrap();
+        let esc = format!("{}", s);
+        // dbg!(s.clone());
+        // dbg!(esc.clone());
+        self.inner.write_all(esc.as_bytes())
+    }
+
+    #[inline]
+    fn write_fmt(&mut self, fmt: std::fmt::Arguments<'_>) -> std::io::Result<()> {
+        let s = match fmt.as_str() {
+            Some(s) => String::from(s),
+            _ => fmt.to_string(),
+        };
+
+        let s2 = format!("{}", HtmlEscape(s));
+        self.inner.write(s2.as_bytes())?;
+
+        Ok(())
+    }
+}
+#+end_src
+
+*** Second API

 there's a step further on, where an API takes a list of headings and feed metadata, and it parses each heading and its subheadings to HTML, *which is an API I already want to provide to document systems*. it could take arbitrary document headings provided through the public interface, and construct multi-page feeds.

 or we could just clobber together a version of [[https://github.com/tanrax/RSSingle][RSSingle]]; [[id:personal_software_can_be_shitty][Personal Software Can Be Shitty]].

-way out there: how do feed readers behave if the "feed" is just the linearized document with updated-at and whatnot applied to it? The feed would send the entire page with each update, but what if each heading could then be processed in to a diff or summary of changes?
+*** Future API
+
+way out there: how do feed readers behave if the "feed" is just the linearized document with updated-at and whatnot applied to it? The feed would send the entire page with each update, but what if each heading could then be processed in to a diff or summary of changes? how could i possibly do that well, anyhow?

 ** Library definition and exports for the Python library
 :PROPERTIES:
@ -1114,7 +1368,8 @@ way out there: how do feed readers behave if the "feed" is just the linearized d
 use pyo3::prelude::*;

 pub mod parse;
-pub mod export;
+pub mod export_html;
+pub mod export_atom;
 pub mod types;

 #[pymodule]
@ -1125,15 +1380,20 @@ fn arroyo_rs(py: Python, m: &PyModule) -> PyResult<()> {
    }

    #[pyfn(m)]
-    fn htmlize_file(path: String, options: export::ExportOptions) -> PyResult<String> {
-        Ok(export::htmlize_file(path, options)?)
+    fn htmlize_file(path: String, options: export_html::ExportOptions) -> PyResult<String> {
+        Ok(export_html::htmlize_file(path, options)?)
+    }
+
+    #[pyfn(m)]
+    fn atomize_file(path: String, options: export_html::ExportOptions) -> PyResult<String> {
+        Ok(export_atom::atomize_file(path, options)?)
    }

    m.add_class::<types::Document>()?;
    m.add_class::<types::Heading>()?;
    m.add_class::<types::Keyword>()?;
    m.add_class::<types::Link>()?;
-    m.add_class::<export::ExportOptions>()?;
+    m.add_class::<export_html::ExportOptions>()?;

    m.add("InvalidDocError", py.get_type::<types::InvalidDocError>())?;

@ -1285,7 +1545,7 @@ Stub package interface

 #+begin_src python :tangle arroyo/__init__.py :mkdirp yes
 from .arroyo_rs import parse_file, InvalidDocError
-from .arroyo_rs import htmlize_file, ExportOptions
+from .arroyo_rs import atomize_file, htmlize_file, ExportOptions
 #+end_src

 ** Click command wrapper
@ -1304,7 +1564,7 @@ import click
 import glob

 # from . import persist_one_file
-from .arroyo_rs import htmlize_file, ExportOptions
+from .arroyo_rs import atomize_file, htmlize_file, ExportOptions
 # from . import models
 # from sqlmodel import Session
 #+end_src
@ -1348,6 +1608,15 @@ def export_document(file):
    link_retargets = {"currently_reading": "https://rix.si/hello-world"}
  )
  print(htmlize_file(file, options))
+
+@cli.command()
+@click.option("--file", "-f", help="The file to export")
+def atomize_document(file):
+  # in The Real World this is loaded from DB and generated.
+  options = ExportOptions(
+    link_retargets = {"currently_reading": "https://rix.si/hello-world"}
+  )
+  print(atomize_file(file, options))
 #+end_src

 #+begin_src python
--- a/arroyo/init.py
+++ b/arroyo/init.py
@ -1,4 +1,4 @@
 # [[file:../arroyo-native-parser.org::*Python Package][Python Package:1]]
 from .arroyo_rs import parse_file, InvalidDocError
-from .arroyo_rs import htmlize_file, ExportOptions
+from .arroyo_rs import atomize_file, htmlize_file, ExportOptions
 # Python Package:1 ends here
--- a/arroyo/main.py
+++ b/arroyo/main.py
@ -4,7 +4,7 @@ import click
 import glob

 # from . import persist_one_file
-from .arroyo_rs import htmlize_file, ExportOptions
+from .arroyo_rs import atomize_file, htmlize_file, ExportOptions
 # from . import models
 # from sqlmodel import Session
 # Click command wrapper:1 ends here
@ -46,6 +46,15 @@ def export_document(file):
    link_retargets = {"currently_reading": "https://rix.si/hello-world"}
  )
  print(htmlize_file(file, options))
+
+@cli.command()
+@click.option("--file", "-f", help="The file to export")
+def atomize_document(file):
+  # in The Real World this is loaded from DB and generated.
+  options = ExportOptions(
+    link_retargets = {"currently_reading": "https://rix.si/hello-world"}
+  )
+  print(atomize_file(file, options))
 # Click command wrapper:2 ends here

 # [[file:../arroyo-native-parser.org::*Click command wrapper][Click command wrapper:3]]
--- a/src/export_atom.rs
+++ b/src/export_atom.rs
@ -0,0 +1,238 @@
+// [[file:../arroyo-native-parser.org::*First Pass][First Pass:1]]
+use anyhow::Result;
+use std::fs;
+use std::io::{Error, Write};
+use std::marker::PhantomData;
+
+use orgize::export::{DefaultHtmlHandler, HtmlEscape, HtmlHandler, SyntectHtmlHandler};
+use orgize::{Element, Org};
+
+use crate::export_html::ArroyoHtmlHandler;
+use crate::export_html::ExportOptions;
+
+// sure would be nice..... some day i'll understand lifetimes enough
+// to write a function that goes path -> orgize::Org
+// use crate::parse::orgize_document;
+
+pub struct ArroyoAtomHandler<E: From<Error>, H: HtmlHandler<E>> {
+    // pub options: ExportOptions,
+    pub inner: H,
+    pub error_type: PhantomData<E>,
+    pub in_heading: bool,
+}
+
+impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
+    // pub fn new(options: ExportOptions, inner: H) -> Self {
+    //     ArroyoHtmlHandler {
+    //         inner,
+    //         options,
+    //         ..Default::default()
+    //     }
+    // }
+    pub fn new(inner: H) -> Self {
+        ArroyoAtomHandler {
+            inner,
+            ..Default::default()
+        }
+    }
+}
+
+impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
+    fn default() -> Self {
+        ArroyoAtomHandler {
+            inner: H::default(),
+            error_type: PhantomData,
+            // options: ExportOptions::default(),
+            in_heading: false,
+        }
+    }
+}
+
+pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
+    let syntect_handler = SyntectHtmlHandler::new(DefaultHtmlHandler);
+    let html_handler = ArroyoHtmlHandler::new(options, syntect_handler);
+    let mut handler = ArroyoAtomHandler::new(html_handler);
+
+    let org = String::from_utf8(fs::read(path.clone())?).unwrap();
+    let org_tree = &Org::parse_custom(
+        &org,
+        &orgize::ParseConfig {
+            // Need to pull these from environment or options...
+            todo_keywords: (
+                vec![
+                    "NEXT".to_string(),
+                    "INPROGRESS".to_string(),
+                    "WAITING".to_string(),
+                ],
+                vec!["DONE".to_string(), "CANCELLED".to_string()],
+            ),
+            ..Default::default()
+        },
+    );
+
+    let mut vec = vec![];
+    org_tree.write_html_custom(&mut vec, &mut handler)?;
+    Ok(String::from_utf8(vec)?)
+}
+
+impl<E: From<Error>, H: HtmlHandler<E>> HtmlHandler<E> for ArroyoAtomHandler<E, H> {
+    fn start<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
+        match element {
+            Element::Document { .. } => {
+                write!(
+                    w,
+                    "<?xml version=\"1.0\" encoding=\"utf-8\"?>
+                     <feed xmlns=\"http://www.w3.org/2005/Atom\">
+                       <title>Example Feed</title>
+                       <link href=\"http://example.org/\"/>
+                       <updated>2003-12-13T18:30:02Z</updated>
+                       <author>
+                         <name>John Doe</name>
+                       </author>
+                       <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
+                )?;
+            }
+            Element::Title(title) => {
+                // !!!
+                let the_link = "";
+                // !!!
+                let ignore_tags = vec![
+                    String::from("noexport"),
+                    String::from("NOEXPORT"),
+                    String::from("ignore"),
+                ];
+                let export_tags = title
+                    .tags
+                    .clone()
+                    .into_iter()
+                    .map(String::from)
+                    .find(|v| ignore_tags.contains(v));
+                let id = title
+                    .properties
+                    .clone()
+                    .into_iter()
+                    .find(|(k, _v)| k == "ID")
+                    .map(|(_k, v)| String::from(v))
+                    .unwrap_or("".to_string());
+                let pubdate = title
+                    .properties
+                    .clone()
+                    .into_iter()
+                    .find(|(k, _v)| k == "PUBDATE")
+                    .map(|(_k, v)| String::from(v))
+                    .unwrap_or("".to_string());
+
+                if id != "" && pubdate != "" && export_tags.is_none() {
+                    if self.in_heading == true {
+                        write!(w, "</content>\n")?;
+                        write!(w, "</entry>\n")?;
+                    }
+                    let s = format!(
+                        "<entry>
+                           <title>{}</title>
+                           <link href=\"{}\"/>
+                           <id>urn:{}</id>
+                           <updated>{}</updated>\n
+
+                           <content type=\"html\">",
+                        title.raw,
+                        the_link,
+                        id,
+                        HtmlEscape(pubdate)
+                    );
+                    self.in_heading = true;
+                    write!(w, "{}", s)?
+                }
+            }
+            _t => {
+                if self.in_heading == true {
+                    let mut buf = InternalWriter::new();
+                    self.inner.start(&mut buf, element)?;
+                    let s = buf.to_utf8().unwrap();
+                    // dbg!(_t);
+                    // dbg!(s.clone());
+                    write!(w, "{}", s)?
+                }
+            }
+        }
+        Ok(())
+    }
+
+    fn end<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
+        match element {
+            Element::Title(_title) => {}
+            Element::Document { .. } => {
+                write!(w, "</content>\n")?;
+                write!(w, "</entry>\n")?;
+                write!(w, "</feed>")?;
+            }
+            _ => {
+                if self.in_heading == true {
+                    let mut buf = InternalWriter::new();
+                    self.inner.end(&mut buf, element)?;
+                    let s = buf.to_utf8().unwrap();
+                    // dbg!(_t);
+                    // dbg!(s.clone());
+                    write!(w, "{}", s)?
+                }
+            }
+        }
+        Ok(())
+    }
+}
+
+struct InternalWriter {
+    inner: Vec<u8>,
+}
+
+impl InternalWriter {
+    pub fn to_utf8(self) -> Result<String> {
+        return Ok(String::from_utf8(self.inner)?);
+    }
+
+    pub fn new() -> InternalWriter {
+        return InternalWriter { inner: Vec::new() };
+    }
+}
+
+impl Write for &mut InternalWriter {
+    #[inline]
+    fn write(&mut self, buf: &[u8]) -> std::io::Result<usize> {
+        let s = String::from_utf8(buf.to_vec()).unwrap();
+        let esc = format!("{}", s);
+        self.inner.write(esc.as_bytes())
+    }
+
+    #[inline]
+    fn write_vectored(&mut self, bufs: &[std::io::IoSlice<'_>]) -> std::io::Result<usize> {
+        self.inner.write_vectored(bufs)
+    }
+
+    #[inline]
+    fn flush(&mut self) -> std::io::Result<()> {
+        self.inner.flush()
+    }
+
+    #[inline]
+    fn write_all(&mut self, buf: &[u8]) -> std::io::Result<()> {
+        let s = String::from_utf8(buf.to_vec()).unwrap();
+        let esc = format!("{}", s);
+        // dbg!(s.clone());
+        // dbg!(esc.clone());
+        self.inner.write_all(esc.as_bytes())
+    }
+
+    #[inline]
+    fn write_fmt(&mut self, fmt: std::fmt::Arguments<'_>) -> std::io::Result<()> {
+        let s = match fmt.as_str() {
+            Some(s) => String::from(s),
+            _ => fmt.to_string(),
+        };
+
+        let s2 = format!("{}", HtmlEscape(s));
+        self.inner.write(s2.as_bytes())?;
+
+        Ok(())
+    }
+}
+// First Pass:1 ends here
--- a/src/export_html.rs
+++ b/src/export_html.rs
--- a/src/lib.rs
+++ b/src/lib.rs
@ -2,7 +2,8 @@
 use pyo3::prelude::*;

 pub mod parse;
-pub mod export;
+pub mod export_html;
+pub mod export_atom;
 pub mod types;

 #[pymodule]
@ -13,15 +14,20 @@ fn arroyo_rs(py: Python, m: &PyModule) -> PyResult<()> {
    }

    #[pyfn(m)]
-    fn htmlize_file(path: String, options: export::ExportOptions) -> PyResult<String> {
-        Ok(export::htmlize_file(path, options)?)
+    fn htmlize_file(path: String, options: export_html::ExportOptions) -> PyResult<String> {
+        Ok(export_html::htmlize_file(path, options)?)
+    }
+
+    #[pyfn(m)]
+    fn atomize_file(path: String, options: export_html::ExportOptions) -> PyResult<String> {
+        Ok(export_atom::atomize_file(path, options)?)
    }

    m.add_class::<types::Document>()?;
    m.add_class::<types::Heading>()?;
    m.add_class::<types::Keyword>()?;
    m.add_class::<types::Link>()?;
-    m.add_class::<export::ExportOptions>()?;
+    m.add_class::<export_html::ExportOptions>()?;

    m.add("InvalidDocError", py.get_type::<types::InvalidDocError>())?;
Author	SHA1	Message	Date
Ryan Rix	7f19049ae7	90% validating atom generator	2024-01-13 20:37:55 -08:00
Ryan Rix	2db2dd8e15	in-progress towards atom export	2024-01-13 19:22:50 -08:00