Compare commits

...

3 Commits

Author SHA1 Message Date
Ryan Rix c7c05f2742 try to structure this functionality in to a cohesive narrative, fail. 2024-01-17 00:23:41 -08:00
Ryan Rix 6a6fbd8ecc document a todo 2024-01-17 00:22:47 -08:00
Ryan Rix 3459a29433 lift HTML link rewriting in to a public function which the Atom parser can use 2024-01-17 00:22:23 -08:00
3 changed files with 557 additions and 290 deletions

View File

@ -648,6 +648,9 @@ Event::Start(orgize::Element::Title(title)) => {
:END:
**** File-level Property Drawer parsing
:PROPERTIES:
:ID: 20240116T235328.441922
:END:
Handling the file-level properties drawer is a bit of a pain -- some day I'll roll this in to =orgize= itself so that these can be accessed via a =PropertiesMap= as in the heading parsing above, but I don't get this library well enough to do that right now.
@ -716,6 +719,8 @@ Event::End(orgize::Element::Drawer(_drawer)) => {
}
#+END_SRC
***** NEXT fix orgize to expose file-level propertiesmap
**** Link parsing
Look; I'm gonna be honest here. I don't remember why the links are stored outside the heading until the end of the document parsing. Some ownership bullshit, and the COW types, if I recall.
@ -915,6 +920,13 @@ impl<E: From<Error>, H: HtmlHandler<E>> ArroyoHtmlHandler<E, H> {
..Default::default()
}
}
pub fn rewrite_link_from(&self, id: &String) -> String {
match self.options.link_retargets.get(id) {
Some(path) => HtmlEscape(&path).to_string(),
_ => HtmlEscape(format!("/404?key={}", id)).to_string(),
}
}
}
impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoHtmlHandler<E, H> {
@ -980,39 +992,20 @@ Link exporting is going to be the most complicated part of this because it does
Some((proto, stripped_dest)) => (proto, stripped_dest.into()),
None => ("", string_path),
};
let desc = link.desc.clone().unwrap_or(link.path.clone());
match proto {
"id" => {
let maybe_new_target = self.options.link_retargets.get(&stripped_dest);
match maybe_new_target {
Some(path) => {
let desc = link.desc.clone().unwrap_or(path.clone().into());
write!(
w,
"<a class=\"internal\" href=\"{}\">{}</a>",
HtmlEscape(&path),
HtmlEscape(&desc),
)?
}
_ => {
let desc = link.desc.clone().unwrap_or(link.path.clone());
write!(
w,
"<a href=\"/404?key={}\">{}</a>",
HtmlEscape(&link.path),
HtmlEscape(&desc),
)?
}
};
}
"roam" => {
let desc = link.desc.clone().unwrap_or(link.path.clone());
write!(
w,
"<a href=\"/404?key={}\">{}</a>",
HtmlEscape(&link.path),
HtmlEscape(&desc),
)?
}
"id" => write!(
w,
"<a href=\"{}\">{}</a>",
self.rewrite_link_from(&stripped_dest),
HtmlEscape(&desc),
)?,
"roam" => write!(
w,
"<a href=\"/404?key={}\">{}</a>",
HtmlEscape(&link.path),
HtmlEscape(&desc),
)?,
_ => self.inner.start(w, &Element::Link(link.clone()))?,
}
}
@ -1112,12 +1105,13 @@ This thing is similar in many respects to the HTML Handler, and it uses it direc
#+begin_src rust :tangle src/export_atom.rs
use anyhow::Result;
use regex;
use std::fs;
use std::io::{Error, Write};
use std::marker::PhantomData;
use orgize::export::{DefaultHtmlHandler, HtmlEscape, HtmlHandler, SyntectHtmlHandler};
use orgize::{Element, Org};
use orgize::{elements, Element, Org};
use crate::export_html::ArroyoHtmlHandler;
use crate::export_html::ExportOptions;
@ -1127,23 +1121,27 @@ This is just some basic implementation definitions and junk.
#+begin_src rust :tangle src/export_atom.rs
pub struct ArroyoAtomHandler<E: From<Error>, H: HtmlHandler<E>> {
// pub options: ExportOptions,
pub inner: H,
pub options: ExportOptions,
pub inner: ArroyoHtmlHandler<E, H>,
pub error_type: PhantomData<E>,
pub in_heading: bool,
// internal parser state
in_heading: bool,
in_drawer: bool,
heading_lvl: usize,
// Document metadata placed in feed
pub filetags: Vec<String>,
pub authors: Vec<String>,
pub feed_title: String,
pub last_date: String,
}
impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
// pub fn new(options: ExportOptions, inner: H) -> Self {
// ArroyoHtmlHandler {
// inner,
// options,
// ..Default::default()
// }
// }
pub fn new(inner: H) -> Self {
pub fn new(options: ExportOptions, inner: ArroyoHtmlHandler<E, H>) -> Self {
ArroyoAtomHandler {
inner,
options,
..Default::default()
}
}
@ -1152,10 +1150,18 @@ impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
fn default() -> Self {
ArroyoAtomHandler {
inner: H::default(),
inner: ArroyoHtmlHandler::default(),
error_type: PhantomData,
// options: ExportOptions::default(),
options: ExportOptions::default(),
in_heading: false,
in_drawer: false,
heading_lvl: 0,
last_date: "".into(),
feed_title: "".into(),
filetags: vec![],
authors: vec![],
}
}
}
@ -1166,8 +1172,8 @@ impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
#+begin_src rust :tangle src/export_atom.rs
pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
let syntect_handler = SyntectHtmlHandler::new(DefaultHtmlHandler);
let html_handler = ArroyoHtmlHandler::new(options, syntect_handler);
let mut handler = ArroyoAtomHandler::new(html_handler);
let html_handler = ArroyoHtmlHandler::new(options.clone(), syntect_handler);
let mut handler = ArroyoAtomHandler::new(options.clone(), html_handler);
let org = String::from_utf8(fs::read(path.clone())?).unwrap();
let org_tree = &Org::parse_custom(
@ -1192,107 +1198,50 @@ pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
}
#+end_src
This is functionally the same as the Arroyo HTML Handler.
This is uses the same custom org handling that the HTML exporter does, but in a worse fashion.
#+begin_src rust :tangle src/export_atom.rs
impl<E: From<Error>, H: HtmlHandler<E>> HtmlHandler<E> for ArroyoAtomHandler<E, H> {
fn start<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
match element {
(match dbg!(element) {
#+end_src
The root element contains metadata which need to be populated with data pulled from the doc, keywords, etc:
The root element contains metadata which need to be populated with data pulled from the doc, keywords, etc. This is defined below:
#+begin_src rust :tangle src/export_atom.rs
Element::Document { .. } => {
write!(
w,
"<?xml version=\"1.0\" encoding=\"utf-8\"?>
<feed xmlns=\"http://www.w3.org/2005/Atom\">
<title>Example Feed</title>
<link href=\"http://example.org/\"/>
<updated>2003-12-13T18:30:02Z</updated>
<author>
<name>John Doe</name>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
)?;
}
Element::Document { .. } => self.start_document(w, element),
#+end_src
Each title is the basis of a new heading and a new entry in the Atom document. There's still some work to do here, but it will only publish headings that have an ID and a PUBDATE property, and don't have certain tags. The basic metadata for the heading is injected in to the final document.
The title needs to know to close the previous one, this is pretty ugly, but the structure of the =Orgize= iterator don't work well for this.
=start_keyword= extracts document metadata required to populate the Atom feed.
#+begin_src rust :tangle src/export_atom.rs
Element::Title(title) => {
// !!!
let the_link = "";
// !!!
let ignore_tags = vec![
String::from("noexport"),
String::from("NOEXPORT"),
String::from("ignore"),
];
let export_tags = title
.tags
.clone()
.into_iter()
.map(String::from)
.find(|v| ignore_tags.contains(v));
let id = title
.properties
.clone()
.into_iter()
.find(|(k, _v)| k == "ID")
.map(|(_k, v)| String::from(v))
.unwrap_or("".to_string());
let pubdate = title
.properties
.clone()
.into_iter()
.find(|(k, _v)| k == "PUBDATE")
.map(|(_k, v)| String::from(v))
.unwrap_or("".to_string());
Element::Keyword(kw) => self.start_keyword(w, kw),
#+end_src
if id == "" || pubdate == "" || export_tags.is_some() {
self.in_heading = false;
} else if id != "" && pubdate != "" && export_tags.is_none() {
if self.in_heading == true {
write!(w, "</content>\n")?;
write!(w, "</entry>\n")?;
}
let s = format!(
"<entry>
<title>{}</title>
<link href=\"{}\"/>
<id>urn:roam:{}</id>
<updated>{}</updated>\n
Each title is immediately contained in a Heading, but contains all the actual metadata. It is the basis of a new heading and thus a new entry in the Atom document.
<content type=\"html\">&lt;h{}&gt;",
title.raw,
the_link,
id,
HtmlEscape(pubdate),
title.level,
);
self.in_heading = true;
write!(w, "{}", s)?
}
}
There's still some work to do here, but it will only publish headings that have an ID and a PUBDATE property, and don't have certain tags. The basic metadata for the heading is injected in to the final document.
The title needs to know to close the previous one, this is pretty ugly, but the structure of the =Orgize= iterator don't work well for this. This functionality is pretty ugly and explained better below:
#+begin_src rust :tangle src/export_atom.rs
Element::Title(title) => self.start_title(w, title),
#+end_src
Any other elements will be HTML-ized and then escaped, and emitted in to the doc as HTML-encoded HTML.
Because Orgize doesn't properly parse file-level =PROPERTIES= drawers in to a pseudo-heading, I have a forked version of Orgize that lets me reach in and do that myself. I handle =Text= elements to do this.
I would love to eliminate this logic, it's lifted from the parser above.
#+begin_src rust :tangle src/export_atom.rs
_t => {
if self.in_heading == true {
let mut buf = InternalWriter::new();
self.inner.start(&mut buf, element)?;
let s = buf.to_utf8().unwrap();
write!(w, "{}", s)?
}
}
}
Element::Text { value } => self.start_text(w, value),
#+end_src
Any other elements will be HTML-ized and then escaped, and emitted in to the doc as HTML-encoded HTML, assuming we're inside of a heading.
#+begin_src rust :tangle src/export_atom.rs
_t => self.start_rest(w, element),
})
.unwrap(); // if we can't parse something, just fucken panic.
#+end_src
#+begin_src rust :tangle src/export_atom.rs
@ -1300,35 +1249,234 @@ Any other elements will be HTML-ized and then escaped, and emitted in to the doc
}
fn end<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
match element {
(match element {
#+end_src
The end functions is to close out document entities. we only use it to encode the end of the Document to make the XML valid, and everything goes through the same "HTML and HTML Escape" path as above.
#+begin_src rust :tangle src/export_atom.rs
// Element::Title(_title) => {}
Element::Document { .. } => {
write!(w, "</content>\n")?;
write!(w, "</entry>\n")?;
write!(w, "</feed>")?;
}
Element::Document { .. } => self.end_document(w, element),
#+end_src
#+begin_src rust :tangle src/export_atom.rs
_ => {
if self.in_heading == true {
let mut buf = InternalWriter::new();
self.inner.end(&mut buf, element)?;
let s = buf.to_utf8().unwrap();
write!(w, "{}", s)?
}
_ => self.end_rest(w, element),
})
.ok();
Ok(())
}
}
#+end_src
All of those things are implemented in the =struct= =impl=.
#+begin_src rust :tangle src/export_atom.rs
impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
#+end_src
Processing the document to create metadata is going to be a bit of a pain because keywords need to be extracted out of the document and perhaps even turned in to URLs, etc...
#+begin_src rust :tangle src/export_atom.rs
fn start_document<W: Write>(&mut self, mut w: W, _document: &elements::Element) -> Result<()> {
Ok(write!(
w,
"<?xml version=\"1.0\" encoding=\"utf-8\"?>
<feed xmlns=\"http://www.w3.org/2005/Atom\">
<link href=\"http://example.org/\"/>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
)?)
}
fn end_document<W: Write>(&mut self, mut w: W, _document: &elements::Element) -> Result<()> {
// the last heading/entry is still "open", close it.
write!(
w,
r#" </content>
</entry>
<updated>{}</updated>
</feed>"#,
self.last_date,
)?;
Ok(())
}
#+end_src
Processing a title in to an Atom =<entry>= is pretty ugly. Really what I want is a way to get all this out of Heading objects but those lack enough information to fill out the =<entry>= metadata.
#+begin_src rust :tangle src/export_atom.rs
fn start_title<W: Write>(&mut self, mut w: W, title: &elements::Title) -> Result<()> {
let ignore_tags = vec![
String::from("noexport"),
String::from("NOEXPORT"),
String::from("ignore"),
];
let export_tags = title
.tags
.clone()
.into_iter()
.map(String::from)
.find(|v| ignore_tags.contains(v));
let props = title.properties.clone().into_hash_map();
let id = props
.get("ID")
.map(|id| id.clone().into())
.unwrap_or("".to_string());
let pubdate = props
.get("PUBDATE")
.map(|pubdate| pubdate.clone().into())
.unwrap_or("".to_string());
let the_link = self.inner.rewrite_link_from(&id);
if id == "" || pubdate == "" || export_tags.is_some() && self.in_heading {
self.in_heading = false;
Ok(())
} else if id != "" && pubdate != "" && export_tags.is_none() {
// close previous heading; note that self.heading_lvl defaults to 0
if title.level <= self.heading_lvl {
write!(w, "</content>\n")?;
write!(w, "</entry>\n")?;
}
let date = match rfcize_datestamp(pubdate.clone()) {
Ok(date) => date,
Err(_) => {
dbg!(format!("bad date {}", pubdate.clone()));
HtmlEscape(pubdate.clone()).to_string()
}
};
if self.last_date < date {
self.last_date = date.clone();
}
let title_text = match strip_links_from_str(&title.raw.clone()) {
Ok(text) => HtmlEscape(text),
Err(the_err) => {
dbg!(format!("bad title {} {}", title.raw.clone(), the_err));
HtmlEscape(title.raw.to_string())
}
};
let cat_xmls = "";
let s = format!(
"<entry>
<title>{}</title>
<link href=\"{}\"/>
<id>{}</id>
<updated>{}</updated>\n
{}
<content type=\"html\">&lt;h{}&gt;", // the HTML encoded heading opening needs to be added here!
title_text,
the_link.to_string(), // link
the_link.to_string(), // ID
date,
cat_xmls,
title.level,
);
self.heading_lvl = title.level;
self.in_heading = true;
Ok(write!(w, "{}", s)?)
} else {
Ok(())
}
}
#+end_src
#+begin_src rust :tangle src/export_atom.rs
fn start_keyword<W: Write>(&mut self, mut w: W, kw: &elements::Keyword) -> Result<()> {
// dbg!(kw);
match kw.key.as_ref() {
"FILETAGS" => {
kw.value
.split(":")
.map(String::from)
.filter(|s| !s.is_empty())
.for_each(|s| self.filetags.push(s));
dbg!(&self.filetags);
}
"TITLE" => {
self.feed_title = kw.value.to_string();
write!(w, r#"<title>{}</title>"#, self.feed_title)?;
}
"AUTHOR" => {
let re = regex::Regex::new(r"(?<name>[\w\s\d]+) <(?<email>.*)>").unwrap();
re.captures_iter(&kw.value)
.map(|caps| {
format!(
"<author><name>{}</name><email>{}</email></author>",
&caps["name"], &caps["email"]
)
})
.for_each(|s| {
self.authors.push(s.clone());
write!(w, "{}", s).ok();
});
dbg!(&self.authors);
}
_ => {}
}
Ok(())
}
#+end_src
#+begin_src rust :tangle src/export_atom.rs
fn start_rest<W: Write>(&mut self, mut w: W, element: &elements::Element) -> Result<()> {
Ok(if self.in_heading == true {
let mut buf = InternalWriter::new();
self.inner.start(&mut buf, element).ok(); // panic if this fails.
let s = buf.to_utf8().unwrap();
write!(w, "{}", s)?
})
}
fn end_rest<W: Write>(&mut self, mut w: W, element: &elements::Element) -> Result<()> {
if self.in_heading == true {
let mut buf = InternalWriter::new();
self.inner.end(&mut buf, element).ok();
let s = buf.to_utf8().unwrap();
write!(w, "{}", s)?
}
Ok(())
}
}
#+end_src
**** Strip Links from Strings
#+begin_src rust :tangle src/export_atom.rs
fn strip_links_from_str(in_str: &str) -> Result<String> {
// title.raw.replace("[", "&#91;").replace("]", "&#92;"),
let re = regex::Regex::new(r"\[(?<wrapped_the_link>\[\])\[(?<text>)\]\]")?;
Ok(re.replace_all(in_str, "$text").to_string())
}
#+end_src
**** Convert my org-style timestamps to RFC-3339 strings
#+begin_src rust :tangle src/export_atom.rs
fn rfcize_datestamp(in_str: String) -> Result<String> {
let re = regex::Regex::new(
r"<?(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2}) \w+ (?<hour>\d{2}):(?<minutes>\d{2})>?",
)?;
let date: Option<String> = re
.captures_iter(&in_str)
.map(|caps| {
let year = caps.name("year").unwrap().as_str();
let month = caps.name("month").unwrap().as_str();
let day = caps.name("day").unwrap().as_str();
let hour = caps.name("hour").unwrap().as_str();
let minutes = caps.name("minutes").unwrap().as_str();
let ret: String = format!("{}-{}-{}T{}:{}:00-08:00", year, month, day, hour, minutes);
ret
})
.next();
Ok(date.ok_or(Error::new(std::io::ErrorKind::Other, "invalid date"))?)
}
#+end_src
**** Internal Buffer Writer for Escaping Entities
I implemented a really simple/dumb =Write= interface so will =HtmlEscape= things which are written to it. I should make this take a String under the hood instead of =Vec<bytes>= but meh it's good enough for now.
#+begin_src rust :tangle src/export_atom.rs
struct InternalWriter {
inner: Vec<u8>,
@ -1342,7 +1490,6 @@ impl InternalWriter {
pub fn new() -> InternalWriter {
return InternalWriter { inner: Vec::new() };
}
}
impl Write for &mut InternalWriter {

View File

@ -1,11 +1,12 @@
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:1]]
use anyhow::Result;
use regex;
use std::fs;
use std::io::{Error, Write};
use std::marker::PhantomData;
use orgize::export::{DefaultHtmlHandler, HtmlEscape, HtmlHandler, SyntectHtmlHandler};
use orgize::{Element, Org};
use orgize::{elements, Element, Org};
use crate::export_html::ArroyoHtmlHandler;
use crate::export_html::ExportOptions;
@ -13,23 +14,27 @@ use crate::export_html::ExportOptions;
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:2]]
pub struct ArroyoAtomHandler<E: From<Error>, H: HtmlHandler<E>> {
// pub options: ExportOptions,
pub inner: H,
pub options: ExportOptions,
pub inner: ArroyoHtmlHandler<E, H>,
pub error_type: PhantomData<E>,
pub in_heading: bool,
// internal parser state
in_heading: bool,
in_drawer: bool,
heading_lvl: usize,
// Document metadata placed in feed
pub filetags: Vec<String>,
pub authors: Vec<String>,
pub feed_title: String,
pub last_date: String,
}
impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
// pub fn new(options: ExportOptions, inner: H) -> Self {
// ArroyoHtmlHandler {
// inner,
// options,
// ..Default::default()
// }
// }
pub fn new(inner: H) -> Self {
pub fn new(options: ExportOptions, inner: ArroyoHtmlHandler<E, H>) -> Self {
ArroyoAtomHandler {
inner,
options,
..Default::default()
}
}
@ -38,10 +43,18 @@ impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
fn default() -> Self {
ArroyoAtomHandler {
inner: H::default(),
inner: ArroyoHtmlHandler::default(),
error_type: PhantomData,
// options: ExportOptions::default(),
options: ExportOptions::default(),
in_heading: false,
in_drawer: false,
heading_lvl: 0,
last_date: "".into(),
feed_title: "".into(),
filetags: vec![],
authors: vec![],
}
}
}
@ -50,8 +63,8 @@ impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoAtomHandler<E, H> {
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:3]]
pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
let syntect_handler = SyntectHtmlHandler::new(DefaultHtmlHandler);
let html_handler = ArroyoHtmlHandler::new(options, syntect_handler);
let mut handler = ArroyoAtomHandler::new(html_handler);
let html_handler = ArroyoHtmlHandler::new(options.clone(), syntect_handler);
let mut handler = ArroyoAtomHandler::new(options.clone(), html_handler);
let org = String::from_utf8(fs::read(path.clone())?).unwrap();
let org_tree = &Org::parse_custom(
@ -79,129 +92,249 @@ pub fn atomize_file(path: String, options: ExportOptions) -> Result<String> {
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:4]]
impl<E: From<Error>, H: HtmlHandler<E>> HtmlHandler<E> for ArroyoAtomHandler<E, H> {
fn start<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
match element {
(match dbg!(element) {
// First Pass:4 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:5]]
Element::Document { .. } => {
write!(
w,
"<?xml version=\"1.0\" encoding=\"utf-8\"?>
<feed xmlns=\"http://www.w3.org/2005/Atom\">
<title>Example Feed</title>
<link href=\"http://example.org/\"/>
<updated>2003-12-13T18:30:02Z</updated>
<author>
<name>John Doe</name>
</author>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
)?;
}
Element::Document { .. } => self.start_document(w, element),
// First Pass:5 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:6]]
Element::Title(title) => {
// !!!
let the_link = "";
// !!!
let ignore_tags = vec![
String::from("noexport"),
String::from("NOEXPORT"),
String::from("ignore"),
];
let export_tags = title
.tags
.clone()
.into_iter()
.map(String::from)
.find(|v| ignore_tags.contains(v));
let id = title
.properties
.clone()
.into_iter()
.find(|(k, _v)| k == "ID")
.map(|(_k, v)| String::from(v))
.unwrap_or("".to_string());
let pubdate = title
.properties
.clone()
.into_iter()
.find(|(k, _v)| k == "PUBDATE")
.map(|(_k, v)| String::from(v))
.unwrap_or("".to_string());
if id == "" || pubdate == "" || export_tags.is_some() {
self.in_heading = false;
} else if id != "" && pubdate != "" && export_tags.is_none() {
if self.in_heading == true {
write!(w, "</content>\n")?;
write!(w, "</entry>\n")?;
}
let s = format!(
"<entry>
<title>{}</title>
<link href=\"{}\"/>
<id>urn:roam:{}</id>
<updated>{}</updated>\n
<content type=\"html\">&lt;h{}&gt;",
title.raw,
the_link,
id,
HtmlEscape(pubdate),
title.level,
);
self.in_heading = true;
write!(w, "{}", s)?
}
}
Element::Keyword(kw) => self.start_keyword(w, kw),
// First Pass:6 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:7]]
_t => {
if self.in_heading == true {
let mut buf = InternalWriter::new();
self.inner.start(&mut buf, element)?;
let s = buf.to_utf8().unwrap();
write!(w, "{}", s)?
}
}
}
Element::Title(title) => self.start_title(w, title),
// First Pass:7 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:8]]
Element::Text { value } => self.start_text(w, value),
// First Pass:8 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:9]]
_t => self.start_rest(w, element),
})
.unwrap(); // if we can't parse something, just fucken panic.
// First Pass:9 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:10]]
Ok(())
}
fn end<W: Write>(&mut self, mut w: W, element: &Element) -> Result<(), E> {
match element {
// First Pass:8 ends here
(match element {
// First Pass:10 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:9]]
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:11]]
// Element::Title(_title) => {}
Element::Document { .. } => {
Element::Document { .. } => self.end_document(w, element),
// First Pass:11 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:12]]
_ => self.end_rest(w, element),
})
.ok();
Ok(())
}
}
// First Pass:12 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:13]]
impl<E: From<Error>, H: HtmlHandler<E>> ArroyoAtomHandler<E, H> {
// First Pass:13 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:14]]
fn start_document<W: Write>(&mut self, mut w: W, _document: &elements::Element) -> Result<()> {
Ok(write!(
w,
"<?xml version=\"1.0\" encoding=\"utf-8\"?>
<feed xmlns=\"http://www.w3.org/2005/Atom\">
<link href=\"http://example.org/\"/>
<id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>\n"
)?)
}
fn end_document<W: Write>(&mut self, mut w: W, _document: &elements::Element) -> Result<()> {
// the last heading/entry is still "open", close it.
write!(
w,
r#" </content>
</entry>
<updated>{}</updated>
</feed>"#,
self.last_date,
)?;
Ok(())
}
// First Pass:14 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:15]]
fn start_title<W: Write>(&mut self, mut w: W, title: &elements::Title) -> Result<()> {
let ignore_tags = vec![
String::from("noexport"),
String::from("NOEXPORT"),
String::from("ignore"),
];
let export_tags = title
.tags
.clone()
.into_iter()
.map(String::from)
.find(|v| ignore_tags.contains(v));
let props = title.properties.clone().into_hash_map();
let id = props
.get("ID")
.map(|id| id.clone().into())
.unwrap_or("".to_string());
let pubdate = props
.get("PUBDATE")
.map(|pubdate| pubdate.clone().into())
.unwrap_or("".to_string());
let the_link = self.inner.rewrite_link_from(&id);
if id == "" || pubdate == "" || export_tags.is_some() && self.in_heading {
self.in_heading = false;
Ok(())
} else if id != "" && pubdate != "" && export_tags.is_none() {
// close previous heading; note that self.heading_lvl defaults to 0
if title.level <= self.heading_lvl {
write!(w, "</content>\n")?;
write!(w, "</entry>\n")?;
write!(w, "</feed>")?;
}
// First Pass:9 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:10]]
_ => {
if self.in_heading == true {
let mut buf = InternalWriter::new();
self.inner.end(&mut buf, element)?;
let s = buf.to_utf8().unwrap();
write!(w, "{}", s)?
let date = match rfcize_datestamp(pubdate.clone()) {
Ok(date) => date,
Err(_) => {
dbg!(format!("bad date {}", pubdate.clone()));
HtmlEscape(pubdate.clone()).to_string()
}
};
if self.last_date < date {
self.last_date = date.clone();
}
let title_text = match strip_links_from_str(&title.raw.clone()) {
Ok(text) => HtmlEscape(text),
Err(the_err) => {
dbg!(format!("bad title {} {}", title.raw.clone(), the_err));
HtmlEscape(title.raw.to_string())
}
};
let cat_xmls = "";
let s = format!(
"<entry>
<title>{}</title>
<link href=\"{}\"/>
<id>{}</id>
<updated>{}</updated>\n
{}
<content type=\"html\">&lt;h{}&gt;", // the HTML encoded heading opening needs to be added here!
title_text,
the_link.to_string(), // link
the_link.to_string(), // ID
date,
cat_xmls,
title.level,
);
self.heading_lvl = title.level;
self.in_heading = true;
Ok(write!(w, "{}", s)?)
} else {
Ok(())
}
}
// First Pass:15 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:16]]
fn start_keyword<W: Write>(&mut self, mut w: W, kw: &elements::Keyword) -> Result<()> {
// dbg!(kw);
match kw.key.as_ref() {
"FILETAGS" => {
kw.value
.split(":")
.map(String::from)
.filter(|s| !s.is_empty())
.for_each(|s| self.filetags.push(s));
dbg!(&self.filetags);
}
"TITLE" => {
self.feed_title = kw.value.to_string();
write!(w, r#"<title>{}</title>"#, self.feed_title)?;
}
"AUTHOR" => {
let re = regex::Regex::new(r"(?<name>[\w\s\d]+) <(?<email>.*)>").unwrap();
re.captures_iter(&kw.value)
.map(|caps| {
format!(
"<author><name>{}</name><email>{}</email></author>",
&caps["name"], &caps["email"]
)
})
.for_each(|s| {
self.authors.push(s.clone());
write!(w, "{}", s).ok();
});
dbg!(&self.authors);
}
_ => {}
}
Ok(())
}
// First Pass:16 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:17]]
fn start_rest<W: Write>(&mut self, mut w: W, element: &elements::Element) -> Result<()> {
Ok(if self.in_heading == true {
let mut buf = InternalWriter::new();
self.inner.start(&mut buf, element).ok(); // panic if this fails.
let s = buf.to_utf8().unwrap();
write!(w, "{}", s)?
})
}
fn end_rest<W: Write>(&mut self, mut w: W, element: &elements::Element) -> Result<()> {
if self.in_heading == true {
let mut buf = InternalWriter::new();
self.inner.end(&mut buf, element).ok();
let s = buf.to_utf8().unwrap();
write!(w, "{}", s)?
}
Ok(())
}
}
// First Pass:10 ends here
// First Pass:17 ends here
// [[file:../arroyo-native-parser.org::*First Pass][First Pass:11]]
// [[file:../arroyo-native-parser.org::*Strip Links from Strings][Strip Links from Strings:1]]
fn strip_links_from_str(in_str: &str) -> Result<String> {
// title.raw.replace("[", "&#91;").replace("]", "&#92;"),
let re = regex::Regex::new(r"\[(?<wrapped_the_link>\[\])\[(?<text>)\]\]")?;
Ok(re.replace_all(in_str, "$text").to_string())
}
// Strip Links from Strings:1 ends here
// [[file:../arroyo-native-parser.org::*Convert my org-style timestamps to RFC-3339 strings][Convert my org-style timestamps to RFC-3339 strings:1]]
fn rfcize_datestamp(in_str: String) -> Result<String> {
let re = regex::Regex::new(
r"<?(?<year>\d{4})-(?<month>\d{2})-(?<day>\d{2}) \w+ (?<hour>\d{2}):(?<minutes>\d{2})>?",
)?;
let date: Option<String> = re
.captures_iter(&in_str)
.map(|caps| {
let year = caps.name("year").unwrap().as_str();
let month = caps.name("month").unwrap().as_str();
let day = caps.name("day").unwrap().as_str();
let hour = caps.name("hour").unwrap().as_str();
let minutes = caps.name("minutes").unwrap().as_str();
let ret: String = format!("{}-{}-{}T{}:{}:00-08:00", year, month, day, hour, minutes);
ret
})
.next();
Ok(date.ok_or(Error::new(std::io::ErrorKind::Other, "invalid date"))?)
}
// Convert my org-style timestamps to RFC-3339 strings:1 ends here
// [[file:../arroyo-native-parser.org::*Internal Buffer Writer for Escaping Entities][Internal Buffer Writer for Escaping Entities:1]]
struct InternalWriter {
inner: Vec<u8>,
}
@ -214,7 +347,6 @@ impl InternalWriter {
pub fn new() -> InternalWriter {
return InternalWriter { inner: Vec::new() };
}
}
impl Write for &mut InternalWriter {
@ -257,4 +389,4 @@ impl Write for &mut InternalWriter {
Ok(())
}
}
// First Pass:11 ends here
// Internal Buffer Writer for Escaping Entities:1 ends here

View File

@ -62,6 +62,13 @@ impl<E: From<Error>, H: HtmlHandler<E>> ArroyoHtmlHandler<E, H> {
..Default::default()
}
}
pub fn rewrite_link_from(&self, id: &String) -> String {
match self.options.link_retargets.get(id) {
Some(path) => HtmlEscape(&path).to_string(),
_ => HtmlEscape(format!("/404?key={}", id)).to_string(),
}
}
}
impl<E: From<Error>, H: HtmlHandler<E>> Default for ArroyoHtmlHandler<E, H> {
@ -117,39 +124,20 @@ impl<E: From<Error>, H: HtmlHandler<E>> HtmlHandler<E> for ArroyoHtmlHandler<E,
Some((proto, stripped_dest)) => (proto, stripped_dest.into()),
None => ("", string_path),
};
let desc = link.desc.clone().unwrap_or(link.path.clone());
match proto {
"id" => {
let maybe_new_target = self.options.link_retargets.get(&stripped_dest);
match maybe_new_target {
Some(path) => {
let desc = link.desc.clone().unwrap_or(path.clone().into());
write!(
w,
"<a class=\"internal\" href=\"{}\">{}</a>",
HtmlEscape(&path),
HtmlEscape(&desc),
)?
}
_ => {
let desc = link.desc.clone().unwrap_or(link.path.clone());
write!(
w,
"<a href=\"/404?key={}\">{}</a>",
HtmlEscape(&link.path),
HtmlEscape(&desc),
)?
}
};
}
"roam" => {
let desc = link.desc.clone().unwrap_or(link.path.clone());
write!(
w,
"<a href=\"/404?key={}\">{}</a>",
HtmlEscape(&link.path),
HtmlEscape(&desc),
)?
}
"id" => write!(
w,
"<a href=\"{}\">{}</a>",
self.rewrite_link_from(&stripped_dest),
HtmlEscape(&desc),
)?,
"roam" => write!(
w,
"<a href=\"/404?key={}\">{}</a>",
HtmlEscape(&link.path),
HtmlEscape(&desc),
)?,
_ => self.inner.start(w, &Element::Link(link.clone()))?,
}
}