move feed generation in to its own module
parent
7d6c98739a
commit
529958c361
@ -0,0 +1,182 @@
|
||||
:PROPERTIES:
|
||||
:ID: arcology/atom-gen
|
||||
:ROAM_ALIASES: "Arcology Feed Generator" "Arcology Atom Generator"
|
||||
:END:
|
||||
#+TITLE: Arcology's Atom Pandoc Filter + Template
|
||||
#+AUTO_TANGLE: t
|
||||
#+ARCOLOGY_KEY: arcology/feed-gen
|
||||
|
||||
This module renders an [[https://en.wikipedia.org/wiki/Atom_(web_standard)][ATOM feed]]. It's possible for any page in the Arcology now to define an =#+ARCOLOGY_FEED= keyword, and in doing so create a new route in the [[id:20220225T175638.482695][Arcology Public Router]] which will render an Atom feed. The semantics of the feed more-or-less follow the expectations defined in =ox-rss=: Any heading with an =ID= property and a =PUBDATE= property with an org-mode active timestamp in it will be published to the feed. Any entry with an ID will have a =PUBDATE= added to it by invoking =(org-rss-add-pubdate-property)=.
|
||||
|
||||
* Invoking Pandoc for the Feed Generator
|
||||
|
||||
To get an ATOM feed for an org document, it's easy enough to invoke =render_feed_from_file=
|
||||
|
||||
I'm shelling out to =pandoc= directly. Probably shouldn't have reached for that thing in the first place! oh well.
|
||||
|
||||
#+begin_src python :tangle arcology/feeds.py :mkdirp yes
|
||||
import re
|
||||
from fastapi import Response, HTTPException
|
||||
|
||||
import asyncio
|
||||
from sqlmodel import Session
|
||||
from async_lru import alru_cache
|
||||
from typing import Optional
|
||||
|
||||
from arcology.html import rewrite_html
|
||||
from arcology.key import ArcologyKey
|
||||
from arcology.parse import parse_sexp, print_sexp
|
||||
from arcology.arroyo import Page
|
||||
#+end_src
|
||||
|
||||
This is pretty straightforward, except I stick an [[https://docs.python.org/3/library/functools.html#functools.lru_cache][LRU cache]] in the middle of it so that feed readers aren't constantly invoking Pandoc.
|
||||
|
||||
#+begin_src python :tangle arcology/feeds.py :mkdirp yes
|
||||
class ExportException(BaseException):
|
||||
code: int
|
||||
stderr: str
|
||||
|
||||
def __init__(self, code, stderr=None):
|
||||
self.code = code
|
||||
self.stderr = stderr
|
||||
|
||||
@alru_cache(maxsize=64)
|
||||
async def export_pandoc(file: str, hash: str) -> str:
|
||||
proc = await asyncio.create_subprocess_shell(
|
||||
f"pandoc {file} --lua-filter=./arcology/pandoc/make-atom.lua --template=./arcology/pandoc/atom.xml -s",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE)
|
||||
stdout, stderr = await proc.communicate()
|
||||
if proc.returncode == 0:
|
||||
return stdout.decode()
|
||||
else:
|
||||
raise ExportException(code=proc.returncode, stderr=stderr.decode())
|
||||
|
||||
async def render_feed_from_file(_request, file: str, engine, site) -> Optional[Response]:
|
||||
with Session(engine) as session:
|
||||
p = Page.from_file(file, session)
|
||||
if p is None:
|
||||
raise HTTPException(status_code=404, detail="Feed not found.")
|
||||
try:
|
||||
xml = await export_pandoc(file, p.hash)
|
||||
except ExportException as e:
|
||||
raise HTTPException(status_code=500, detail=f"pandoc exited {e.code} w/ {e.stderr}")
|
||||
|
||||
return hydrate_feed(file, xml, session)
|
||||
#+end_src
|
||||
|
||||
The feed is more-or-less ready as-is when it comes out of Pandoc except for the final "canonical" URL -- an =re.sub= invocation will replace it a stand-in variable with the correct URL. I could probably inject this in to the Pandoc invocation as a metadata variable but this is Good Enough.
|
||||
|
||||
#+begin_src python :tangle arcology/feeds.py :mkdirp yes
|
||||
def hydrate_feed(filename: str, xml: str, session) -> str:
|
||||
page = Page.from_file(filename, session)
|
||||
def feed_page_replacement_fn(match):
|
||||
return page.get_file()
|
||||
|
||||
akey = ArcologyKey(page.get_key())
|
||||
|
||||
out_xml = xml
|
||||
out_xml = re.sub(r'{ARCOLOGY_FEED_PAGE}', akey.to_url(), out_xml)
|
||||
out_xml = rewrite_html(out_xml, session) # lol dangerous
|
||||
return out_xml
|
||||
#+end_src
|
||||
|
||||
* Rendering Atom from Org in Pandoc in two steps
|
||||
|
||||
I had some real trouble figuring out how to get Pandoc to spit out [[https://validator.w3.org/feed/docs/atom.html][ATOM feeds]] and this is not "technically compliant" but I can do it with a combination of a [[https://pandoc.org/lua-filters.html][Lua filter]] which extracts headings' metadata in to variables which a [[https://pandoc.org/MANUAL.html#templates][custom template]] then renders out:
|
||||
|
||||
** Lua Filter
|
||||
|
||||
#+begin_src lua :tangle arcology/pandoc/make-atom.lua :mkdirp yes
|
||||
local utils = require 'pandoc.utils'
|
||||
|
||||
local entries = {}
|
||||
local vars = {}
|
||||
|
||||
-- thanks random github users https://gist.github.com/zwh8800/9b0442efadc97408ffff248bc8573064
|
||||
local epoch = os.time{year=1970, month=1, day=1, hour=0}
|
||||
function parse_org_date(org_date)
|
||||
local year, month, day, hour, minute = org_date:match("<?(%d+)%-(%d+)%-(%d+)%s%a+%s(%d+)%:(%d+)>?")
|
||||
local timestamp = os.time{year = year, month = month, day = day, hour = hour, min = minute, sec = 0} - epoch
|
||||
return timestamp
|
||||
end
|
||||
rfc3339ish = "%Y-%m-%dT%TZ"
|
||||
|
||||
set_entries_and_date = function(m)
|
||||
if m.date == nil then
|
||||
m.date = os.date(rfc3339ish) --current time in iso8601/rfc3339
|
||||
end
|
||||
m.entries = entries
|
||||
return m
|
||||
end
|
||||
|
||||
extract_entries = function (blocks)
|
||||
pandoc.utils.make_sections(true, nil, blocks):walk
|
||||
{
|
||||
Div = function (div)
|
||||
if div.attributes.pubdate then
|
||||
local header = div.content[1]
|
||||
header.attributes.pubdate = nil
|
||||
header.attributes.number = nil
|
||||
div.attributes.number = nil
|
||||
|
||||
entry = {
|
||||
content=div,
|
||||
title=header.content,
|
||||
rel=div.attributes.id,
|
||||
pubdate=os.date(rfc3339ish, parse_org_date(div.attributes.pubdate))
|
||||
}
|
||||
table.insert(entries, entry)
|
||||
end
|
||||
end
|
||||
}
|
||||
end
|
||||
|
||||
return {
|
||||
{
|
||||
Blocks = extract_entries,
|
||||
Meta = set_entries_and_date
|
||||
}
|
||||
}
|
||||
#+end_src
|
||||
|
||||
** Pandoc Template
|
||||
|
||||
The template is basically unremarkable, but has the same issues that the HTML files have: they need to have their [[id:arcology/arroyo/hydrate][ID links fixed]].
|
||||
|
||||
#+begin_src xml :tangle arcology/pandoc/atom.xml
|
||||
<?xml version="1.0" encoding="utf-8"?>
|
||||
<feed xmlns="http://www.w3.org/2005/Atom">
|
||||
|
||||
<title>$pagetitle$</title>
|
||||
<link href="http://example.org/"/>
|
||||
<updated>$date$</updated>
|
||||
<author>
|
||||
$for(author)$
|
||||
<name>${author}</name>
|
||||
$endfor$
|
||||
</author>
|
||||
<id>{ARCOLOGY_FEED_PAGE}</id>
|
||||
<link rel="self" type="application/atom+xml"
|
||||
href="{ARCOLOGY_FEED_PAGE}.xml"/>
|
||||
|
||||
$for(entries)$
|
||||
<entry>
|
||||
<title type="html">${it.title}</title>
|
||||
<link href="{ARCOLOGY_FEED_PAGE}#${it.rel}"/>
|
||||
<id>{ARCOLOGY_FEED_PAGE}#${it.rel}</id>
|
||||
<updated>${it.pubdate}</updated>
|
||||
<summary type="html">${it.content}</summary>
|
||||
</entry>
|
||||
$endfor$
|
||||
|
||||
</feed>
|
||||
#+end_src
|
||||
|
||||
And this can be confirmed to work with e.g. [[id:20220523T154158.992219][The Lion's Rear Site Feed]]:
|
||||
|
||||
#+begin_src shell :results verbatim :exports both
|
||||
pandoc ../thelionsrear_updates.org --lua-filter=arcology/pandoc/make-atom.lua --template=arcology/pandoc/atom.xml -s
|
||||
#+end_src
|
||||
|
||||
#+results:
|
@ -0,0 +1,56 @@
|
||||
import re
|
||||
from fastapi import Response, HTTPException
|
||||
|
||||
import asyncio
|
||||
from sqlmodel import Session
|
||||
from async_lru import alru_cache
|
||||
from typing import Optional
|
||||
|
||||
from arcology.html import rewrite_html
|
||||
from arcology.key import ArcologyKey
|
||||
from arcology.parse import parse_sexp, print_sexp
|
||||
from arcology.arroyo import Page
|
||||
|
||||
class ExportException(BaseException):
|
||||
code: int
|
||||
stderr: str
|
||||
|
||||
def __init__(self, code, stderr=None):
|
||||
self.code = code
|
||||
self.stderr = stderr
|
||||
|
||||
@alru_cache(maxsize=64)
|
||||
async def export_pandoc(file: str, hash: str) -> str:
|
||||
proc = await asyncio.create_subprocess_shell(
|
||||
f"pandoc {file} --lua-filter=./arcology/pandoc/make-atom.lua --template=./arcology/pandoc/atom.xml -s",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE)
|
||||
stdout, stderr = await proc.communicate()
|
||||
if proc.returncode == 0:
|
||||
return stdout.decode()
|
||||
else:
|
||||
raise ExportException(code=proc.returncode, stderr=stderr.decode())
|
||||
|
||||
async def render_feed_from_file(_request, file: str, engine, site) -> Optional[Response]:
|
||||
with Session(engine) as session:
|
||||
p = Page.from_file(file, session)
|
||||
if p is None:
|
||||
raise HTTPException(status_code=404, detail="Feed not found.")
|
||||
try:
|
||||
xml = await export_pandoc(file, p.hash)
|
||||
except ExportException as e:
|
||||
raise HTTPException(status_code=500, detail=f"pandoc exited {e.code} w/ {e.stderr}")
|
||||
|
||||
return hydrate_feed(file, xml, session)
|
||||
|
||||
def hydrate_feed(filename: str, xml: str, session) -> str:
|
||||
page = Page.from_file(filename, session)
|
||||
def feed_page_replacement_fn(match):
|
||||
return page.get_file()
|
||||
|
||||
akey = ArcologyKey(page.get_key())
|
||||
|
||||
out_xml = xml
|
||||
out_xml = re.sub(r'{ARCOLOGY_FEED_PAGE}', akey.to_url(), out_xml)
|
||||
out_xml = rewrite_html(out_xml, session) # lol dangerous
|
||||
return out_xml
|
Loading…
Reference in New Issue