move feed generation in to its own module

main
Ryan Rix 11 months ago
parent 7d6c98739a
commit 529958c361

@ -0,0 +1,182 @@
:PROPERTIES:
:ID: arcology/atom-gen
:ROAM_ALIASES: "Arcology Feed Generator" "Arcology Atom Generator"
:END:
#+TITLE: Arcology's Atom Pandoc Filter + Template
#+AUTO_TANGLE: t
#+ARCOLOGY_KEY: arcology/feed-gen
This module renders an [[https://en.wikipedia.org/wiki/Atom_(web_standard)][ATOM feed]]. It's possible for any page in the Arcology now to define an =#+ARCOLOGY_FEED= keyword, and in doing so create a new route in the [[id:20220225T175638.482695][Arcology Public Router]] which will render an Atom feed. The semantics of the feed more-or-less follow the expectations defined in =ox-rss=: Any heading with an =ID= property and a =PUBDATE= property with an org-mode active timestamp in it will be published to the feed. Any entry with an ID will have a =PUBDATE= added to it by invoking =(org-rss-add-pubdate-property)=.
* Invoking Pandoc for the Feed Generator
To get an ATOM feed for an org document, it's easy enough to invoke =render_feed_from_file=
I'm shelling out to =pandoc= directly. Probably shouldn't have reached for that thing in the first place! oh well.
#+begin_src python :tangle arcology/feeds.py :mkdirp yes
import re
from fastapi import Response, HTTPException
import asyncio
from sqlmodel import Session
from async_lru import alru_cache
from typing import Optional
from arcology.html import rewrite_html
from arcology.key import ArcologyKey
from arcology.parse import parse_sexp, print_sexp
from arcology.arroyo import Page
#+end_src
This is pretty straightforward, except I stick an [[https://docs.python.org/3/library/functools.html#functools.lru_cache][LRU cache]] in the middle of it so that feed readers aren't constantly invoking Pandoc.
#+begin_src python :tangle arcology/feeds.py :mkdirp yes
class ExportException(BaseException):
code: int
stderr: str
def __init__(self, code, stderr=None):
self.code = code
self.stderr = stderr
@alru_cache(maxsize=64)
async def export_pandoc(file: str, hash: str) -> str:
proc = await asyncio.create_subprocess_shell(
f"pandoc {file} --lua-filter=./arcology/pandoc/make-atom.lua --template=./arcology/pandoc/atom.xml -s",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE)
stdout, stderr = await proc.communicate()
if proc.returncode == 0:
return stdout.decode()
else:
raise ExportException(code=proc.returncode, stderr=stderr.decode())
async def render_feed_from_file(_request, file: str, engine, site) -> Optional[Response]:
with Session(engine) as session:
p = Page.from_file(file, session)
if p is None:
raise HTTPException(status_code=404, detail="Feed not found.")
try:
xml = await export_pandoc(file, p.hash)
except ExportException as e:
raise HTTPException(status_code=500, detail=f"pandoc exited {e.code} w/ {e.stderr}")
return hydrate_feed(file, xml, session)
#+end_src
The feed is more-or-less ready as-is when it comes out of Pandoc except for the final "canonical" URL -- an =re.sub= invocation will replace it a stand-in variable with the correct URL. I could probably inject this in to the Pandoc invocation as a metadata variable but this is Good Enough.
#+begin_src python :tangle arcology/feeds.py :mkdirp yes
def hydrate_feed(filename: str, xml: str, session) -> str:
page = Page.from_file(filename, session)
def feed_page_replacement_fn(match):
return page.get_file()
akey = ArcologyKey(page.get_key())
out_xml = xml
out_xml = re.sub(r'{ARCOLOGY_FEED_PAGE}', akey.to_url(), out_xml)
out_xml = rewrite_html(out_xml, session) # lol dangerous
return out_xml
#+end_src
* Rendering Atom from Org in Pandoc in two steps
I had some real trouble figuring out how to get Pandoc to spit out [[https://validator.w3.org/feed/docs/atom.html][ATOM feeds]] and this is not "technically compliant" but I can do it with a combination of a [[https://pandoc.org/lua-filters.html][Lua filter]] which extracts headings' metadata in to variables which a [[https://pandoc.org/MANUAL.html#templates][custom template]] then renders out:
** Lua Filter
#+begin_src lua :tangle arcology/pandoc/make-atom.lua :mkdirp yes
local utils = require 'pandoc.utils'
local entries = {}
local vars = {}
-- thanks random github users https://gist.github.com/zwh8800/9b0442efadc97408ffff248bc8573064
local epoch = os.time{year=1970, month=1, day=1, hour=0}
function parse_org_date(org_date)
local year, month, day, hour, minute = org_date:match("<?(%d+)%-(%d+)%-(%d+)%s%a+%s(%d+)%:(%d+)>?")
local timestamp = os.time{year = year, month = month, day = day, hour = hour, min = minute, sec = 0} - epoch
return timestamp
end
rfc3339ish = "%Y-%m-%dT%TZ"
set_entries_and_date = function(m)
if m.date == nil then
m.date = os.date(rfc3339ish) --current time in iso8601/rfc3339
end
m.entries = entries
return m
end
extract_entries = function (blocks)
pandoc.utils.make_sections(true, nil, blocks):walk
{
Div = function (div)
if div.attributes.pubdate then
local header = div.content[1]
header.attributes.pubdate = nil
header.attributes.number = nil
div.attributes.number = nil
entry = {
content=div,
title=header.content,
rel=div.attributes.id,
pubdate=os.date(rfc3339ish, parse_org_date(div.attributes.pubdate))
}
table.insert(entries, entry)
end
end
}
end
return {
{
Blocks = extract_entries,
Meta = set_entries_and_date
}
}
#+end_src
** Pandoc Template
The template is basically unremarkable, but has the same issues that the HTML files have: they need to have their [[id:arcology/arroyo/hydrate][ID links fixed]].
#+begin_src xml :tangle arcology/pandoc/atom.xml
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>$pagetitle$</title>
<link href="http://example.org/"/>
<updated>$date$</updated>
<author>
$for(author)$
<name>${author}</name>
$endfor$
</author>
<id>{ARCOLOGY_FEED_PAGE}</id>
<link rel="self" type="application/atom+xml"
href="{ARCOLOGY_FEED_PAGE}.xml"/>
$for(entries)$
<entry>
<title type="html">${it.title}</title>
<link href="{ARCOLOGY_FEED_PAGE}#${it.rel}"/>
<id>{ARCOLOGY_FEED_PAGE}#${it.rel}</id>
<updated>${it.pubdate}</updated>
<summary type="html">${it.content}</summary>
</entry>
$endfor$
</feed>
#+end_src
And this can be confirmed to work with e.g. [[id:20220523T154158.992219][The Lion's Rear Site Feed]]:
#+begin_src shell :results verbatim :exports both
pandoc ../thelionsrear_updates.org --lua-filter=arcology/pandoc/make-atom.lua --template=arcology/pandoc/atom.xml -s
#+end_src
#+results:

@ -59,164 +59,6 @@ async def render_page_from_key(request: Request, key: str, engine, site) -> Opti
))
#+end_src
** Arcology's Atom Pandoc Filter + Template
:PROPERTIES:
:ID: arcology/atom-gen
:ROAM_ALIASES: "Arcology Atom Generator"
:END:
This similarly renders an ATOM feed using the [[id:arcology/atom-gen][Arcology Atom Generator]]. Note that =pypandoc= doesn't support user-defined templates, and so i'm shelling out to =pandoc= directly. Probably shouldn't have reached for that thing in the first place! oh well.
#+begin_src python :tangle arcology/routing/util.py :mkdirp yes
import re
from fastapi import Response
from arcology.html import rewrite_html
from arcology.key import ArcologyKey
from arcology.parse import parse_sexp, print_sexp
from arcology.arroyo import Page
#+end_src
This is pretty straightforward, except I stick an [[https://docs.python.org/3/library/functools.html#functools.lru_cache][LRU cache]] in the middle of it so that feed readers aren't constantly invoking Pandoc.
#+begin_src python :tangle arcology/routing/util.py :mkdirp yes
from async_lru import alru_cache
class ExportException(BaseException):
code: int
stderr: str
@alru_cache(maxsize=64)
async def export_pandoc(file: str, hash: str) -> str:
proc = await asyncio.create_subprocess_shell(
f"pandoc {file} --lua-filter=./arcology/pandoc/make-atom.lua --template=./arcology/pandoc/atom.xml -s",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE)
stdout, stderr = await proc.communicate()
if proc.returncode == 0:
return stdout.decode()
else:
raise ExportExeption(code=proc.returncode, stderr=stderr)
async def render_feed_from_file(request: Request, file: str, engine, site) -> Optional[Response]:
with Session(engine) as session:
p = Page.from_file(file, session)
if p is None:
raise HTTPException(status_code=404, detail="Feed not found.")
try:
xml = await export_pandoc(file, p.hash)
except ExportException as e:
raise HTTPException(status_code=500, detail=f"pandoc exited {e.code} w/ {e.stderr}")
return hydrate_feed(file, xml, session)
#+end_src
#+begin_src python :tangle arcology/routing/util.py :mkdirp yes
def hydrate_feed(filename: str, xml: str, session) -> str:
page = Page.from_file(filename, session)
def feed_page_replacement_fn(match):
return page.get_file()
akey = ArcologyKey(page.get_key())
out_xml = xml
out_xml = re.sub(r'{ARCOLOGY_FEED_PAGE}', akey.to_url(), out_xml)
out_xml = rewrite_html(out_xml, session)
return out_xml
#+end_src
I had some real trouble figuring out how to get Pandoc to spit out [[https://validator.w3.org/feed/docs/atom.html][ATOM feeds]] and this is not "technically compliant" but I can do it with a combination of a [[https://pandoc.org/lua-filters.html][Lua filter]] which extracts headings' metadata in to variables which a [[https://pandoc.org/MANUAL.html#templates][custom template]] then renders out:
#+begin_src lua :tangle arcology/pandoc/make-atom.lua :mkdirp yes
local utils = require 'pandoc.utils'
local entries = {}
local vars = {}
-- thanks random github users https://gist.github.com/zwh8800/9b0442efadc97408ffff248bc8573064
local epoch = os.time{year=1970, month=1, day=1, hour=0}
function parse_org_date(org_date)
local year, month, day, hour, minute = org_date:match("<?(%d+)%-(%d+)%-(%d+)%s%a+%s(%d+)%:(%d+)>?")
local timestamp = os.time{year = year, month = month, day = day, hour = hour, min = minute, sec = 0} - epoch
return timestamp
end
rfc3339ish = "%Y-%m-%dT%TZ"
set_entries_and_date = function(m)
if m.date == nil then
m.date = os.date(rfc3339ish) --current time in iso8601/rfc3339
end
m.entries = entries
return m
end
extract_entries = function (blocks)
pandoc.utils.make_sections(true, nil, blocks):walk
{
Div = function (div)
if div.attributes.pubdate then
local header = div.content[1]
header.attributes.pubdate = nil
header.attributes.number = nil
div.attributes.number = nil
entry = {
content=div,
title=header.content,
rel=div.attributes.id,
pubdate=os.date(rfc3339ish, parse_org_date(div.attributes.pubdate))
}
table.insert(entries, entry)
end
end
}
end
return {
{
Blocks = extract_entries,
Meta = set_entries_and_date
}
}
#+end_src
The template is basically unremarkable, but has the same issues that the HTML files have: they need to have their [[id:arcology/arroyo/hydrate][ID links fixed]].
#+begin_src xml :tangle arcology/pandoc/atom.xml
<?xml version="1.0" encoding="utf-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
<title>$pagetitle$</title>
<link href="http://example.org/"/>
<updated>$date$</updated>
<author>
$for(author)$
<name>${author}</name>
$endfor$
</author>
<id>{ARCOLOGY_FEED_PAGE}</id>
<link rel="self" type="application/atom+xml"
href="{ARCOLOGY_FEED_PAGE}.xml"/>
$for(entries)$
<entry>
<title type="html">${it.title}</title>
<link href="{ARCOLOGY_FEED_PAGE}#${it.rel}"/>
<id>{ARCOLOGY_FEED_PAGE}#${it.rel}</id>
<updated>${it.pubdate}</updated>
<summary type="html">${it.content}</summary>
</entry>
$endfor$
</feed>
#+end_src
And this can be confirmed to work with e.g. [[id:20220523T154158.992219][The Lion's Rear Site Feed]]:
#+begin_src shell :results verbatim :exports both
pandoc ../thelionsrear_updates.org --lua-filter=arcology/pandoc/make-atom.lua --template=arcology/pandoc/atom.xml -s
#+end_src
* NEXT This routing split between local and prod doesn't work because the routes aren't domain aware. and very greedy.
* [#A] Arcology Domain-Aware Routing
:PROPERTIES:
@ -236,7 +78,8 @@ from sqlmodel import Session
from fastapi.responses import HTMLResponse, FileResponse, Response
import arcology.sites as sites
from arcology.routing.util import render_page_from_key, render_feed_from_file
from arcology.routing.util import render_page_from_key
from arcology.feeds import render_feed_from_file
from arcology.sites import host_to_site
from arcology.arroyo import engine
from arcology.config import get_settings
@ -250,7 +93,7 @@ def decorate_app(app: FastAPI) -> FastAPI:
async def robots_txt(request: Request):
return "arcology/static/robots.txt"
@app.get("/{sub_key:path}.xml", response_class=Response, name="base-route")
@app.get("/{sub_key:path}.xml", response_class=Response, name="feed-route")
async def feed_route(request: Request, sub_key: str):
sub_key += ".xml" # dark laughter
return Response(content=(await public_router(request, sub_key)), media_type="application/atom+xml")

@ -0,0 +1,56 @@
import re
from fastapi import Response, HTTPException
import asyncio
from sqlmodel import Session
from async_lru import alru_cache
from typing import Optional
from arcology.html import rewrite_html
from arcology.key import ArcologyKey
from arcology.parse import parse_sexp, print_sexp
from arcology.arroyo import Page
class ExportException(BaseException):
code: int
stderr: str
def __init__(self, code, stderr=None):
self.code = code
self.stderr = stderr
@alru_cache(maxsize=64)
async def export_pandoc(file: str, hash: str) -> str:
proc = await asyncio.create_subprocess_shell(
f"pandoc {file} --lua-filter=./arcology/pandoc/make-atom.lua --template=./arcology/pandoc/atom.xml -s",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE)
stdout, stderr = await proc.communicate()
if proc.returncode == 0:
return stdout.decode()
else:
raise ExportException(code=proc.returncode, stderr=stderr.decode())
async def render_feed_from_file(_request, file: str, engine, site) -> Optional[Response]:
with Session(engine) as session:
p = Page.from_file(file, session)
if p is None:
raise HTTPException(status_code=404, detail="Feed not found.")
try:
xml = await export_pandoc(file, p.hash)
except ExportException as e:
raise HTTPException(status_code=500, detail=f"pandoc exited {e.code} w/ {e.stderr}")
return hydrate_feed(file, xml, session)
def hydrate_feed(filename: str, xml: str, session) -> str:
page = Page.from_file(filename, session)
def feed_page_replacement_fn(match):
return page.get_file()
akey = ArcologyKey(page.get_key())
out_xml = xml
out_xml = re.sub(r'{ARCOLOGY_FEED_PAGE}', akey.to_url(), out_xml)
out_xml = rewrite_html(out_xml, session) # lol dangerous
return out_xml

@ -4,7 +4,8 @@ from sqlmodel import Session
from fastapi.responses import HTMLResponse, FileResponse, Response
import arcology.sites as sites
from arcology.routing.util import render_page_from_key, render_feed_from_file
from arcology.routing.util import render_page_from_key
from arcology.feeds import render_feed_from_file
from arcology.sites import host_to_site
from arcology.arroyo import engine
from arcology.config import get_settings
@ -18,7 +19,7 @@ def decorate_app(app: FastAPI) -> FastAPI:
async def robots_txt(request: Request):
return "arcology/static/robots.txt"
@app.get("/{sub_key:path}.xml", response_class=Response, name="base-route")
@app.get("/{sub_key:path}.xml", response_class=Response, name="feed-route")
async def feed_route(request: Request, sub_key: str):
sub_key += ".xml" # dark laughter
return Response(content=(await public_router(request, sub_key)), media_type="application/atom+xml")

@ -30,53 +30,3 @@ async def render_page_from_key(request: Request, key: str, engine, site) -> Opti
backlink=backlink,
request=request,
))
import re
from fastapi import Response
from arcology.html import rewrite_html
from arcology.key import ArcologyKey
from arcology.parse import parse_sexp, print_sexp
from arcology.arroyo import Page
from async_lru import alru_cache
class ExportException(BaseException):
code: int
stderr: str
@alru_cache(maxsize=64)
async def export_pandoc(file: str, hash: str) -> str:
proc = await asyncio.create_subprocess_shell(
f"pandoc {file} --lua-filter=./arcology/pandoc/make-atom.lua --template=./arcology/pandoc/atom.xml -s",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE)
stdout, stderr = await proc.communicate()
if proc.returncode == 0:
return stdout.decode()
else:
raise ExportExeption(code=proc.returncode, stderr=stderr)
async def render_feed_from_file(request: Request, file: str, engine, site) -> Optional[Response]:
with Session(engine) as session:
p = Page.from_file(file, session)
if p is None:
raise HTTPException(status_code=404, detail="Feed not found.")
try:
xml = await export_pandoc(file, p.hash)
except ExportException as e:
raise HTTPException(status_code=500, detail=f"pandoc exited {e.code} w/ {e.stderr}")
return hydrate_feed(file, xml, session)
def hydrate_feed(filename: str, xml: str, session) -> str:
page = Page.from_file(filename, session)
def feed_page_replacement_fn(match):
return page.get_file()
akey = ArcologyKey(page.get_key())
out_xml = xml
out_xml = re.sub(r'{ARCOLOGY_FEED_PAGE}', akey.to_url(), out_xml)
out_xml = rewrite_html(out_xml, session)
return out_xml

Loading…
Cancel
Save