329 lines
12 KiB
Org Mode
329 lines
12 KiB
Org Mode
:PROPERTIES:
|
|
:ID: arcology/atom-gen
|
|
:ROAM_ALIASES: "Arcology Feed Generator" "Arcology Atom Generator"
|
|
:END:
|
|
#+TITLE: Arcology's Atom Pandoc Filter + Template
|
|
#+filetags: :Project:
|
|
#+AUTO_TANGLE: t
|
|
#+ARCOLOGY_KEY: arcology/feed-gen
|
|
#+ARCOLOGY_ALLOW_CRAWL: t
|
|
|
|
This module renders an [[https://en.wikipedia.org/wiki/Atom_(web_standard)][ATOM feed]]. It's possible for any page in the Arcology now to define an =#+ARCOLOGY_FEED= keyword, and in doing so create a new route in the [[id:20220225T175638.482695][Arcology Public Router]] which will render an Atom feed. The semantics of the feed more-or-less follow the expectations defined in =ox-rss=: Any heading with an =ID= property and a =PUBDATE= property with an org-mode active timestamp in it will be published to the feed. Any entry with an ID will have a =PUBDATE= added to it by invoking =(org-rss-add-pubdate-property)=.
|
|
|
|
* Invoking Pandoc for the Feed Generator
|
|
|
|
To get an ATOM feed for an org document, it's easy enough to invoke =render_feed_from_file=
|
|
|
|
I'm shelling out to =pandoc= directly. Probably shouldn't have reached for that thing in the first place! oh well.
|
|
|
|
#+begin_src python :tangle arcology/feeds.py :mkdirp yes
|
|
import re
|
|
from fastapi import Response, HTTPException
|
|
|
|
import asyncio
|
|
from sqlmodel import Session
|
|
from async_lru import alru_cache
|
|
from typing import Optional
|
|
|
|
from arcology.html import rewrite_html
|
|
from arcology.key import ArcologyKey
|
|
from arcology.parse import parse_sexp, print_sexp
|
|
from arcology.arroyo import Page
|
|
#+end_src
|
|
|
|
This is pretty straightforward, except I stick an [[https://docs.python.org/3/library/functools.html#functools.lru_cache][LRU cache]] in the middle of it so that feed readers aren't constantly invoking Pandoc.
|
|
|
|
#+begin_src python :tangle arcology/feeds.py :mkdirp yes
|
|
class ExportException(BaseException):
|
|
code: int
|
|
stderr: str
|
|
|
|
def __init__(self, code, stderr=None):
|
|
self.code = code
|
|
self.stderr = stderr
|
|
|
|
@alru_cache(maxsize=64)
|
|
async def export_pandoc(file: str, hash: str) -> str:
|
|
proc = await asyncio.create_subprocess_shell(
|
|
f"pandoc {file} --lua-filter=./arcology/pandoc/make-atom.lua --template=./arcology/pandoc/atom.xml -s",
|
|
stdout=asyncio.subprocess.PIPE,
|
|
stderr=asyncio.subprocess.PIPE)
|
|
stdout, stderr = await proc.communicate()
|
|
if proc.returncode == 0:
|
|
return stdout.decode()
|
|
else:
|
|
raise ExportException(code=proc.returncode, stderr=stderr.decode())
|
|
|
|
async def render_feed_from_file(_request, file: str, engine, site) -> Optional[Response]:
|
|
with Session(engine) as session:
|
|
p = Page.from_file(file, session)
|
|
if p is None:
|
|
raise HTTPException(status_code=404, detail="Feed not found.")
|
|
try:
|
|
xml = await export_pandoc(file, p.hash)
|
|
except ExportException as e:
|
|
raise HTTPException(status_code=500, detail=f"pandoc exited {e.code} w/ {e.stderr}")
|
|
|
|
return hydrate_feed(file, xml, session)
|
|
#+end_src
|
|
|
|
The feed is more-or-less ready as-is when it comes out of Pandoc except for the final "canonical" URL -- an =re.sub= invocation will replace it a stand-in variable with the correct URL. I could probably inject this in to the Pandoc invocation as a metadata variable but this is Good Enough.
|
|
|
|
#+begin_src python :tangle arcology/feeds.py :mkdirp yes
|
|
def hydrate_feed(filename: str, xml: str, session) -> str:
|
|
page = Page.from_file(filename, session)
|
|
def feed_page_replacement_fn(match):
|
|
return page.get_file()
|
|
|
|
akey = ArcologyKey(page.get_key())
|
|
|
|
out_xml = xml
|
|
out_xml = re.sub(r'{ARCOLOGY_FEED_PAGE}', akey.to_url(), out_xml)
|
|
out_xml = rewrite_html(out_xml, session) # lol dangerous
|
|
return out_xml
|
|
#+end_src
|
|
|
|
* Rendering Atom from Org in Pandoc in two steps
|
|
|
|
I had some real trouble figuring out how to get Pandoc to spit out [[https://validator.w3.org/feed/docs/atom.html][ATOM feeds]] and this is not "technically compliant" but I can do it with a combination of a [[https://pandoc.org/lua-filters.html][Lua filter]] which extracts headings' metadata in to variables which a [[https://pandoc.org/MANUAL.html#templates][custom template]] then renders out:
|
|
|
|
** Lua Filter
|
|
|
|
#+begin_src lua :tangle arcology/pandoc/make-atom.lua :mkdirp yes
|
|
local utils = require 'pandoc.utils'
|
|
|
|
local entries = {}
|
|
local keywords = {}
|
|
local variables = {}
|
|
|
|
set_meta_from_raw = function (raw)
|
|
-- Don't do anything unless the block contains *org* markup.
|
|
if raw.format ~= 'org' then return nil end
|
|
|
|
-- extract variable name and value
|
|
local name, value = raw.text:match '#%+(%w+):%s*(.+)$'
|
|
if name and value then
|
|
variables[name] = value
|
|
end
|
|
end
|
|
|
|
-- thanks random github users https://gist.github.com/zwh8800/9b0442efadc97408ffff248bc8573064
|
|
local epoch = os.time{year=1970, month=1, day=1, hour=0}
|
|
function parse_org_date(org_date)
|
|
local year, month, day, hour, minute = org_date:match("<?(%d+)%-(%d+)%-(%d+)%s%a+%s(%d+)%:(%d+)>?")
|
|
local timestamp = os.time{year = year, month = month, day = day, hour = hour, min = minute, sec = 0} - epoch
|
|
return timestamp
|
|
end
|
|
rfc3339ish = "%Y-%m-%dT%TZ"
|
|
|
|
set_entries_and_date = function(m)
|
|
for name, value in pairs(variables) do
|
|
m[name] = value
|
|
end
|
|
if m["FILETAGS"] ~= nil then
|
|
kw_str = utils.stringify(m["FILETAGS"])
|
|
kw_str:gsub("([^:]*)", function(tag)
|
|
if tag ~= "" then
|
|
table.insert(keywords, tag)
|
|
end
|
|
end)
|
|
end
|
|
if m["keywords"] ~= nil then
|
|
kw_str = utils.stringify(m["keywords"])
|
|
kw_str:gsub("([^, ]*)", function(tag)
|
|
if tag ~= "" then
|
|
table.insert(keywords, tag)
|
|
end
|
|
end)
|
|
end
|
|
if m.date == nil then
|
|
m.date = os.date(rfc3339ish) --current time in iso8601/rfc3339
|
|
end
|
|
m.entries = entries
|
|
m.keywords = keywords
|
|
return m
|
|
end
|
|
|
|
extract_entries = function (blocks)
|
|
pandoc.utils.make_sections(true, nil, blocks):walk
|
|
{
|
|
Div = function (div)
|
|
if div.attributes.pubdate then
|
|
local header = div.content[1]
|
|
header.attributes.pubdate = nil
|
|
header.attributes.number = nil
|
|
div.attributes.number = nil
|
|
header_tags = {}
|
|
|
|
title_subbed = utils.stringify(header.content)
|
|
for k,v in pairs(header.content) do
|
|
if v.tag == "Span" and v.attributes["tag-name"] ~= nil then
|
|
table.insert(header_tags, v.attributes["tag-name"])
|
|
end
|
|
end
|
|
|
|
entry = {
|
|
content=div,
|
|
title=title_subbed,
|
|
rel=div.attributes.id,
|
|
pubdate=os.date(rfc3339ish, parse_org_date(div.attributes.pubdate))
|
|
}
|
|
if #header_tags > 0 then
|
|
entry["categories"] = header_tags
|
|
end
|
|
table.insert(entries, entry)
|
|
end
|
|
end
|
|
}
|
|
end
|
|
|
|
return {
|
|
{
|
|
RawBlock = set_meta_from_raw,
|
|
Blocks = extract_entries,
|
|
Meta = set_entries_and_date
|
|
}
|
|
}
|
|
#+end_src
|
|
|
|
** Pandoc Template
|
|
|
|
The template is basically unremarkable, but has the same issues that the HTML files have: they need to have their [[id:arcology/arroyo/hydrate][ID links fixed]].
|
|
|
|
#+begin_src xml :tangle arcology/pandoc/atom.xml
|
|
<?xml version="1.0" encoding="utf-8"?>
|
|
<feed xmlns="http://www.w3.org/2005/Atom">
|
|
|
|
<title>$pagetitle$</title>
|
|
<link href="{ARCOLOGY_FEED_PAGE}"/>
|
|
<updated>$date$</updated>
|
|
<author>
|
|
$for(author)$
|
|
<name>${author}</name>
|
|
$endfor$
|
|
</author>
|
|
<id>{ARCOLOGY_FEED_PAGE}</id>
|
|
<link rel="self" type="application/atom+xml"
|
|
href="{ARCOLOGY_FEED_PAGE}.xml"/>
|
|
|
|
$for(entries)$
|
|
<entry>
|
|
<title type="html">${it.title}</title>
|
|
<link href="{ARCOLOGY_FEED_PAGE}#${it.rel}"/>
|
|
<id>{ARCOLOGY_FEED_PAGE}#${it.rel}</id>
|
|
<updated>${it.pubdate}</updated>
|
|
$for(it.categories)$
|
|
<category term="${it}" />
|
|
$endfor$
|
|
$for(keywords)$
|
|
<category term="${it}" />
|
|
$endfor$
|
|
<summary type="html">${it.content}</summary>
|
|
</entry>
|
|
$endfor$
|
|
|
|
</feed>
|
|
#+end_src
|
|
|
|
And this can be confirmed to work with e.g. [[id:20220523T154158.992219][The Lion's Rear Site Feed]]:
|
|
|
|
#+begin_src shell :results verbatim :exports both
|
|
pandoc ../thelionsrear_updates.org --lua-filter=arcology/pandoc/make-atom.lua --template=arcology/pandoc/atom.xml -s
|
|
#+end_src
|
|
|
|
Notice that it still has to go through the process of [[id:arcology/arroyo/hydrate][Rewriting and Hydrating]] that the HTML docs have to go through so that links and whatnot work. This is handled in =hydrate_feed= above.
|
|
|
|
* Listing the Arcology Feeds
|
|
|
|
Since the feeds exist in the [[id:arroyo/system-cache][Arroyo Cache]] K/V/F store, they can be extracted to shove in to the <head> for example.
|
|
|
|
This is a poor data modeling, however, and it's like that we will benefit from an [[id:arcology/arroyo-page][Arroyo Arcology Generator]] which extracts =ARCOLOGY_FEED= entities to associate them to the page/file they're embedded in.
|
|
|
|
#+begin_src python :tangle arcology/feeds.py
|
|
from typing import List
|
|
|
|
from sqlmodel import select, Session
|
|
|
|
from arcology.arroyo import Keyword
|
|
from arcology.parse import parse_sexp
|
|
from arcology.key import ArcologyKey
|
|
#+END_SRC
|
|
|
|
These helpers prepare the data for =make_feed_entries=. =get_feed_keys= will return the list of all =ARCOLOGY_FEED= routing keys, and =get_feed_files= returns the files associated with those keys.
|
|
|
|
#+begin_src python :tangle arcology/feeds.py :noweb yes
|
|
def arcology_feed_q():
|
|
return select(Keyword).where(Keyword.keyword=='"ARCOLOGY_FEED"')
|
|
|
|
def get_feed_keys(session) -> List[str]:
|
|
return [parse_sexp(row.value) for row in session.exec(arcology_feed_q())]
|
|
|
|
def get_feed_files(session) -> List[str]:
|
|
return [parse_sexp(row.file) for row in session.exec(arcology_feed_q())]
|
|
#+END_SRC
|
|
=make_feed_entries= exposes just why the data model is a bit weak.
|
|
|
|
We have to build the mapping using the return of =get_feed_files= so that the feeds' pages' titles can be applied in the final return value.
|
|
|
|
We use the =site_key= to make sure it's filtered to only show feeds related to the current [[id:20211219T144255.001827][Arcology Site]]. It's certainly simpler to show *all* feeds for *all* sites, but in the future I may want to have sites which are at least somewhat hidden, and so showing them in the global feed discovery mechanism is quite a silly thing to build in. If the site keys don't match, the title isn't added to the dict...
|
|
|
|
#+begin_src python :noweb-ref populateDict
|
|
feed_page_titles = dict() # file -> title
|
|
for feed_file in get_feed_files(session):
|
|
p = Page.from_file(feed_file, session)
|
|
if p.get_site().key == site_key:
|
|
feed_page_titles[feed_file] = p.get_title()
|
|
#+end_src
|
|
|
|
If the file isn't set in the =feed_page_titles= dict, we know that it's been skipped. The feed URL is generated using [[id:arcology/arroyo/key][arcology.key.ArcologyKey]], and the title and URL are added to the return list in a tuple.
|
|
|
|
#+begin_src python :noweb-ref populateRetVal
|
|
ret = list()
|
|
for feed_key in get_feed_keys(session):
|
|
feed_url = ArcologyKey(feed_key).to_url()
|
|
feed_file = Keyword.get("ARCOLOGY_FEED", feed_key, session=session).filename()
|
|
if feed_page_titles.get(feed_file, None):
|
|
ret.append((feed_url, feed_page_titles[feed_file]))
|
|
#+end_src
|
|
|
|
Splat!
|
|
|
|
#+begin_src python :tangle arcology/feeds.py :noweb yes
|
|
def make_feed_entries(site_key: str, session):
|
|
|
|
<<populateDict>>
|
|
|
|
<<populateRetVal>>
|
|
|
|
return ret
|
|
#+end_src
|
|
|
|
** INPROGRESS [[id:arcology/arroyo-page][Arroyo Arcology Generator]] for =ARCOLOGY_FEED= keys
|
|
:PROPERTIES:
|
|
:ID: 20221228T183435.210299
|
|
:END:
|
|
:LOGBOOK:
|
|
- State "INPROGRESS" from "NEXT" [2023-01-24 Tue 22:25]
|
|
:END:
|
|
|
|
All of this becomes much simpler with a [[id:arcology/arroyo-page][Arroyo Arcology Generator]] schema like, maybe, this:
|
|
|
|
#+begin_src elisp
|
|
(arcology-feeds
|
|
[(file :not-null)
|
|
(key :not-null)
|
|
(title :not-null)
|
|
(site :not-null)
|
|
(post-visibility :not-null)
|
|
(hash :not-null)])
|
|
#+end_src
|
|
|
|
then things like =select(Feed.key, Feed.title).where(Feed.site == "lionsrear")= is trivial.
|
|
|
|
|
|
Port this code to use [[id:arcology/arroyo/feed][arcology.arroyo.Feed]], something like:
|
|
|
|
select(arroyo.Page).where(arroyo.Page.site_key==site_key) -> file
|
|
select(arroyo.Feed).where(arroyo.Feed.file==file)
|