Compare commits

...

2 Commits

Author SHA1 Message Date
Ryan Rix 9afba3d788 provide a 404 page and wire it up to the Native 404/unpublished pages 2024-02-17 13:14:10 -08:00
Ryan Rix 0ba1c6c683 add some more user agents to classify 2024-02-17 13:12:38 -08:00
6 changed files with 202 additions and 45 deletions

View File

@ -541,7 +541,8 @@ from arcology import views
urlpatterns = [
path("admin/", admin.site.urls),
path("", views.index),
path("robots.txt", views.robots),
path("robots.txt", views.robots, name="robots_txt"),
path("404", views.unpublished, name="page_not_found"),
path("sitemap", views.sitemap, name="sitemap"),
path("sites.css", views.site_css, name="site-css"),
path("feeds.json", views.feed_list, name="feed-list"),
@ -554,7 +555,7 @@ urlpatterns = [
#+begin_src python :tangle arcology/views.py
import logging
from django.http import HttpResponse, HttpResponseNotFound, Http404
from django.http import HttpResponse, HttpResponseNotFound
from django.shortcuts import render, get_object_or_404
from arcology.models import Page, Feed, Site
@ -610,6 +611,7 @@ page_counter = Counter("arcology_page", "Hit counter for each page", ["site", "p
render_latency = Histogram("arcology_page_render_seconds", "Latency for render_page func.", ["page", "site", "agent_type"])
from arcology.agent_utils import AgentClassification
from django.template import loader
def render_page(request, site, full_key):
agent = AgentClassification.from_request(request)
@ -619,7 +621,13 @@ def render_page(request, site, full_key):
the_page = Page.objects.get(route_key=full_key)
except Page.DoesNotExist:
page_counter.labels(page=full_key, status=404, site=site.key, agent_type=agent).inc()
raise Http404
template = loader.get_template("404.html")
context = dict(
missing_key=full_key
)
return HttpResponseNotFound(
template.render(context, request)
)
links = the_page.collect_links()
page_html = the_page.to_html(links)
@ -714,6 +722,38 @@ The main =content= block contains the =<main>= generated by the native parser, a
{% endblock %}
#+end_src
Here's a really simple 404 template, too.
#+begin_src jinja2 :tangle arcology/templates/404.html
{% extends "arcology/app.html" %}
{% block title %}Page Not Found{% endblock %}
{% block h1 %}<h1>Page Not Found</h1>{% endblock %}
{% block content %}
<section>
<p>
The page you tried to open either has not been written by the
author or the author has chosen to not publish it at this
time. Please contact the author and include the URL of both the
page you clicked the link on, as well as the link you&apos;d like
to read. You may just want
to <a href="javascript:history.back()">Go Back</a>, too.
</p>
<p>
If you&apos;re interested in a particular reference, you might of
course have more luck using a public search engine
like <a href="https://duckduckgo.com">DuckDuckGo</a>
or <a href="https://kagi.com">Kagi</a>.
</p>
<pre>MISSING KEY = {{ missing_key }}</pre>
</section>
{% endblock %}
#+end_src
*** Org Page-specific CSS Stylings
Most of the page CSS is defined below, but the content CSS is here, nearer the actual implementation of the flexbox:
@ -781,7 +821,6 @@ Here are some [[https://medium.com/@massimo.cassandro/flexbox-separators-b284d6d
And some simple image wrangling:
** INPROGRESS Atom Feed Handler
:PROPERTIES:
:ID: 20240204T234814.612917
@ -890,6 +929,26 @@ def sitemap(request):
return HttpResponse(b"sitemap")
#+end_src
** NEXT unpublished/not found endpoint
There are plenty of links inside the Arcology which aren't meant to be clicked. =roam:= stub links will of course
#+begin_src python :tangle arcology/views.py
def unpublished(request):
key = request.GET.get("key")
if key is None:
key = "NOT_SUPPLIED"
# query links etc to create a JSON doc for SigmaJS
template = loader.get_template("404.html")
context = dict(
missing_key=key
)
return HttpResponseNotFound(
template.render(context, request)
)
#+end_src
** =robots.txt= Endpoint
- Disallow all GPT-alikes on all pages, I will add more to this list as necessary. Probably will pull these in to [[id:arcology/django/config][Arcology Project Configuration]] sooner or later.

View File

@ -17,6 +17,7 @@ class AgentClassification(str, Enum):
FEED = "feed"
BOT = "bot"
AUTOMATION = "automation"
CRAWLER = "crawler"
def __str__(self):
@ -27,6 +28,8 @@ class AgentClassification(str, Enum):
user_agent = request.headers.get("User-Agent")
if user_agent == "":
return cls.NO_UA
if user_agent is None:
return cls.NO_UA
if 'prometheus' in user_agent:
return cls.INTERNAL
if 'feediverse' in user_agent:
@ -39,6 +42,10 @@ class AgentClassification(str, Enum):
return cls.BROWSER
if 'Safari/' in user_agent:
return cls.BROWSER
if 'Opera/' in user_agent:
return cls.BROWSER
if 'ddg_android/' in user_agent:
return cls.BROWSER
if 'Synapse' in user_agent:
return cls.MATRIX
if 'Element' in user_agent:
@ -79,6 +86,10 @@ class AgentClassification(str, Enum):
return cls.FEED
if 'Feedbin' in user_agent:
return cls.FEED
if 'NetNewsWire' in user_agent:
return cls.FEED
if 'FreshRSS' in user_agent:
return cls.FEED
if 'SimplePie' in user_agent:
return cls.FEED
if 'Elfeed' in user_agent:
@ -95,10 +106,28 @@ class AgentClassification(str, Enum):
return cls.BOT
if 'Poduptime' in user_agent:
return cls.BOT
if 'aiohttp' in user_agent:
return cls.AUTOMATION
if 'python-requests' in user_agent:
return cls.AUTOMATION
if 'Go-http-client' in user_agent:
return cls.AUTOMATION
if 'curl/' in user_agent:
return cls.AUTOMATION
if 'wget/' in user_agent:
return cls.AUTOMATION
if 'keybase-proofs/' in user_agent:
return cls.AUTOMATION
if 'InternetMeasurement' in user_agent:
return cls.CRAWLER
if 'CensysInspect' in user_agent:
return cls.CRAWLER
if 'scaninfo@paloaltonetworks.com' in user_agent:
return cls.CRAWLER
if 'SEOlyt/' in user_agent:
return cls.CRAWLER
if 'Sogou web spider/' in user_agent:
return cls.CRAWLER
logger.warn(f"Unknown User-Agent: {user_agent}")

View File

@ -0,0 +1,29 @@
{# [[file:../../arcology.org::*Rendering the converted Org HTML in to a whole web-page][Rendering the converted Org HTML in to a whole web-page:5]] #}
{% extends "arcology/app.html" %}
{% block title %}Page Not Found{% endblock %}
{% block h1 %}<h1>Page Not Found</h1>{% endblock %}
{% block content %}
<section>
<p>
The page you tried to open either has not been written by the
author or the author has chosen to not publish it at this
time. Please contact the author and include the URL of both the
page you clicked the link on, as well as the link you&apos;d like
to read. You may just want
to <a href="javascript:history.back()">Go Back</a>, too.
</p>
<p>
If you&apos;re interested in a particular reference, you might of
course have more luck using a public search engine
like <a href="https://duckduckgo.com">DuckDuckGo</a>
or <a href="https://kagi.com">Kagi</a>.
</p>
<pre>MISSING KEY = {{ missing_key }}</pre>
</section>
{% endblock %}
{# Rendering the converted Org HTML in to a whole web-page:5 ends here #}

View File

@ -7,7 +7,8 @@ from arcology import views
urlpatterns = [
path("admin/", admin.site.urls),
path("", views.index),
path("robots.txt", views.robots),
path("robots.txt", views.robots, name="robots_txt"),
path("404", views.unpublished, name="page_not_found"),
path("sitemap", views.sitemap, name="sitemap"),
path("sites.css", views.site_css, name="site-css"),
path("feeds.json", views.feed_list, name="feed-list"),

View File

@ -1,6 +1,6 @@
# [[file:../arcology.org::*The Web Server][The Web Server:2]]
import logging
from django.http import HttpResponse, HttpResponseNotFound, Http404
from django.http import HttpResponse, HttpResponseNotFound
from django.shortcuts import render, get_object_or_404
from arcology.models import Page, Feed, Site
@ -38,6 +38,7 @@ page_counter = Counter("arcology_page", "Hit counter for each page", ["site", "p
render_latency = Histogram("arcology_page_render_seconds", "Latency for render_page func.", ["page", "site", "agent_type"])
from arcology.agent_utils import AgentClassification
from django.template import loader
def render_page(request, site, full_key):
agent = AgentClassification.from_request(request)
@ -47,7 +48,13 @@ def render_page(request, site, full_key):
the_page = Page.objects.get(route_key=full_key)
except Page.DoesNotExist:
page_counter.labels(page=full_key, status=404, site=site.key, agent_type=agent).inc()
raise Http404
template = loader.get_template("404.html")
context = dict(
missing_key=full_key
)
return HttpResponseNotFound(
template.render(context, request)
)
links = the_page.collect_links()
page_html = the_page.to_html(links)
@ -129,6 +136,22 @@ def sitemap(request):
return HttpResponse(b"sitemap")
# sitemap:1 ends here
# [[file:../arcology.org::*unpublished/not found endpoint][unpublished/not found endpoint:1]]
def unpublished(request):
key = request.GET.get("key")
if key is None:
key = "NOT_SUPPLIED"
# query links etc to create a JSON doc for SigmaJS
template = loader.get_template("404.html")
context = dict(
missing_key=key
)
return HttpResponseNotFound(
template.render(context, request)
)
# unpublished/not found endpoint:1 ends here
# [[file:../arcology.org::*=robots.txt= Endpoint][=robots.txt= Endpoint:1]]
def robots(request):
site = Site.from_request(request)

View File

@ -285,50 +285,64 @@ application = get_wsgi_application()
** User-Agent break-down
:PROPERTIES:
:ID: 20240213T120603.921365
:ROAM_ALIASES: arcology.agent_utils.AgentClassification
:END:
This =AgentClassification= enumeration class can take a User Agent header and map it to one of a handful of groups, which a user has the ability to extend. =AgentClassification.from_request(request)= will return a string from an enumeration, this is probably useful in labeling metrics or site statistics.
#+NAME: agent_classifications
| User Agent Substring | Enumeration |
|----------------------+-------------|
| prometheus | INTERNAL |
| feediverse | INTERNAL |
| Chrome/ | BROWSER |
| Firefox/ | BROWSER |
| DuckDuckGo/ | BROWSER |
| Safari/ | BROWSER |
| Synapse | MATRIX |
| Element | MATRIX |
| SubwayTooter | APP |
| Dalvik | APP |
| Nextcloud-android | APP |
| Pleroma | FEDIVERSE |
| Mastodon/ | FEDIVERSE |
| Akkoma | FEDIVERSE |
| Friendica | FEDIVERSE |
| FoundKey | FEDIVERSE |
| MissKey | FEDIVERSE |
| CalcKey | FEDIVERSE |
| gotosocial | FEDIVERSE |
| Epicyon | FEDIVERSE |
| feedparser | FEED |
| granary | FEED |
| Tiny Tiny RSS | FEED |
| Go_NEB | FEED |
| Gwene | FEED |
| Feedbin | FEED |
| SimplePie | FEED |
| Elfeed | FEED |
| inoreader | FEED |
| Reeder | FEED |
| Miniflux | FEED |
| Bot | BOT |
| bot | BOT |
| Poduptime | BOT |
| curl/ | AUTOMATION |
| wget/ | AUTOMATION |
| User Agent Substring | Enumeration |
|-------------------------------+-------------|
| prometheus | INTERNAL |
| feediverse | INTERNAL |
| Chrome/ | BROWSER |
| Firefox/ | BROWSER |
| DuckDuckGo/ | BROWSER |
| Safari/ | BROWSER |
| Opera/ | BROWSER |
| ddg_android/ | BROWSER |
| Synapse | MATRIX |
| Element | MATRIX |
| SubwayTooter | APP |
| Dalvik | APP |
| Nextcloud-android | APP |
| Pleroma | FEDIVERSE |
| Mastodon/ | FEDIVERSE |
| Akkoma | FEDIVERSE |
| Friendica | FEDIVERSE |
| FoundKey | FEDIVERSE |
| MissKey | FEDIVERSE |
| CalcKey | FEDIVERSE |
| gotosocial | FEDIVERSE |
| Epicyon | FEDIVERSE |
| feedparser | FEED |
| granary | FEED |
| Tiny Tiny RSS | FEED |
| Go_NEB | FEED |
| Gwene | FEED |
| Feedbin | FEED |
| NetNewsWire | FEED |
| FreshRSS | FEED |
| SimplePie | FEED |
| Elfeed | FEED |
| inoreader | FEED |
| Reeder | FEED |
| Miniflux | FEED |
| Bot | BOT |
| bot | BOT |
| Poduptime | BOT |
| aiohttp | AUTOMATION |
| python-requests | AUTOMATION |
| Go-http-client | AUTOMATION |
| curl/ | AUTOMATION |
| wget/ | AUTOMATION |
| keybase-proofs/ | AUTOMATION |
| InternetMeasurement | CRAWLER |
| CensysInspect | CRAWLER |
| scaninfo@paloaltonetworks.com | CRAWLER |
| SEOlyt/ | CRAWLER |
| Sogou web spider/ | CRAWLER |
#+begin_src python :tangle arcology/agent_utils.py :noweb yes
from __future__ import annotations
@ -351,6 +365,8 @@ class AgentClassification(str, Enum):
user_agent = request.headers.get("User-Agent")
if user_agent == "":
return cls.NO_UA
if user_agent is None:
return cls.NO_UA
<<agent_classifier()>>
logger.warn(f"Unknown User-Agent: {user_agent}")