Compare commits
2 Commits
3bcc81c36f
...
9afba3d788
Author | SHA1 | Date |
---|---|---|
Ryan Rix | 9afba3d788 | |
Ryan Rix | 0ba1c6c683 |
67
arcology.org
67
arcology.org
|
@ -541,7 +541,8 @@ from arcology import views
|
|||
urlpatterns = [
|
||||
path("admin/", admin.site.urls),
|
||||
path("", views.index),
|
||||
path("robots.txt", views.robots),
|
||||
path("robots.txt", views.robots, name="robots_txt"),
|
||||
path("404", views.unpublished, name="page_not_found"),
|
||||
path("sitemap", views.sitemap, name="sitemap"),
|
||||
path("sites.css", views.site_css, name="site-css"),
|
||||
path("feeds.json", views.feed_list, name="feed-list"),
|
||||
|
@ -554,7 +555,7 @@ urlpatterns = [
|
|||
|
||||
#+begin_src python :tangle arcology/views.py
|
||||
import logging
|
||||
from django.http import HttpResponse, HttpResponseNotFound, Http404
|
||||
from django.http import HttpResponse, HttpResponseNotFound
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
|
||||
from arcology.models import Page, Feed, Site
|
||||
|
@ -610,6 +611,7 @@ page_counter = Counter("arcology_page", "Hit counter for each page", ["site", "p
|
|||
render_latency = Histogram("arcology_page_render_seconds", "Latency for render_page func.", ["page", "site", "agent_type"])
|
||||
|
||||
from arcology.agent_utils import AgentClassification
|
||||
from django.template import loader
|
||||
|
||||
def render_page(request, site, full_key):
|
||||
agent = AgentClassification.from_request(request)
|
||||
|
@ -619,7 +621,13 @@ def render_page(request, site, full_key):
|
|||
the_page = Page.objects.get(route_key=full_key)
|
||||
except Page.DoesNotExist:
|
||||
page_counter.labels(page=full_key, status=404, site=site.key, agent_type=agent).inc()
|
||||
raise Http404
|
||||
template = loader.get_template("404.html")
|
||||
context = dict(
|
||||
missing_key=full_key
|
||||
)
|
||||
return HttpResponseNotFound(
|
||||
template.render(context, request)
|
||||
)
|
||||
links = the_page.collect_links()
|
||||
page_html = the_page.to_html(links)
|
||||
|
||||
|
@ -714,6 +722,38 @@ The main =content= block contains the =<main>= generated by the native parser, a
|
|||
{% endblock %}
|
||||
#+end_src
|
||||
|
||||
Here's a really simple 404 template, too.
|
||||
|
||||
#+begin_src jinja2 :tangle arcology/templates/404.html
|
||||
{% extends "arcology/app.html" %}
|
||||
|
||||
{% block title %}Page Not Found{% endblock %}
|
||||
{% block h1 %}<h1>Page Not Found</h1>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<section>
|
||||
<p>
|
||||
The page you tried to open either has not been written by the
|
||||
author or the author has chosen to not publish it at this
|
||||
time. Please contact the author and include the URL of both the
|
||||
page you clicked the link on, as well as the link you'd like
|
||||
to read. You may just want
|
||||
to <a href="javascript:history.back()">Go Back</a>, too.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
If you're interested in a particular reference, you might of
|
||||
course have more luck using a public search engine
|
||||
like <a href="https://duckduckgo.com">DuckDuckGo</a>
|
||||
or <a href="https://kagi.com">Kagi</a>.
|
||||
</p>
|
||||
|
||||
<pre>MISSING KEY = {{ missing_key }}</pre>
|
||||
|
||||
</section>
|
||||
{% endblock %}
|
||||
#+end_src
|
||||
|
||||
*** Org Page-specific CSS Stylings
|
||||
|
||||
Most of the page CSS is defined below, but the content CSS is here, nearer the actual implementation of the flexbox:
|
||||
|
@ -781,7 +821,6 @@ Here are some [[https://medium.com/@massimo.cassandro/flexbox-separators-b284d6d
|
|||
|
||||
And some simple image wrangling:
|
||||
|
||||
|
||||
** INPROGRESS Atom Feed Handler
|
||||
:PROPERTIES:
|
||||
:ID: 20240204T234814.612917
|
||||
|
@ -890,6 +929,26 @@ def sitemap(request):
|
|||
return HttpResponse(b"sitemap")
|
||||
#+end_src
|
||||
|
||||
** NEXT unpublished/not found endpoint
|
||||
|
||||
There are plenty of links inside the Arcology which aren't meant to be clicked. =roam:= stub links will of course
|
||||
|
||||
#+begin_src python :tangle arcology/views.py
|
||||
def unpublished(request):
|
||||
key = request.GET.get("key")
|
||||
if key is None:
|
||||
key = "NOT_SUPPLIED"
|
||||
|
||||
# query links etc to create a JSON doc for SigmaJS
|
||||
template = loader.get_template("404.html")
|
||||
context = dict(
|
||||
missing_key=key
|
||||
)
|
||||
return HttpResponseNotFound(
|
||||
template.render(context, request)
|
||||
)
|
||||
#+end_src
|
||||
|
||||
** =robots.txt= Endpoint
|
||||
|
||||
- Disallow all GPT-alikes on all pages, I will add more to this list as necessary. Probably will pull these in to [[id:arcology/django/config][Arcology Project Configuration]] sooner or later.
|
||||
|
|
|
@ -17,6 +17,7 @@ class AgentClassification(str, Enum):
|
|||
FEED = "feed"
|
||||
BOT = "bot"
|
||||
AUTOMATION = "automation"
|
||||
CRAWLER = "crawler"
|
||||
|
||||
|
||||
def __str__(self):
|
||||
|
@ -27,6 +28,8 @@ class AgentClassification(str, Enum):
|
|||
user_agent = request.headers.get("User-Agent")
|
||||
if user_agent == "":
|
||||
return cls.NO_UA
|
||||
if user_agent is None:
|
||||
return cls.NO_UA
|
||||
if 'prometheus' in user_agent:
|
||||
return cls.INTERNAL
|
||||
if 'feediverse' in user_agent:
|
||||
|
@ -39,6 +42,10 @@ class AgentClassification(str, Enum):
|
|||
return cls.BROWSER
|
||||
if 'Safari/' in user_agent:
|
||||
return cls.BROWSER
|
||||
if 'Opera/' in user_agent:
|
||||
return cls.BROWSER
|
||||
if 'ddg_android/' in user_agent:
|
||||
return cls.BROWSER
|
||||
if 'Synapse' in user_agent:
|
||||
return cls.MATRIX
|
||||
if 'Element' in user_agent:
|
||||
|
@ -79,6 +86,10 @@ class AgentClassification(str, Enum):
|
|||
return cls.FEED
|
||||
if 'Feedbin' in user_agent:
|
||||
return cls.FEED
|
||||
if 'NetNewsWire' in user_agent:
|
||||
return cls.FEED
|
||||
if 'FreshRSS' in user_agent:
|
||||
return cls.FEED
|
||||
if 'SimplePie' in user_agent:
|
||||
return cls.FEED
|
||||
if 'Elfeed' in user_agent:
|
||||
|
@ -95,10 +106,28 @@ class AgentClassification(str, Enum):
|
|||
return cls.BOT
|
||||
if 'Poduptime' in user_agent:
|
||||
return cls.BOT
|
||||
if 'aiohttp' in user_agent:
|
||||
return cls.AUTOMATION
|
||||
if 'python-requests' in user_agent:
|
||||
return cls.AUTOMATION
|
||||
if 'Go-http-client' in user_agent:
|
||||
return cls.AUTOMATION
|
||||
if 'curl/' in user_agent:
|
||||
return cls.AUTOMATION
|
||||
if 'wget/' in user_agent:
|
||||
return cls.AUTOMATION
|
||||
if 'keybase-proofs/' in user_agent:
|
||||
return cls.AUTOMATION
|
||||
if 'InternetMeasurement' in user_agent:
|
||||
return cls.CRAWLER
|
||||
if 'CensysInspect' in user_agent:
|
||||
return cls.CRAWLER
|
||||
if 'scaninfo@paloaltonetworks.com' in user_agent:
|
||||
return cls.CRAWLER
|
||||
if 'SEOlyt/' in user_agent:
|
||||
return cls.CRAWLER
|
||||
if 'Sogou web spider/' in user_agent:
|
||||
return cls.CRAWLER
|
||||
|
||||
|
||||
logger.warn(f"Unknown User-Agent: {user_agent}")
|
||||
|
|
|
@ -0,0 +1,29 @@
|
|||
{# [[file:../../arcology.org::*Rendering the converted Org HTML in to a whole web-page][Rendering the converted Org HTML in to a whole web-page:5]] #}
|
||||
{% extends "arcology/app.html" %}
|
||||
|
||||
{% block title %}Page Not Found{% endblock %}
|
||||
{% block h1 %}<h1>Page Not Found</h1>{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
<section>
|
||||
<p>
|
||||
The page you tried to open either has not been written by the
|
||||
author or the author has chosen to not publish it at this
|
||||
time. Please contact the author and include the URL of both the
|
||||
page you clicked the link on, as well as the link you'd like
|
||||
to read. You may just want
|
||||
to <a href="javascript:history.back()">Go Back</a>, too.
|
||||
</p>
|
||||
|
||||
<p>
|
||||
If you're interested in a particular reference, you might of
|
||||
course have more luck using a public search engine
|
||||
like <a href="https://duckduckgo.com">DuckDuckGo</a>
|
||||
or <a href="https://kagi.com">Kagi</a>.
|
||||
</p>
|
||||
|
||||
<pre>MISSING KEY = {{ missing_key }}</pre>
|
||||
|
||||
</section>
|
||||
{% endblock %}
|
||||
{# Rendering the converted Org HTML in to a whole web-page:5 ends here #}
|
|
@ -7,7 +7,8 @@ from arcology import views
|
|||
urlpatterns = [
|
||||
path("admin/", admin.site.urls),
|
||||
path("", views.index),
|
||||
path("robots.txt", views.robots),
|
||||
path("robots.txt", views.robots, name="robots_txt"),
|
||||
path("404", views.unpublished, name="page_not_found"),
|
||||
path("sitemap", views.sitemap, name="sitemap"),
|
||||
path("sites.css", views.site_css, name="site-css"),
|
||||
path("feeds.json", views.feed_list, name="feed-list"),
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
# [[file:../arcology.org::*The Web Server][The Web Server:2]]
|
||||
import logging
|
||||
from django.http import HttpResponse, HttpResponseNotFound, Http404
|
||||
from django.http import HttpResponse, HttpResponseNotFound
|
||||
from django.shortcuts import render, get_object_or_404
|
||||
|
||||
from arcology.models import Page, Feed, Site
|
||||
|
@ -38,6 +38,7 @@ page_counter = Counter("arcology_page", "Hit counter for each page", ["site", "p
|
|||
render_latency = Histogram("arcology_page_render_seconds", "Latency for render_page func.", ["page", "site", "agent_type"])
|
||||
|
||||
from arcology.agent_utils import AgentClassification
|
||||
from django.template import loader
|
||||
|
||||
def render_page(request, site, full_key):
|
||||
agent = AgentClassification.from_request(request)
|
||||
|
@ -47,7 +48,13 @@ def render_page(request, site, full_key):
|
|||
the_page = Page.objects.get(route_key=full_key)
|
||||
except Page.DoesNotExist:
|
||||
page_counter.labels(page=full_key, status=404, site=site.key, agent_type=agent).inc()
|
||||
raise Http404
|
||||
template = loader.get_template("404.html")
|
||||
context = dict(
|
||||
missing_key=full_key
|
||||
)
|
||||
return HttpResponseNotFound(
|
||||
template.render(context, request)
|
||||
)
|
||||
links = the_page.collect_links()
|
||||
page_html = the_page.to_html(links)
|
||||
|
||||
|
@ -129,6 +136,22 @@ def sitemap(request):
|
|||
return HttpResponse(b"sitemap")
|
||||
# sitemap:1 ends here
|
||||
|
||||
# [[file:../arcology.org::*unpublished/not found endpoint][unpublished/not found endpoint:1]]
|
||||
def unpublished(request):
|
||||
key = request.GET.get("key")
|
||||
if key is None:
|
||||
key = "NOT_SUPPLIED"
|
||||
|
||||
# query links etc to create a JSON doc for SigmaJS
|
||||
template = loader.get_template("404.html")
|
||||
context = dict(
|
||||
missing_key=key
|
||||
)
|
||||
return HttpResponseNotFound(
|
||||
template.render(context, request)
|
||||
)
|
||||
# unpublished/not found endpoint:1 ends here
|
||||
|
||||
# [[file:../arcology.org::*=robots.txt= Endpoint][=robots.txt= Endpoint:1]]
|
||||
def robots(request):
|
||||
site = Site.from_request(request)
|
||||
|
|
|
@ -285,50 +285,64 @@ application = get_wsgi_application()
|
|||
** User-Agent break-down
|
||||
:PROPERTIES:
|
||||
:ID: 20240213T120603.921365
|
||||
:ROAM_ALIASES: arcology.agent_utils.AgentClassification
|
||||
:END:
|
||||
|
||||
This =AgentClassification= enumeration class can take a User Agent header and map it to one of a handful of groups, which a user has the ability to extend. =AgentClassification.from_request(request)= will return a string from an enumeration, this is probably useful in labeling metrics or site statistics.
|
||||
|
||||
|
||||
#+NAME: agent_classifications
|
||||
| User Agent Substring | Enumeration |
|
||||
|----------------------+-------------|
|
||||
| prometheus | INTERNAL |
|
||||
| feediverse | INTERNAL |
|
||||
| Chrome/ | BROWSER |
|
||||
| Firefox/ | BROWSER |
|
||||
| DuckDuckGo/ | BROWSER |
|
||||
| Safari/ | BROWSER |
|
||||
| Synapse | MATRIX |
|
||||
| Element | MATRIX |
|
||||
| SubwayTooter | APP |
|
||||
| Dalvik | APP |
|
||||
| Nextcloud-android | APP |
|
||||
| Pleroma | FEDIVERSE |
|
||||
| Mastodon/ | FEDIVERSE |
|
||||
| Akkoma | FEDIVERSE |
|
||||
| Friendica | FEDIVERSE |
|
||||
| FoundKey | FEDIVERSE |
|
||||
| MissKey | FEDIVERSE |
|
||||
| CalcKey | FEDIVERSE |
|
||||
| gotosocial | FEDIVERSE |
|
||||
| Epicyon | FEDIVERSE |
|
||||
| feedparser | FEED |
|
||||
| granary | FEED |
|
||||
| Tiny Tiny RSS | FEED |
|
||||
| Go_NEB | FEED |
|
||||
| Gwene | FEED |
|
||||
| Feedbin | FEED |
|
||||
| SimplePie | FEED |
|
||||
| Elfeed | FEED |
|
||||
| inoreader | FEED |
|
||||
| Reeder | FEED |
|
||||
| Miniflux | FEED |
|
||||
| Bot | BOT |
|
||||
| bot | BOT |
|
||||
| Poduptime | BOT |
|
||||
| curl/ | AUTOMATION |
|
||||
| wget/ | AUTOMATION |
|
||||
| User Agent Substring | Enumeration |
|
||||
|-------------------------------+-------------|
|
||||
| prometheus | INTERNAL |
|
||||
| feediverse | INTERNAL |
|
||||
| Chrome/ | BROWSER |
|
||||
| Firefox/ | BROWSER |
|
||||
| DuckDuckGo/ | BROWSER |
|
||||
| Safari/ | BROWSER |
|
||||
| Opera/ | BROWSER |
|
||||
| ddg_android/ | BROWSER |
|
||||
| Synapse | MATRIX |
|
||||
| Element | MATRIX |
|
||||
| SubwayTooter | APP |
|
||||
| Dalvik | APP |
|
||||
| Nextcloud-android | APP |
|
||||
| Pleroma | FEDIVERSE |
|
||||
| Mastodon/ | FEDIVERSE |
|
||||
| Akkoma | FEDIVERSE |
|
||||
| Friendica | FEDIVERSE |
|
||||
| FoundKey | FEDIVERSE |
|
||||
| MissKey | FEDIVERSE |
|
||||
| CalcKey | FEDIVERSE |
|
||||
| gotosocial | FEDIVERSE |
|
||||
| Epicyon | FEDIVERSE |
|
||||
| feedparser | FEED |
|
||||
| granary | FEED |
|
||||
| Tiny Tiny RSS | FEED |
|
||||
| Go_NEB | FEED |
|
||||
| Gwene | FEED |
|
||||
| Feedbin | FEED |
|
||||
| NetNewsWire | FEED |
|
||||
| FreshRSS | FEED |
|
||||
| SimplePie | FEED |
|
||||
| Elfeed | FEED |
|
||||
| inoreader | FEED |
|
||||
| Reeder | FEED |
|
||||
| Miniflux | FEED |
|
||||
| Bot | BOT |
|
||||
| bot | BOT |
|
||||
| Poduptime | BOT |
|
||||
| aiohttp | AUTOMATION |
|
||||
| python-requests | AUTOMATION |
|
||||
| Go-http-client | AUTOMATION |
|
||||
| curl/ | AUTOMATION |
|
||||
| wget/ | AUTOMATION |
|
||||
| keybase-proofs/ | AUTOMATION |
|
||||
| InternetMeasurement | CRAWLER |
|
||||
| CensysInspect | CRAWLER |
|
||||
| scaninfo@paloaltonetworks.com | CRAWLER |
|
||||
| SEOlyt/ | CRAWLER |
|
||||
| Sogou web spider/ | CRAWLER |
|
||||
|
||||
#+begin_src python :tangle arcology/agent_utils.py :noweb yes
|
||||
from __future__ import annotations
|
||||
|
@ -351,6 +365,8 @@ class AgentClassification(str, Enum):
|
|||
user_agent = request.headers.get("User-Agent")
|
||||
if user_agent == "":
|
||||
return cls.NO_UA
|
||||
if user_agent is None:
|
||||
return cls.NO_UA
|
||||
<<agent_classifier()>>
|
||||
|
||||
logger.warn(f"Unknown User-Agent: {user_agent}")
|
||||
|
|
Loading…
Reference in New Issue