provide a 404 page and wire it up to the Native 404/unpublished pages

add some more user agents to classify
2024-02-17 13:14:10 -08:00 · 2024-02-17 13:12:38 -08:00
6 changed files with 202 additions and 45 deletions
--- a/arcology.org
+++ b/arcology.org
@ -541,7 +541,8 @@ from arcology import views
 urlpatterns = [
    path("admin/", admin.site.urls),
    path("", views.index),
-    path("robots.txt", views.robots),
+    path("robots.txt", views.robots, name="robots_txt"),
+    path("404", views.unpublished, name="page_not_found"),
    path("sitemap", views.sitemap, name="sitemap"),
    path("sites.css", views.site_css, name="site-css"),
    path("feeds.json", views.feed_list, name="feed-list"),
@ -554,7 +555,7 @@ urlpatterns = [

 #+begin_src python :tangle arcology/views.py
 import logging
-from django.http import HttpResponse, HttpResponseNotFound, Http404
+from django.http import HttpResponse, HttpResponseNotFound
 from django.shortcuts import render, get_object_or_404

 from arcology.models import Page, Feed, Site
@ -610,6 +611,7 @@ page_counter = Counter("arcology_page", "Hit counter for each page", ["site", "p
 render_latency = Histogram("arcology_page_render_seconds", "Latency for render_page func.", ["page", "site", "agent_type"])

 from arcology.agent_utils import AgentClassification
+from django.template import loader

 def render_page(request, site, full_key):
    agent = AgentClassification.from_request(request)
@ -619,7 +621,13 @@ def render_page(request, site, full_key):
            the_page = Page.objects.get(route_key=full_key)
        except Page.DoesNotExist:
            page_counter.labels(page=full_key, status=404, site=site.key, agent_type=agent).inc() 
-            raise Http404
+            template = loader.get_template("404.html")
+            context = dict(
+                missing_key=full_key
+            )
+            return HttpResponseNotFound(
+                template.render(context, request)
+            )
        links = the_page.collect_links()
        page_html = the_page.to_html(links)

@ -714,6 +722,38 @@ The main =content= block contains the =<main>= generated by the native parser, a
 {% endblock %}
 #+end_src

+Here's a really simple 404 template, too.
+
+#+begin_src jinja2 :tangle arcology/templates/404.html
+{% extends "arcology/app.html" %}
+
+{% block title %}Page Not Found{% endblock %}
+{% block h1 %}<h1>Page Not Found</h1>{% endblock %}
+
+{% block content %}
+<section>
+  <p>
+    The page you tried to open either has not been written by the
+    author or the author has chosen to not publish it at this
+    time. Please contact the author and include the URL of both the
+    page you clicked the link on, as well as the link you&apos;d like
+    to read. You may just want
+    to <a href="javascript:history.back()">Go Back</a>, too.
+  </p>
+
+  <p>
+    If you&apos;re interested in a particular reference, you might of
+    course have more luck using a public search engine
+    like <a href="https://duckduckgo.com">DuckDuckGo</a>
+    or <a href="https://kagi.com">Kagi</a>.
+  </p>
+
+  <pre>MISSING KEY = {{ missing_key }}</pre>
+
+</section>
+{% endblock %}
+#+end_src
+
 *** Org Page-specific CSS Stylings

 Most of the page CSS is defined below, but the content CSS is here, nearer the actual implementation of the flexbox:
@ -781,7 +821,6 @@ Here are some [[https://medium.com/@massimo.cassandro/flexbox-separators-b284d6d

 And some simple image wrangling:

-
 ** INPROGRESS Atom Feed Handler
 :PROPERTIES:
 :ID:       20240204T234814.612917
@ -890,6 +929,26 @@ def sitemap(request):
    return HttpResponse(b"sitemap")
 #+end_src

+** NEXT unpublished/not found endpoint
+
+There are plenty of links inside the Arcology which aren't meant to be clicked. =roam:= stub links will of course 
+
+#+begin_src python :tangle arcology/views.py
+def unpublished(request):
+    key = request.GET.get("key")
+    if key is None:
+        key = "NOT_SUPPLIED"
+
+    # query links etc to create a JSON doc for SigmaJS
+    template = loader.get_template("404.html")
+    context = dict(
+        missing_key=key
+    )
+    return HttpResponseNotFound(
+        template.render(context, request)
+    )
+#+end_src
+
 ** =robots.txt= Endpoint

 - Disallow all GPT-alikes on all pages, I will add more to this list as necessary. Probably will pull these in to [[id:arcology/django/config][Arcology Project Configuration]] sooner or later.
--- a/arcology/agent_utils.py
+++ b/arcology/agent_utils.py
@ -17,6 +17,7 @@ class AgentClassification(str, Enum):
  FEED = "feed"
  BOT = "bot"
  AUTOMATION = "automation"
+  CRAWLER = "crawler"
  

  def __str__(self):
@ -27,6 +28,8 @@ class AgentClassification(str, Enum):
    user_agent = request.headers.get("User-Agent")
    if user_agent == "":
      return cls.NO_UA
+    if user_agent is None:
+      return cls.NO_UA
    if 'prometheus' in user_agent:
        return cls.INTERNAL
    if 'feediverse' in user_agent:
@ -39,6 +42,10 @@ class AgentClassification(str, Enum):
        return cls.BROWSER
    if 'Safari/' in user_agent:
        return cls.BROWSER
+    if 'Opera/' in user_agent:
+        return cls.BROWSER
+    if 'ddg_android/' in user_agent:
+        return cls.BROWSER
    if 'Synapse' in user_agent:
        return cls.MATRIX
    if 'Element' in user_agent:
@ -79,6 +86,10 @@ class AgentClassification(str, Enum):
        return cls.FEED
    if 'Feedbin' in user_agent:
        return cls.FEED
+    if 'NetNewsWire' in user_agent:
+        return cls.FEED
+    if 'FreshRSS' in user_agent:
+        return cls.FEED
    if 'SimplePie' in user_agent:
        return cls.FEED
    if 'Elfeed' in user_agent:
@ -95,10 +106,28 @@ class AgentClassification(str, Enum):
        return cls.BOT
    if 'Poduptime' in user_agent:
        return cls.BOT
+    if 'aiohttp' in user_agent:
+        return cls.AUTOMATION
+    if 'python-requests' in user_agent:
+        return cls.AUTOMATION
+    if 'Go-http-client' in user_agent:
+        return cls.AUTOMATION
    if 'curl/' in user_agent:
        return cls.AUTOMATION
    if 'wget/' in user_agent:
        return cls.AUTOMATION
+    if 'keybase-proofs/' in user_agent:
+        return cls.AUTOMATION
+    if 'InternetMeasurement' in user_agent:
+        return cls.CRAWLER
+    if 'CensysInspect' in user_agent:
+        return cls.CRAWLER
+    if 'scaninfo@paloaltonetworks.com' in user_agent:
+        return cls.CRAWLER
+    if 'SEOlyt/' in user_agent:
+        return cls.CRAWLER
+    if 'Sogou web spider/' in user_agent:
+        return cls.CRAWLER
    

    logger.warn(f"Unknown User-Agent: {user_agent}")
--- a/arcology/templates/404.html
+++ b/arcology/templates/404.html
@ -0,0 +1,29 @@
+{# [[file:../../arcology.org::*Rendering the converted Org HTML in to a whole web-page][Rendering the converted Org HTML in to a whole web-page:5]] #}
+{% extends "arcology/app.html" %}
+
+{% block title %}Page Not Found{% endblock %}
+{% block h1 %}<h1>Page Not Found</h1>{% endblock %}
+
+{% block content %}
+<section>
+  <p>
+    The page you tried to open either has not been written by the
+    author or the author has chosen to not publish it at this
+    time. Please contact the author and include the URL of both the
+    page you clicked the link on, as well as the link you&apos;d like
+    to read. You may just want
+    to <a href="javascript:history.back()">Go Back</a>, too.
+  </p>
+
+  <p>
+    If you&apos;re interested in a particular reference, you might of
+    course have more luck using a public search engine
+    like <a href="https://duckduckgo.com">DuckDuckGo</a>
+    or <a href="https://kagi.com">Kagi</a>.
+  </p>
+
+  <pre>MISSING KEY = {{ missing_key }}</pre>
+
+</section>
+{% endblock %}
+{# Rendering the converted Org HTML in to a whole web-page:5 ends here #}
--- a/arcology/urls.py
+++ b/arcology/urls.py
@ -7,7 +7,8 @@ from arcology import views
 urlpatterns = [
    path("admin/", admin.site.urls),
    path("", views.index),
-    path("robots.txt", views.robots),
+    path("robots.txt", views.robots, name="robots_txt"),
+    path("404", views.unpublished, name="page_not_found"),
    path("sitemap", views.sitemap, name="sitemap"),
    path("sites.css", views.site_css, name="site-css"),
    path("feeds.json", views.feed_list, name="feed-list"),
--- a/arcology/views.py
+++ b/arcology/views.py
@ -1,6 +1,6 @@
 # [[file:../arcology.org::*The Web Server][The Web Server:2]]
 import logging
-from django.http import HttpResponse, HttpResponseNotFound, Http404
+from django.http import HttpResponse, HttpResponseNotFound
 from django.shortcuts import render, get_object_or_404

 from arcology.models import Page, Feed, Site
@ -38,6 +38,7 @@ page_counter = Counter("arcology_page", "Hit counter for each page", ["site", "p
 render_latency = Histogram("arcology_page_render_seconds", "Latency for render_page func.", ["page", "site", "agent_type"])

 from arcology.agent_utils import AgentClassification
+from django.template import loader

 def render_page(request, site, full_key):
    agent = AgentClassification.from_request(request)
@ -47,7 +48,13 @@ def render_page(request, site, full_key):
            the_page = Page.objects.get(route_key=full_key)
        except Page.DoesNotExist:
            page_counter.labels(page=full_key, status=404, site=site.key, agent_type=agent).inc() 
-            raise Http404
+            template = loader.get_template("404.html")
+            context = dict(
+                missing_key=full_key
+            )
+            return HttpResponseNotFound(
+                template.render(context, request)
+            )
        links = the_page.collect_links()
        page_html = the_page.to_html(links)

@ -129,6 +136,22 @@ def sitemap(request):
    return HttpResponse(b"sitemap")
 # sitemap:1 ends here

+# [[file:../arcology.org::*unpublished/not found endpoint][unpublished/not found endpoint:1]]
+def unpublished(request):
+    key = request.GET.get("key")
+    if key is None:
+        key = "NOT_SUPPLIED"
+
+    # query links etc to create a JSON doc for SigmaJS
+    template = loader.get_template("404.html")
+    context = dict(
+        missing_key=key
+    )
+    return HttpResponseNotFound(
+        template.render(context, request)
+    )
+# unpublished/not found endpoint:1 ends here
+
 # [[file:../arcology.org::*=robots.txt= Endpoint][=robots.txt= Endpoint:1]]
 def robots(request):
    site = Site.from_request(request)
--- a/scaffolding.org
+++ b/scaffolding.org
@ -285,50 +285,64 @@ application = get_wsgi_application()
 ** User-Agent break-down
 :PROPERTIES:
 :ID:       20240213T120603.921365
+:ROAM_ALIASES: arcology.agent_utils.AgentClassification
 :END:

 This =AgentClassification= enumeration class can take a User Agent header and map it to one of a handful of groups, which a user has the ability to extend. =AgentClassification.from_request(request)= will return a string from an enumeration, this is probably useful in labeling metrics or site statistics.


 #+NAME: agent_classifications
-| User Agent Substring | Enumeration |
-|----------------------+-------------|
-| prometheus           | INTERNAL    |
-| feediverse           | INTERNAL    |
-| Chrome/              | BROWSER     |
-| Firefox/             | BROWSER     |
-| DuckDuckGo/          | BROWSER     |
-| Safari/              | BROWSER     |
-| Synapse              | MATRIX      |
-| Element              | MATRIX      |
-| SubwayTooter         | APP         |
-| Dalvik               | APP         |
-| Nextcloud-android    | APP         |
-| Pleroma              | FEDIVERSE   |
-| Mastodon/            | FEDIVERSE   |
-| Akkoma               | FEDIVERSE   |
-| Friendica            | FEDIVERSE   |
-| FoundKey             | FEDIVERSE   |
-| MissKey              | FEDIVERSE   |
-| CalcKey              | FEDIVERSE   |
-| gotosocial           | FEDIVERSE   |
-| Epicyon              | FEDIVERSE   |
-| feedparser           | FEED        |
-| granary              | FEED        |
-| Tiny Tiny RSS        | FEED        |
-| Go_NEB               | FEED        |
-| Gwene                | FEED        |
-| Feedbin              | FEED        |
-| SimplePie            | FEED        |
-| Elfeed               | FEED        |
-| inoreader            | FEED        |
-| Reeder               | FEED        |
-| Miniflux             | FEED        |
-| Bot                  | BOT         |
-| bot                  | BOT         |
-| Poduptime            | BOT         |
-| curl/                | AUTOMATION  |
-| wget/                | AUTOMATION  |
+| User Agent Substring          | Enumeration |
+|-------------------------------+-------------|
+| prometheus                    | INTERNAL    |
+| feediverse                    | INTERNAL    |
+| Chrome/                       | BROWSER     |
+| Firefox/                      | BROWSER     |
+| DuckDuckGo/                   | BROWSER     |
+| Safari/                       | BROWSER     |
+| Opera/                        | BROWSER     |
+| ddg_android/                  | BROWSER     |
+| Synapse                       | MATRIX      |
+| Element                       | MATRIX      |
+| SubwayTooter                  | APP         |
+| Dalvik                        | APP         |
+| Nextcloud-android             | APP         |
+| Pleroma                       | FEDIVERSE   |
+| Mastodon/                     | FEDIVERSE   |
+| Akkoma                        | FEDIVERSE   |
+| Friendica                     | FEDIVERSE   |
+| FoundKey                      | FEDIVERSE   |
+| MissKey                       | FEDIVERSE   |
+| CalcKey                       | FEDIVERSE   |
+| gotosocial                    | FEDIVERSE   |
+| Epicyon                       | FEDIVERSE   |
+| feedparser                    | FEED        |
+| granary                       | FEED        |
+| Tiny Tiny RSS                 | FEED        |
+| Go_NEB                        | FEED        |
+| Gwene                         | FEED        |
+| Feedbin                       | FEED        |
+| NetNewsWire                   | FEED        |
+| FreshRSS                      | FEED        |
+| SimplePie                     | FEED        |
+| Elfeed                        | FEED        |
+| inoreader                     | FEED        |
+| Reeder                        | FEED        |
+| Miniflux                      | FEED        |
+| Bot                           | BOT         |
+| bot                           | BOT         |
+| Poduptime                     | BOT         |
+| aiohttp                       | AUTOMATION  |
+| python-requests               | AUTOMATION  |
+| Go-http-client                | AUTOMATION  |
+| curl/                         | AUTOMATION  |
+| wget/                         | AUTOMATION  |
+| keybase-proofs/               | AUTOMATION  |
+| InternetMeasurement           | CRAWLER     |
+| CensysInspect                 | CRAWLER     |
+| scaninfo@paloaltonetworks.com | CRAWLER     |
+| SEOlyt/                       | CRAWLER     |
+| Sogou web spider/             | CRAWLER     |

 #+begin_src python :tangle arcology/agent_utils.py :noweb yes
 from __future__ import annotations
@ -351,6 +365,8 @@ class AgentClassification(str, Enum):
    user_agent = request.headers.get("User-Agent")
    if user_agent == "":
      return cls.NO_UA
+    if user_agent is None:
+      return cls.NO_UA
    <<agent_classifier()>>

    logger.warn(f"Unknown User-Agent: {user_agent}")
Author	SHA1	Message	Date
Ryan Rix	9afba3d788	provide a 404 page and wire it up to the Native 404/unpublished pages	2024-02-17 13:14:10 -08:00
Ryan Rix	0ba1c6c683	add some more user agents to classify	2024-02-17 13:12:38 -08:00