post the HTMl from summaries directly

fetch feeds from Arcology feeds.json file
Remove image downloading
2023-01-24 21:17:44 -10:00 · 2023-01-24 21:17:29 -10:00 · 2022-02-19 13:38:12 +00:00 · 2021-10-08 13:40:48 -04:00 · 2021-09-25 16:35:45 +00:00 · 2021-09-18 20:44:07 +00:00
4 changed files with 97 additions and 206 deletions
--- a/2
+++ b/2
@ -1,6 +1,6 @@
 The MIT License (MIT)

-Copyright (c) 2018 Ed Summers
+Copyright (c) Ed Summers

 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/README.md
+++ b/README.md
@ -1,6 +1,6 @@
 *feediverse* will read RSS/Atom feeds and send the messages as Mastodon posts.
-Please use responsibly! *feediverse* is kind of the same thing as [feed2toot]
-but it's just one module that works with Python 3 ... and I was bored.
+It's meant to add a little bit of spice to your timeline from other places.
+Please use it responsibly.

 ## Install

@ -18,8 +18,7 @@ Once *feediverse* is configured you can add it to your crontab:

    */15 * * * * /usr/local/bin/feediverse    

-Run `feediverse --help` to show the comand line options.
-
+Run `feediverse --help` to show the command line options.

 ## Post Format

@ -41,7 +40,6 @@ separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{l
 stripped). Please be aware that this might easily exceed Mastodon's
 limit of 512 characters.

-
 ## Multiple Feeds

 Since *feeds* is a list you can add additional feeds to watch if you want.
@ -52,38 +50,4 @@ Since *feeds* is a list you can add additional feeds to watch if you want.
        template: "dot com: {title} {url}"
      - url: https://example.org/feed/
        template: "dot org: {title} {url}"
-        generator: wordpress
-
-
-## Special Handling for Different Feed Generators
-
-*feediverse* has support for some special cases of some feed
-generators. For example detecting the entries perma-link. Currently
-only Wordpress is handled, but others may follow.
-
-If a feed does not provide a proper *generator* entry, you can set it
-by adding a `generator:` value to the feed's configuration. See the
-seconds one in the example above.
-
-You can check whether feed provides a *generator* entry like this:
-
-  feediverse --verbose --dry-run feedverse-test.rc | grep generator
-
-
-## Why?
-
-I created *feediverse* because I wanted to send my Pinboard bookmarks to
-Mastodon.  I've got an IFTTT recipe that does this for Twitter, but IFTTT
-doesn't appear to work with Mastodon yet. That being said *feediverse* should
-work with any RSS or Atom feed (thanks to [feedparser]).
-
-## Warning!
-
-Please be responsible. Don't fill up Mastodon with tons of junk just because you
-can. That kind of toxic behavior is why a lot of people are trying to establish
-other forms of social media like Mastodon.
-
-[feed2toot]: https://gitlab.com/chaica/feed2toot/
-[feedparser]: http://feedparser.org/
-

--- a/feediverse.py
+++ b/feediverse.py
@ -4,30 +4,16 @@ import os
 import re
 import sys
 import yaml
-import codecs
 import argparse
-import urllib3
 import dateutil
 import feedparser

+import requests
+
 from bs4 import BeautifulSoup
 from mastodon import Mastodon
 from datetime import datetime, timezone, MINYEAR

-
-DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
-MAX_IMAGES = 4  # Mastodon allows attaching 4 images max.
-
-http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',)
-
-# encoding error-handler for buggy wordpress urls
-def __urlencodereplace_errors(exc):
-    bs = exc.object[exc.start:exc.end].encode("utf-8")
-    bs = b"".join(b'%%%X' % b for b in bs)
-    return (bs, exc.end)
-codecs.register_error("urlencodereplace", __urlencodereplace_errors)
-
-
 DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")

 def main():
@ -37,12 +23,18 @@ def main():
                              "don't toot, don't save config"))
    parser.add_argument("-v", "--verbose", action="store_true",
                        help="be verbose")
-    parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE",
-                        help=("config file to use, default: %s" %
-                              DEFAULT_CONFIG_FILE),
+    parser.add_argument("-c", "--config",
+                        help="config file to use",
                        default=os.path.expanduser(DEFAULT_CONFIG_FILE))
+    parser.add_argument("-f", "--feeds",
+                        help="URL to fetch feed list from",
+                        default="https://thelionsrear.com/feeds.json")
+
    args = parser.parse_args()
-    config_file = args.config_file
+    config_file = args.config
+
+    if args.verbose:
+        print("using config file", config_file)

    if not os.path.isfile(config_file):
        setup(config_file)        
@ -56,41 +48,95 @@ def main():
        access_token=config['access_token']
    )

+    config['feeds'] = fetch_dynamic_feeds(config['name'], args.feeds)
+
    newest_post = config['updated']
    for feed in config['feeds']:
-        for entry in get_feed(feed['url'], config['updated'],
-                              config['include_images'],
-                              generator=feed.get('generator')):
+        if args.verbose:
+            print(f"fetching {feed['url']} entries since {config['updated']}")
+        for entry in get_feed(feed['url'], config['updated']):
            newest_post = max(newest_post, entry['updated'])
            if args.verbose:
-                try:
-                    print(entry)
-                except UnicodeEncodeError:
-                    # work-around for non-unicode terminals
-                    print(dict(
-                        (k, v.encode("utf-8") if hasattr(v, "encode") else v)
-                        for k, v in entry.items()))
+                print(entry)
            if args.dry_run:
                print("trial run, not tooting ", entry["title"][:50])
                continue
-            media_ids = []
-            for img in entry.get("images", []):
-                media = masto.media_post(img, img.headers['content-type'])
-                img.release_conn()  # deferred from collect_images()
-                if not 'error' in media:
-                    media_ids.append(media)
-            entry.pop("images", None)
            masto.status_post(feed['template'].format(**entry)[:499],
-                              media_ids=media_ids)
+                              content_type='text/html',
+                              visbility=feed['visibility'])

-    config['updated'] = newest_post.isoformat()
-    if args.dry_run:
-        print("trial run, not saving the config")
-    else:
-        if args.verbose:
-            print("saving the config")
+    if not args.dry_run:
+        config['updated'] = newest_post.isoformat()
        save_config(config, config_file)

+def fetch_dynamic_feeds(site_name, feeds_url):
+    feeds = requests.get(feeds_url).json()
+    return [
+        dict(url=x['url'],
+             visibility=x['visibility'],
+             template='NEW by @rrix@notes.whatthefuck.computer: {url} {title}\n{summary}')
+        for x in feeds if x['site'] == site_name
+    ]
+
+def get_feed(feed_url, last_update):
+    feed = feedparser.parse(feed_url)
+    if last_update:
+        entries = [e for e in feed.entries
+                   if dateutil.parser.parse(e['updated']) > last_update]
+    else:
+        entries = feed.entries
+    entries.sort(key=lambda e: e.updated_parsed)
+    for entry in entries:
+        yield get_entry(entry)
+
+def get_entry(entry):
+    hashtags = []
+    for tag in entry.get('tags', []):
+        t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
+        hashtags.append('#{}'.format(t))
+    summary = entry.get('summary', '')
+    content = entry.get('content', '') or ''
+    # if content:
+    #     content = cleanup(content[0].get('value', ''))
+    url = entry.id
+    return {
+        'url': url,
+        'link': entry.link,
+        'title': cleanup(entry.title),
+        'summary': summary,
+        'content': content,
+        'hashtags': ' '.join(hashtags),
+        'updated': dateutil.parser.parse(entry['updated'])
+    }
+
+def cleanup(text):
+    html = BeautifulSoup(text, 'html.parser')
+    text = html.get_text()
+    text = re.sub('\xa0+', ' ', text)
+    text = re.sub('  +', ' ', text)
+    text = re.sub(' +\n', '\n', text)
+    text = re.sub('(\w)\n(\w)', '\\1 \\2', text)
+    text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
+    return text.strip()
+
+def find_urls(html):
+    if not html:
+        return
+    urls = []
+    soup = BeautifulSoup(html, 'html.parser')
+    for tag in soup.find_all(["a", "img"]):
+        if tag.name == "a":
+            url = tag.get("href")
+        elif tag.name == "img":
+            url = tag.get("src")
+        if url and url not in urls:
+            urls.append(url)
+    return urls
+
+def yes_no(question):
+    res = input(question + ' [y/n] ')
+    return res.lower() in "y1"
+
 def save_config(config, config_file):
    copy = dict(config)
    with open(config_file, 'w') as fh:
@ -98,8 +144,7 @@ def save_config(config, config_file):

 def read_config(config_file):
    config = {
-        'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc),
-        'include_images': False,
+        'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc)
    }
    with open(config_file) as fh:
        cfg = yaml.load(fh, yaml.SafeLoader)
@ -108,122 +153,7 @@ def read_config(config_file):
    config.update(cfg)
    return config

-def detect_generator(feed):
-    # For RSS the generator tag holds the URL, while for ATOM it holds the name
-    generator = feed.feed.get("generator", "")
-    if "/wordpress.org/" in generator:
-        return "wordpress"
-    elif "wordpress" == generator.lower():
-        return "wordpress"
-    return None
-
-def get_feed(feed_url, last_update, include_images, generator=None):
-    new_entries = 0
-    feed = feedparser.parse(feed_url)
-    if last_update:
-        entries = [e for e in feed.entries
-                   if dateutil.parser.parse(e['updated']) > last_update]
-    else:
-        entries = feed.entries
-    entries.sort(key=lambda e: e.updated_parsed)
-    generator = generator or detect_generator(feed)
-    for entry in entries:
-        new_entries += 1
-        yield get_entry(entry, include_images, generator)
-    return new_entries
-
-def collect_images(entry, generator=None):
-
-    def find_urls(part):
-        if not part:
-            return
-        soup = BeautifulSoup(part, 'html.parser')
-        for tag in soup.find_all(["a", "img"]):
-            if tag.name == "a":
-                url = tag["href"]
-            elif tag.name == "img":
-                url = tag["src"]
-            if url not in urls:
-                urls.append(url)
-
-    urls = []
-    find_urls(entry.get("summary", ""))
-    for c in entry.get("content", []):
-        find_urls(c.value)
-    for e in (entry.enclosures
-              + [l for l in entry.links if l.get("rel") == "enclosure"]):
-        if (e["type"].startswith(("image/", "video/")) and
-            e["href"] not in urls):
-            urls.append(e["href"])
-    if generator == "wordpress":
-        urls = (u for u in urls if not "/wp-content/plugins/" in u)
-        # Work around a wordpress bug: If the filename contains an
-        # umlaut, this will not be encoded using %-escape, as the
-        # standard demands. This will break encoding in http.request()
-        urls = (u.encode("ascii", "urlencodereplace").decode()
-                for u in urls)
-    images = []
-    for url in urls:
-        resp = http.request('GET', url, preload_content=False)
-        if resp.headers['content-type'].startswith(("image/", "video/")):
-            images.append(resp)
-            # IMPORTANT: Need to release_conn() later!
-            if len(images) >= MAX_IMAGES:
-                break
-        else:
-            resp.release_conn()
-    return images
-
-
-def get_entry(entry, include_images, generator=None):
-
-    def cleanup(text):
-        html = BeautifulSoup(text, 'html.parser')
-        # Remove all elements of class read-more or read-more-*
-        for more in html.find_all(None, re.compile("^read-more($|-.*)")):
-            more.extract()
-        text = html.get_text()
-        text = re.sub('\xa0+', ' ', text)
-        text = re.sub('  +', ' ', text)
-        text = re.sub(' +\n', '\n', text)
-        text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
-        return text.strip()
-
-    hashtags = []
-    for tag in entry.get('tags', []):
-        t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
-        hashtags.append('#{}'.format(t))
-    summary = entry.get('summary', '')
-    content = entry.get('content', '') or ''
-    if content:
-        content = cleanup(content[0].get('value', ''))
-    url = entry.id
-    if generator == "wordpress":
-        links = [l for l in entry.links if l.get("rel") == "alternate"]
-        if len(links) > 1:
-            links = [l for l in entry.links if l.get("type") == "text/html"]
-        if links:
-            url = links[0]["href"]
-        t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
-        hashtags.append('#{}'.format(t))
-    return {
-        'url': url,
-        'link': entry.link,
-        'title': cleanup(entry.title),
-        'summary': cleanup(summary),
-        'content': content,
-        'hashtags': ' '.join(hashtags),
-        'updated': dateutil.parser.parse(entry['updated']),
-        'images': collect_images(entry, generator) if include_images else [],
-        '__generator__': generator,
-    }
-
 def setup(config_file):
-
-    def yes_no(question):
-        res = input(question + ' [y/n] ')
-        return res.lower() in "y1"
-
    url = input('What is your Mastodon Instance URL? ')
    have_app = yes_no('Do you have your app credentials already?')
    if have_app:
@ -247,14 +177,12 @@ def setup(config_file):

    feed_url = input('RSS/Atom feed URL to watch: ')
    old_posts = yes_no('Shall already existing entries be tooted, too?')
-    include_images = yes_no('Shall images be included in the toot?')
    config = {
        'name': name,
        'url': url,
        'client_id': client_id,
        'client_secret': client_secret,
        'access_token': access_token,
-        'include_images': include_images,
        'feeds': [
            {'url': feed_url, 'template': '{title} {url}'}
        ]
--- a/setup.py
+++ b/setup.py
@ -5,7 +5,7 @@ with open("README.md") as f:

 setup(
    name='feediverse',
-    version='0.1.2',
+    version='0.3.0',
    python_requires='>=3.3',
    url='https://github.com/edsu/feediverse',
    author='Ed Summers',
@ -18,7 +18,6 @@ setup(
                      'feedparser',
                      'mastodon.py',
                      'python-dateutil',
-                      'pyyaml',
-                      'urllib3[secure]'],
+                      'pyyaml'],
    entry_points={'console_scripts': ['feediverse = feediverse:main']}
 )
Author	SHA1	Message	Date
Ryan Rix	361dd98c64	post the HTMl from summaries directly	2023-01-24 21:17:44 -10:00
Ryan Rix	6662467e9a	fetch feeds from Arcology feeds.json file	2023-01-24 21:17:29 -10:00
Ed Summers	e9d58c95be	Remove image downloading The special casing of Wordpress and image downloading was not reliable for me so I have removed it, and tried to simplify the code in the process. If you still need this functionality you will want to pin v0.2.1.	2022-02-19 13:38:12 +00:00
Ed Summers	f3daed0bfb	link to feed-to-activitypub	2021-10-08 13:40:48 -04:00
Ed Summers	7a90313f1e	a bit more verbose	2021-09-25 16:35:45 +00:00
Ed Summers	be69e525b9	guard against content-type http header not being present	2021-09-18 20:44:07 +00:00
Ed Summers	fd292f6222	fixed config file error	2021-01-07 21:20:36 +00:00
Ed Summers	29f416d7a4	catch http errors when fetching images	2020-11-24 21:45:10 +00:00
Ed Summers	fb914c7510	doc fix	2020-10-09 16:35:11 +00:00
Ed Summers	e73f405b54	new version	2020-10-09 16:28:41 +00:00