Compare commits

...

10 Commits

Author SHA1 Message Date
Ryan Rix 361dd98c64 post the HTMl from summaries directly 2023-01-24 21:17:44 -10:00
Ryan Rix 6662467e9a fetch feeds from Arcology feeds.json file 2023-01-24 21:17:29 -10:00
Ed Summers e9d58c95be Remove image downloading
The special casing of Wordpress and image downloading was not reliable
for me so I have removed it, and tried to simplify the code in the
process. If you still need this functionality you will want to pin v0.2.1.
2022-02-19 13:38:12 +00:00
Ed Summers f3daed0bfb
link to feed-to-activitypub 2021-10-08 13:40:48 -04:00
Ed Summers 7a90313f1e a bit more verbose 2021-09-25 16:35:45 +00:00
Ed Summers be69e525b9 guard against content-type http header not being present 2021-09-18 20:44:07 +00:00
Ed Summers fd292f6222 fixed config file error 2021-01-07 21:20:36 +00:00
Ed Summers 29f416d7a4 catch http errors when fetching images 2020-11-24 21:45:10 +00:00
Ed Summers fb914c7510 doc fix 2020-10-09 16:35:11 +00:00
Ed Summers e73f405b54 new version 2020-10-09 16:28:41 +00:00
4 changed files with 97 additions and 206 deletions

View File

@ -1,6 +1,6 @@
The MIT License (MIT)
Copyright (c) 2018 Ed Summers
Copyright (c) Ed Summers
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@ -1,6 +1,6 @@
*feediverse* will read RSS/Atom feeds and send the messages as Mastodon posts.
Please use responsibly! *feediverse* is kind of the same thing as [feed2toot]
but it's just one module that works with Python 3 ... and I was bored.
It's meant to add a little bit of spice to your timeline from other places.
Please use it responsibly.
## Install
@ -18,8 +18,7 @@ Once *feediverse* is configured you can add it to your crontab:
*/15 * * * * /usr/local/bin/feediverse
Run `feediverse --help` to show the comand line options.
Run `feediverse --help` to show the command line options.
## Post Format
@ -41,7 +40,6 @@ separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{l
stripped). Please be aware that this might easily exceed Mastodon's
limit of 512 characters.
## Multiple Feeds
Since *feeds* is a list you can add additional feeds to watch if you want.
@ -52,38 +50,4 @@ Since *feeds* is a list you can add additional feeds to watch if you want.
template: "dot com: {title} {url}"
- url: https://example.org/feed/
template: "dot org: {title} {url}"
generator: wordpress
## Special Handling for Different Feed Generators
*feediverse* has support for some special cases of some feed
generators. For example detecting the entries perma-link. Currently
only Wordpress is handled, but others may follow.
If a feed does not provide a proper *generator* entry, you can set it
by adding a `generator:` value to the feed's configuration. See the
seconds one in the example above.
You can check whether feed provides a *generator* entry like this:
feediverse --verbose --dry-run feedverse-test.rc | grep generator
## Why?
I created *feediverse* because I wanted to send my Pinboard bookmarks to
Mastodon. I've got an IFTTT recipe that does this for Twitter, but IFTTT
doesn't appear to work with Mastodon yet. That being said *feediverse* should
work with any RSS or Atom feed (thanks to [feedparser]).
## Warning!
Please be responsible. Don't fill up Mastodon with tons of junk just because you
can. That kind of toxic behavior is why a lot of people are trying to establish
other forms of social media like Mastodon.
[feed2toot]: https://gitlab.com/chaica/feed2toot/
[feedparser]: http://feedparser.org/

View File

@ -4,30 +4,16 @@ import os
import re
import sys
import yaml
import codecs
import argparse
import urllib3
import dateutil
import feedparser
import requests
from bs4 import BeautifulSoup
from mastodon import Mastodon
from datetime import datetime, timezone, MINYEAR
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
MAX_IMAGES = 4 # Mastodon allows attaching 4 images max.
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',)
# encoding error-handler for buggy wordpress urls
def __urlencodereplace_errors(exc):
bs = exc.object[exc.start:exc.end].encode("utf-8")
bs = b"".join(b'%%%X' % b for b in bs)
return (bs, exc.end)
codecs.register_error("urlencodereplace", __urlencodereplace_errors)
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
def main():
@ -37,12 +23,18 @@ def main():
"don't toot, don't save config"))
parser.add_argument("-v", "--verbose", action="store_true",
help="be verbose")
parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE",
help=("config file to use, default: %s" %
DEFAULT_CONFIG_FILE),
parser.add_argument("-c", "--config",
help="config file to use",
default=os.path.expanduser(DEFAULT_CONFIG_FILE))
parser.add_argument("-f", "--feeds",
help="URL to fetch feed list from",
default="https://thelionsrear.com/feeds.json")
args = parser.parse_args()
config_file = args.config_file
config_file = args.config
if args.verbose:
print("using config file", config_file)
if not os.path.isfile(config_file):
setup(config_file)
@ -56,41 +48,95 @@ def main():
access_token=config['access_token']
)
config['feeds'] = fetch_dynamic_feeds(config['name'], args.feeds)
newest_post = config['updated']
for feed in config['feeds']:
for entry in get_feed(feed['url'], config['updated'],
config['include_images'],
generator=feed.get('generator')):
if args.verbose:
print(f"fetching {feed['url']} entries since {config['updated']}")
for entry in get_feed(feed['url'], config['updated']):
newest_post = max(newest_post, entry['updated'])
if args.verbose:
try:
print(entry)
except UnicodeEncodeError:
# work-around for non-unicode terminals
print(dict(
(k, v.encode("utf-8") if hasattr(v, "encode") else v)
for k, v in entry.items()))
print(entry)
if args.dry_run:
print("trial run, not tooting ", entry["title"][:50])
continue
media_ids = []
for img in entry.get("images", []):
media = masto.media_post(img, img.headers['content-type'])
img.release_conn() # deferred from collect_images()
if not 'error' in media:
media_ids.append(media)
entry.pop("images", None)
masto.status_post(feed['template'].format(**entry)[:499],
media_ids=media_ids)
content_type='text/html',
visbility=feed['visibility'])
config['updated'] = newest_post.isoformat()
if args.dry_run:
print("trial run, not saving the config")
else:
if args.verbose:
print("saving the config")
if not args.dry_run:
config['updated'] = newest_post.isoformat()
save_config(config, config_file)
def fetch_dynamic_feeds(site_name, feeds_url):
feeds = requests.get(feeds_url).json()
return [
dict(url=x['url'],
visibility=x['visibility'],
template='NEW by @rrix@notes.whatthefuck.computer: {url} {title}\n{summary}')
for x in feeds if x['site'] == site_name
]
def get_feed(feed_url, last_update):
feed = feedparser.parse(feed_url)
if last_update:
entries = [e for e in feed.entries
if dateutil.parser.parse(e['updated']) > last_update]
else:
entries = feed.entries
entries.sort(key=lambda e: e.updated_parsed)
for entry in entries:
yield get_entry(entry)
def get_entry(entry):
hashtags = []
for tag in entry.get('tags', []):
t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
hashtags.append('#{}'.format(t))
summary = entry.get('summary', '')
content = entry.get('content', '') or ''
# if content:
# content = cleanup(content[0].get('value', ''))
url = entry.id
return {
'url': url,
'link': entry.link,
'title': cleanup(entry.title),
'summary': summary,
'content': content,
'hashtags': ' '.join(hashtags),
'updated': dateutil.parser.parse(entry['updated'])
}
def cleanup(text):
html = BeautifulSoup(text, 'html.parser')
text = html.get_text()
text = re.sub('\xa0+', ' ', text)
text = re.sub(' +', ' ', text)
text = re.sub(' +\n', '\n', text)
text = re.sub('(\w)\n(\w)', '\\1 \\2', text)
text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
return text.strip()
def find_urls(html):
if not html:
return
urls = []
soup = BeautifulSoup(html, 'html.parser')
for tag in soup.find_all(["a", "img"]):
if tag.name == "a":
url = tag.get("href")
elif tag.name == "img":
url = tag.get("src")
if url and url not in urls:
urls.append(url)
return urls
def yes_no(question):
res = input(question + ' [y/n] ')
return res.lower() in "y1"
def save_config(config, config_file):
copy = dict(config)
with open(config_file, 'w') as fh:
@ -98,8 +144,7 @@ def save_config(config, config_file):
def read_config(config_file):
config = {
'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc),
'include_images': False,
'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc)
}
with open(config_file) as fh:
cfg = yaml.load(fh, yaml.SafeLoader)
@ -108,122 +153,7 @@ def read_config(config_file):
config.update(cfg)
return config
def detect_generator(feed):
# For RSS the generator tag holds the URL, while for ATOM it holds the name
generator = feed.feed.get("generator", "")
if "/wordpress.org/" in generator:
return "wordpress"
elif "wordpress" == generator.lower():
return "wordpress"
return None
def get_feed(feed_url, last_update, include_images, generator=None):
new_entries = 0
feed = feedparser.parse(feed_url)
if last_update:
entries = [e for e in feed.entries
if dateutil.parser.parse(e['updated']) > last_update]
else:
entries = feed.entries
entries.sort(key=lambda e: e.updated_parsed)
generator = generator or detect_generator(feed)
for entry in entries:
new_entries += 1
yield get_entry(entry, include_images, generator)
return new_entries
def collect_images(entry, generator=None):
def find_urls(part):
if not part:
return
soup = BeautifulSoup(part, 'html.parser')
for tag in soup.find_all(["a", "img"]):
if tag.name == "a":
url = tag["href"]
elif tag.name == "img":
url = tag["src"]
if url not in urls:
urls.append(url)
urls = []
find_urls(entry.get("summary", ""))
for c in entry.get("content", []):
find_urls(c.value)
for e in (entry.enclosures
+ [l for l in entry.links if l.get("rel") == "enclosure"]):
if (e["type"].startswith(("image/", "video/")) and
e["href"] not in urls):
urls.append(e["href"])
if generator == "wordpress":
urls = (u for u in urls if not "/wp-content/plugins/" in u)
# Work around a wordpress bug: If the filename contains an
# umlaut, this will not be encoded using %-escape, as the
# standard demands. This will break encoding in http.request()
urls = (u.encode("ascii", "urlencodereplace").decode()
for u in urls)
images = []
for url in urls:
resp = http.request('GET', url, preload_content=False)
if resp.headers['content-type'].startswith(("image/", "video/")):
images.append(resp)
# IMPORTANT: Need to release_conn() later!
if len(images) >= MAX_IMAGES:
break
else:
resp.release_conn()
return images
def get_entry(entry, include_images, generator=None):
def cleanup(text):
html = BeautifulSoup(text, 'html.parser')
# Remove all elements of class read-more or read-more-*
for more in html.find_all(None, re.compile("^read-more($|-.*)")):
more.extract()
text = html.get_text()
text = re.sub('\xa0+', ' ', text)
text = re.sub(' +', ' ', text)
text = re.sub(' +\n', '\n', text)
text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
return text.strip()
hashtags = []
for tag in entry.get('tags', []):
t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
hashtags.append('#{}'.format(t))
summary = entry.get('summary', '')
content = entry.get('content', '') or ''
if content:
content = cleanup(content[0].get('value', ''))
url = entry.id
if generator == "wordpress":
links = [l for l in entry.links if l.get("rel") == "alternate"]
if len(links) > 1:
links = [l for l in entry.links if l.get("type") == "text/html"]
if links:
url = links[0]["href"]
t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
hashtags.append('#{}'.format(t))
return {
'url': url,
'link': entry.link,
'title': cleanup(entry.title),
'summary': cleanup(summary),
'content': content,
'hashtags': ' '.join(hashtags),
'updated': dateutil.parser.parse(entry['updated']),
'images': collect_images(entry, generator) if include_images else [],
'__generator__': generator,
}
def setup(config_file):
def yes_no(question):
res = input(question + ' [y/n] ')
return res.lower() in "y1"
url = input('What is your Mastodon Instance URL? ')
have_app = yes_no('Do you have your app credentials already?')
if have_app:
@ -247,14 +177,12 @@ def setup(config_file):
feed_url = input('RSS/Atom feed URL to watch: ')
old_posts = yes_no('Shall already existing entries be tooted, too?')
include_images = yes_no('Shall images be included in the toot?')
config = {
'name': name,
'url': url,
'client_id': client_id,
'client_secret': client_secret,
'access_token': access_token,
'include_images': include_images,
'feeds': [
{'url': feed_url, 'template': '{title} {url}'}
]

View File

@ -5,7 +5,7 @@ with open("README.md") as f:
setup(
name='feediverse',
version='0.1.2',
version='0.3.0',
python_requires='>=3.3',
url='https://github.com/edsu/feediverse',
author='Ed Summers',
@ -18,7 +18,6 @@ setup(
'feedparser',
'mastodon.py',
'python-dateutil',
'pyyaml',
'urllib3[secure]'],
'pyyaml'],
entry_points={'console_scripts': ['feediverse = feediverse:main']}
)