Compare commits
10 Commits
fa175cf9c6
...
361dd98c64
Author | SHA1 | Date |
---|---|---|
Ryan Rix | 361dd98c64 | |
Ryan Rix | 6662467e9a | |
Ed Summers | e9d58c95be | |
Ed Summers | f3daed0bfb | |
Ed Summers | 7a90313f1e | |
Ed Summers | be69e525b9 | |
Ed Summers | fd292f6222 | |
Ed Summers | 29f416d7a4 | |
Ed Summers | fb914c7510 | |
Ed Summers | e73f405b54 |
2
LICENSE
2
LICENSE
|
@ -1,6 +1,6 @@
|
||||||
The MIT License (MIT)
|
The MIT License (MIT)
|
||||||
|
|
||||||
Copyright (c) 2018 Ed Summers
|
Copyright (c) Ed Summers
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
42
README.md
42
README.md
|
@ -1,6 +1,6 @@
|
||||||
*feediverse* will read RSS/Atom feeds and send the messages as Mastodon posts.
|
*feediverse* will read RSS/Atom feeds and send the messages as Mastodon posts.
|
||||||
Please use responsibly! *feediverse* is kind of the same thing as [feed2toot]
|
It's meant to add a little bit of spice to your timeline from other places.
|
||||||
but it's just one module that works with Python 3 ... and I was bored.
|
Please use it responsibly.
|
||||||
|
|
||||||
## Install
|
## Install
|
||||||
|
|
||||||
|
@ -18,8 +18,7 @@ Once *feediverse* is configured you can add it to your crontab:
|
||||||
|
|
||||||
*/15 * * * * /usr/local/bin/feediverse
|
*/15 * * * * /usr/local/bin/feediverse
|
||||||
|
|
||||||
Run `feediverse --help` to show the comand line options.
|
Run `feediverse --help` to show the command line options.
|
||||||
|
|
||||||
|
|
||||||
## Post Format
|
## Post Format
|
||||||
|
|
||||||
|
@ -41,7 +40,6 @@ separated list of hashtags. For some feeds (e.g. youtube-rss) you should use `{l
|
||||||
stripped). Please be aware that this might easily exceed Mastodon's
|
stripped). Please be aware that this might easily exceed Mastodon's
|
||||||
limit of 512 characters.
|
limit of 512 characters.
|
||||||
|
|
||||||
|
|
||||||
## Multiple Feeds
|
## Multiple Feeds
|
||||||
|
|
||||||
Since *feeds* is a list you can add additional feeds to watch if you want.
|
Since *feeds* is a list you can add additional feeds to watch if you want.
|
||||||
|
@ -52,38 +50,4 @@ Since *feeds* is a list you can add additional feeds to watch if you want.
|
||||||
template: "dot com: {title} {url}"
|
template: "dot com: {title} {url}"
|
||||||
- url: https://example.org/feed/
|
- url: https://example.org/feed/
|
||||||
template: "dot org: {title} {url}"
|
template: "dot org: {title} {url}"
|
||||||
generator: wordpress
|
|
||||||
|
|
||||||
|
|
||||||
## Special Handling for Different Feed Generators
|
|
||||||
|
|
||||||
*feediverse* has support for some special cases of some feed
|
|
||||||
generators. For example detecting the entries perma-link. Currently
|
|
||||||
only Wordpress is handled, but others may follow.
|
|
||||||
|
|
||||||
If a feed does not provide a proper *generator* entry, you can set it
|
|
||||||
by adding a `generator:` value to the feed's configuration. See the
|
|
||||||
seconds one in the example above.
|
|
||||||
|
|
||||||
You can check whether feed provides a *generator* entry like this:
|
|
||||||
|
|
||||||
feediverse --verbose --dry-run feedverse-test.rc | grep generator
|
|
||||||
|
|
||||||
|
|
||||||
## Why?
|
|
||||||
|
|
||||||
I created *feediverse* because I wanted to send my Pinboard bookmarks to
|
|
||||||
Mastodon. I've got an IFTTT recipe that does this for Twitter, but IFTTT
|
|
||||||
doesn't appear to work with Mastodon yet. That being said *feediverse* should
|
|
||||||
work with any RSS or Atom feed (thanks to [feedparser]).
|
|
||||||
|
|
||||||
## Warning!
|
|
||||||
|
|
||||||
Please be responsible. Don't fill up Mastodon with tons of junk just because you
|
|
||||||
can. That kind of toxic behavior is why a lot of people are trying to establish
|
|
||||||
other forms of social media like Mastodon.
|
|
||||||
|
|
||||||
[feed2toot]: https://gitlab.com/chaica/feed2toot/
|
|
||||||
[feedparser]: http://feedparser.org/
|
|
||||||
|
|
||||||
|
|
||||||
|
|
254
feediverse.py
254
feediverse.py
|
@ -4,30 +4,16 @@ import os
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import yaml
|
import yaml
|
||||||
import codecs
|
|
||||||
import argparse
|
import argparse
|
||||||
import urllib3
|
|
||||||
import dateutil
|
import dateutil
|
||||||
import feedparser
|
import feedparser
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
from mastodon import Mastodon
|
from mastodon import Mastodon
|
||||||
from datetime import datetime, timezone, MINYEAR
|
from datetime import datetime, timezone, MINYEAR
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
|
|
||||||
MAX_IMAGES = 4 # Mastodon allows attaching 4 images max.
|
|
||||||
|
|
||||||
http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',)
|
|
||||||
|
|
||||||
# encoding error-handler for buggy wordpress urls
|
|
||||||
def __urlencodereplace_errors(exc):
|
|
||||||
bs = exc.object[exc.start:exc.end].encode("utf-8")
|
|
||||||
bs = b"".join(b'%%%X' % b for b in bs)
|
|
||||||
return (bs, exc.end)
|
|
||||||
codecs.register_error("urlencodereplace", __urlencodereplace_errors)
|
|
||||||
|
|
||||||
|
|
||||||
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
|
DEFAULT_CONFIG_FILE = os.path.join("~", ".feediverse")
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
@ -37,12 +23,18 @@ def main():
|
||||||
"don't toot, don't save config"))
|
"don't toot, don't save config"))
|
||||||
parser.add_argument("-v", "--verbose", action="store_true",
|
parser.add_argument("-v", "--verbose", action="store_true",
|
||||||
help="be verbose")
|
help="be verbose")
|
||||||
parser.add_argument("config_file", nargs="?", metavar="CONFIG-FILE",
|
parser.add_argument("-c", "--config",
|
||||||
help=("config file to use, default: %s" %
|
help="config file to use",
|
||||||
DEFAULT_CONFIG_FILE),
|
|
||||||
default=os.path.expanduser(DEFAULT_CONFIG_FILE))
|
default=os.path.expanduser(DEFAULT_CONFIG_FILE))
|
||||||
|
parser.add_argument("-f", "--feeds",
|
||||||
|
help="URL to fetch feed list from",
|
||||||
|
default="https://thelionsrear.com/feeds.json")
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
config_file = args.config_file
|
config_file = args.config
|
||||||
|
|
||||||
|
if args.verbose:
|
||||||
|
print("using config file", config_file)
|
||||||
|
|
||||||
if not os.path.isfile(config_file):
|
if not os.path.isfile(config_file):
|
||||||
setup(config_file)
|
setup(config_file)
|
||||||
|
@ -56,41 +48,95 @@ def main():
|
||||||
access_token=config['access_token']
|
access_token=config['access_token']
|
||||||
)
|
)
|
||||||
|
|
||||||
|
config['feeds'] = fetch_dynamic_feeds(config['name'], args.feeds)
|
||||||
|
|
||||||
newest_post = config['updated']
|
newest_post = config['updated']
|
||||||
for feed in config['feeds']:
|
for feed in config['feeds']:
|
||||||
for entry in get_feed(feed['url'], config['updated'],
|
if args.verbose:
|
||||||
config['include_images'],
|
print(f"fetching {feed['url']} entries since {config['updated']}")
|
||||||
generator=feed.get('generator')):
|
for entry in get_feed(feed['url'], config['updated']):
|
||||||
newest_post = max(newest_post, entry['updated'])
|
newest_post = max(newest_post, entry['updated'])
|
||||||
if args.verbose:
|
if args.verbose:
|
||||||
try:
|
print(entry)
|
||||||
print(entry)
|
|
||||||
except UnicodeEncodeError:
|
|
||||||
# work-around for non-unicode terminals
|
|
||||||
print(dict(
|
|
||||||
(k, v.encode("utf-8") if hasattr(v, "encode") else v)
|
|
||||||
for k, v in entry.items()))
|
|
||||||
if args.dry_run:
|
if args.dry_run:
|
||||||
print("trial run, not tooting ", entry["title"][:50])
|
print("trial run, not tooting ", entry["title"][:50])
|
||||||
continue
|
continue
|
||||||
media_ids = []
|
|
||||||
for img in entry.get("images", []):
|
|
||||||
media = masto.media_post(img, img.headers['content-type'])
|
|
||||||
img.release_conn() # deferred from collect_images()
|
|
||||||
if not 'error' in media:
|
|
||||||
media_ids.append(media)
|
|
||||||
entry.pop("images", None)
|
|
||||||
masto.status_post(feed['template'].format(**entry)[:499],
|
masto.status_post(feed['template'].format(**entry)[:499],
|
||||||
media_ids=media_ids)
|
content_type='text/html',
|
||||||
|
visbility=feed['visibility'])
|
||||||
|
|
||||||
config['updated'] = newest_post.isoformat()
|
if not args.dry_run:
|
||||||
if args.dry_run:
|
config['updated'] = newest_post.isoformat()
|
||||||
print("trial run, not saving the config")
|
|
||||||
else:
|
|
||||||
if args.verbose:
|
|
||||||
print("saving the config")
|
|
||||||
save_config(config, config_file)
|
save_config(config, config_file)
|
||||||
|
|
||||||
|
def fetch_dynamic_feeds(site_name, feeds_url):
|
||||||
|
feeds = requests.get(feeds_url).json()
|
||||||
|
return [
|
||||||
|
dict(url=x['url'],
|
||||||
|
visibility=x['visibility'],
|
||||||
|
template='NEW by @rrix@notes.whatthefuck.computer: {url} {title}\n{summary}')
|
||||||
|
for x in feeds if x['site'] == site_name
|
||||||
|
]
|
||||||
|
|
||||||
|
def get_feed(feed_url, last_update):
|
||||||
|
feed = feedparser.parse(feed_url)
|
||||||
|
if last_update:
|
||||||
|
entries = [e for e in feed.entries
|
||||||
|
if dateutil.parser.parse(e['updated']) > last_update]
|
||||||
|
else:
|
||||||
|
entries = feed.entries
|
||||||
|
entries.sort(key=lambda e: e.updated_parsed)
|
||||||
|
for entry in entries:
|
||||||
|
yield get_entry(entry)
|
||||||
|
|
||||||
|
def get_entry(entry):
|
||||||
|
hashtags = []
|
||||||
|
for tag in entry.get('tags', []):
|
||||||
|
t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
|
||||||
|
hashtags.append('#{}'.format(t))
|
||||||
|
summary = entry.get('summary', '')
|
||||||
|
content = entry.get('content', '') or ''
|
||||||
|
# if content:
|
||||||
|
# content = cleanup(content[0].get('value', ''))
|
||||||
|
url = entry.id
|
||||||
|
return {
|
||||||
|
'url': url,
|
||||||
|
'link': entry.link,
|
||||||
|
'title': cleanup(entry.title),
|
||||||
|
'summary': summary,
|
||||||
|
'content': content,
|
||||||
|
'hashtags': ' '.join(hashtags),
|
||||||
|
'updated': dateutil.parser.parse(entry['updated'])
|
||||||
|
}
|
||||||
|
|
||||||
|
def cleanup(text):
|
||||||
|
html = BeautifulSoup(text, 'html.parser')
|
||||||
|
text = html.get_text()
|
||||||
|
text = re.sub('\xa0+', ' ', text)
|
||||||
|
text = re.sub(' +', ' ', text)
|
||||||
|
text = re.sub(' +\n', '\n', text)
|
||||||
|
text = re.sub('(\w)\n(\w)', '\\1 \\2', text)
|
||||||
|
text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
|
||||||
|
return text.strip()
|
||||||
|
|
||||||
|
def find_urls(html):
|
||||||
|
if not html:
|
||||||
|
return
|
||||||
|
urls = []
|
||||||
|
soup = BeautifulSoup(html, 'html.parser')
|
||||||
|
for tag in soup.find_all(["a", "img"]):
|
||||||
|
if tag.name == "a":
|
||||||
|
url = tag.get("href")
|
||||||
|
elif tag.name == "img":
|
||||||
|
url = tag.get("src")
|
||||||
|
if url and url not in urls:
|
||||||
|
urls.append(url)
|
||||||
|
return urls
|
||||||
|
|
||||||
|
def yes_no(question):
|
||||||
|
res = input(question + ' [y/n] ')
|
||||||
|
return res.lower() in "y1"
|
||||||
|
|
||||||
def save_config(config, config_file):
|
def save_config(config, config_file):
|
||||||
copy = dict(config)
|
copy = dict(config)
|
||||||
with open(config_file, 'w') as fh:
|
with open(config_file, 'w') as fh:
|
||||||
|
@ -98,8 +144,7 @@ def save_config(config, config_file):
|
||||||
|
|
||||||
def read_config(config_file):
|
def read_config(config_file):
|
||||||
config = {
|
config = {
|
||||||
'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc),
|
'updated': datetime(MINYEAR, 1, 1, 0, 0, 0, 0, timezone.utc)
|
||||||
'include_images': False,
|
|
||||||
}
|
}
|
||||||
with open(config_file) as fh:
|
with open(config_file) as fh:
|
||||||
cfg = yaml.load(fh, yaml.SafeLoader)
|
cfg = yaml.load(fh, yaml.SafeLoader)
|
||||||
|
@ -108,122 +153,7 @@ def read_config(config_file):
|
||||||
config.update(cfg)
|
config.update(cfg)
|
||||||
return config
|
return config
|
||||||
|
|
||||||
def detect_generator(feed):
|
|
||||||
# For RSS the generator tag holds the URL, while for ATOM it holds the name
|
|
||||||
generator = feed.feed.get("generator", "")
|
|
||||||
if "/wordpress.org/" in generator:
|
|
||||||
return "wordpress"
|
|
||||||
elif "wordpress" == generator.lower():
|
|
||||||
return "wordpress"
|
|
||||||
return None
|
|
||||||
|
|
||||||
def get_feed(feed_url, last_update, include_images, generator=None):
|
|
||||||
new_entries = 0
|
|
||||||
feed = feedparser.parse(feed_url)
|
|
||||||
if last_update:
|
|
||||||
entries = [e for e in feed.entries
|
|
||||||
if dateutil.parser.parse(e['updated']) > last_update]
|
|
||||||
else:
|
|
||||||
entries = feed.entries
|
|
||||||
entries.sort(key=lambda e: e.updated_parsed)
|
|
||||||
generator = generator or detect_generator(feed)
|
|
||||||
for entry in entries:
|
|
||||||
new_entries += 1
|
|
||||||
yield get_entry(entry, include_images, generator)
|
|
||||||
return new_entries
|
|
||||||
|
|
||||||
def collect_images(entry, generator=None):
|
|
||||||
|
|
||||||
def find_urls(part):
|
|
||||||
if not part:
|
|
||||||
return
|
|
||||||
soup = BeautifulSoup(part, 'html.parser')
|
|
||||||
for tag in soup.find_all(["a", "img"]):
|
|
||||||
if tag.name == "a":
|
|
||||||
url = tag["href"]
|
|
||||||
elif tag.name == "img":
|
|
||||||
url = tag["src"]
|
|
||||||
if url not in urls:
|
|
||||||
urls.append(url)
|
|
||||||
|
|
||||||
urls = []
|
|
||||||
find_urls(entry.get("summary", ""))
|
|
||||||
for c in entry.get("content", []):
|
|
||||||
find_urls(c.value)
|
|
||||||
for e in (entry.enclosures
|
|
||||||
+ [l for l in entry.links if l.get("rel") == "enclosure"]):
|
|
||||||
if (e["type"].startswith(("image/", "video/")) and
|
|
||||||
e["href"] not in urls):
|
|
||||||
urls.append(e["href"])
|
|
||||||
if generator == "wordpress":
|
|
||||||
urls = (u for u in urls if not "/wp-content/plugins/" in u)
|
|
||||||
# Work around a wordpress bug: If the filename contains an
|
|
||||||
# umlaut, this will not be encoded using %-escape, as the
|
|
||||||
# standard demands. This will break encoding in http.request()
|
|
||||||
urls = (u.encode("ascii", "urlencodereplace").decode()
|
|
||||||
for u in urls)
|
|
||||||
images = []
|
|
||||||
for url in urls:
|
|
||||||
resp = http.request('GET', url, preload_content=False)
|
|
||||||
if resp.headers['content-type'].startswith(("image/", "video/")):
|
|
||||||
images.append(resp)
|
|
||||||
# IMPORTANT: Need to release_conn() later!
|
|
||||||
if len(images) >= MAX_IMAGES:
|
|
||||||
break
|
|
||||||
else:
|
|
||||||
resp.release_conn()
|
|
||||||
return images
|
|
||||||
|
|
||||||
|
|
||||||
def get_entry(entry, include_images, generator=None):
|
|
||||||
|
|
||||||
def cleanup(text):
|
|
||||||
html = BeautifulSoup(text, 'html.parser')
|
|
||||||
# Remove all elements of class read-more or read-more-*
|
|
||||||
for more in html.find_all(None, re.compile("^read-more($|-.*)")):
|
|
||||||
more.extract()
|
|
||||||
text = html.get_text()
|
|
||||||
text = re.sub('\xa0+', ' ', text)
|
|
||||||
text = re.sub(' +', ' ', text)
|
|
||||||
text = re.sub(' +\n', '\n', text)
|
|
||||||
text = re.sub('\n\n\n+', '\n\n', text, flags=re.M)
|
|
||||||
return text.strip()
|
|
||||||
|
|
||||||
hashtags = []
|
|
||||||
for tag in entry.get('tags', []):
|
|
||||||
t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
|
|
||||||
hashtags.append('#{}'.format(t))
|
|
||||||
summary = entry.get('summary', '')
|
|
||||||
content = entry.get('content', '') or ''
|
|
||||||
if content:
|
|
||||||
content = cleanup(content[0].get('value', ''))
|
|
||||||
url = entry.id
|
|
||||||
if generator == "wordpress":
|
|
||||||
links = [l for l in entry.links if l.get("rel") == "alternate"]
|
|
||||||
if len(links) > 1:
|
|
||||||
links = [l for l in entry.links if l.get("type") == "text/html"]
|
|
||||||
if links:
|
|
||||||
url = links[0]["href"]
|
|
||||||
t = tag['term'].replace(' ', '_').replace('.', '').replace('-', '')
|
|
||||||
hashtags.append('#{}'.format(t))
|
|
||||||
return {
|
|
||||||
'url': url,
|
|
||||||
'link': entry.link,
|
|
||||||
'title': cleanup(entry.title),
|
|
||||||
'summary': cleanup(summary),
|
|
||||||
'content': content,
|
|
||||||
'hashtags': ' '.join(hashtags),
|
|
||||||
'updated': dateutil.parser.parse(entry['updated']),
|
|
||||||
'images': collect_images(entry, generator) if include_images else [],
|
|
||||||
'__generator__': generator,
|
|
||||||
}
|
|
||||||
|
|
||||||
def setup(config_file):
|
def setup(config_file):
|
||||||
|
|
||||||
def yes_no(question):
|
|
||||||
res = input(question + ' [y/n] ')
|
|
||||||
return res.lower() in "y1"
|
|
||||||
|
|
||||||
url = input('What is your Mastodon Instance URL? ')
|
url = input('What is your Mastodon Instance URL? ')
|
||||||
have_app = yes_no('Do you have your app credentials already?')
|
have_app = yes_no('Do you have your app credentials already?')
|
||||||
if have_app:
|
if have_app:
|
||||||
|
@ -247,14 +177,12 @@ def setup(config_file):
|
||||||
|
|
||||||
feed_url = input('RSS/Atom feed URL to watch: ')
|
feed_url = input('RSS/Atom feed URL to watch: ')
|
||||||
old_posts = yes_no('Shall already existing entries be tooted, too?')
|
old_posts = yes_no('Shall already existing entries be tooted, too?')
|
||||||
include_images = yes_no('Shall images be included in the toot?')
|
|
||||||
config = {
|
config = {
|
||||||
'name': name,
|
'name': name,
|
||||||
'url': url,
|
'url': url,
|
||||||
'client_id': client_id,
|
'client_id': client_id,
|
||||||
'client_secret': client_secret,
|
'client_secret': client_secret,
|
||||||
'access_token': access_token,
|
'access_token': access_token,
|
||||||
'include_images': include_images,
|
|
||||||
'feeds': [
|
'feeds': [
|
||||||
{'url': feed_url, 'template': '{title} {url}'}
|
{'url': feed_url, 'template': '{title} {url}'}
|
||||||
]
|
]
|
||||||
|
|
5
setup.py
5
setup.py
|
@ -5,7 +5,7 @@ with open("README.md") as f:
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name='feediverse',
|
name='feediverse',
|
||||||
version='0.1.2',
|
version='0.3.0',
|
||||||
python_requires='>=3.3',
|
python_requires='>=3.3',
|
||||||
url='https://github.com/edsu/feediverse',
|
url='https://github.com/edsu/feediverse',
|
||||||
author='Ed Summers',
|
author='Ed Summers',
|
||||||
|
@ -18,7 +18,6 @@ setup(
|
||||||
'feedparser',
|
'feedparser',
|
||||||
'mastodon.py',
|
'mastodon.py',
|
||||||
'python-dateutil',
|
'python-dateutil',
|
||||||
'pyyaml',
|
'pyyaml'],
|
||||||
'urllib3[secure]'],
|
|
||||||
entry_points={'console_scripts': ['feediverse = feediverse:main']}
|
entry_points={'console_scripts': ['feediverse = feediverse:main']}
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in New Issue