Scrapers overhaul

* Switch all python scrapers to scrapy
 * Allow scrapers to be directly called, instead of
   using `scrapy runspider`
 * Prefix scapers with `ua-scraper-` for clarity
 * Update documentation
rrix
Simon Lipp 2017-07-26 11:52:10 +02:00
parent 874c844f78
commit 28054b6bfd
21 changed files with 400 additions and 212 deletions

2
.gitignore vendored
View File

@ -3,7 +3,7 @@ maildir-put/maildir-put
rss2json/rss2json
ua-inline/ua-inline
ua-proxify/ua-proxify
__pycache__
scrapers/ua-scraper-torrent9
tmp-go
*.pyc
node_modules

View File

@ -1,22 +1,24 @@
PREFIX=/usr/local
DESTDIR=
PYTHONVER=$(shell pkg-config --modversion python3 2>/dev/null)
BINDIR=$(DESTDIR)$(PREFIX)/bin
PYLIBDIR=$(DESTDIR)$(PREFIX)/lib/python$(PYTHONVER)/site-packages
DOCDIR=$(DESTDIR)$(PREFIX)/share/doc/ua
MANDIR=$(DESTDIR)$(PREFIX)/share/man
GODIRS=ggs rss2json maildir-put ua-inline ua-proxify
SCRAPERS=edxcourses lyon-bm-bd mal mangareader yggtorrent torrent9
export GOPATH ?= $(PWD)/tmp-go
.PHONY: all clean doc
all: ggs/ggs rss2json/rss2json maildir-put/maildir-put ua-inline/ua-inline ua-proxify/ua-proxify
all: ggs/ggs rss2json/rss2json maildir-put/maildir-put ua-inline/ua-inline ua-proxify/ua-proxify \
scrapers/ua-scraper-torrent9
doc:
test -d doc || mkdir doc
test -f doc/ua.md || ln -s ../README.md doc/ua.md
test -f doc/ua-scrapers.md || ln -s ../scrapers/README.md doc/ua-scrapers.md
for d in $(GODIRS) ; do test -f doc/$$d.md || ln -s ../$$d/README.md doc/$$d.md ; done
cd doc ; for f in *.md ; do ronn $$f ; done
@ -35,6 +37,10 @@ ua-inline/ua-inline: ua-inline/ua-inline.go $(GOPATH)
ua-proxify/ua-proxify: ua-proxify/ua-proxify.go $(GOPATH)
cd ua-proxify; go get -d && go build
scrapers/ua-scraper-torrent9: scrapers/torrent9.js
cd scrapers ; npm install && npm run webpack -- -p --output-filename ua-scraper-torrent9 --entry ./torrent9
chmod +x $@
$(GOPATH):
mkdir $(GOPATH)
mkdir $(GOPATH)/bin
@ -44,12 +50,8 @@ $(GOPATH):
install: all
install -d $(BINDIR)
for f in $(GODIRS) ; do install $$f/$$f $(BINDIR)/ ; done
install scrappers/mangareader2json $(BINDIR)/
install scrappers/ipboard2json $(BINDIR)/
install scrappers/medscape2json $(BINDIR)/
test -n "$(PYTHONVER)" && install -d $(PYLIBDIR)
test -n "$(PYTHONVER)" && install scrappers/scraplib.py $(PYLIBDIR)/
for s in $(SCRAPERS) ; do install scrapers/ua-scraper-$$s $(BINDIR)/ ; done
install weboobmsg2json/weboobmsg2json $(BINDIR)/
install -d $(DOCDIR)
install -d $(MANDIR)/man1/
@ -59,4 +61,4 @@ install: all
clean:
for f in $(GODIRS) ; do rm -f $$f/$$f ; done
rm -rf tmp-go
rm -rf tmp-go scapers/node_modules scrapers/ua-scraper-torrent9

View File

@ -9,9 +9,9 @@ description in their respective folder.
them in a maildir
* `rss2json` transforms any RSS/Atom feed into a set of messages that
`maildir-put` can process
* You can write your own producers for maildir-put ; an example for the
[mangareader](http://mangareader.net) service is provided.
* You can also put filters, like `ua-inline`
* You can write your own producers (scrapers) for maildir-put ; some are
already provided in the `scrapers/` directory.
* You can also put filters, like `ua-inline` or `ua-proxify`.
## Usage
@ -22,7 +22,7 @@ them in a maildir
* Go
* libxml
* [jq](https://stedolan.github.io/jq/)
* For additional scrappers: python 3, aiohttp and pyquery
* For additional scrapers: scrapy, python 3 and nodejs
## Installation
@ -42,7 +42,7 @@ and my Github personal feed into inbox:
}
mangareader() {
command 2000 "mangareader2json http://mangareader.net/$1 | "\
command 2000 "ua-scraper-mangareader -a name=$1 | "\
"maildir-put -root $HOME/Maildir-feeds -folder Entertainment"
}
@ -58,3 +58,11 @@ and my Github personal feed into inbox:
mangareader gantz
rss https://github.com/sloonz.private.atom?token=HIDDEN ""
## Weboob compatibility
You can use [weboob](http://weboob.org/) modules used by
[boobmsg](http://weboob.org/applications/boobmsg) to generate
messages. Configure the modules using `boobmsg`, and use `weboobmsg2json
[module-name]` to generate messages. `[module-name]` can be found in
`~/.config/weboob/backends`.

14
doc/ua
View File

@ -1,7 +1,7 @@
.\" generated with Ronn/v0.7.3
.\" http://github.com/rtomayko/ronn/tree/0.7.3
.
.TH "UA" "" "April 2016" "" ""
.TH "UA" "" "July 2017" "" ""
This is a set of tools to aggregate all your information into your maildir\. Each tool can be used separately ; you can find a more complete description in their respective folder\.
.
.IP "\(bu" 4
@ -14,10 +14,10 @@ This is a set of tools to aggregate all your information into your maildir\. Eac
\fBrss2json\fR transforms any RSS/Atom feed into a set of messages that \fBmaildir\-put\fR can process
.
.IP "\(bu" 4
You can write your own producers for maildir\-put ; an example for the mangareader \fIhttp://mangareader\.net\fR service is provided\.
You can write your own producers (scrapers) for maildir\-put ; some are already provided in the \fBscrapers/\fR directory\.
.
.IP "\(bu" 4
You can also put filters, like \fBua\-inline\fR
You can also put filters, like \fBua\-inline\fR or \fBua\-proxify\fR\.
.
.IP "" 0
.
@ -41,7 +41,7 @@ libxml
jq \fIhttps://stedolan\.github\.io/jq/\fR
.
.IP "\(bu" 4
For additional scrappers: python 3, aiohttp and pyquery
For additional scrapers: scrapy, python 3 and nodejs
.
.IP "" 0
.
@ -67,7 +67,7 @@ rss() {
}
mangareader() {
command 2000 "mangareader2json http://mangareader\.net/$1 | "\e
command 2000 "ua\-scraper\-mangareader \-a name=$1 | "\e
"maildir\-put \-root $HOME/Maildir\-feeds \-folder Entertainment"
}
@ -87,4 +87,6 @@ rss https://github\.com/sloonz\.private\.atom?token=HIDDEN ""
.fi
.
.IP "" 0
.
.SH "Weboob compatibility"
You can use weboob \fIhttp://weboob\.org/\fR modules used by boobmsg \fIhttp://weboob\.org/applications/boobmsg\fR to generate messages\. Configure the modules using \fBboobmsg\fR, and use \fBweboobmsg2json [module\-name]\fR to generate messages\. \fB[module\-name]\fR can be found in \fB~/\.config/weboob/backends\fR\.

34
doc/ua-scrapers Normal file
View File

@ -0,0 +1,34 @@
.\" generated with Ronn/v0.7.3
.\" http://github.com/rtomayko/ronn/tree/0.7.3
.
.TH "UA\-SCRAPERS" "" "July 2017" "" ""
List all courses on EdX \fIhttps://www\.edx\.org/\fR\.List new comics on Lyon public library \fIhttps://www\.bm\-lyon\.fr/\fR\.List season animes from myanimelist \fIhttps://myanimelist\.net/anime/season\fR\.List latest chapters for a given manga on mangareader \fIhttp://www\.mangareader\.net/\fR\.
.
.P
Usage: \fBua\-scraper\-mangareader \-a name=[manga\-title]\fR\. \fB[manga\-title]\fR is the path of the manga on mangareader, for example \fBnatsume\-yuujinchou\fR for http://www\.mangareader\.net/natsume\-yuujinchou\.List latest torrents on torrent9 \fIhttp://www\.torrent9\.cc/\fR\.
.
.P
Usage:
.
.IP "\(bu" 4
All categories: \fBua\-scraper\-torrent9\fR
.
.IP "\(bu" 4
Specific categories: \fBua\-scraper\-torrent9 "category1 category2\.\.\."\fR
.
.IP "" 0
.
.P
Categories references the anchor in the URL (for example \fBebook\fR for http://www\.torrent9\.cc/#ebook)\.List lastest torrents on yggtorrent \fIhttps://yggtorrent\.com/\fR\.
.
.P
Usage:
.
.IP "\(bu" 4
All categories: \fBua\-scraper\-yggtorrent\fR
.
.IP "\(bu" 4
Specific category: \fBua\-scraper\-yggtorrent [url]\fR\.
.
.IP "" 0

125
doc/ua-scrapers.html Normal file
View File

@ -0,0 +1,125 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv='content-type' value='text/html;charset=utf8'>
<meta name='generator' value='Ronn/v0.7.3 (http://github.com/rtomayko/ronn/tree/0.7.3)'>
<title>&lt;p&gt;This contains additional scrapers. You can take those as examples to
write your own.&lt;/p&gt;
ua-scraper-exdcourses</title>
<style type='text/css' media='all'>
/* style: man */
body#manpage {margin:0}
.mp {max-width:100ex;padding:0 9ex 1ex 4ex}
.mp p,.mp pre,.mp ul,.mp ol,.mp dl {margin:0 0 20px 0}
.mp h2 {margin:10px 0 0 0}
.mp > p,.mp > pre,.mp > ul,.mp > ol,.mp > dl {margin-left:8ex}
.mp h3 {margin:0 0 0 4ex}
.mp dt {margin:0;clear:left}
.mp dt.flush {float:left;width:8ex}
.mp dd {margin:0 0 0 9ex}
.mp h1,.mp h2,.mp h3,.mp h4 {clear:left}
.mp pre {margin-bottom:20px}
.mp pre+h2,.mp pre+h3 {margin-top:22px}
.mp h2+pre,.mp h3+pre {margin-top:5px}
.mp img {display:block;margin:auto}
.mp h1.man-title {display:none}
.mp,.mp code,.mp pre,.mp tt,.mp kbd,.mp samp,.mp h3,.mp h4 {font-family:monospace;font-size:14px;line-height:1.42857142857143}
.mp h2 {font-size:16px;line-height:1.25}
.mp h1 {font-size:20px;line-height:2}
.mp {text-align:justify;background:#fff}
.mp,.mp code,.mp pre,.mp pre code,.mp tt,.mp kbd,.mp samp {color:#131211}
.mp h1,.mp h2,.mp h3,.mp h4 {color:#030201}
.mp u {text-decoration:underline}
.mp code,.mp strong,.mp b {font-weight:bold;color:#131211}
.mp em,.mp var {font-style:italic;color:#232221;text-decoration:none}
.mp a,.mp a:link,.mp a:hover,.mp a code,.mp a pre,.mp a tt,.mp a kbd,.mp a samp {color:#0000ff}
.mp b.man-ref {font-weight:normal;color:#434241}
.mp pre {padding:0 4ex}
.mp pre code {font-weight:normal;color:#434241}
.mp h2+pre,h3+pre {padding-left:0}
ol.man-decor,ol.man-decor li {margin:3px 0 10px 0;padding:0;float:left;width:33%;list-style-type:none;text-transform:uppercase;color:#999;letter-spacing:1px}
ol.man-decor {width:100%}
ol.man-decor li.tl {text-align:left}
ol.man-decor li.tc {text-align:center;letter-spacing:4px}
ol.man-decor li.tr {text-align:right;float:right}
</style>
</head>
<!--
The following styles are deprecated and will be removed at some point:
div#man, div#man ol.man, div#man ol.head, div#man ol.man.
The .man-page, .man-decor, .man-head, .man-foot, .man-title, and
.man-navigation should be used instead.
-->
<body id='manpage'>
<div class='mp' id='man'>
<div class='man-navigation' style='display:none'>
</div>
<ol class='man-decor man-head man head'>
<li class='tl'>ua-scrapers</li>
<li class='tc'></li>
<li class='tr'>ua-scrapers</li>
</ol>
<h1><p>This contains additional scrapers. You can take those as examples to
write your own.</p>
ua-scraper-exdcourses</h1>
<p>List all courses on <a href="https://www.edx.org/">EdX</a>.</p>
<h1>ua-scraper-lyon-bm-bd</h1>
<p>List new comics on <a href="https://www.bm-lyon.fr/">Lyon public library</a>.</p>
<h1>ua-scraper-mal</h1>
<p>List season animes from <a href="https://myanimelist.net/anime/season">myanimelist</a>.</p>
<h1>ua-scraper-mangareader</h1>
<p>List latest chapters for a given manga on <a href="http://www.mangareader.net/">mangareader</a>.</p>
<p>Usage: <code>ua-scraper-mangareader -a name=[manga-title]</code>. <code>[manga-title]</code>
is the path of the manga on mangareader, for example <code>natsume-yuujinchou</code>
for http://www.mangareader.net/natsume-yuujinchou.</p>
<h1>ua-scraper-torrent9</h1>
<p>List latest torrents on <a href="http://www.torrent9.cc/">torrent9</a>.</p>
<p>Usage:</p>
<ul>
<li>All categories: <code>ua-scraper-torrent9</code></li>
<li>Specific categories: <code>ua-scraper-torrent9 "category1 category2..."</code></li>
</ul>
<p>Categories references the anchor in the URL (for example <code>ebook</code> for
http://www.torrent9.cc/#ebook).</p>
<h1>ua-scraper-yggtorrent</h1>
<p>List lastest torrents on <a href="https://yggtorrent.com/">yggtorrent</a>.</p>
<p>Usage:</p>
<ul>
<li>All categories: <code>ua-scraper-yggtorrent</code></li>
<li>Specific category: <code>ua-scraper-yggtorrent [url]</code>.</li>
</ul>
<ol class='man-decor man-foot man foot'>
<li class='tl'></li>
<li class='tc'>July 2017</li>
<li class='tr'>ua-scrapers</li>
</ol>
</div>
</body>
</html>

1
doc/ua-scrapers.md Symbolic link
View File

@ -0,0 +1 @@
../scrapers/README.md

View File

@ -57,6 +57,7 @@
<a href="#Dependencies">Dependencies</a>
<a href="#Installation">Installation</a>
<a href="#Configuration">Configuration</a>
<a href="#Weboob-compatibility">Weboob compatibility</a>
</div>
<ol class='man-decor man-head man head'>
@ -76,9 +77,9 @@ description in their respective folder.</p>
them in a maildir</li>
<li><code>rss2json</code> transforms any RSS/Atom feed into a set of messages that
<code>maildir-put</code> can process</li>
<li>You can write your own producers for maildir-put ; an example for the
<a href="http://mangareader.net">mangareader</a> service is provided.</li>
<li>You can also put filters, like <code>ua-inline</code></li>
<li>You can write your own producers (scrapers) for maildir-put ; some are
already provided in the <code>scrapers/</code> directory.</li>
<li>You can also put filters, like <code>ua-inline</code> or <code>ua-proxify</code>.</li>
</ul>
@ -93,7 +94,7 @@ them in a maildir</li>
<li>Go</li>
<li>libxml</li>
<li><a href="https://stedolan.github.io/jq/">jq</a></li>
<li>For additional scrappers: python 3, aiohttp and pyquery</li>
<li>For additional scrapers: scrapy, python 3 and nodejs</li>
</ul>
@ -116,7 +117,7 @@ rss() {
}
mangareader() {
command 2000 "mangareader2json http://mangareader.net/$1 | "\
command 2000 "ua-scraper-mangareader -a name=$1 | "\
"maildir-put -root $HOME/Maildir-feeds -folder Entertainment"
}
@ -134,10 +135,18 @@ mangareader gantz
rss https://github.com/sloonz.private.atom?token=HIDDEN ""
</code></pre>
<h2 id="Weboob-compatibility">Weboob compatibility</h2>
<p>You can use <a href="http://weboob.org/">weboob</a> modules used by
<a href="http://weboob.org/applications/boobmsg">boobmsg</a> to generate
messages. Configure the modules using <code>boobmsg</code>, and use <code>weboobmsg2json
[module-name]</code> to generate messages. <code>[module-name]</code> can be found in
<code>~/.config/weboob/backends</code>.</p>
<ol class='man-decor man-foot man foot'>
<li class='tl'></li>
<li class='tc'>April 2016</li>
<li class='tc'>July 2017</li>
<li class='tr'>ua</li>
</ol>

43
scrapers/README.md Normal file
View File

@ -0,0 +1,43 @@
This contains additional scrapers. You can take those as examples to
write your own.
# ua-scraper-exdcourses
List all courses on [EdX](https://www.edx.org/).
# ua-scraper-lyon-bm-bd
List new comics on [Lyon public library](https://www.bm-lyon.fr/).
# ua-scraper-mal
List season animes from [myanimelist](https://myanimelist.net/anime/season).
# ua-scraper-mangareader
List latest chapters for a given manga on [mangareader](http://www.mangareader.net/).
Usage: `ua-scraper-mangareader -a name=[manga-title]`. `[manga-title]`
is the path of the manga on mangareader, for example `natsume-yuujinchou`
for http://www.mangareader.net/natsume-yuujinchou.
# ua-scraper-torrent9
List latest torrents on [torrent9](http://www.torrent9.cc/).
Usage:
* All categories: `ua-scraper-torrent9`
* Specific categories: `ua-scraper-torrent9 "category1 category2..."`
Categories references the anchor in the URL (for example `ebook` for
http://www.torrent9.cc/#ebook).
# ua-scraper-yggtorrent
List lastest torrents on [yggtorrent](https://yggtorrent.com/).
Usage:
* All categories: `ua-scraper-yggtorrent`
* Specific category: `ua-scraper-yggtorrent [url]`.

13
scrapers/package.json Normal file
View File

@ -0,0 +1,13 @@
{
"scripts": {
"webpack": "webpack"
},
"devDependencies": {
"cheerio": "^0.22.0",
"cloudscraper": "^1.4.1",
"babel-core": "^6.25.0",
"babel-loader": "^7.1.1",
"babel-preset-es2015": "^6.24.1",
"webpack": "^3.4.0"
}
}

View File

@ -1,5 +1,3 @@
#!/usr/bin/node
const cloudscraper = require('cloudscraper');
const cheerio = require('cheerio');
const process = require('process');

View File

@ -1,4 +1,4 @@
# Usage: scrapy edxcourses.py | ... | maildir-putt
#!/usr/bin/python3
# -*- encoding: utf-8 -*-
import json
@ -48,3 +48,21 @@ class EdxCourses(scrapy.Spider):
})
yield scrapy.Request(course["url"], callback=course_item.parse_description)
if __name__ == "__main__":
from scrapy.commands.runspider import Command
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
import optparse
parser = optparse.OptionParser()
cmd = scrapy.commands.runspider.Command()
cmd.settings = Settings(cmd.default_settings)
cmd.add_options(parser)
opts, args = parser.parse_args()
cmd.process_options(args, opts)
crawler = CrawlerProcess(cmd.settings)
crawler.crawl(EdxCourses, **opts.spargs)
crawler.start()

View File

@ -1,3 +1,5 @@
#!/usr/bin/python3
import scrapy
import json
import urllib.parse
@ -43,3 +45,20 @@ class BMLyonBD(scrapy.Spider):
yield scrapy.Request(urllib.parse.urljoin(response.url, item), callback=self.parse_page, dont_filter=True)
break
if __name__ == "__main__":
from scrapy.commands.runspider import Command
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
import optparse
parser = optparse.OptionParser()
cmd = scrapy.commands.runspider.Command()
cmd.settings = Settings(cmd.default_settings)
cmd.add_options(parser)
opts, args = parser.parse_args()
cmd.process_options(args, opts)
crawler = CrawlerProcess(cmd.settings)
crawler.crawl(BMLyonBD, **opts.spargs)
crawler.start()

20
scrappers/mal.py → scrapers/ua-scraper-mal Normal file → Executable file
View File

@ -1,3 +1,5 @@
#!/usr/bin/python3
import scrapy
import json
import re
@ -25,3 +27,21 @@ class Animes(scrapy.Spider):
'body': '%s %s %s %s' % (title.extract(), img_tag, desc.extract(), genres.extract()),
'host': 'myanimelist.net'
})))
if __name__ == "__main__":
from scrapy.commands.runspider import Command
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
import optparse
parser = optparse.OptionParser()
cmd = scrapy.commands.runspider.Command()
cmd.settings = Settings(cmd.default_settings)
cmd.add_options(parser)
opts, args = parser.parse_args()
cmd.process_options(args, opts)
crawler = CrawlerProcess(cmd.settings)
crawler.crawl(Animes, **opts.spargs)
crawler.start()

43
scrapers/ua-scraper-mangareader Executable file
View File

@ -0,0 +1,43 @@
#!/usr/bin/python3
import scrapy
import json
import re
import urllib.parse
class Mangareader(scrapy.Spider):
name = "mangareader"
def __init__(self, *args, **kwargs):
super(Mangareader, self).__init__(*args, **kwargs)
self.start_urls = [urllib.parse.urljoin("http://www.mangareader.net", kwargs['name'])]
def parse(self, response):
for chapter in response.css('#latestchapters > ul > li'):
url = urllib.parse.urljoin(self.start_urls[0], chapter.css('::attr(href)').extract()[0])
title = u' '.join(chapter.css('::text').extract()).strip()
print(json.dumps({
"title": title,
"body": '<a href="%s">%s</a>' % (url, title),
"host": "mangareader.net",
"url": url,
"id": url
}))
if __name__ == "__main__":
from scrapy.commands.runspider import Command
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
import optparse
parser = optparse.OptionParser()
cmd = scrapy.commands.runspider.Command()
cmd.settings = Settings(cmd.default_settings)
cmd.add_options(parser)
opts, args = parser.parse_args()
cmd.process_options(args, opts)
crawler = CrawlerProcess(cmd.settings)
crawler.crawl(Mangareader, **opts.spargs)
crawler.start()

View File

@ -1,3 +1,5 @@
#!/usr/bin/python3
import scrapy
import json
import urllib.parse
@ -28,3 +30,21 @@ class YggTorrent(scrapy.Spider):
for item in response.css('a.torrent-name'):
url = urllib.parse.urljoin(response.url, item.css('::attr("href")').extract()[0])
yield scrapy.Request(url, self.parse_item)
if __name__ == "__main__":
from scrapy.commands.runspider import Command
from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
import optparse
parser = optparse.OptionParser()
cmd = scrapy.commands.runspider.Command()
cmd.settings = Settings(cmd.default_settings)
cmd.add_options(parser)
opts, args = parser.parse_args()
cmd.process_options(args, opts)
crawler = CrawlerProcess(cmd.settings)
crawler.crawl(YggTorrent, **opts.spargs)
crawler.start()

View File

@ -0,0 +1,14 @@
const webpack = require('webpack');
module.exports = {
target: 'node',
module: {
loaders: [
{ test: /\.js$/, exclude: /node_modules/, loader: 'babel-loader', query: { presets: ['es2015'] } }
]
},
plugins: [
new webpack.optimize.UglifyJsPlugin({ test: /^/ }),
new webpack.BannerPlugin({ banner: '#!/usr/bin/env node', raw: true })
]
}

View File

@ -1,21 +0,0 @@
#!/usr/bin/python3
import scraplib
import sys
import json
def scrap_mangareader(url):
url = scraplib.urljoin("http://www.mangareader.net", url)
doc = yield from scraplib.fetch(url)
for chapter in doc("#latestchapters > ul > li").pq():
print(json.dumps({
"title": chapter.text(),
"body": '<a href="%s">%s</a>' % (chapter("a").url(), chapter.text()),
"id": "mangareader:"+chapter("a").url()
}))
if len(sys.argv) < 2 or "-h" in sys.argv or "--help" in sys.argv:
print("Usage: %s manga-index-url" % sys.argv[0])
sys.exit(1)
scraplib.main(scrap_mangareader(sys.argv[1]))

View File

@ -1,6 +0,0 @@
{
"devDependencies": {
"cheerio": "^0.22.0",
"cloudscraper": "^1.4.1"
}
}

View File

@ -1,154 +0,0 @@
import asyncio
import base64
import aiohttp
import pyquery
import urllib.parse
import http.cookiejar
import os
import sys
import re
import hashlib
urljoin = urllib.parse.urljoin
cookie_jar = None
def wait(coroutines):
coroutines = list(coroutines)
if not coroutines:
f = asyncio.futures.Future()
f.set_result([])
return (yield from f)
else:
return (yield from asyncio.wait(coroutines))[0]
def debug(msg):
print(msg, file = sys.stderr)
def open_cookies(cookie_file = None):
global cookie_jar
if cookie_file is None:
cookie_jar = http.cookiejar.CookieJar()
else:
cookie_jar = http.cookiejar.MozillaCookieJar(os.path.expanduser(cookie_file))
load_cookies()
def load_cookies():
if not isinstance(cookie_jar, http.cookiejar.FileCookieJar):
return
if os.path.exists(cookie_jar.filename) and os.stat(cookie_jar.filename).st_size > 0:
cookie_jar.load()
def save_cookies():
if not isinstance(cookie_jar, http.cookiejar.FileCookieJar):
return
if not os.path.exists(cookie_jar.filename):
parent_dir = os.path.dirname(cookie_jar.filename)
if not os.path.exists(parent_dir):
os.makedirs(parent_dir)
open(cookie_jar.filename, "w+").close()
cookie_jar.save()
def url_quote_unicode(url):
res = b""
for c in url.encode('utf-8', 'surrogateescape'):
if c > 127:
res += ("%%%02x" % c).encode('utf-8').upper()
else:
res += bytes([c])
return res.decode('utf-8')
class PyQueryWrapper(pyquery.PyQuery):
def __init__(self, *args, **kwargs):
self._response = kwargs.pop('response', None)
base_url = kwargs.pop("base_url", None)
pyquery.PyQuery.__init__(self, *args, **kwargs)
if base_url and not self._base_url:
self._base_url = base_url
def url(self, attr = None, base_url = None):
"""
Get the absolute url of this element (for a href, img src...)
"""
if base_url is None:
base_url = self.base_url
if attr is None:
for attr in ("href", "src", "action"):
if self.attr(attr):
return url_quote_unicode(urllib.parse.urljoin(base_url, self.attr(attr)))
return url_quote_unicode(urllib.parse.urljoin(base_url, self.attr(attr)))
def pq(self):
"""
Equilavent to map(PyQueryWrapper, iter(self))
Returns PyQuery-like objects for all elements contained in this
object
"""
for el in self:
yield self.__class__(el, parent = self)
def fetch(self, *args, **kwargs):
"""
Simple wrapper around fetch() that sets the referer according
to the request used to retrieve current document
"""
if "headers" not in kwargs:
kwargs["headers"] = {}
kwargs["headers"]["Referer"] = self.base_url
return fetch(*args, **kwargs)
@property
def response(self):
"""
HTTP response object for the request used to retrive this page
"""
return self._response or (self.parent and self.parent.response or None)
def _simple_cookie_to_cookie(name, sc, host):
expires = sc["expires"] and http.cookiejar.http2time(sc["expires"]) or 0
return http.cookiejar.Cookie(
None, name, sc.value, None, False,
sc["domain"].lstrip(".") or host, bool(sc["domain"]), sc["domain"].startswith("."),
sc["path"] or "/", bool(sc["path"]),
bool(sc["secure"]), expires, False, None, None, {})
def fetch(url, method = "GET", **kwargs):
# TODO: don't assume that body is UTF-8
raw = kwargs.pop('raw_response', False)
if cookie_jar:
if not "cookies" in kwargs:
kwargs["cookies"] = {}
for c in cookie_jar:
kwargs["cookies"][c.name] = c.value
# Handle redirect since aiohttp can be buggy there
# TODO: report the bug to aiohttp
allow_redirects = kwargs.pop('allow_redirects', True)
max_redirects = kwargs.pop('max_redirects', 10)
kwargs["allow_redirects"] = False
resp = yield from aiohttp.request(method, url, **kwargs)
while resp.status // 100 == 3 and allow_redirects and max_redirects > 0:
r_url = url_quote_unicode(resp.headers.get('LOCATION') or resp.headers.get('URI'))
url = urljoin(url, r_url)
resp.close()
resp = yield from aiohttp.request(method, re.sub(r"#.*", "", url_quote_unicode(url)), **kwargs)
max_redirects -= 1
if max_redirects == 0:
raise Exception("max redirections happened")
if cookie_jar is not None:
# TODO: lock this file
load_cookies()
for name, cookie in resp.cookies.items():
cookie_jar.set_cookie(_simple_cookie_to_cookie(name, cookie, resp.host))
save_cookies()
if raw:
return resp
else:
body = yield from resp.read()
return PyQueryWrapper(body.decode("utf-8"), response = resp, base_url = url)
def main(entrypoint):
asyncio.get_event_loop().run_until_complete(entrypoint)