#!/usr/bin/env python
import re, feedwriter, os, sys, urlparse, time, traceback
import getopt
warntag = "none"
warnlevel = 2
nofetch = False
def setwarn(n):
global warntag
warntag = n
def setnofetch(n):
global nofetch
nofetch = n
def setlevel(n):
global warnlevel
warnlevel = n
def warn(level, *args):
if level >= warnlevel:
sys.stderr.write(warntag + ": " + " ".join(map(str, args)) + "\n")
# This is the old link-fixing code from rawdog; it doesn't behave correctly
# under all conditions, but it's a lot simpler and less fragile than the
# feedparser implementation.
link_dq_re = re.compile(r'(<[^>]*(?:href|src)=)"([^"]*)"', re.I)
link_sq_re = re.compile(r'(<[^>]*(?:href|src)=)\'([^\']*)\'', re.I)
link_nq_re = re.compile(r'(<[^>]*(?:href|src)=)([^"\'][^\s>]*)', re.I)
def make_links_absolute(base, html):
"""Convert relative URIs in HTML href and src attributes to absolute
form from the given base URI."""
def fix(match):
(whole, a, url) = match.group(0, 1, 2)
return a + '"' + urlparse.urljoin(base, url) + '"'
html = link_dq_re.sub(fix, html)
html = link_sq_re.sub(fix, html)
html = link_nq_re.sub(fix, html)
return html
tag_re = re.compile(r'<([^>]*)>')
tag_name_re = re.compile(r'([^\s]+)(\s+(.*))?$')
tag_attr_re = re.compile(r'([^\s=]+)=("[^"]*"|\'[^\']*\'|[^\s]*)\s*')
def extract_tags(input):
"""Extract all the start tags from some probably-invalid HTML."""
for tag in tag_re.findall(input):
tag = tag.strip()
if tag == "":
continue
elif tag.startswith("!--"):
continue
m = tag_name_re.match(tag)
if m is None:
continue
(name, attrs) = m.group(1, 3)
adict = {}
pos = 0
while attrs is not None and pos < len(attrs):
m = tag_attr_re.match(attrs, pos)
if m is None:
break
pos = m.end()
(aname, aval) = m.group(1, 2)
if aval.startswith("'") or aval.startswith('"'):
aval = aval[1:-1]
adict[aname.lower()] = aval
yield (name.lower(), adict)
class NotHTMLParser:
"""OK, the real HTMLParser sucks at dealing with broken HTML. This is
a minimal replacement."""
def __init__(self):
pass
def feed(self, data):
for tag, attrs in extract_tags(data):
if tag[0] != "/":
self.handle_starttag(tag, attrs)
class TagFinder(NotHTMLParser):
def __init__(self, lookfor):
NotHTMLParser.__init__(self)
self.lookfor = lookfor
self.found = []
def handle_starttag(self, tag, attrs):
warn(0, "starttag", tag, attrs)
if tag not in self.lookfor:
return
warn(1, "matched", tag, attrs)
self.found.append((tag, dict(attrs)))
def cmd(argv):
return os.spawnvp(os.P_WAIT, argv[0], argv)
def mtime(fn):
try:
return os.stat(fn).st_mtime
except OSError:
return 0
class FeedConfig:
def __init__(self, name, super):
self.values = {}
self.name = name
self.super = super
def __getitem__(self, key):
ls = self.getall(key)
if ls == []:
return None
else:
return ls[0]
def getall(self, key):
if self.values.has_key(key):
return self.values[key]
if self.super is not None:
return self.super.getall(key)
return []
def set(self, key, value):
if not self.values.has_key(key):
self.values[key] = []
self.values[key].append(value)
class ConfigCollection:
def __init__(self):
self.default = FeedConfig("default", None)
self.feeds = {}
def load(self, filename):
f = open(filename, "r")
feed = self.default
for l in f.readlines():
l = l.strip()
if l == "" or l[0] == "#":
continue
elif l[0] == "[" and l[-1] == "]":
feed = FeedConfig(l[1:-1], self.default)
self.feeds[feed.name] = feed
else:
i = l.index(" ")
feed.set(l[:i], l[i + 1:])
f.close()
class Feed:
def __init__(self, config):
self.url = config["source"]
self.storedir = config["storedir"]
self.outdir = config["outdir"]
self.maxitems = int(config["maxitems"])
self.name = config.name
self.items = []
cmd(["mkdir", "-p", self.dir(), self.odir()])
self.chan = feedwriter.Channel(config["title"], self.url, config["title"] + " (rsscomics)")
def dir(self):
return self.storedir + "/" + self.name
def odir(self):
return self.outdir + "/" + self.name
def update(self):
raise "must implement"
def add_item(self, description, title = "Item", link = None):
self.maxitems -= 1
if self.maxitems < 0:
return
self.chan.add_item(title = title, link = link, description = description)
def write(self):
f = open(self.outdir + "/" + self.name + ".rss", "w")
f.write(self.chan.rss2())
f.close()
class URLFeed(Feed):
def __init__(self, config):
Feed.__init__(self, config)
self.ua = config["useragent"]
self.imageext = config["imageext"]
self.now = 0
def fetch(self):
self.now = time.time()
fn = self.dir() + "/sourcefile"
if (not nofetch) and (not self.get_url(fn)):
return 0
f = open(fn)
self.data = f.read()
f.close()
return 1
def get_url(self, fn, relurl = None):
opts = ["-s", "-R",
"-A", self.ua,
"-o", fn]
if relurl is None:
url = self.url
opts += ["-z", fn]
else:
url = urlparse.urljoin(self.url, relurl)
opts += ["-e", self.url]
rc = cmd(["curl"] + opts + [url])
if rc != 0:
warn(2, "Can't fetch", url)
return 0
else:
return 1
def oname(self, url):
url = re.sub(r'\?.*', '', url)
m = re.match(r'.*(\.[^\./]*)$', url)
if self.imageext is not None:
ext = self.imageext
elif m is not None:
ext = m.group(1)
else:
ext = ".gif"
return re.sub(r'[^A-Za-z0-9-]+', '_', url) + "-" + str(int(self.now)) + ext
def fill(template, matches):
if template is None:
return None
return re.sub(r'\$([0-9]+)', lambda m: matches[int(m.group(1)) - 1], template)
class RegexpFeed(URLFeed):
name = "regexp"
def __init__(self, config):
URLFeed.__init__(self, config)
self.encoding = config["encoding"]
if self.encoding is None:
self.encoding = "ISO-8859-1"
self.match = re.compile(config["match"])
self.title = config["itemtitle"]
self.link = config["itemlink"]
self.description = config["itemdescription"]
def update(self):
if not self.fetch():
return 0
d = self.data.decode(self.encoding)
d = make_links_absolute(self.url, d)
matches = self.match.findall(d)
if matches == []:
warn(2, "No matches")
if type(matches) is not list:
matches = [matches]
for ms in matches:
title = fill(self.title, ms)
link = fill(self.link, ms)
description = fill(self.description, ms)
self.add_item(title = title, link = link, description = description)
return 1
class ImagesFeed(URLFeed):
name = "images"
def __init__(self, config):
URLFeed.__init__(self, config)
self.wants = {}
for m in config.getall("match"):
(attr, exp) = m.split(" ", 1)
if not self.wants.has_key(attr):
self.wants[attr] = []
self.wants[attr].append(re.compile(exp))
self.cookies = []
for m in config.getall("cookie"):
self.cookies.append(re.compile(m))
def update(self):
if not self.fetch():
return 0
tf = TagFinder(["img", "image"])
# Fix some of the ways that real HTML breaks HTMLParser.
# Lose ', '', self.data)
# And embed elements, because we don't need them and
# megatokyo's are often broken.
self.data = re.sub(r"(?sim)