#!/usr/bin/env python import re, feedwriter, os, sys, urlparse, time, traceback import getopt warntag = "none" warnlevel = 2 nofetch = False def setwarn(n): global warntag warntag = n def setnofetch(n): global nofetch nofetch = n def setlevel(n): global warnlevel warnlevel = n def warn(level, *args): if level >= warnlevel: sys.stderr.write(warntag + ": " + " ".join(map(str, args)) + "\n") # This is the old link-fixing code from rawdog; it doesn't behave correctly # under all conditions, but it's a lot simpler and less fragile than the # feedparser implementation. link_dq_re = re.compile(r'(<[^>]*(?:href|src)=)"([^"]*)"', re.I) link_sq_re = re.compile(r'(<[^>]*(?:href|src)=)\'([^\']*)\'', re.I) link_nq_re = re.compile(r'(<[^>]*(?:href|src)=)([^"\'][^\s>]*)', re.I) def make_links_absolute(base, html): """Convert relative URIs in HTML href and src attributes to absolute form from the given base URI.""" def fix(match): (whole, a, url) = match.group(0, 1, 2) return a + '"' + urlparse.urljoin(base, url) + '"' html = link_dq_re.sub(fix, html) html = link_sq_re.sub(fix, html) html = link_nq_re.sub(fix, html) return html tag_re = re.compile(r'<([^>]*)>') tag_name_re = re.compile(r'([^\s]+)(\s+(.*))?$') tag_attr_re = re.compile(r'([^\s=]+)=("[^"]*"|\'[^\']*\'|[^\s]*)\s*') def extract_tags(input): """Extract all the start tags from some probably-invalid HTML.""" for tag in tag_re.findall(input): tag = tag.strip() if tag == "": continue elif tag.startswith("!--"): continue m = tag_name_re.match(tag) if m is None: continue (name, attrs) = m.group(1, 3) adict = {} pos = 0 while attrs is not None and pos < len(attrs): m = tag_attr_re.match(attrs, pos) if m is None: break pos = m.end() (aname, aval) = m.group(1, 2) if aval.startswith("'") or aval.startswith('"'): aval = aval[1:-1] adict[aname.lower()] = aval yield (name.lower(), adict) class NotHTMLParser: """OK, the real HTMLParser sucks at dealing with broken HTML. This is a minimal replacement.""" def __init__(self): pass def feed(self, data): for tag, attrs in extract_tags(data): if tag[0] != "/": self.handle_starttag(tag, attrs) class TagFinder(NotHTMLParser): def __init__(self, lookfor): NotHTMLParser.__init__(self) self.lookfor = lookfor self.found = [] def handle_starttag(self, tag, attrs): warn(0, "starttag", tag, attrs) if tag not in self.lookfor: return warn(1, "matched", tag, attrs) self.found.append((tag, dict(attrs))) def cmd(argv): return os.spawnvp(os.P_WAIT, argv[0], argv) def mtime(fn): try: return os.stat(fn).st_mtime except OSError: return 0 class FeedConfig: def __init__(self, name, super): self.values = {} self.name = name self.super = super def __getitem__(self, key): ls = self.getall(key) if ls == []: return None else: return ls[0] def getall(self, key): if self.values.has_key(key): return self.values[key] if self.super is not None: return self.super.getall(key) return [] def set(self, key, value): if not self.values.has_key(key): self.values[key] = [] self.values[key].append(value) class ConfigCollection: def __init__(self): self.default = FeedConfig("default", None) self.feeds = {} def load(self, filename): f = open(filename, "r") feed = self.default for l in f.readlines(): l = l.strip() if l == "" or l[0] == "#": continue elif l[0] == "[" and l[-1] == "]": feed = FeedConfig(l[1:-1], self.default) self.feeds[feed.name] = feed else: i = l.index(" ") feed.set(l[:i], l[i + 1:]) f.close() class Feed: def __init__(self, config): self.url = config["source"] self.storedir = config["storedir"] self.outdir = config["outdir"] self.maxitems = int(config["maxitems"]) self.name = config.name self.items = [] cmd(["mkdir", "-p", self.dir(), self.odir()]) self.chan = feedwriter.Channel(config["title"], self.url, config["title"] + " (rsscomics)") def dir(self): return self.storedir + "/" + self.name def odir(self): return self.outdir + "/" + self.name def update(self): raise "must implement" def add_item(self, description, title = "Item", link = None): self.maxitems -= 1 if self.maxitems < 0: return self.chan.add_item(title = title, link = link, description = description) def write(self): f = open(self.outdir + "/" + self.name + ".rss", "w") f.write(self.chan.rss2()) f.close() class URLFeed(Feed): def __init__(self, config): Feed.__init__(self, config) self.ua = config["useragent"] self.imageext = config["imageext"] self.now = 0 def fetch(self): self.now = time.time() fn = self.dir() + "/sourcefile" if (not nofetch) and (not self.get_url(fn)): return 0 f = open(fn) self.data = f.read() f.close() return 1 def get_url(self, fn, relurl = None): opts = ["-s", "-R", "-A", self.ua, "-o", fn] if relurl is None: url = self.url opts += ["-z", fn] else: url = urlparse.urljoin(self.url, relurl) opts += ["-e", self.url] rc = cmd(["curl"] + opts + [url]) if rc != 0: warn(2, "Can't fetch", url) return 0 else: return 1 def oname(self, url): url = re.sub(r'\?.*', '', url) m = re.match(r'.*(\.[^\./]*)$', url) if self.imageext is not None: ext = self.imageext elif m is not None: ext = m.group(1) else: ext = ".gif" return re.sub(r'[^A-Za-z0-9-]+', '_', url) + "-" + str(int(self.now)) + ext def fill(template, matches): if template is None: return None return re.sub(r'\$([0-9]+)', lambda m: matches[int(m.group(1)) - 1], template) class RegexpFeed(URLFeed): name = "regexp" def __init__(self, config): URLFeed.__init__(self, config) self.encoding = config["encoding"] if self.encoding is None: self.encoding = "ISO-8859-1" self.match = re.compile(config["match"]) self.title = config["itemtitle"] self.link = config["itemlink"] self.description = config["itemdescription"] def update(self): if not self.fetch(): return 0 d = self.data.decode(self.encoding) d = make_links_absolute(self.url, d) matches = self.match.findall(d) if matches == []: warn(2, "No matches") if type(matches) is not list: matches = [matches] for ms in matches: title = fill(self.title, ms) link = fill(self.link, ms) description = fill(self.description, ms) self.add_item(title = title, link = link, description = description) return 1 class ImagesFeed(URLFeed): name = "images" def __init__(self, config): URLFeed.__init__(self, config) self.wants = {} for m in config.getall("match"): (attr, exp) = m.split(" ", 1) if not self.wants.has_key(attr): self.wants[attr] = [] self.wants[attr].append(re.compile(exp)) self.cookies = [] for m in config.getall("cookie"): self.cookies.append(re.compile(m)) def update(self): if not self.fetch(): return 0 tf = TagFinder(["img", "image"]) # Fix some of the ways that real HTML breaks HTMLParser. # Lose ', '', self.data) # And embed elements, because we don't need them and # megatokyo's are often broken. self.data = re.sub(r"(?sim)]*>", "", self.data) def fixtag(m): s = m.group(1) endtag = (s[0] == "/") # Fix . s = s.replace("\n", " ") # Fix >. s = s.replace("<", "") # Fix KeenSpot ***header_bgimage no_image***. s = re.sub(r'\*\*\*.*?\*\*\*', "", s) # Fix smart quotes inside tags. s = s.replace('\xe2\x80\x9d', '"') # Remove non-US-ASCII characters. s = re.sub("[^\001-\177]", "", s) # Fix MSN/Slate extraneous quote. s = s.replace('GetImage?"N','GetImage?N') # Fix a typo in ok-cancel's HTML. s = s.replace(' title-"', ' title="') # Fix "target="..." in qwantz. s = s.replace('"target="', '" target="') # Fix various bits in megatokyo. s = s.replace('border ="', 'border="') s = s.replace('src = ', 'src=') s = re.sub(r'id="[^"]*$', '', s) if endtag: s = "/" + re.sub(r"[^a-zA-Z]", "", s[1:]) return "<" + s + ">" self.data = re.sub(r'(?sim)<([^>]+)>', fixtag, self.data) f = open(self.dir() + "/fixedsource", "w") f.write(self.data) f.close() tf.feed(self.data) imgs = [] for (tag, attrs) in tf.found: warn(0, "found", tag, attrs) if not attrs.has_key("src"): continue for attr in self.wants.keys(): if not attrs.has_key(attr): continue for exp in self.wants[attr]: if exp.search(attrs[attr]) is not None: imgs.append(attrs["src"]) cookies = [] for exp in self.cookies: m = exp.search(self.data) if m is not None: cookies.append(m.group(0)) else: cookies.append("") try: f = open(self.dir() + "/imglist") ostate = f.read().split("\n") f.close() except IOError: ostate = [] nstate = imgs + cookies if ostate == nstate: return 0 f = open(self.dir() + "/imglist", "w") f.write("\n".join(nstate)) f.close() bits = [] for img in imgs: fn = self.oname(img) ourl = self.name + "/" + fn ofn = self.odir() + "/" + fn self.get_url(ofn, img) bits.append('\n') if bits == []: warn(2, "No matches") bits.append('No matches') self.add_item("".join(bits)) return 1 if __name__ == "__main__": types = {} for c in [ImagesFeed, RegexpFeed]: types[c.name] = c try: opts, args = getopt.getopt(sys.argv[1:], "vn") except getopt.GetoptError, s: print s sys.exit(1) for o, a in opts: if o == "-v": setlevel(0) elif o == "-n": setnofetch(True) cc = ConfigCollection() cc.load(os.getenv("HOME") + "/.rsscomics/config") classes = {} for feed in cc.feeds.keys(): cls = cc.feeds[feed]["class"] if not classes.has_key(cls): classes[cls] = [] classes[cls].append(feed) if len(args) > 0: toupdate = [] for f in args: if f[0] == "@": toupdate += classes[f[1:]] else: toupdate.append(f) else: toupdate = cc.feeds.keys() rawdogconf = cc.default["rawdogconf"] if rawdogconf is not None: (path, prefix) = rawdogconf.split(" ", 1) f = open(path, "w") for feed in cc.feeds.keys(): print >>f, prefix + feed + ".rss" f.close() for name in toupdate: conf = cc.feeds[name] setwarn(name) fc = types[conf["use"]](conf) try: if fc.update(): fc.write() except KeyboardInterrupt: warn(2, "Interrupted") sys.exit(1) except: warn(2, "Exception in feed reader") traceback.print_exc(None, sys.stderr)