#!/usr/bin/env python # Take Chuck Taggart's "Looka" blog page, and produce RSS from it. # Adam Sampson import sys, os, time, re, htmlentitydefs # {{{ feedwriter # This section's a slightly cut-down version of feedwriter.py: # http://offog.org/darcs/misccode/feedwriter.py class FeedError(Exception): pass def escape_xml(s): """Escape special characters in XML.""" chars = { "<" : "lt", ">" : "gt", "&" : "amp", '"' : "quot", "'" : "apos" } cs = [] for c in s: if chars.has_key(c): cs.append("&" + chars[c] + ";") else: cs.append(c) return "".join(cs) def wrap(name, value): if value is None: return "" else: return "<" + name + ">" + escape_xml(value).encode("UTF-8") + "\n" class Channel: """A summary of a syndicated site.""" def __init__(self, title, link, description): self.title = title self.link = link self.description = description self.items = [] def add_item(self, *args, **keywords): """Add an item to the feed. The arguments to this method are the same as for the Item constructor.""" self.items.append(Item(*args, **keywords)) def rss2(self): """Return the RSS 2.0 representation of this feed.""" bits = [] bits.append('\n') bits.append('\n') bits.append('\n') bits.append(wrap("title", self.title)) bits.append(wrap("link", self.link)) bits.append(wrap("description", self.description)) for item in self.items: bits.append('\n') bits.append(wrap("title", item.title)) bits.append(wrap("link", item.link)) bits.append(wrap("description", item.description)) if item.pubDate is not None: d = item.pubDate if type(d) is not tuple and type(d) is not time.struct_time: d = time.gmtime(d) s = time.strftime("%a, %d %b %Y %H:%M:%S +0000", d) bits.append(wrap("pubDate", s)) bits.append('\n') bits.append('\n\n') return "".join(bits) class Item: """An item in a syndication feed.""" def __init__(self, title = None, link = None, description = None, pubDate = None): if title is None and description is None: raise FeedError("Item must have either title or description") self.title = title self.link = link self.description = description self.pubDate = pubDate # }}} def strip_html(s): """Remove HTML tags and entities from a string.""" s = re.sub(r'<[^>]*>', '', s) def decode(m): return htmlentitydefs.entitydefs.get(m.group(1), "").decode("ISO-8859-1") s = re.sub(r'&([a-z]*);', decode, s) return s.strip() def convert(html_fn, rss_fn): """Convert Looka's HTML to RSS.""" # Read in the HTML f = open(html_fn) html = f.read().decode("ISO-8859-1") f.close() current_date = None main_url = "http://www.gumbopages.com/looka/" channel = Channel(title = "Looka!", link = main_url, description = "All that and a Creole's Stuffed Bread.") # Find all the articles, which are bracketed by ..., # and all the date markers which look like .... for (article, daynumber, date) in re.findall(r'(?ms)(?:(.*?)|(.*?))', html): if article != "": # It's an article. # The title is the first line after the ; the rest is the article proper. m = re.match(r'(?ms)([^\n]*)\n(.*)$', article) if m is None: print "Can't find title in article that starts " + repr(article[:80]) + "; ignoring it" continue title = strip_html(m.group(1)) body = m.group(2) # If we managed to parse a date header before this article, use it. if current_date is not None: (date, link) = current_date else: date, link = None, main_url # If there's a Javascript link for comments, replace it # with a link back to the original article. body = re.sub(r'(?ms)Comments', body) # Add the item to the channel. channel.add_item(title = title, link = link, description = body, pubDate = date) else: # It's a date marker. # The date's in the format "Tuesday, April 3, 2007". Try to parse it. s = re.sub(r'(,|\s+)', ' ', strip_html(date)) try: parsed = time.strptime(s, "%A %B %d %Y") except: print "Can't parse date " + repr(s) current_date = None continue link = main_url + "#" + daynumber current_date = (parsed, link) # Write the RSS to a temporary file... f = open(rss_fn + ".new", "w") f.write(channel.rss2()) f.close() # ... and rename that temporary file over the output file. os.rename(rss_fn + ".new", rss_fn) if __name__ == "__main__": args = sys.argv[1:] if len(args) != 2: print "Usage: lookafeed.py input.html output.rss" sys.exit(1) else: convert(args[0], args[1])