#!/usr/bin/env python # A programmatic interface to the Oxford English Dictionary's online edition. # Adam Sampson import os, re, HTMLParser, sha from urlparse import urljoin base_url = "http://80-dictionary.oed.com.chain.kent.ac.uk" cookie_jar = "/tmp/oed.cookies" class TagFinder(HTMLParser.HTMLParser): def __init__(self, lookfor): HTMLParser.HTMLParser.__init__(self) self.lookfor = lookfor self.found = [] def handle_starttag(self, tag, attrs): if tag not in self.lookfor: return self.found.append((tag, dict(attrs))) def get_page(url): cache_name = "/tmp/oed.cache." + sha.new(url).hexdigest() try: f = open(cache_name) #print "Reading from cache " + url data = f.read() f.close() return data except IOError: pass #print "Fetching " + url f = os.popen("wget -q -O - --load-cookies=" + cookie_jar + " --save-cookies=" + cookie_jar + " '" + url + "'") data = f.read() f.close() f = open(cache_name, "w") f.write(data) f.close() return data def get_frames(url): data = get_page(url) tf = TagFinder(["frame"]) tf.feed(data) frames = {} for (tag, attrs) in tf.found: frames[attrs["name"]] = urljoin(url, attrs["src"]) return frames def strip_tags(s): s = re.sub(r'<[^>]*>', "", s) s = re.sub(r'\s+', " ", s) def fix(m): s = m.group(1) if s == "amp": return "&" elif s == "lt": return "<" elif s == "gt": return ">" elif s[0] == "#": if s[1] == "x": return chr(int(s[2:], 16)) else: return chr(int(s[1:])) s = re.sub(r'&([^;]+);', fix, s) return s.strip() def parse_entry(maindata, hdrdata): m = re.search(r'(?ms)(.*?)', hdrdata) word = strip_tags(m.group(1)) ms = re.findall(r'(?ms)(.*?)', maindata) definition = strip_tags(" ".join(ms)) return (word, definition) def search(word): fseturl = base_url + "/cgi/findword?query_type=word&queryword=" + word + "&find=find" frames = get_frames(fseturl) if frames["Main frame"].find("nearest_to") != -1: # No results. return [] maindata = get_page(frames["Main frame"]) results = [] if maindata.find("OED Online main entry text frame") != -1: # Only one result -- this page. hdrdata = get_page(frames["Header frame"]) results.append(parse_entry(maindata, hdrdata)) else: # Multiple results. for m in re.findall(r'<A HREF="(/cgi/entry[^"]*)">', maindata): rseturl = urljoin(frames["Main frame"], m) rframes = get_frames(rseturl) rmaindata = get_page(rframes["Main frame"]) rhdrdata = get_page(rframes["Header frame"]) results.append(parse_entry(rmaindata, rhdrdata)) return results def wordwrap(text, width, prefix = ""): text += " " n = 0 while n < len(text): p = -1 w = width while p == -1: p = text.rfind(" ", n, w + n) w += width print prefix + text[n:p] n = p + 1 if __name__ == "__main__": import sys if len(sys.argv) == 2: want = sys.argv[1] else: want = "hello" for word, definition in search(want): print word wordwrap(definition, 70, " ")