#!/usr/bin/python """ Print links, one per line, from the given URLs. Adam Sampson """ import sys, urllib2, urlparse, html5lib, optparse import lxml.etree as etree def get_links(url): f = urllib2.urlopen(url) # This is oversimplifying a bit (it ignores BASE, etc.). base = f.geturl() doc = html5lib.parse(f, treebuilder="lxml", namespaceHTMLElements=False) f.close() return [urlparse.urljoin(base, el.get("href")) for el in doc.findall(".//a[@href]")] if __name__ == "__main__": parser = optparse.OptionParser() (options, args) = parser.parse_args() for url in args: for link in get_links(url): sys.stdout.write(link + "\n")