#!/usr/bin/python # Given text containing Geocities URLs on stdin, print interesting starting # points for mirroring to stdout. You can then pipe this into: # xargs wget -c -nc -r -l0 # Adam Sampson import sys, re, getopt, fileinput def usage(): print "find-geocities [OPTIONS] [FILE ...]" print "-f Only look for site front pages" print "-h Only look for (probable) HTML pages" print "-v Be verbose" sys.exit(1) try: opts, args = getopt.getopt(sys.argv[1:], "fhv") except getopt.GetoptError: usage() front_only = False html_only = False verbose = False for o, a in opts: if o == "-f": front_only = True elif o == "-h": html_only = True elif o == "-v": verbose = True def log(*s): if verbose: print >>sys.stderr, "".join(map(str, s)) starts = set() for line in fileinput.input(args): # This is for wget's "File `www...' is already there". line = line.replace("File `", "http://") for url in re.findall(r'http://[^\s)\]\\"\'<>]+', line): log("Found URL: ", url) starts.add(url) sites = set() for url in starts: front = None # Look for www.geocities.com/Category/Category/1234 m = re.match(r'^(http://[^/]*geocities[^/]*/[A-Z][^/]*/[A-Z][^/]*/[0-9]+).*', url) if m is not None: front = m.group(1) + "/" # Look for www.geocities.com/Category/1234 m = re.match(r'^(http://[^/]*geocities[^/]*/[A-Z][^/]*/[0-9]+).*', url) if front is None and m is not None: front = m.group(1) + "/" # Look for www.geocities.com/sitename m = re.match(r'^(http://[^/]*geocities[^/]*/[^/]+).*', url) if front is None and m is not None: front = m.group(1) + "/" if front is not None: # If the "front page" we've found is a file or a Geocities # internal directory, then we aren't interested. if re.match(r'(?i).*(\.(jpg|jpeg|gif|png|css|html|htm|js)|/(cgi-bin|pictures|images|gbim)/)', front) is not None: continue log("Adding site front page: ", front) sites.add(front) else: log("No front page; ignoring: ", url) continue if front_only: continue log("Adding found URL: ", url) sites.add(url) # Try stripping off a final filename. m = re.match(r'^(.*/)([^/]+)$', url) if m is not None: log("Adding directory: ", m.group(1)) sites.add(m.group(1)) # Reverse sort, so we get subpages before front pages. for url in reversed(sorted(list(sites))): if html_only and re.match(r'(?i).*(/|.htm|.html)$', url) is None: continue print url