#!/usr/bin/python # Grab all the images from a Flickr set. # Adam Sampson import os, sys, urlparse, re def die(*s): print "".join(map(str, s)) sys.exit(1) def wget(url, outputfile = None, referer = None, args = []): args = ["wget", "-U", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.7.5) Gecko/20041219"] + args if outputfile: args += ["-O", outputfile] if os.access(outputfile, os.F_OK): print "Already exists, not fetching: " + outputfile return if referer: args += ["--referer", referer] args += [url] print "Fetching: " + url if os.spawnvp(os.P_WAIT, args[0], args) != 0: die("Command failed: ", args) def get_file(fn): f = open(fn) d = f.read() f.close() return d link_re = re.compile(r'(?msi)]+href=["\']?([^"\'> ]*)["\']?') image_re = re.compile(r'(?msi)]+src=["\']?([^"\'> ]*)["\' >]') def get_links(url, d, reg = link_re): return [urlparse.urljoin(url, l.replace("&", "&")) for l in re.findall(reg, d)] def safe_fn(s): return re.sub(r'[^A-Za-z0-9_., -]', '_', s) def get_set(url, cont): if cont is None: tempname = "temp-index-%d" % (os.getpid(),) wget(url, tempname) d = get_file(tempname) else: d = get_file(cont + "/html/index-0") indexes = [] for l in get_links(url, d): if re.search(r'\?page=', l): indexes.append(l) def get_title(d): m = re.search(r'

]*>([^<]*)<', d) if m is None: die("Can't find author") author = m.group(1).strip() print "Author:", author dirname = safe_fn(title + " - " + author) print "Dir:", dirname try: os.mkdir(dirname) except: pass try: os.mkdir(dirname + "/html") except: pass ifs = [("html/index-0", url)] f = open(dirname + "/" + ifs[0][0], "w") f.write(d) f.close() os.chdir(dirname) n = 1 for iurl in indexes: fn = "html/index-%d" % (n,) ifs.append((fn, iurl)) n += 1 wget(iurl, fn, url) def get_image(url, ref, fn): hfn = "html/image-" + fn wget(url, hfn, ref) d = get_file(hfn) title = get_title(d) fn += "-" + safe_fn(title) for l in get_links(url, d, image_re): if re.search(r'static.*\?v=0', l): wget(l, "%s-medium.jpg" % (fn,), url) pg = 0 for fn, url in ifs: pg += 1 d = get_file(fn) n = 0 for l in get_links(url, d): if re.search(r'in/set-', l): n += 1 img = "%02d-%02d" % (pg, n) get_image(l, url, img) os.chdir("..") if cont is None: os.unlink(tempname) if __name__ == "__main__": if len(sys.argv) == 3: cont = sys.argv[2] else: cont = None get_set(sys.argv[1], cont)