#!/usr/bin/python """ podget: download podcast attachments from feeds. Adam Sampson """ import sys, os, re, optparse, feedparser, subprocess, time USER_AGENT = "podget/1.0" def get_filename_prefix(feed, entry, raw_filename, options): # Media Network often has proper dates in the entry titles or filenames. if feed.get("title") == "Media Network Vintage Vault": day, month, year = None, None, None m = re.search(r'(\d\d)\.(\d\d)\.(\d\d\d\d)', entry.get("title")) if m is not None: day, month, year = m.group(1, 2, 3) m = re.search(r'(\d\d)\.(\d\d)\.(\d\d\d\d)', raw_filename) if m is not None: day, month, year = m.group(1, 2, 3) if day is not None: return ("Media Network - %s-%s-%s - " % (year, month, day)) return "" def try_unlink(fn): try: os.unlink(fn) except OSError: pass def download_file(url, raw_filename, filename, length, referer, options): def is_downloaded(fn): try: st = os.stat(fn) except OSError: return False if options.nolength or length is None or st.st_size == length: return True return False for dir in [options.outputdir] + options.checkdirs: for fn in os.listdir(dir): fn = os.path.join(dir, fn) if fn.endswith(raw_filename): if is_downloaded(fn): if options.dryrun: print >>sys.stderr, "Already got: " + fn return output_fn = os.path.join(options.outputdir, filename) cmd = [ "wget", "-U", USER_AGENT, "-O", output_fn, "--referer", referer, url, ] if options.dryrun: print >>sys.stderr, "Would run: " + " ".join(cmd) return delay = 1 while True: print >>sys.stderr, "Downloading: " + url try_unlink(output_fn) rc = subprocess.call(cmd) if rc == 0 and is_downloaded(output_fn): # Success! return if delay > 40: print >>sys.stderr, "Too many retries; giving up" try_unlink(output_fn) break print >>sys.stderr, "Failed, trying again in %d seconds" % delay delay *= 2 time.sleep(delay) def handle_enclosure(feed, entry, link, referer, options): url = link.get("href") if url is None: return length = link.get("length") if length is not None: length = int(length) filename = os.path.basename(url) # Filenames starting with dots are annoying. filename = re.sub(r'^\.+', '', filename) fn_prefix = get_filename_prefix(feed, entry, filename, options) download_file(url, filename, fn_prefix + filename, length, referer, options) def scan_feed(feed_url, options): data = feedparser.parse(feed_url, agent=USER_AGENT) entries = data.entries if options.reverse: entries = reversed(entries) for entry in entries: links = entry.get("links") if links is None: continue for link in links: if link.get("rel") == "enclosure": handle_enclosure(data.feed, entry, link, feed_url, options) if __name__ == "__main__": parser = optparse.OptionParser() parser.add_option("-n", dest="dryrun", action="store_true", default=False, help="print what would be done and exit") parser.add_option("-d", dest="outputdir", metavar="DIR", default=".", help="directory to store downloaded files in") parser.add_option("-D", dest="checkdirs", metavar="DIR", action="append", default=[], help="extra directory to check for downloaded files") parser.add_option("-L", dest="nolength", action="store_true", default=False, help="don't check file lengths against podcast") parser.add_option("-r", dest="reverse", action="store_true", default=False, help="download oldest items first") (options, args) = parser.parse_args() for url in args: scan_feed(url, options)