r"""rawdog plugin to limit articles using regular expressions This rawdog plugin filters articles for a feed using Python's regular expressions. Only articles containing a match to the regular expression are kept. Both the title and the description are searched. It adds a "grep" feed option with the following syntax: grep [opts] regular expression The following options are supported: -i Perform case-insensitive matching. -s Strip HTML tags and newlines. Tags and newlines are converted to spaces, and multiple spaces are then condensed into a single space. -v Invert the sense of matching so that only articles not containg a match are kept. The regular expression should not be quoted. Any characters after the options are considered part of the regular expression, although trailing spaces are trimmed. Regular expressions that start with the "-" character should start with "\-" instead. Example Configuration: feed 1h http://www.mysite.com/myfeed.rdf grep -i dell monitor feed 1h http://www.mysite.com/myfeed2.rdf grep \b[Ii]nteresting\b|\bexciting\b Limitations: Only one regular expression can be specified per feed. """ import rawdoglib.rawdog import rawdoglib.plugins import re __version__ = "1.0" __author__ = "Steve Atwell " __date__ = "$Date: 2005-01-22 21:07:56 -0600 (Sat, 22 Jan 2005) $" class _RECache: def __init__(self): self._cache = {} def compile(self, pattern, flags=0): try: return self._cache[(pattern, flags)] except KeyError: compiled = re.compile(pattern, flags) self._cache[(pattern, flags)] = compiled return compiled cache = _RECache() stripre = re.compile(r'<.*?>|\n') spacere = re.compile(r' +') def grep(rawdog, config, article, ignore): """Handle new articles using the article_seen hook.""" global cache, stripre, spacere ignore.value = False feedargs = rawdog.feeds[article.feed].args if feedargs.has_key("grep"): reflags = re.U + re.S invert = False strip = False # Parse options grepline = feedargs["grep"].strip() while grepline[0] == "-": try: (opt, grepline) = grepline.split(None, 1) except ValueError: raise rawdoglib.rawdog.ConfigError("feedgrep: missing regex for feed %s" % (article.feed,)) for o in opt[1:]: if o == "i": reflags += re.I elif o == "v": invert = True elif o == "s": strip = True else: raise rawdoglig.rawdog.ConfigError("feedgrep: bad option -%s for feed %s" % (o, article.feed)) ignore.value = True grepre = cache.compile(grepline, reflags) # Copy the text we will search so that we can modify # it if the strip option is set text = [] for piece in ["title", "summary"]: if (article.entry_info.has_key(piece)): text.append(article.entry_info[piece]) # Strip text. First replace HTML tags and newlines with # spaces, and then condense multiples spaces into a # single space. if strip: for i in range(len(text)): text[i] = stripre.sub(' ', text[i]) text[i] = spacere.sub(' ', text[i]) for piece in text: if (grepre.search(piece)): ignore.value = False break if invert: ignore.value = not ignore.value # if we decided to ignore this, don't bother processing # it further return not ignore.value return True rawdoglib.plugins.attach_hook("article_seen", grep)