import re html4_entities = { "lt" : "<", "gt" : ">", "amp" : "&", "quot" : '"' } xml_entities = { "lt" : "<", "gt" : ">", "amp" : "&", "quot" : '"', "apos" : "'" } _entity_re = re.compile("&([^;]*);") def decode_entities(s, entities = xml_entities): """Decode XML entities in a string.""" def decode(match): e = match.group(1) if entities.has_key(e): return entities[e] elif e[0] == "#" and len(e) > 2: try: if e[1] == "x": return chr(int(e[2:], 16)) else: return chr(int(e[1:])) except ValueError: pass return match.group(0) return _entity_re.sub(decode, s) def encode_entities(s, entities = xml_entities): """Encode a string using XML entities.""" rev = {} for k in entities.keys(): rev[entities[k]] = k def encode(match): return "&" + rev[match.group(0)] + ";" # Assumes that none of the characters to be replaced are special in # regexps. change_re = re.compile("[" + "".join(rev.keys()) + "]") return change_re.sub(encode, s) if __name__ == "__main__": print decode_entities("""Test: stock names: < > & " ' decimal: A B C hex: A B O unknown names: &unknown; &fish; unknown decimal: fish; unknown hex: ish; incomplete: & &fish with random; semicolon""") s = "
Fish > \"big\" wombats' elephants
" print encode_entities(s) assert decode_entities(encode_entities(s)) == s