tdi._htmldecode

40 """ Make decoder """ 41 from . import c 42 c = c.load('impl') 43 if c is not None: 44 return c.htmldecode 45 46 sub = _re.compile(ur'&([^& \t\n\r\f;]*);').sub 47 unicode_, unichr_, str_, int_ = unicode, unichr, str, int 48 isinstance_ = isinstance 49 default_entities = dict(_htmlentities.htmlentities) 50 51 def decode(value, encoding='latin-1', errors='strict', entities=None): 52 """ 53 Decode HTML encoded text 54 55 :Parameters: 56 `value` : ``basestring`` 57 HTML content to decode 58 59 `encoding` : ``str`` 60 Unicode encoding to be applied before value is being processed 61 further. If value is already a unicode instance, the encoding is 62 ignored. If omitted, 'latin-1' is applied (because it can't fail 63 and maps bytes 1:1 to unicode codepoints). 64 65 `errors` : ``str`` 66 Error handling, passed to .decode() and evaluated for entities. 67 If the entity name or character codepoint could not be found or 68 not be parsed then the error handler has the following semantics: 69 70 ``strict`` (or anything different from the other tokens below) 71 A ``ValueError`` is raised. 72 73 ``ignore`` 74 The original entity is passed through 75 76 ``replace`` 77 The character is replaced by the replacement character 78 (U+FFFD) 79 80 `entities` : ``dict`` 81 Entity name mapping (unicode(name) -> unicode(value)). If 82 omitted or ``None``, the `HTML5 entity list`_ is applied. 83 84 .. _HTML5 entity list: http://www.w3.org/TR/html5/ 85 syntax.html#named-character-references 86 87 :Return: The decoded content 88 :Rtype: ``unicode`` 89 """ 90 # pylint: disable = redefined-outer-name 91 92 if not isinstance_(value, unicode_): 93 value = str_(value).decode(encoding, errors) 94 if entities is None: 95 entities = default_entities 96 97 def subber(match): 98 """ Substituter """ 99 name = match.group(1) 100 if not name.startswith(u'#'): 101 try: 102 return entities[name] 103 except KeyError: 104 pass 105 else: 106 if name.startswith(u'#x') or name.startswith(u'#X'): 107 base = 16 108 name = name[2:] 109 else: 110 base = 10 111 name = name[1:] 112 try: 113 return unichr_(int_(name, base)) 114 except (ValueError, TypeError, OverflowError): 115 pass 116 117 if errors == 'ignore': 118 return match.group(0) 119 elif errors == 'replace': 120 return u'\ufffd' 121 else: 122 raise ValueError( 123 "Unresolved entity %r" % (match.group(0),) 124 )

Source Code for Module tdi._htmldecode