1
2 r"""
3 :Copyright:
4
5 Copyright 2006 - 2015
6 Andr\xe9 Malo or his licensors, as applicable
7
8 :License:
9
10 Licensed under the Apache License, Version 2.0 (the "License");
11 you may not use this file except in compliance with the License.
12 You may obtain a copy of the License at
13
14 http://www.apache.org/licenses/LICENSE-2.0
15
16 Unless required by applicable law or agreed to in writing, software
17 distributed under the License is distributed on an "AS IS" BASIS,
18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 See the License for the specific language governing permissions and
20 limitations under the License.
21
22 ==============
23 HTML Decoder
24 ==============
25
26 HTML Decoder.
27 """
28 if __doc__:
29
30 __doc__ = __doc__.encode('ascii').decode('unicode_escape')
31 __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape')
32 __docformat__ = "restructuredtext en"
33
34 import re as _re
35
36 from . import _htmlentities
37
38
40 """ Make decoder """
41 from . import c
42 c = c.load('impl')
43 if c is not None:
44 return c.htmldecode
45
46 sub = _re.compile(ur'&([^& \t\n\r\f;]*);').sub
47 unicode_, unichr_, str_, int_ = unicode, unichr, str, int
48 isinstance_ = isinstance
49 default_entities = dict(_htmlentities.htmlentities)
50
51 def decode(value, encoding='latin-1', errors='strict', entities=None):
52 """
53 Decode HTML encoded text
54
55 :Parameters:
56 `value` : ``basestring``
57 HTML content to decode
58
59 `encoding` : ``str``
60 Unicode encoding to be applied before value is being processed
61 further. If value is already a unicode instance, the encoding is
62 ignored. If omitted, 'latin-1' is applied (because it can't fail
63 and maps bytes 1:1 to unicode codepoints).
64
65 `errors` : ``str``
66 Error handling, passed to .decode() and evaluated for entities.
67 If the entity name or character codepoint could not be found or
68 not be parsed then the error handler has the following semantics:
69
70 ``strict`` (or anything different from the other tokens below)
71 A ``ValueError`` is raised.
72
73 ``ignore``
74 The original entity is passed through
75
76 ``replace``
77 The character is replaced by the replacement character
78 (U+FFFD)
79
80 `entities` : ``dict``
81 Entity name mapping (unicode(name) -> unicode(value)). If
82 omitted or ``None``, the `HTML5 entity list`_ is applied.
83
84 .. _HTML5 entity list: http://www.w3.org/TR/html5/
85 syntax.html#named-character-references
86
87 :Return: The decoded content
88 :Rtype: ``unicode``
89 """
90
91
92 if not isinstance_(value, unicode_):
93 value = str_(value).decode(encoding, errors)
94 if entities is None:
95 entities = default_entities
96
97 def subber(match):
98 """ Substituter """
99 name = match.group(1)
100 if not name.startswith(u'#'):
101 try:
102 return entities[name]
103 except KeyError:
104 pass
105 else:
106 if name.startswith(u'#x') or name.startswith(u'#X'):
107 base = 16
108 name = name[2:]
109 else:
110 base = 10
111 name = name[1:]
112 try:
113 return unichr_(int_(name, base))
114 except (ValueError, TypeError, OverflowError):
115 pass
116
117 if errors == 'ignore':
118 return match.group(0)
119 elif errors == 'replace':
120 return u'\ufffd'
121 else:
122 raise ValueError(
123 "Unresolved entity %r" % (match.group(0),)
124 )
125
126 return sub(subber, value)
127 return decode
128
129 decode = _make_decode()
130