Package tdi :: Module _htmldecode
[frames] | no frames]

Source Code for Module tdi._htmldecode

  1  # -*- coding: ascii -*- 
  2  r""" 
  3  :Copyright: 
  4   
  5   Copyright 2006 - 2015 
  6   Andr\xe9 Malo or his licensors, as applicable 
  7   
  8  :License: 
  9   
 10   Licensed under the Apache License, Version 2.0 (the "License"); 
 11   you may not use this file except in compliance with the License. 
 12   You may obtain a copy of the License at 
 13   
 14       http://www.apache.org/licenses/LICENSE-2.0 
 15   
 16   Unless required by applicable law or agreed to in writing, software 
 17   distributed under the License is distributed on an "AS IS" BASIS, 
 18   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 19   See the License for the specific language governing permissions and 
 20   limitations under the License. 
 21   
 22  ============== 
 23   HTML Decoder 
 24  ============== 
 25   
 26  HTML Decoder. 
 27  """ 
 28  if __doc__: 
 29      # pylint: disable = redefined-builtin 
 30      __doc__ = __doc__.encode('ascii').decode('unicode_escape') 
 31  __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape') 
 32  __docformat__ = "restructuredtext en" 
 33   
 34  import re as _re 
 35   
 36  from . import _htmlentities 
 37   
 38   
39 -def _make_decode():
40 """ Make decoder """ 41 from . import c 42 c = c.load('impl') 43 if c is not None: 44 return c.htmldecode 45 46 sub = _re.compile(ur'&([^& \t\n\r\f;]*);').sub 47 unicode_, unichr_, str_, int_ = unicode, unichr, str, int 48 isinstance_ = isinstance 49 default_entities = dict(_htmlentities.htmlentities) 50 51 def decode(value, encoding='latin-1', errors='strict', entities=None): 52 """ 53 Decode HTML encoded text 54 55 :Parameters: 56 `value` : ``basestring`` 57 HTML content to decode 58 59 `encoding` : ``str`` 60 Unicode encoding to be applied before value is being processed 61 further. If value is already a unicode instance, the encoding is 62 ignored. If omitted, 'latin-1' is applied (because it can't fail 63 and maps bytes 1:1 to unicode codepoints). 64 65 `errors` : ``str`` 66 Error handling, passed to .decode() and evaluated for entities. 67 If the entity name or character codepoint could not be found or 68 not be parsed then the error handler has the following semantics: 69 70 ``strict`` (or anything different from the other tokens below) 71 A ``ValueError`` is raised. 72 73 ``ignore`` 74 The original entity is passed through 75 76 ``replace`` 77 The character is replaced by the replacement character 78 (U+FFFD) 79 80 `entities` : ``dict`` 81 Entity name mapping (unicode(name) -> unicode(value)). If 82 omitted or ``None``, the `HTML5 entity list`_ is applied. 83 84 .. _HTML5 entity list: http://www.w3.org/TR/html5/ 85 syntax.html#named-character-references 86 87 :Return: The decoded content 88 :Rtype: ``unicode`` 89 """ 90 # pylint: disable = redefined-outer-name 91 92 if not isinstance_(value, unicode_): 93 value = str_(value).decode(encoding, errors) 94 if entities is None: 95 entities = default_entities 96 97 def subber(match): 98 """ Substituter """ 99 name = match.group(1) 100 if not name.startswith(u'#'): 101 try: 102 return entities[name] 103 except KeyError: 104 pass 105 else: 106 if name.startswith(u'#x') or name.startswith(u'#X'): 107 base = 16 108 name = name[2:] 109 else: 110 base = 10 111 name = name[1:] 112 try: 113 return unichr_(int_(name, base)) 114 except (ValueError, TypeError, OverflowError): 115 pass 116 117 if errors == 'ignore': 118 return match.group(0) 119 elif errors == 'replace': 120 return u'\ufffd' 121 else: 122 raise ValueError( 123 "Unresolved entity %r" % (match.group(0),) 124 )
125 126 return sub(subber, value) 127 return decode 128 129 decode = _make_decode() 130