tdi.markup.soup.filters

1 # -*- coding: ascii -*- 2 r""" 3 :Copyright: 4 5 Copyright 2006 - 2015 6 Andr\xe9 Malo or his licensors, as applicable 7 8 :License: 9 10 Licensed under the Apache License, Version 2.0 (the "License"); 11 you may not use this file except in compliance with the License. 12 You may obtain a copy of the License at 13 14 http://www.apache.org/licenses/LICENSE-2.0 15 16 Unless required by applicable law or agreed to in writing, software 17 distributed under the License is distributed on an "AS IS" BASIS, 18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 See the License for the specific language governing permissions and 20 limitations under the License. 21 22 ===================== 23 Soup Filter Classes 24 ===================== 25 26 Filters for soup templates. 27 """ 28 if __doc__: 29 # pylint: disable = redefined-builtin 30 __doc__ = __doc__.encode('ascii').decode('unicode_escape') 31 __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape') 32 __docformat__ = "restructuredtext en" 33 34 import re as _re 35 36 from ... import filters as _filters 37 38

39 -def _make_parse_content_type():

40 """ 41 Make content type parser 42 43 :Return: parse_content_type 44 :Rtype: ``callable`` 45 """ 46 # These are a bit more lenient than RFC 2045. 47 tokenres = r'[^\000-\040()<>@,;:\\"/[\]?=]+' 48 qcontent = r'[^\000\\"]' 49 qsres = r'"%(qc)s*(?:\\"%(qc)s*)*"' % {'qc': qcontent} 50 valueres = r'(?:%(token)s|%(quoted-string)s)' % { 51 'token': tokenres, 'quoted-string': qsres, 52 } 53 54 typere = _re.compile( 55 r'\s*([^;/\s]+/[^;/\s]+)((?:\s*;\s*%(key)s\s*=\s*%(val)s)*)\s*$' % { 56 'key': tokenres, 'val': valueres, 57 } 58 ) 59 pairre = _re.compile(r'\s*;\s*(%(key)s)\s*=\s*(%(val)s)' % { 60 'key': tokenres, 'val': valueres 61 }) 62 stripre = _re.compile(r'\r?\n') 63 64 def _parse_content_type(value): 65 """ 66 Parse a content type 67 68 :Warning: comments are not recognized (yet?) 69 70 :Parameters: 71 `value` : ``basestring`` 72 The value to parse - must be ascii compatible 73 74 :Return: The parsed header (``(value, {key, [value, value, ...]})``) 75 or ``None`` 76 :Rtype: ``tuple`` 77 """ 78 try: 79 if isinstance(value, unicode): 80 value.encode('ascii') 81 else: 82 value.decode('ascii') 83 except (AttributeError, UnicodeError): 84 return None 85 86 match = typere.match(value) 87 if not match: 88 return None 89 90 parsed = (match.group(1).lower(), {}) 91 match = match.group(2) 92 if match: 93 for key, val in pairre.findall(match): 94 if val[:1] == '"': 95 val = stripre.sub(r'', val[1:-1]).replace(r'\"', '"') 96 parsed[1].setdefault(key.lower(), []).append(val) 97 98 return parsed

99 100 return _parse_content_type 101 102 _parse_content_type = _make_parse_content_type() 103 104

105 -class EncodingDetectFilter(_filters.BaseEventFilter):

106 """ Extract template encoding and pass it properly to the builder """ 107 __slots__ = ('_normalize', '_meta') 108

109 - def __init__(self, builder):

110 """ Initialization """ 111 super(EncodingDetectFilter, self).__init__(builder) 112 self._normalize = self.builder.decoder.normalize 113 self._meta = self._normalize('meta')

114

115 - def handle_starttag(self, name, attr, closed, data):

116 """ 117 Extract encoding from HTML meta element 118 119 Here are samples for the expected formats:: 120 121 <meta charset="utf-8">  122 123 <meta http-equiv="Content-Type" content="text/html; charset=utf-8"> 124 125 The event is passed to the builder nevertheless. 126 127 :See: `BuildingListenerInterface` 128 """ 129 normalize = self._normalize 130 131 iname = normalize(name) 132 if iname == self._meta: 133 adict = dict([(normalize(key), val) for key, val in attr]) 134 value = str((adict.get(normalize('charset')) or '')) 135 if value.startswith('"') or value.startswith("'"): 136 value = value[1:-1].strip() 137 if value: 138 self.builder.handle_encoding(value) 139 else: 140 value = (adict.get(normalize('http-equiv')) or '').lower() 141 if value.startswith('"') or value.startswith("'"): 142 value = value[1:-1].strip() 143 if value == 'content-type': 144 ctype = adict.get(normalize('content')) 145 if ctype: 146 if ctype.startswith('"') or ctype.startswith("'"): 147 ctype = ctype[1:-1].strip() 148 149 parsed = _parse_content_type(ctype) 150 if parsed is not None: 151 encoding = parsed[1].get('charset') 152 if encoding: 153 self.builder.handle_encoding( 154 encoding[0].strip() 155 ) 156 157 self.builder.handle_starttag(name, attr, closed, data)

158 159 #: Regex matcher to match xml declarations 160 #: 161 #: :Type: ``callable`` 162 _PI_MATCH = _re.compile(r''' 163 <\? \s* [xX][mM][lL] \s+ (?P<attr> 164 [^"'?]* 165 (?: 166 (?: 167 "[^"]*" 168 | '[^']*' 169 ) 170 [^"'?]* 171 )* 172 ) 173 \s* \?>$ 174 ''', _re.X).match 175 176 #: Iterator over the matched xml declaration attributes 177 #: 178 #: :Type: ``callable`` 179 _PI_ATT_ITER = _re.compile(r''' 180 \s* 181 (?P<name>[^\s=]*) # attribute name 182 \s* 183 = 184 (?P<value> # value 185 \s*"[^"]*" 186 | \s*'[^']*' 187 ) 188 ''', _re.X).finditer 189

190 - def handle_pi(self, data):

191 """ 192 Extract encoding from xml declaration 193 194 Here's a sample for the expected format:: 195 196 <?xml version="1.0" encoding="ascii" ?> 197 198 The event is passed to the builder nevertheless. 199 200 :See: `BuildingListenerInterface` 201 """ 202 match = self._PI_MATCH(str(data)) 203 if match: 204 encoding = 'utf-8' # xml default 205 for match in self._PI_ATT_ITER(match.group('attr')): 206 key, value = match.group('name', 'value') 207 if key or value: 208 if key == 'encoding': 209 value = value.strip() 210 if value.startswith('"') or value.startswith("'"): 211 value = value[1:-1].strip() 212 if value: 213 encoding = value 214 break 215 else: 216 break 217 self.builder.handle_encoding(encoding) 218 self.builder.handle_pi(data)

219 220 from ... import c 221 c = c.load('impl') 222 if c is not None: 223 EncodingDetectFilter = c.SoupEncodingDetectFilter # noqa 224 del c 225

Source Code for Module tdi.markup.soup.filters