1
2 r"""
3 :Copyright:
4
5 Copyright 2006 - 2015
6 Andr\xe9 Malo or his licensors, as applicable
7
8 :License:
9
10 Licensed under the Apache License, Version 2.0 (the "License");
11 you may not use this file except in compliance with the License.
12 You may obtain a copy of the License at
13
14 http://www.apache.org/licenses/LICENSE-2.0
15
16 Unless required by applicable law or agreed to in writing, software
17 distributed under the License is distributed on an "AS IS" BASIS,
18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 See the License for the specific language governing permissions and
20 limitations under the License.
21
22 =====================
23 Soup Filter Classes
24 =====================
25
26 Filters for soup templates.
27 """
28 if __doc__:
29
30 __doc__ = __doc__.encode('ascii').decode('unicode_escape')
31 __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape')
32 __docformat__ = "restructuredtext en"
33
34 import re as _re
35
36 from ... import filters as _filters
37
38
40 """
41 Make content type parser
42
43 :Return: parse_content_type
44 :Rtype: ``callable``
45 """
46
47 tokenres = r'[^\000-\040()<>@,;:\\"/[\]?=]+'
48 qcontent = r'[^\000\\"]'
49 qsres = r'"%(qc)s*(?:\\"%(qc)s*)*"' % {'qc': qcontent}
50 valueres = r'(?:%(token)s|%(quoted-string)s)' % {
51 'token': tokenres, 'quoted-string': qsres,
52 }
53
54 typere = _re.compile(
55 r'\s*([^;/\s]+/[^;/\s]+)((?:\s*;\s*%(key)s\s*=\s*%(val)s)*)\s*$' % {
56 'key': tokenres, 'val': valueres,
57 }
58 )
59 pairre = _re.compile(r'\s*;\s*(%(key)s)\s*=\s*(%(val)s)' % {
60 'key': tokenres, 'val': valueres
61 })
62 stripre = _re.compile(r'\r?\n')
63
64 def _parse_content_type(value):
65 """
66 Parse a content type
67
68 :Warning: comments are not recognized (yet?)
69
70 :Parameters:
71 `value` : ``basestring``
72 The value to parse - must be ascii compatible
73
74 :Return: The parsed header (``(value, {key, [value, value, ...]})``)
75 or ``None``
76 :Rtype: ``tuple``
77 """
78 try:
79 if isinstance(value, unicode):
80 value.encode('ascii')
81 else:
82 value.decode('ascii')
83 except (AttributeError, UnicodeError):
84 return None
85
86 match = typere.match(value)
87 if not match:
88 return None
89
90 parsed = (match.group(1).lower(), {})
91 match = match.group(2)
92 if match:
93 for key, val in pairre.findall(match):
94 if val[:1] == '"':
95 val = stripre.sub(r'', val[1:-1]).replace(r'\"', '"')
96 parsed[1].setdefault(key.lower(), []).append(val)
97
98 return parsed
99
100 return _parse_content_type
101
102 _parse_content_type = _make_parse_content_type()
103
104
106 """ Extract template encoding and pass it properly to the builder """
107 __slots__ = ('_normalize', '_meta')
108
114
116 """
117 Extract encoding from HTML meta element
118
119 Here are samples for the expected formats::
120
121 <meta charset="utf-8"> <!-- HTML5 -->
122
123 <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
124
125 The event is passed to the builder nevertheless.
126
127 :See: `BuildingListenerInterface`
128 """
129 normalize = self._normalize
130
131 iname = normalize(name)
132 if iname == self._meta:
133 adict = dict([(normalize(key), val) for key, val in attr])
134 value = str((adict.get(normalize('charset')) or ''))
135 if value.startswith('"') or value.startswith("'"):
136 value = value[1:-1].strip()
137 if value:
138 self.builder.handle_encoding(value)
139 else:
140 value = (adict.get(normalize('http-equiv')) or '').lower()
141 if value.startswith('"') or value.startswith("'"):
142 value = value[1:-1].strip()
143 if value == 'content-type':
144 ctype = adict.get(normalize('content'))
145 if ctype:
146 if ctype.startswith('"') or ctype.startswith("'"):
147 ctype = ctype[1:-1].strip()
148
149 parsed = _parse_content_type(ctype)
150 if parsed is not None:
151 encoding = parsed[1].get('charset')
152 if encoding:
153 self.builder.handle_encoding(
154 encoding[0].strip()
155 )
156
157 self.builder.handle_starttag(name, attr, closed, data)
158
159
160
161
162 _PI_MATCH = _re.compile(r'''
163 <\? \s* [xX][mM][lL] \s+ (?P<attr>
164 [^"'?]*
165 (?:
166 (?:
167 "[^"]*"
168 | '[^']*'
169 )
170 [^"'?]*
171 )*
172 )
173 \s* \?>$
174 ''', _re.X).match
175
176
177
178
179 _PI_ATT_ITER = _re.compile(r'''
180 \s*
181 (?P<name>[^\s=]*) # attribute name
182 \s*
183 =
184 (?P<value> # value
185 \s*"[^"]*"
186 | \s*'[^']*'
187 )
188 ''', _re.X).finditer
189
191 """
192 Extract encoding from xml declaration
193
194 Here's a sample for the expected format::
195
196 <?xml version="1.0" encoding="ascii" ?>
197
198 The event is passed to the builder nevertheless.
199
200 :See: `BuildingListenerInterface`
201 """
202 match = self._PI_MATCH(str(data))
203 if match:
204 encoding = 'utf-8'
205 for match in self._PI_ATT_ITER(match.group('attr')):
206 key, value = match.group('name', 'value')
207 if key or value:
208 if key == 'encoding':
209 value = value.strip()
210 if value.startswith('"') or value.startswith("'"):
211 value = value[1:-1].strip()
212 if value:
213 encoding = value
214 break
215 else:
216 break
217 self.builder.handle_encoding(encoding)
218 self.builder.handle_pi(data)
219
220 from ... import c
221 c = c.load('impl')
222 if c is not None:
223 EncodingDetectFilter = c.SoupEncodingDetectFilter
224 del c
225