1
2 r"""
3 :Copyright:
4
5 Copyright 2006 - 2015
6 Andr\xe9 Malo or his licensors, as applicable
7
8 :License:
9
10 Licensed under the Apache License, Version 2.0 (the "License");
11 you may not use this file except in compliance with the License.
12 You may obtain a copy of the License at
13
14 http://www.apache.org/licenses/LICENSE-2.0
15
16 Unless required by applicable law or agreed to in writing, software
17 distributed under the License is distributed on an "AS IS" BASIS,
18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 See the License for the specific language governing permissions and
20 limitations under the License.
21
22 ============
23 HTML Tools
24 ============
25
26 HTML Tools.
27 """
28 if __doc__:
29
30 __doc__ = __doc__.encode('ascii').decode('unicode_escape')
31 __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape')
32 __docformat__ = "restructuredtext en"
33 __all__ = [
34 'decode', 'entities', 'class_add', 'class_del', 'multiline',
35 'CommentStripFilter', 'MinifyFilter', 'minify'
36 ]
37
38 import codecs as _codecs
39 import re as _re
40 try:
41 import cStringIO as _string_io
42 except ImportError:
43 import StringIO as _string_io
44
45 from .._exceptions import LexerError
46 from .. import factory as _factory
47 from .. import filters as _filters
48 from .. import interfaces as _interfaces
49 from ..markup.soup import dtd as _dtd
50 from ..markup.soup import encoder as _encoder
51 from ..markup.soup import decoder as _decoder
52 from ..markup.soup import parser as _parser
53 from . import css as _css
54 from . import javascript as _javascript
55 from .._htmldecode import decode
56 from .._htmlentities import htmlentities as entities
57
58
59
60
61
62
63
64
65
66 entities = dict(entities)
67
68
70 """
71 Add class(es) to a node's class attribute
72
73 :Parameters:
74 `node` : TDI node
75 The node to modify
76
77 `class_` : ``tuple``
78 Class name(s) to add
79 """
80 try:
81 old = decode(node[u'class'], node.raw.encoder.encoding).split()
82 except KeyError:
83 class_ = u' '.join(class_)
84 else:
85 class_ = u' '.join(old + list(class_))
86 if class_:
87 node[u'class'] = class_
88 else:
89 del node[u'class']
90
91
93 """
94 Remove class(es) from node's class attribute
95
96 :Parameters:
97 `node` : TDI node
98 The node to modify
99
100 `class_` : ``tuple``
101 Class name(s) to remove. It is *not* an error if a class is not
102 defined before.
103 """
104 try:
105 old = decode(node[u'class'], node.raw.encoder.encoding).split()
106 except KeyError:
107 pass
108 else:
109 class_ = u' '.join([item for item in old if item not in class_])
110 if class_:
111 node[u'class'] = class_
112 else:
113 del node[u'class']
114
115
117 """ Make multiline encoder """
118
119 divmod_, len_ = divmod, len
120
121 def space_func(match):
122 """ Space filler """
123 length, rest = divmod_(len_(match.group(0)), 2)
124 if length == 0:
125 return u' '
126 return u' ' * rest + u' ' * length
127 ws_sub = _re.compile(ur'\s+').sub
128 ws1_sub = _re.compile(ur'^\s(\S)').sub
129
130 def multiline(content, encoding='ascii', tabwidth=8, xhtml=True):
131 """
132 Encode multiline content to HTML, assignable to ``node.raw.content``
133
134 :Parameters:
135 `content` : ``unicode``
136 Content to encode
137
138 `encoding` : ``str``
139 Target encoding
140
141 `tabwidth` : ``int``
142 Tab width? Used to expand tabs. If ``None``, tabs are not
143 expanded.
144
145 `xhtml` : ``bool``
146 XHTML? Only used to determine if <br> or <br /> is emitted.
147
148 :Return: The multilined content
149 :Rtype: ``str``
150 """
151
152
153 content = (
154 content
155 .replace(u'&', u'&')
156 .replace(u'<', u'<')
157 .replace(u'>', u'>')
158 )
159 lines = []
160 for line in content.splitlines():
161 line = line.rstrip()
162 if not line:
163 line = u' '
164 else:
165 if tabwidth is not None:
166 line = line.expandtabs(tabwidth)
167 line = ws1_sub(ur' \1', line)
168 line = ws_sub(space_func, line)
169 lines.append(line)
170 if xhtml:
171 res = u'<br />'.join(lines)
172 else:
173 res = u'<br>'.join(lines)
174 return res.encode(encoding, 'xmlcharrefreplace')
175
176 return multiline
177
178 multiline = _make_multiline()
179
180
187
188
190 """
191 Strip unneeded whitespace and comments
192
193 :IVariables:
194 `_buffer` : ``list``
195 Current text buffer
196
197 `_stack` : ``list``
198 Current tag stack
199
200 `_last` : ``str``
201 Last seen endtag name (normalized) or ``None``
202
203 `_blocks` : ``dict``
204 List of block elements (in a dict for better lookup)
205 """
206
207 - def __init__(self, builder, comment_filter=None):
208 """
209 Initialization
210
211 :Parameters:
212 `builder` : `BuildingListenerInterface`
213 Next level builder.
214
215 `comment_filter` : callable
216 Comment filter. A function which takes the comment data and
217 returns a filtered comment (which is passed through to the
218 builder) or ``None`` (meaning the comment can be stripped
219 completely). For example::
220
221 def keep_ad_comments(data):
222 if 'google_ad_section' in data:
223 return data
224 return None
225
226 If omitted or ``None``, all comments are stripped.
227 """
228 super(MinifyFilter, self).__init__(builder)
229 self._buffer = []
230 self._stack = []
231 self._last = None
232 self._dtd = _dtd.HTMLDTD()
233 self._normalize = self.builder.decoder.normalize
234 if comment_filter is None:
235 comment_filter = lambda x: None
236 self._comment_filter = comment_filter
237 self._blocks = dict([(item, None) for item in (
238 'address',
239 'article',
240 'aside',
241 'blockquote',
242 'body',
243 'caption',
244 'col',
245 'colgroup',
246 'dd',
247 'dir',
248 'div',
249 'dl',
250 'dt',
251 'fieldset',
252 'figcaption',
253 'figure',
254 'footer',
255 'form',
256 'frame',
257 'frameset',
258 'h1',
259 'h2',
260 'h3',
261 'h4',
262 'h5',
263 'h6',
264 'head',
265 'header',
266 'hgroup',
267 'hr',
268 'html',
269 'isindex',
270 'layer',
271 'li',
272 'listing',
273 'map',
274 'marquee',
275 'menu',
276 'multicol',
277 'nav',
278 'noframes',
279 'ol',
280 'option',
281 'p',
282 'script',
283 'style',
284 'section',
285 'table',
286 'tbody',
287 'td',
288 'title',
289 'tfoot',
290 'th',
291 'thead',
292 'tr',
293 'ul',
294 'xmp',
295 )])
296
297
298
299
300 _WS_SUB = _re.compile(r'\s+').sub
301
302 - def _flush(self, endtag=False, starttag=None):
303 """
304 Flush the current text buffer to the builder
305
306 :Parameters:
307 `endtag` : ``bool``
308 Endtag flush?
309
310 `starttag` : ``str``
311 Next starttag (normalized) if starttag flush
312 """
313 if self._buffer:
314 self._buffer, buf, stack = [], ''.join(self._buffer), self._stack
315 if stack and \
316 (self._dtd.cdata(stack[-1]) or stack[-1] == 'pre'):
317 if stack[-1] == 'pre':
318 buf = [
319 line.rstrip()
320 for line in buf.rstrip().splitlines(False)
321 ]
322 elif stack[-1] in ('script', 'style'):
323 buf = buf.strip().splitlines(False)
324 else:
325 buf = buf.splitlines(False)
326 buf = '\n'.join(buf)
327 else:
328 buf = self._WS_SUB(' ', buf)
329 if self._last in self._blocks:
330 buf = buf.lstrip()
331 if (endtag and stack and stack[-1] in self._blocks) \
332 or starttag in self._blocks:
333 buf = buf.rstrip()
334 self.builder.handle_text(buf)
335
337 """
338 Flush the last chunk
339
340 :See: `tdi.interfaces.BuilderInterface`
341 """
342 self._flush(starttag=self._blocks.keys()[0])
343 return self.builder.finalize()
344
345 - def handle_text(self, data):
346 """
347 Buffer the text
348
349 :See: `tdi.interfaces.ListenerInterface`
350 """
351 self._buffer.append(data)
352
354 """ :See: `tdi.interfaces.ListenerInterface` """
355 norm = self._normalize
356 norm_name = norm(name)
357 self._flush(False, norm_name)
358 if not closed:
359 self._stack.append(norm_name)
360 newattr = [(norm(key), value) for key, value in attr]
361 newattr.sort()
362 data = self.encoder.starttag(
363 norm_name, newattr, closed
364 )
365 self.builder.handle_starttag(norm_name, attr, closed, data)
366
368 """ :See: `tdi.interfaces.ListenerInterface` """
369 self._flush(True)
370 norm_name, stack = self._normalize(name), self._stack
371 if stack and norm_name == stack[-1]:
372 self._last = stack.pop()
373 if data:
374 data = self.encoder.endtag(norm_name)
375 self.builder.handle_endtag(norm_name, data)
376
382
387
389 """ :See: `tdi.interfaces.ListenerInterface` """
390 self._flush()
391 self.builder.handle_decl(name, value, data)
392
394 """ :See: `tdi.interfaces.ListenerInterface` """
395 self._flush()
396 self.builder.handle_pi(data)
397
398
399 -def minify(html, encoding='ascii', fail_silently=False, comment_filter=None,
400 cdata_containers=False):
401 """
402 Minify HTML
403
404 Enclosed <script> and <style> blocks are minified as well.
405
406 :Parameters:
407 `html` : ``basestring``
408 HTML to minify
409
410 `encoding` : ``str``
411 Initially assumed encoding. Only marginally interesting.
412
413 `fail_silently` : ``bool``
414 Fail if a parse error is encountered? If true, the parse error is
415 passed. Otherwise it's swallowed and the input html is returned.
416
417 `comment_filter` : callable
418 HTML Comment filter. A function which takes the comment data and
419 returns a filtered comment (which is passed through to the
420 builder) or ``None`` (meaning the comment can be stripped
421 completely). For example::
422
423 def keep_ad_comments(data):
424 if 'google_ad_section' in data:
425 return data
426 return None
427
428 If omitted or ``None``, all HTML comments are stripped.
429
430 `cdata_containers` : ``bool``
431 Add CDATA containers to enclosed <script> or <style> content? If true,
432 these containers are added after minimization of the content. Default
433 is false.
434
435 :Return: the minified HTML - typed as input
436 :Rtype: ``basestring``
437 """
438 def js_minify(builder):
439 """ Javascript minifier filter factory """
440 return _javascript.MinifyFilter(builder, standalone=True)
441
442 def js_cdata(builder):
443 """ Javascript cdata container filter factory """
444 return _javascript.CDATAFilter(builder, standalone=True)
445
446 def css_minify(builder):
447 """ CSS minifier filter factory """
448 return _css.MinifyFilter(builder, standalone=True)
449
450 def css_cdata(builder):
451 """ CSS cdata container filter factory """
452 return _css.CDATAFilter(builder, standalone=True)
453
454 def html_minify(builder):
455 """ HTML minifier filter factory """
456 return MinifyFilter(builder, comment_filter=comment_filter)
457
458 filters = cdata_containers and [js_cdata, css_cdata] or []
459 isuni = isinstance(html, unicode)
460 if isuni:
461 html = html.encode('utf-8')
462 try:
463 result = _factory.Loader(
464 builder=_StringBuilder,
465 parser=_parser.SoupParser.html,
466 encoder=_encoder.SoupEncoder,
467 decoder=_decoder.HTMLDecoder,
468 eventfilters=filters + [
469 js_minify,
470 css_minify,
471 html_minify,
472 ]
473 )(_string_io.StringIO(html), '<string>', encoding)
474 except LexerError:
475 if not fail_silently:
476 raise
477 result = html
478 if isuni:
479 return result.decode('utf-8')
480 return result
481
482
484 """ String builder """
485 __implements__ = [_interfaces.BuilderInterface,
486 _interfaces.BuildingListenerInterface]
487
488 encoding = 'ascii'
489
491 """
492 Initialization
493
494 :Parameters:
495 `encoder` : ``callable``
496 Encoder factory
497
498 `decoder` : ``callable``
499 Decoder factory
500 """
501 self._result = []
502 self.encoder = encoder(self.encoding)
503 self.decoder = decoder(self.encoding)
504
505 - def handle_text(self, data):
506 """ :see: `ListenerInterface` """
507 self._result.append(data)
508
510 """ :see: `ListenerInterface` """
511
512 self._result.append(data)
513
515 """ :see: `ListenerInterface` """
516
517 self._result.append(data)
518
520 """ :see: `ListenerInterface` """
521
522 self._result.append(data)
523
527
529 """ :see: `ListenerInterface` """
530
531 self._result.append(data)
532
534 """ :see: `ListenerInterface` """
535
536 self._result.append(data)
537
539 """ :see: `ListenerInterface` """
540 self._result.append(data)
541
553
555 """ :See: `tdi.interfaces.BuilderInterface` """
556 return ''.join(self._result)
557