tdi.tools.html

1 # -*- coding: ascii -*- 2 r""" 3 :Copyright: 4 5 Copyright 2006 - 2015 6 Andr\xe9 Malo or his licensors, as applicable 7 8 :License: 9 10 Licensed under the Apache License, Version 2.0 (the "License"); 11 you may not use this file except in compliance with the License. 12 You may obtain a copy of the License at 13 14 http://www.apache.org/licenses/LICENSE-2.0 15 16 Unless required by applicable law or agreed to in writing, software 17 distributed under the License is distributed on an "AS IS" BASIS, 18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 See the License for the specific language governing permissions and 20 limitations under the License. 21 22 ============ 23 HTML Tools 24 ============ 25 26 HTML Tools. 27 """ 28 if __doc__: 29 # pylint: disable = redefined-builtin 30 __doc__ = __doc__.encode('ascii').decode('unicode_escape') 31 __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape') 32 __docformat__ = "restructuredtext en" 33 __all__ = [ 34 'decode', 'entities', 'class_add', 'class_del', 'multiline', 35 'CommentStripFilter', 'MinifyFilter', 'minify' 36 ] 37 38 import codecs as _codecs 39 import re as _re 40 try: 41 import cStringIO as _string_io 42 except ImportError: 43 import StringIO as _string_io 44 45 from .._exceptions import LexerError 46 from .. import factory as _factory 47 from .. import filters as _filters 48 from .. import interfaces as _interfaces 49 from ..markup.soup import dtd as _dtd 50 from ..markup.soup import encoder as _encoder 51 from ..markup.soup import decoder as _decoder 52 from ..markup.soup import parser as _parser 53 from . import css as _css 54 from . import javascript as _javascript 55 from .._htmldecode import decode 56 from .._htmlentities import htmlentities as entities 57 58 59 #: HTML named character references, generated from 60 #: `the HTML5 spec`_\. 61 #: 62 #: .. _the HTML5 spec: http://www.w3.org/TR/html5/ 63 #: syntax.html#named-character-references 64 #: 65 #: :Type: ``dict`` 66 entities = dict(entities) 67 68

69 -def class_add(node, *class_):

70 """ 71 Add class(es) to a node's class attribute 72 73 :Parameters: 74 `node` : TDI node 75 The node to modify 76 77 `class_` : ``tuple`` 78 Class name(s) to add 79 """ 80 try: 81 old = decode(node[u'class'], node.raw.encoder.encoding).split() 82 except KeyError: 83 class_ = u' '.join(class_) 84 else: 85 class_ = u' '.join(old + list(class_)) 86 if class_: 87 node[u'class'] = class_ 88 else: 89 del node[u'class']

90 91

92 -def class_del(node, *class_):

93 """ 94 Remove class(es) from node's class attribute 95 96 :Parameters: 97 `node` : TDI node 98 The node to modify 99 100 `class_` : ``tuple`` 101 Class name(s) to remove. It is *not* an error if a class is not 102 defined before. 103 """ 104 try: 105 old = decode(node[u'class'], node.raw.encoder.encoding).split() 106 except KeyError: 107 pass 108 else: 109 class_ = u' '.join([item for item in old if item not in class_]) 110 if class_: 111 node[u'class'] = class_ 112 else: 113 del node[u'class']

114 115

116 -def _make_multiline():

117 """ Make multiline encoder """ 118 119 divmod_, len_ = divmod, len 120 121 def space_func(match): 122 """ Space filler """ 123 length, rest = divmod_(len_(match.group(0)), 2) 124 if length == 0: 125 return u' ' 126 return u' ' * rest + u'  ' * length

127 ws_sub = _re.compile(ur'\s+').sub 128 ws1_sub = _re.compile(ur'^\s(\S)').sub 129 130 def multiline(content, encoding='ascii', tabwidth=8, xhtml=True): 131 """ 132 Encode multiline content to HTML, assignable to ``node.raw.content`` 133 134 :Parameters: 135 `content` : ``unicode`` 136 Content to encode 137 138 `encoding` : ``str`` 139 Target encoding 140 141 `tabwidth` : ``int`` 142 Tab width? Used to expand tabs. If ``None``, tabs are not 143 expanded. 144 145 `xhtml` : ``bool`` 146 XHTML? Only used to determine if <br> or <br /> is emitted. 147 148 :Return: The multilined content 149 :Rtype: ``str`` 150 """ 151 # pylint: disable = redefined-outer-name 152 153 content = ( 154 content 155 .replace(u'&', u'&') 156 .replace(u'<', u'<') 157 .replace(u'>', u'>') 158 ) 159 lines = [] 160 for line in content.splitlines(): 161 line = line.rstrip() 162 if not line: 163 line = u' ' 164 else: 165 if tabwidth is not None: 166 line = line.expandtabs(tabwidth) 167 line = ws1_sub(ur' \1', line) 168 line = ws_sub(space_func, line) 169 lines.append(line) 170 if xhtml: 171 res = u'<br />'.join(lines) 172 else: 173 res = u'<br>'.join(lines) 174 return res.encode(encoding, 'xmlcharrefreplace') 175 176 return multiline 177 178 multiline = _make_multiline() 179 180

181 -class CommentStripFilter(_filters.BaseEventFilter):

182 """ Strip comments from the event chain """ 183

184 - def handle_comment(self, data):

185 """ :See: `tdi.interfaces.ListenerInterface` """ 186 pass

187 188

189 -class MinifyFilter(_filters.BaseEventFilter):

190 """ 191 Strip unneeded whitespace and comments 192 193 :IVariables: 194 `_buffer` : ``list`` 195 Current text buffer 196 197 `_stack` : ``list`` 198 Current tag stack 199 200 `_last` : ``str`` 201 Last seen endtag name (normalized) or ``None`` 202 203 `_blocks` : ``dict`` 204 List of block elements (in a dict for better lookup) 205 """ 206

207 - def __init__(self, builder, comment_filter=None):

208 """ 209 Initialization 210 211 :Parameters: 212 `builder` : `BuildingListenerInterface` 213 Next level builder. 214 215 `comment_filter` : callable 216 Comment filter. A function which takes the comment data and 217 returns a filtered comment (which is passed through to the 218 builder) or ``None`` (meaning the comment can be stripped 219 completely). For example:: 220 221 def keep_ad_comments(data): 222 if 'google_ad_section' in data: 223 return data 224 return None 225 226 If omitted or ``None``, all comments are stripped. 227 """ 228 super(MinifyFilter, self).__init__(builder) 229 self._buffer = [] 230 self._stack = [] 231 self._last = None 232 self._dtd = _dtd.HTMLDTD() 233 self._normalize = self.builder.decoder.normalize 234 if comment_filter is None: 235 comment_filter = lambda x: None 236 self._comment_filter = comment_filter 237 self._blocks = dict([(item, None) for item in ( 238 'address', 239 'article', 240 'aside', 241 'blockquote', 242 'body', 243 'caption', 244 'col', 245 'colgroup', 246 'dd', 247 'dir', 248 'div', 249 'dl', 250 'dt', 251 'fieldset', 252 'figcaption', 253 'figure', 254 'footer', 255 'form', 256 'frame', 257 'frameset', 258 'h1', 259 'h2', 260 'h3', 261 'h4', 262 'h5', 263 'h6', 264 'head', 265 'header', 266 'hgroup', 267 'hr', 268 'html', 269 'isindex', 270 'layer', 271 'li', 272 'listing', 273 'map', 274 'marquee', 275 'menu', 276 'multicol', 277 'nav', 278 'noframes', 279 'ol', 280 'option', 281 'p', 282 'script', 283 'style', 284 'section', 285 'table', 286 'tbody', 287 'td', 288 'title', 289 'tfoot', 290 'th', 291 'thead', 292 'tr', 293 'ul', 294 'xmp', 295 )])

296 297 #: Whitespace substitutor 298 #: 299 #: :Type: ``callable`` 300 _WS_SUB = _re.compile(r'\s+').sub 301

302 - def _flush(self, endtag=False, starttag=None):

303 """ 304 Flush the current text buffer to the builder 305 306 :Parameters: 307 `endtag` : ``bool`` 308 Endtag flush? 309 310 `starttag` : ``str`` 311 Next starttag (normalized) if starttag flush 312 """ 313 if self._buffer: 314 self._buffer, buf, stack = [], ''.join(self._buffer), self._stack 315 if stack and \ 316 (self._dtd.cdata(stack[-1]) or stack[-1] == 'pre'): 317 if stack[-1] == 'pre': 318 buf = [ 319 line.rstrip() 320 for line in buf.rstrip().splitlines(False) 321 ] 322 elif stack[-1] in ('script', 'style'): 323 buf = buf.strip().splitlines(False) 324 else: 325 buf = buf.splitlines(False) 326 buf = '\n'.join(buf) 327 else: 328 buf = self._WS_SUB(' ', buf) 329 if self._last in self._blocks: 330 buf = buf.lstrip() 331 if (endtag and stack and stack[-1] in self._blocks) \ 332 or starttag in self._blocks: 333 buf = buf.rstrip() 334 self.builder.handle_text(buf)

335

336 - def finalize(self):

337 """ 338 Flush the last chunk 339 340 :See: `tdi.interfaces.BuilderInterface` 341 """ 342 self._flush(starttag=self._blocks.keys()[0]) 343 return self.builder.finalize()

344

345 - def handle_text(self, data):

346 """ 347 Buffer the text 348 349 :See: `tdi.interfaces.ListenerInterface` 350 """ 351 self._buffer.append(data)

352

353 - def handle_starttag(self, name, attr, closed, data):

354 """ :See: `tdi.interfaces.ListenerInterface` """ 355 norm = self._normalize 356 norm_name = norm(name) 357 self._flush(False, norm_name) 358 if not closed: 359 self._stack.append(norm_name) 360 newattr = [(norm(key), value) for key, value in attr] 361 newattr.sort() 362 data = self.encoder.starttag( 363 norm_name, newattr, closed 364 ) 365 self.builder.handle_starttag(norm_name, attr, closed, data)

366

367 - def handle_endtag(self, name, data):

368 """ :See: `tdi.interfaces.ListenerInterface` """ 369 self._flush(True) 370 norm_name, stack = self._normalize(name), self._stack 371 if stack and norm_name == stack[-1]: 372 self._last = stack.pop() 373 if data: 374 data = self.encoder.endtag(norm_name) 375 self.builder.handle_endtag(norm_name, data)

376

377 - def handle_comment(self, data):

378 """ :See: `tdi.interfaces.ListenerInterface` """ 379 data = self._comment_filter(data) 380 if data is not None: 381 self.builder.handle_comment(data)

382

383 - def handle_msection(self, name, value, data):

384 """ :See: `tdi.interfaces.ListenerInterface` """ 385 self._flush() 386 self.builder.handle_msection(name, value, data)

387

388 - def handle_decl(self, name, value, data):

389 """ :See: `tdi.interfaces.ListenerInterface` """ 390 self._flush() 391 self.builder.handle_decl(name, value, data)

392

393 - def handle_pi(self, data):

394 """ :See: `tdi.interfaces.ListenerInterface` """ 395 self._flush() 396 self.builder.handle_pi(data)

397 398

399 -def minify(html, encoding='ascii', fail_silently=False, comment_filter=None, 400 cdata_containers=False):

401 """ 402 Minify HTML 403 404 Enclosed <script> and <style> blocks are minified as well. 405 406 :Parameters: 407 `html` : ``basestring`` 408 HTML to minify 409 410 `encoding` : ``str`` 411 Initially assumed encoding. Only marginally interesting. 412 413 `fail_silently` : ``bool`` 414 Fail if a parse error is encountered? If true, the parse error is 415 passed. Otherwise it's swallowed and the input html is returned. 416 417 `comment_filter` : callable 418 HTML Comment filter. A function which takes the comment data and 419 returns a filtered comment (which is passed through to the 420 builder) or ``None`` (meaning the comment can be stripped 421 completely). For example:: 422 423 def keep_ad_comments(data): 424 if 'google_ad_section' in data: 425 return data 426 return None 427 428 If omitted or ``None``, all HTML comments are stripped. 429 430 `cdata_containers` : ``bool`` 431 Add CDATA containers to enclosed <script> or <style> content? If true, 432 these containers are added after minimization of the content. Default 433 is false. 434 435 :Return: the minified HTML - typed as input 436 :Rtype: ``basestring`` 437 """ 438 def js_minify(builder): 439 """ Javascript minifier filter factory """ 440 return _javascript.MinifyFilter(builder, standalone=True)

441 442 def js_cdata(builder): 443 """ Javascript cdata container filter factory """ 444 return _javascript.CDATAFilter(builder, standalone=True) 445 446 def css_minify(builder): 447 """ CSS minifier filter factory """ 448 return _css.MinifyFilter(builder, standalone=True) 449 450 def css_cdata(builder): 451 """ CSS cdata container filter factory """ 452 return _css.CDATAFilter(builder, standalone=True) 453 454 def html_minify(builder): 455 """ HTML minifier filter factory """ 456 return MinifyFilter(builder, comment_filter=comment_filter) 457 458 filters = cdata_containers and [js_cdata, css_cdata] or [] 459 isuni = isinstance(html, unicode) 460 if isuni: 461 html = html.encode('utf-8') 462 try: 463 result = _factory.Loader( 464 builder=_StringBuilder, 465 parser=_parser.SoupParser.html, 466 encoder=_encoder.SoupEncoder, 467 decoder=_decoder.HTMLDecoder, 468 eventfilters=filters + [ 469 js_minify, 470 css_minify, 471 html_minify, 472 ] 473 )(_string_io.StringIO(html), '<string>', encoding) 474 except LexerError: 475 if not fail_silently: 476 raise 477 result = html 478 if isuni: 479 return result.decode('utf-8') 480 return result 481 482

483 -class _StringBuilder(object):

484 """ String builder """ 485 __implements__ = [_interfaces.BuilderInterface, 486 _interfaces.BuildingListenerInterface] 487 488 encoding = 'ascii' 489

490 - def __init__(self, encoder, decoder):

491 """ 492 Initialization 493 494 :Parameters: 495 `encoder` : ``callable`` 496 Encoder factory 497 498 `decoder` : ``callable`` 499 Decoder factory 500 """ 501 self._result = [] 502 self.encoder = encoder(self.encoding) 503 self.decoder = decoder(self.encoding)

504

505 - def handle_text(self, data):

506 """ :see: `ListenerInterface` """ 507 self._result.append(data)

508

509 - def handle_escape(self, escaped, data):

510 """ :see: `ListenerInterface` """ 511 # pylint: disable = unused-argument 512 self._result.append(data)

513

514 - def handle_starttag(self, name, attr, closed, data):

515 """ :see: `ListenerInterface` """ 516 # pylint: disable = unused-argument 517 self._result.append(data)

518

519 - def handle_endtag(self, name, data):

520 """ :see: `ListenerInterface` """ 521 # pylint: disable = unused-argument 522 self._result.append(data)

523

524 - def handle_comment(self, data):

525 """ :see: `ListenerInterface` """ 526 self._result.append(data)

527

528 - def handle_msection(self, name, value, data):

529 """ :see: `ListenerInterface` """ 530 # pylint: disable = unused-argument 531 self._result.append(data)

532

533 - def handle_decl(self, name, value, data):

534 """ :see: `ListenerInterface` """ 535 # pylint: disable = unused-argument 536 self._result.append(data)

537

538 - def handle_pi(self, data):

539 """ :see: `ListenerInterface` """ 540 self._result.append(data)

541

542 - def handle_encoding(self, encoding):

543 """ :See: `tdi.interfaces.BuildingListenerInterface` """ 544 try: 545 _codecs.lookup(encoding) 546 except LookupError: 547 pass 548 else: 549 if self.encoding != encoding: 550 self.encoding = encoding 551 self.encoder.encoding = encoding 552 self.decoder.encoding = encoding

553

554 - def finalize(self):

555 """ :See: `tdi.interfaces.BuilderInterface` """ 556 return ''.join(self._result)

557

Source Code for Module tdi.tools.html