tdi.markup.soup.parser

1 # -*- coding: ascii -*- 2 r""" 3 :Copyright: 4 5 Copyright 2006 - 2015 6 Andr\xe9 Malo or his licensors, as applicable 7 8 :License: 9 10 Licensed under the Apache License, Version 2.0 (the "License"); 11 you may not use this file except in compliance with the License. 12 You may obtain a copy of the License at 13 14 http://www.apache.org/licenses/LICENSE-2.0 15 16 Unless required by applicable law or agreed to in writing, software 17 distributed under the License is distributed on an "AS IS" BASIS, 18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 19 See the License for the specific language governing permissions and 20 limitations under the License. 21 22 ===================== 23 Markup Parser Logic 24 ===================== 25 26 Soup Parser 27 ~~~~~~~~~~~ 28 29 This module provides a very lenient HTML/XML lexer. The `SoupLexer` class is 30 initialized with a listener object, which receives all low level events 31 (like starttag, endtag, text etc). Listeners must implement the 32 `ListenerInterface`. 33 34 On top of the lexer there's `SoupParser` class, which actually implements the 35 `ListenerInterface` itself (the parser listens to the lexer). The parser adds 36 HTML semantics to the lexed data and passes the events to a building listener 37 (`BuildingListenerInterface`). In addition to the events sent by the lexer the 38 `SoupParser` class generates endtag events (with empty data arguments) for 39 implicitly closed elements. Furthermore it knows about CDATA elements like 40 ``<script>`` or ``<style>`` and modifies the lexer state accordingly. 41 42 The actual semantics are provided by a DTD query class (implementing 43 `DTDInterface`.) 44 """ 45 if __doc__: 46 # pylint: disable = redefined-builtin 47 __doc__ = __doc__.encode('ascii').decode('unicode_escape') 48 __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape') 49 __docformat__ = "restructuredtext en" 50 51 import re as _re 52 53 from ..._exceptions import LexerEOFError, LexerFinalizedError 54 from ... import interfaces as _interfaces 55 from . import dtd as _dtd

56 57 58 -class SoupLexer(object):

59 """ 60 (X)HTML Tagsoup Lexer 61 62 The lexer works hard to preserve the original data. In order to achieve 63 this goal, it does not validate the input and recognizes its input in a 64 quite lenient way. 65 66 :Groups: 67 - `Lexer states` : 68 `TEXT`, 69 `CDATA`, 70 `MARKUP`, 71 `STARTTAG`, 72 `ENDTAG`, 73 `COMMENT`, 74 `MSECTION`, 75 `DECL`, 76 `PI`, 77 `EMPTY`, 78 `FINAL` 79 - `Regex Matchers` : 80 `_START_MATCH`, 81 `_ATT_ITER`, 82 `_COMMENT_SEARCH`, 83 `_MSECTION_MATCH`, 84 `_MSECTIONINVALID_MATCH`, 85 `_MEND_SEARCH`, 86 `_MSEND_SEARCH`, 87 `_DECL_MATCH` 88 89 :CVariables: 90 `TEXT` : ``int`` 91 Lexer state ``TEXT`` (between tags) 92 93 `CDATA` : ``int`` 94 Lexer state ``CDATA`` (between (P)CDATA tags) 95 96 `MARKUP` : ``int`` 97 Lexer state ``MARKUP`` (``<``) 98 99 `STARTTAG` : ``int`` 100 Lexer state ``STARTTAG`` (``<[letter]``) 101 102 `ENDTAG` : ``int`` 103 Lexer state ``ENDTAG`` (``</``) 104 105 `COMMENT` : ``int`` 106 Lexer state ``COMMENT`` (``<!--``) 107 108 `MSECTION` : ``int`` 109 Lexer state ``MSECTION`` (``<![``) 110 111 `DECL` : ``int`` 112 Lexer state ``DECL`` (``<!``) 113 114 `PI` : ``int`` 115 Lexer state ``PI`` (``<?``) 116 117 `EMPTY` : ``int`` 118 Lexer state ``EMPTY`` (``<>``) 119 120 `FINAL` : ``int`` 121 Lexer state ``FINAL`` 122 123 `_LEXERS` : ``tuple`` 124 The state lexer method names (``('method', ...)``) 125 126 `_STATES` : ``tuple`` 127 The state names (``('name', ...)``) 128 129 :IVariables: 130 `_state` : ``int`` 131 The current lexer state 132 133 `_lexers` : ``list`` 134 The state lexer methods (``[method, ...]``) 135 136 `_listener` : `ListenerInterface` 137 The listener the events shall be sent to 138 139 `_buffer` : ``str`` 140 Current unprocessed buffer 141 142 `_conditional_ie_comments` : ``bool`` 143 Handle conditional IE comments as text? 144 """ 145 # pylint: disable = no-member 146

147 - def __init__(self, listener, conditional_ie_comments=True):

148 r""" 149 Initialization 150 151 :Parameters: 152 `listener` : `ListenerInterface` 153 The event listener 154 155 `conditional_ie_comments` : ``bool`` 156 Handle conditional IE comments as text? 157 158 Conditional comments are described in full detail 159 at `MSDN`_\. 160 161 .. _MSDN: http://msdn.microsoft.com/en-us/library/ 162 ms537512%28v=vs.85%29.aspx 163 """ 164 self._listener = listener 165 self._normalize = None 166 self._cdata_name = None 167 168 self._state = self.TEXT 169 self._lexers = [getattr(self, name) for name in self._LEXERS] 170 self._buffer = '' 171 self._conditional_ie_comments = bool(conditional_ie_comments)

172

173 - def feed(self, food):

174 """ 175 Feed the lexer with new data 176 177 :Parameters: 178 `food` : ``str`` 179 The data to process 180 """ 181 self._buffer += food 182 self._lex()

183

184 - def finalize(self):

185 """ 186 Finalize the lexer 187 188 This processes the rest buffer (if any) 189 190 :Exceptions: 191 - `LexerEOFError` : The rest buffer could not be consumed 192 """ 193 self._lex() 194 if self._buffer: 195 raise LexerEOFError( 196 "Unfinished parser state %s" % self._STATES[self._state] 197 ) 198 199 self._state = self.FINAL

200

201 - def cdata(self, normalize, name):

202 """ Set CDATA state """ 203 if self._state != self.FINAL: 204 self._state = self.CDATA 205 self._normalize = normalize 206 self._cdata_name = normalize(name)

207

208 - def _lex(self):

209 """ Parse the current buffer """ 210 while self._buffer: 211 if self._lexers[self._state](): 212 break

213

214 - def _lex_text(self):

215 """ 216 Text lexer 217 218 State: We are between tags or at the very beginning of the document 219 and look for a ``<``. 220 221 :Return: Unfinished state? 222 :Rtype: ``bool`` 223 """ 224 data = self._buffer 225 pos = data.find('<') 226 if pos == 0: 227 self._state = self.MARKUP 228 return False 229 elif pos == -1: 230 self._buffer = '' 231 else: 232 self._buffer, data = data[pos:], data[:pos] 233 self._state = self.MARKUP 234 235 self._listener.handle_text(data) 236 return False

237

238 - def _lex_cdata(self):

239 """ 240 (PR)CDATA lexer 241 242 State: We are inside a text element and looking for the end tag only 243 244 :Return: Unfinished state? 245 :Rtype: ``bool`` 246 """ 247 incomplete = False 248 data, pos = self._buffer, 0 249 while True: 250 pos = data.find('<', pos) 251 if pos == -1: 252 pos = len(data) 253 self._buffer = '' 254 break 255 else: 256 char = data[pos + 1:pos + 2] 257 if char == '/': 258 self._state = self.ENDTAG 259 break 260 elif char == '': 261 incomplete = True 262 break 263 else: 264 pos += 1 265 266 if pos > 0: 267 self._buffer, data = data[pos:], data[:pos] 268 self._listener.handle_text(data) 269 270 return incomplete

271 272 #: Regex matcher for a tagname character 273 #: 274 #: :Type: ``callable`` 275 _TAGNAME_MATCH = _re.compile(r'[a-zA-Z0-9]').match 276

277 - def _lex_markup(self):

278 """ 279 Markup lexer 280 281 State: We've hit a ``<`` character and now find out, what it's 282 becoming 283 284 :Return: Unfinished state? 285 :Rtype: ``bool`` 286 """ 287 data = self._buffer 288 if len(data) < 2: 289 return True 290 291 char = data[1] 292 state = (self.ENDTAG, self.DECL, self.PI, self.EMPTY, -1)[ 293 "/!?>".find(char) 294 ] 295 if state == -1: 296 if self._TAGNAME_MATCH(char): 297 state = self.STARTTAG 298 else: 299 state = self.TEXT 300 self._buffer = data[1:] 301 self._listener.handle_text(data[0]) 302 303 self._state = state 304 return False

305 306 #: Regex matcher for a start tag 307 #: 308 #: :Type: ``callable`` 309 _START_MATCH = _re.compile(r''' 310 < 311 (?P<name>[^ \t\r\n\f/>]+) 312 (?P<attr> 313 [^"'>]* 314 (?: 315 (?: 316 "[^"]*" 317 | '[^']*' 318 ) 319 [^"'>]* 320 )* 321 ) 322 [ \t\r\n\f]* 323 > 324 ''', _re.X).match 325 326 #: Regex iterator for extracting start tag attributes 327 #: 328 #: :Type: ``callable`` 329 _ATT_ITER = _re.compile(r''' 330 [ \t\r\n\f]* 331 (?P<name>(?:/|[^ \t\r\n\f/=>]*)) # attribute name 332 [ \t\r\n\f]* 333 (?: 334 = 335 (?P<value> # optional value 336 [ \t\r\n\f]*"[^"]*" 337 | [ \t\r\n\f]*'[^']*' 338 | [^ \t\r\n\f/>]* 339 ) 340 )? 341 ''', _re.X).finditer 342

343 - def _lex_start(self):

344 """ 345 Starttag lexer 346 347 State: We've hit a ``<x`` and now look for the ``>``. 348 349 :Return: Unfinished State? 350 :Rtype: ``bool`` 351 """ 352 data = self._buffer 353 match = self._START_MATCH(data) 354 if match is None: 355 return True 356 357 pos = match.end() 358 self._buffer, data = data[pos:], data[:pos] 359 360 name, attrstring = match.group('name', 'attr') 361 attr, closed = [], False 362 if attrstring: 363 for match in self._ATT_ITER(attrstring): 364 key, value = match.group('name', 'value') 365 if key == '/' and value is None: 366 closed = True 367 continue 368 if key or value is not None: 369 if value: 370 value = value.strip() 371 attr.append((key.strip(), value)) 372 else: # bug in Python < 2.3.5 (fixed in rev 37262) 373 break 374 375 self._state = self.TEXT 376 self._listener.handle_starttag(name, attr, closed, data) 377 return False

378

379 - def _lex_end(self):

380 """ 381 Endtag lexer 382 383 State: We've hit ``</``. 384 385 :Return: Unfinished state? 386 :Rtype: ``bool`` 387 """ 388 data = self._buffer 389 pos = data.find('>') + 1 390 if pos == 0: 391 return True 392 393 self._buffer, data = data[pos:], data[:pos] 394 name = data[2:-1].strip() 395 396 if self._cdata_name is not None and \ 397 self._normalize(name) != self._cdata_name: 398 self._state = self.CDATA 399 self._listener.handle_text(data) 400 else: 401 self._cdata_name = self._normalize = None 402 self._state = self.TEXT 403 self._listener.handle_endtag(name, data) 404 return False

405 406 #: Regex searcher for finding the end of a comment 407 #: 408 #: :Type: ``callable`` 409 _COMMENT_SEARCH = _re.compile(r'--[ \t\r\n\f]*>').search 410 411 #: Regex searcher for matching IE conditional comment 412 #: 413 #: :Type: ``callable`` 414 _IE_COMMENT_MATCH = _re.compile(r''' 415 \[[ \t\r\n\f]* (?: 416 [iI][fF] | [eE][lL][sS][eE] | [eE][nN][dD][iI][fF] 417 ) [^\]]+]> 418 ''', _re.X).match 419

420 - def _lex_comment(self):

421 """ 422 Comment lexer 423 424 State: We've hit ``<!--``. 425 426 :Return: Unfinished state? 427 :Rtype: ``bool`` 428 """ 429 data = self._buffer 430 if len(data) < 7: 431 return True 432 433 if self._conditional_ie_comments: 434 match = iec = self._IE_COMMENT_MATCH(data, 4) 435 else: 436 match = iec = None 437 if match is None: 438 match = self._COMMENT_SEARCH(data, 4) 439 if match is None: 440 return True 441 442 pos = match.end() 443 self._buffer, data = data[pos:], data[:pos] 444 445 self._state = self.TEXT 446 if iec: 447 self._listener.handle_text(data) 448 else: 449 self._listener.handle_comment(data) 450 451 return False

452 453 #: List of MS-specific marked section names (lowercased) 454 #: 455 #: :Type: ``tuple`` 456 _MSSECTIONS = ('if', 'else', 'endif') 457 458 #: Regex matcher for the start of a marked section 459 #: 460 #: :Type: ``callable`` 461 _MSECTION_MATCH = _re.compile(r''' 462 <!\[[ \t\r\n\f]*(?P<name>[^\][ \t\r\n\f>]+)(?=[\][ \t\r\n\f>]) 463 ''', _re.X).match 464 465 #: Regex matcher for the start of an invalid marked section 466 #: 467 #: :Type: ``callable`` 468 _MSECTIONINVALID_MATCH = _re.compile(r'<!\[[ \t\r\n\f]*[\][>]').match 469 470 #: Regex searcher for the end of a marked section 471 #: 472 #: :Type: ``callable`` 473 _MEND_SEARCH = _re.compile(r'][ \t\r\n\f]*][ \t\r\n\f]*>').search 474 475 #: Regex searcher for the end of a MS specific marked section 476 #: 477 #: :Type: ``callable`` 478 _MSEND_SEARCH = _re.compile(r'][ \t\r\n\f]*(?:--)?[ \t\r\n\f]*>').search 479

480 - def _lex_msection(self):

481 """ 482 Marked section lexer 483 484 State: We've hit a ``<![`` and now seek the end 485 486 :Return: Unfinished state? 487 :Rtype: ``bool`` 488 """ 489 data = self._buffer 490 match = self._MSECTION_MATCH(data) 491 if match is None: 492 match = self._MSECTIONINVALID_MATCH(data) 493 if match is not None: # pass invalid msection as text 494 pos = match.end() 495 self._buffer = data[pos:] 496 data = data[:pos] 497 self._state = self.TEXT 498 self._listener.handle_text(data) 499 return False 500 return True 501 502 name = match.group('name') 503 start = match.end() 504 if self._conditional_ie_comments and name.lower() in self._MSSECTIONS: 505 match = iec = self._MSEND_SEARCH(data, start) 506 else: 507 pos = data.find('[', start) 508 if pos >= 0: 509 start = pos + 1 510 match = self._MEND_SEARCH(data, start) 511 iec = None 512 if match is None: 513 return True 514 pos, end = match.end(), match.start() 515 value = data[start:end] 516 self._buffer, data = data[pos:], data[:pos] 517 518 self._state = self.TEXT 519 if iec: 520 self._listener.handle_text(data) 521 else: 522 self._listener.handle_msection(name, value, data) 523 return False

524 525 #: Regex matcher for a complete declaration 526 #: 527 #: This regex seems a bit nasty, but it should catch all stuff allowed 528 #: in declarations (including doctype). Some day, it probably needs to 529 #: be replaced it by real lexer states... 530 #: 531 #: :Type: ``callable`` 532 _DECL_MATCH = _re.compile(r''' 533 <! 534 (?P<name>[^\][ \t\r\n\f>]*) 535 (?P<value> 536 [^"'<>-]* # any nonspecial 537 (?: 538 (?: 539 "[^"]*" # double quoted string 540 | '[^']*' # single quoted string (valid?) 541 | <!\[ # marked section 542 [^\]]* 543 (?: 544 ](?![ \t\r\n\f]*][ \t\r\n\f]*>) 545 [^\]]* 546 )* 547 ][ \t\r\n\f]*][ \t\r\n\f]*> 548 | <(?!!\[) # declaration 549 # hopefully not a doctype 550 # (but unlikely, because we are 551 # probably already in a DT subset) 552 [^"'>-]* 553 (?: 554 (?: 555 "[^"]*" 556 | '[^']*' 557 | -- # comment 558 [^-]* 559 (?:-[^-]+)* 560 -- 561 | -(?!-) # just a hyphen 562 ) 563 [^"'>-]* 564 )* 565 > 566 | -- # comment 567 [^-]* 568 (?:-[^-]+)* 569 -- 570 | -(?!-) # just a hyphen 571 ) 572 [^"'<>-]* # more non-specials 573 )* 574 ) 575 > 576 ''', _re.X).match 577

578 - def _lex_decl(self):

579 """ 580 Declaration lexer 581 582 State: We've hit a ``<!`` and now peek inside 583 584 :Return: Unfinished state? 585 :Rtype: ``bool`` 586 """ 587 data = self._buffer 588 if len(data) < 3: 589 return True 590 591 if data.startswith('<!--'): 592 self._state = self.COMMENT 593 return False 594 elif data.startswith('<!['): 595 self._state = self.MSECTION 596 return False 597 elif data == '<!-': 598 return True 599 600 match = self._DECL_MATCH(data) 601 if match is None: 602 return True 603 604 name, value = match.group('name', 'value') 605 pos = match.end() 606 self._buffer, data = data[pos:], data[:pos] 607 608 self._state = self.TEXT 609 self._listener.handle_decl(name, value.strip(), data) 610 return False

611

612 - def _lex_pi(self):

613 """ 614 Processing instruction lexer 615 616 State: We've hit a ``<?`` and now peek inside 617 618 :Return: Unfinished state? 619 :Rtype: ``bool`` 620 """ 621 data = self._buffer 622 pos = data.find('?>', 2) 623 if pos == -1: 624 return True 625 pos += 2 626 627 self._buffer, data = data[pos:], data[:pos] 628 629 self._state = self.TEXT 630 self._listener.handle_pi(data) 631 return False

632

633 - def _lex_empty(self):

634 """ 635 Empty tag lexer 636 637 State: We've hit a ``<>`` 638 639 :Return: Unfinished state? 640 :Rtype: ``bool`` 641 """ 642 self._buffer, data = self._buffer[2:], self._buffer[:2] 643 644 self._state = self.TEXT 645 self._listener.handle_starttag('', [], False, data) 646 return False

647

648 - def _lex_final(self):

649 """ 650 Called after the lexer was finalized 651 652 State: after all 653 654 :Exceptions: 655 - `LexerFinalizedError` : The lexer was already finalized 656 (raised always) 657 """ 658 raise LexerFinalizedError("The lexer was already finalized")

659 660 _LEXERS = [] 661 _STATES = [] 662 for _idx, (_statename, _funcname) in enumerate([ 663 # pylint: disable = bad-whitespace 664 665 ('FINAL', '_lex_final'), 666 ('TEXT', '_lex_text'), 667 ('CDATA', '_lex_cdata'), 668 ('MARKUP', '_lex_markup'), 669 ('STARTTAG', '_lex_start'), 670 ('ENDTAG', '_lex_end'), 671 ('COMMENT', '_lex_comment'), 672 ('MSECTION', '_lex_msection'), 673 ('DECL', '_lex_decl'), 674 ('PI', '_lex_pi'), 675 ('EMPTY', '_lex_empty'), 676 ]): # noqa 677 setattr(SoupLexer, _statename, _idx) 678 _LEXERS.append(_funcname) 679 _STATES.append(_statename) 680 681 SoupLexer._LEXERS = tuple(_LEXERS) # pylint: disable = protected-access 682 SoupLexer._STATES = tuple(_STATES) # pylint: disable = protected-access 683 del _idx, _statename, _funcname # pylint: disable = undefined-loop-variable 684 del _LEXERS, _STATES 685 686 687 from ... import c 688 c = c.load('impl') 689 if c is not None: 690 DEFAULT_LEXER = c.SoupLexer 691 else: 692 DEFAULT_LEXER = SoupLexer # pylint: disable = invalid-name 693 del c

694 695 696 -class SoupParser(object):

697 """ 698 ========================= 699 (X)HTML Tag Soup Parser 700 ========================= 701 702 Overview 703 ~~~~~~~~ 704 705 The parser is actually a tagsoup parser by design in order to process 706 most of the "HTML" that can be found out there. Of course, if the HTML 707 is well-formed and valid, this would be the best. There is only as 708 much HTML syntax applied as necessary to parse it. You can influence 709 these syntax definitions by picking another lexer. You can change 710 the semantics by picking another dtd query class. 711 712 This parser guarantees, that for each not-self-closing starttag event also 713 an endtag event is generated (if the endtag is not actually there, the 714 data parameter is an empty string). This also happens for empty tags (like 715 ``br``). On the other hand, there may be more endtag events than starttag 716 events, because of unbalanced or wrongly nested tags. 717 718 Special constructs, which are comments, PIs, marked sections and 719 declarations may occur anywhere, i.e. they are not closing elements 720 implicitly. 721 722 The default lexer does not deal with NET tags (<h1/Heading/). Neither 723 does it handle unfinished starttags by SGML rules like ``<map<area>``. 724 It *does* know about empty tags (``<>`` and ``</>``). 725 726 CDATA elements and comments are handled in a simplified way. Once 727 the particular state is entered, it's only left, when the accompanying 728 end marker was found (``<script>...</script>``, ````). 729 Anything in between is text. 730 731 How is it used? 732 ~~~~~~~~~~~~~~~ 733 734 The parser API is "streamy" on the input side and event based on the 735 output side. So, what you need first is a building listener, which will 736 receive all generated parser events and process them. Such is listener 737 object is expected to implement the `BuildingListenerInterface`. 738 739 Now you create a `SoupParser` instance and pass the listener object to 740 the contructor and the parser is ready to be fed. You can feed as many 741 chunks of input data you like into the parser by using the `feed` 742 method. Every feed call may generate mutiple events on the output side. 743 When you're done feeding, call the parser's `finalize` method in order 744 to clean up. This also flushes pending events to the listener. 745 746 :IVariables: 747 `listener` : `BuildingListenerInterface` 748 The building listener to send the events to 749 750 `lexer` : `SoupLexer` 751 The lexer instance 752 753 `_tagstack` : ``list`` 754 The current tag stack 755 756 `_inempty` : ``bool`` 757 indicates if the last tag on the stack is an empty one 758 759 `_lastopen` : ``str`` 760 Stores the last seen open tag name 761 """ 762 __implements__ = [ 763 _interfaces.ListenerInterface, _interfaces.ParserInterface 764 ] 765

766 - def __init__(self, listener, dtd, lexer=None):

767 """ 768 Initialization 769 770 :Parameters: 771 `listener` : `ListenerInterface` 772 The building listener 773 774 `dtd` : `DTDInterface` 775 DTD query object 776 777 `lexer` : ``callable`` 778 Lexer class/factory. This mus be a callable taking an 779 event listener and returning a lexer instance. If omitted or 780 ``None``, the default lexer will be used (`DEFAULT_LEXER`). 781 """ 782 self._tagstack, self._inempty, self._lastopen = [], False, '' 783 self.listener = listener 784 self._is_nestable = dtd.nestable 785 self._is_cdata = dtd.cdata 786 self._is_empty = dtd.empty 787 if lexer is None: 788 lexer = DEFAULT_LEXER 789 self._lexer = lexer(self) 790 self._normalize = listener.decoder.normalize

791 792 @classmethod

793 - def html(cls, listener):

794 """ 795 Construct a parser using the `HTMLDTD` 796 797 :Parameters: 798 `listener` : `BuildingListenerInterface` 799 The building listener 800 801 :Return: The new parser instance 802 :Rtype: `SoupParser` 803 """ 804 return cls(listener, _dtd.HTMLDTD())

805 806 @classmethod

807 - def xml(cls, listener):

808 """ 809 Construct a parser using the `XMLDTD` 810 811 :Parameters: 812 `listener` : `ListenerInterface` 813 The building listener 814 815 :Return: The new parser instance 816 :Rtype: `SoupParser` 817 """ 818 return cls(listener, _dtd.XMLDTD())

819

820 - def _close_empty(self):

821 """ Ensure we close last empty tag """ 822 if self._inempty: 823 self._inempty = False 824 self.listener.handle_endtag(self._tagstack.pop()[1], '')

825 826 ######################################################################### 827 # ListenerInterface ##################################################### 828 ######################################################################### 829

830 - def handle_text(self, data):

831 """ :See: `ListenerInterface` """ 832 self._close_empty() 833 self.listener.handle_text(data)

834

835 - def handle_starttag(self, name, attrs, closed, data):

836 """ :See: `ListenerInterface` """ 837 self._close_empty() 838 839 if name == '' and not attrs: 840 name = self._lastopen 841 else: 842 self._lastopen = name 843 844 tagstack = self._tagstack 845 nestable = self._is_nestable 846 starttag = self._normalize(name) 847 while tagstack and not nestable(tagstack[-1][0], starttag): 848 self.listener.handle_endtag(tagstack.pop()[1], '') 849 850 if closed: 851 self.listener.handle_starttag(name, attrs, closed, data) 852 else: 853 if self._is_cdata(starttag): 854 self._lexer.cdata(self._normalize, starttag) 855 self.listener.handle_starttag(name, attrs, closed, data) 856 tagstack.append((starttag, name)) 857 if self._is_empty(starttag): 858 self._inempty = True

859

860 - def handle_endtag(self, name, data):

861 """ :See: `ListenerInterface` """ 862 tagstack = self._tagstack 863 if tagstack: 864 if name == '': 865 name = tagstack[-1][1] 866 endtag = self._normalize(name) 867 if endtag in dict(tagstack): 868 toclose, original = tagstack.pop() 869 self._inempty = False 870 while toclose != endtag: 871 self.listener.handle_endtag(original, '') 872 toclose, original = tagstack.pop() 873 874 self._close_empty() 875 self.listener.handle_endtag(name, data)

876

877 - def handle_comment(self, data):

878 """ :See: `ListenerInterface` """ 879 self._close_empty() 880 self.listener.handle_comment(data)

881

882 - def handle_msection(self, name, value, data):

883 """ :See: `ListenerInterface` """ 884 self._close_empty() 885 self.listener.handle_msection(name, value, data)

886

887 - def handle_decl(self, name, value, data):

888 """ :See: `ListenerInterface` """ 889 self._close_empty() 890 self.listener.handle_decl(name, value, data)

891

892 - def handle_pi(self, data):

893 """ :See: `ListenerInterface` """ 894 self._close_empty() 895 self.listener.handle_pi(data)

896

897 - def handle_escape(self, escaped, data):

898 """ :See: `ListenerInterface` """ 899 # pylint: disable = unused-argument 900 901 raise AssertionError()

902 903 ######################################################################### 904 # ParserInterface ####################################################### 905 ######################################################################### 906

907 - def feed(self, food):

908 """ :See: `ParserInterface` """ 909 self._lexer.feed(food)

910

911 - def finalize(self):

912 """ 913 :See: `ParserInterface` 914 915 :Exceptions: 916 - `LexerEOFError` : EOF in the middle of a state 917 """ 918 if self._lexer is not None: 919 self._lexer, _ = None, self._lexer.finalize() # noqa 920 921 tagstack = self._tagstack 922 while tagstack: 923 self.listener.handle_endtag(tagstack.pop()[1], '')

924 925 926 from ... import c 927 c = c.load('impl') 928 if c is not None: 929 DEFAULT_PARSER = c.SoupParser 930 else: 931 DEFAULT_PARSER = SoupParser # pylint: disable = invalid-name 932 del c 933

Source Code for Module tdi.markup.soup.parser