1
2 r"""
3 :Copyright:
4
5 Copyright 2006 - 2015
6 Andr\xe9 Malo or his licensors, as applicable
7
8 :License:
9
10 Licensed under the Apache License, Version 2.0 (the "License");
11 you may not use this file except in compliance with the License.
12 You may obtain a copy of the License at
13
14 http://www.apache.org/licenses/LICENSE-2.0
15
16 Unless required by applicable law or agreed to in writing, software
17 distributed under the License is distributed on an "AS IS" BASIS,
18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 See the License for the specific language governing permissions and
20 limitations under the License.
21
22 =====================
23 Markup Parser Logic
24 =====================
25
26 Soup Parser
27 ~~~~~~~~~~~
28
29 This module provides a very lenient HTML/XML lexer. The `SoupLexer` class is
30 initialized with a listener object, which receives all low level events
31 (like starttag, endtag, text etc). Listeners must implement the
32 `ListenerInterface`.
33
34 On top of the lexer there's `SoupParser` class, which actually implements the
35 `ListenerInterface` itself (the parser listens to the lexer). The parser adds
36 HTML semantics to the lexed data and passes the events to a building listener
37 (`BuildingListenerInterface`). In addition to the events sent by the lexer the
38 `SoupParser` class generates endtag events (with empty data arguments) for
39 implicitly closed elements. Furthermore it knows about CDATA elements like
40 ``<script>`` or ``<style>`` and modifies the lexer state accordingly.
41
42 The actual semantics are provided by a DTD query class (implementing
43 `DTDInterface`.)
44 """
45 if __doc__:
46
47 __doc__ = __doc__.encode('ascii').decode('unicode_escape')
48 __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape')
49 __docformat__ = "restructuredtext en"
50
51 import re as _re
52
53 from ..._exceptions import LexerEOFError, LexerFinalizedError
54 from ... import interfaces as _interfaces
55 from . import dtd as _dtd
59 """
60 (X)HTML Tagsoup Lexer
61
62 The lexer works hard to preserve the original data. In order to achieve
63 this goal, it does not validate the input and recognizes its input in a
64 quite lenient way.
65
66 :Groups:
67 - `Lexer states` :
68 `TEXT`,
69 `CDATA`,
70 `MARKUP`,
71 `STARTTAG`,
72 `ENDTAG`,
73 `COMMENT`,
74 `MSECTION`,
75 `DECL`,
76 `PI`,
77 `EMPTY`,
78 `FINAL`
79 - `Regex Matchers` :
80 `_START_MATCH`,
81 `_ATT_ITER`,
82 `_COMMENT_SEARCH`,
83 `_MSECTION_MATCH`,
84 `_MSECTIONINVALID_MATCH`,
85 `_MEND_SEARCH`,
86 `_MSEND_SEARCH`,
87 `_DECL_MATCH`
88
89 :CVariables:
90 `TEXT` : ``int``
91 Lexer state ``TEXT`` (between tags)
92
93 `CDATA` : ``int``
94 Lexer state ``CDATA`` (between (P)CDATA tags)
95
96 `MARKUP` : ``int``
97 Lexer state ``MARKUP`` (``<``)
98
99 `STARTTAG` : ``int``
100 Lexer state ``STARTTAG`` (``<[letter]``)
101
102 `ENDTAG` : ``int``
103 Lexer state ``ENDTAG`` (``</``)
104
105 `COMMENT` : ``int``
106 Lexer state ``COMMENT`` (``<!--``)
107
108 `MSECTION` : ``int``
109 Lexer state ``MSECTION`` (``<![``)
110
111 `DECL` : ``int``
112 Lexer state ``DECL`` (``<!``)
113
114 `PI` : ``int``
115 Lexer state ``PI`` (``<?``)
116
117 `EMPTY` : ``int``
118 Lexer state ``EMPTY`` (``<>``)
119
120 `FINAL` : ``int``
121 Lexer state ``FINAL``
122
123 `_LEXERS` : ``tuple``
124 The state lexer method names (``('method', ...)``)
125
126 `_STATES` : ``tuple``
127 The state names (``('name', ...)``)
128
129 :IVariables:
130 `_state` : ``int``
131 The current lexer state
132
133 `_lexers` : ``list``
134 The state lexer methods (``[method, ...]``)
135
136 `_listener` : `ListenerInterface`
137 The listener the events shall be sent to
138
139 `_buffer` : ``str``
140 Current unprocessed buffer
141
142 `_conditional_ie_comments` : ``bool``
143 Handle conditional IE comments as text?
144 """
145
146
147 - def __init__(self, listener, conditional_ie_comments=True):
148 r"""
149 Initialization
150
151 :Parameters:
152 `listener` : `ListenerInterface`
153 The event listener
154
155 `conditional_ie_comments` : ``bool``
156 Handle conditional IE comments as text?
157
158 Conditional comments are described in full detail
159 at `MSDN`_\.
160
161 .. _MSDN: http://msdn.microsoft.com/en-us/library/
162 ms537512%28v=vs.85%29.aspx
163 """
164 self._listener = listener
165 self._normalize = None
166 self._cdata_name = None
167
168 self._state = self.TEXT
169 self._lexers = [getattr(self, name) for name in self._LEXERS]
170 self._buffer = ''
171 self._conditional_ie_comments = bool(conditional_ie_comments)
172
173 - def feed(self, food):
174 """
175 Feed the lexer with new data
176
177 :Parameters:
178 `food` : ``str``
179 The data to process
180 """
181 self._buffer += food
182 self._lex()
183
185 """
186 Finalize the lexer
187
188 This processes the rest buffer (if any)
189
190 :Exceptions:
191 - `LexerEOFError` : The rest buffer could not be consumed
192 """
193 self._lex()
194 if self._buffer:
195 raise LexerEOFError(
196 "Unfinished parser state %s" % self._STATES[self._state]
197 )
198
199 self._state = self.FINAL
200
201 - def cdata(self, normalize, name):
202 """ Set CDATA state """
203 if self._state != self.FINAL:
204 self._state = self.CDATA
205 self._normalize = normalize
206 self._cdata_name = normalize(name)
207
209 """ Parse the current buffer """
210 while self._buffer:
211 if self._lexers[self._state]():
212 break
213
214 - def _lex_text(self):
215 """
216 Text lexer
217
218 State: We are between tags or at the very beginning of the document
219 and look for a ``<``.
220
221 :Return: Unfinished state?
222 :Rtype: ``bool``
223 """
224 data = self._buffer
225 pos = data.find('<')
226 if pos == 0:
227 self._state = self.MARKUP
228 return False
229 elif pos == -1:
230 self._buffer = ''
231 else:
232 self._buffer, data = data[pos:], data[:pos]
233 self._state = self.MARKUP
234
235 self._listener.handle_text(data)
236 return False
237
239 """
240 (PR)CDATA lexer
241
242 State: We are inside a text element and looking for the end tag only
243
244 :Return: Unfinished state?
245 :Rtype: ``bool``
246 """
247 incomplete = False
248 data, pos = self._buffer, 0
249 while True:
250 pos = data.find('<', pos)
251 if pos == -1:
252 pos = len(data)
253 self._buffer = ''
254 break
255 else:
256 char = data[pos + 1:pos + 2]
257 if char == '/':
258 self._state = self.ENDTAG
259 break
260 elif char == '':
261 incomplete = True
262 break
263 else:
264 pos += 1
265
266 if pos > 0:
267 self._buffer, data = data[pos:], data[:pos]
268 self._listener.handle_text(data)
269
270 return incomplete
271
272
273
274
275 _TAGNAME_MATCH = _re.compile(r'[a-zA-Z0-9]').match
276
278 """
279 Markup lexer
280
281 State: We've hit a ``<`` character and now find out, what it's
282 becoming
283
284 :Return: Unfinished state?
285 :Rtype: ``bool``
286 """
287 data = self._buffer
288 if len(data) < 2:
289 return True
290
291 char = data[1]
292 state = (self.ENDTAG, self.DECL, self.PI, self.EMPTY, -1)[
293 "/!?>".find(char)
294 ]
295 if state == -1:
296 if self._TAGNAME_MATCH(char):
297 state = self.STARTTAG
298 else:
299 state = self.TEXT
300 self._buffer = data[1:]
301 self._listener.handle_text(data[0])
302
303 self._state = state
304 return False
305
306
307
308
309 _START_MATCH = _re.compile(r'''
310 <
311 (?P<name>[^ \t\r\n\f/>]+)
312 (?P<attr>
313 [^"'>]*
314 (?:
315 (?:
316 "[^"]*"
317 | '[^']*'
318 )
319 [^"'>]*
320 )*
321 )
322 [ \t\r\n\f]*
323 >
324 ''', _re.X).match
325
326
327
328
329 _ATT_ITER = _re.compile(r'''
330 [ \t\r\n\f]*
331 (?P<name>(?:/|[^ \t\r\n\f/=>]*)) # attribute name
332 [ \t\r\n\f]*
333 (?:
334 =
335 (?P<value> # optional value
336 [ \t\r\n\f]*"[^"]*"
337 | [ \t\r\n\f]*'[^']*'
338 | [^ \t\r\n\f/>]*
339 )
340 )?
341 ''', _re.X).finditer
342
344 """
345 Starttag lexer
346
347 State: We've hit a ``<x`` and now look for the ``>``.
348
349 :Return: Unfinished State?
350 :Rtype: ``bool``
351 """
352 data = self._buffer
353 match = self._START_MATCH(data)
354 if match is None:
355 return True
356
357 pos = match.end()
358 self._buffer, data = data[pos:], data[:pos]
359
360 name, attrstring = match.group('name', 'attr')
361 attr, closed = [], False
362 if attrstring:
363 for match in self._ATT_ITER(attrstring):
364 key, value = match.group('name', 'value')
365 if key == '/' and value is None:
366 closed = True
367 continue
368 if key or value is not None:
369 if value:
370 value = value.strip()
371 attr.append((key.strip(), value))
372 else:
373 break
374
375 self._state = self.TEXT
376 self._listener.handle_starttag(name, attr, closed, data)
377 return False
378
380 """
381 Endtag lexer
382
383 State: We've hit ``</``.
384
385 :Return: Unfinished state?
386 :Rtype: ``bool``
387 """
388 data = self._buffer
389 pos = data.find('>') + 1
390 if pos == 0:
391 return True
392
393 self._buffer, data = data[pos:], data[:pos]
394 name = data[2:-1].strip()
395
396 if self._cdata_name is not None and \
397 self._normalize(name) != self._cdata_name:
398 self._state = self.CDATA
399 self._listener.handle_text(data)
400 else:
401 self._cdata_name = self._normalize = None
402 self._state = self.TEXT
403 self._listener.handle_endtag(name, data)
404 return False
405
406
407
408
409 _COMMENT_SEARCH = _re.compile(r'--[ \t\r\n\f]*>').search
410
411
412
413
414 _IE_COMMENT_MATCH = _re.compile(r'''
415 \[[ \t\r\n\f]* (?:
416 [iI][fF] | [eE][lL][sS][eE] | [eE][nN][dD][iI][fF]
417 ) [^\]]+]>
418 ''', _re.X).match
419
452
453
454
455
456 _MSSECTIONS = ('if', 'else', 'endif')
457
458
459
460
461 _MSECTION_MATCH = _re.compile(r'''
462 <!\[[ \t\r\n\f]*(?P<name>[^\][ \t\r\n\f>]+)(?=[\][ \t\r\n\f>])
463 ''', _re.X).match
464
465
466
467
468 _MSECTIONINVALID_MATCH = _re.compile(r'<!\[[ \t\r\n\f]*[\][>]').match
469
470
471
472
473 _MEND_SEARCH = _re.compile(r'][ \t\r\n\f]*][ \t\r\n\f]*>').search
474
475
476
477
478 _MSEND_SEARCH = _re.compile(r'][ \t\r\n\f]*(?:--)?[ \t\r\n\f]*>').search
479
481 """
482 Marked section lexer
483
484 State: We've hit a ``<![`` and now seek the end
485
486 :Return: Unfinished state?
487 :Rtype: ``bool``
488 """
489 data = self._buffer
490 match = self._MSECTION_MATCH(data)
491 if match is None:
492 match = self._MSECTIONINVALID_MATCH(data)
493 if match is not None:
494 pos = match.end()
495 self._buffer = data[pos:]
496 data = data[:pos]
497 self._state = self.TEXT
498 self._listener.handle_text(data)
499 return False
500 return True
501
502 name = match.group('name')
503 start = match.end()
504 if self._conditional_ie_comments and name.lower() in self._MSSECTIONS:
505 match = iec = self._MSEND_SEARCH(data, start)
506 else:
507 pos = data.find('[', start)
508 if pos >= 0:
509 start = pos + 1
510 match = self._MEND_SEARCH(data, start)
511 iec = None
512 if match is None:
513 return True
514 pos, end = match.end(), match.start()
515 value = data[start:end]
516 self._buffer, data = data[pos:], data[:pos]
517
518 self._state = self.TEXT
519 if iec:
520 self._listener.handle_text(data)
521 else:
522 self._listener.handle_msection(name, value, data)
523 return False
524
525
526
527
528
529
530
531
532 _DECL_MATCH = _re.compile(r'''
533 <!
534 (?P<name>[^\][ \t\r\n\f>]*)
535 (?P<value>
536 [^"'<>-]* # any nonspecial
537 (?:
538 (?:
539 "[^"]*" # double quoted string
540 | '[^']*' # single quoted string (valid?)
541 | <!\[ # marked section
542 [^\]]*
543 (?:
544 ](?![ \t\r\n\f]*][ \t\r\n\f]*>)
545 [^\]]*
546 )*
547 ][ \t\r\n\f]*][ \t\r\n\f]*>
548 | <(?!!\[) # declaration
549 # hopefully not a doctype
550 # (but unlikely, because we are
551 # probably already in a DT subset)
552 [^"'>-]*
553 (?:
554 (?:
555 "[^"]*"
556 | '[^']*'
557 | -- # comment
558 [^-]*
559 (?:-[^-]+)*
560 --
561 | -(?!-) # just a hyphen
562 )
563 [^"'>-]*
564 )*
565 >
566 | -- # comment
567 [^-]*
568 (?:-[^-]+)*
569 --
570 | -(?!-) # just a hyphen
571 )
572 [^"'<>-]* # more non-specials
573 )*
574 )
575 >
576 ''', _re.X).match
577
579 """
580 Declaration lexer
581
582 State: We've hit a ``<!`` and now peek inside
583
584 :Return: Unfinished state?
585 :Rtype: ``bool``
586 """
587 data = self._buffer
588 if len(data) < 3:
589 return True
590
591 if data.startswith('<!--'):
592 self._state = self.COMMENT
593 return False
594 elif data.startswith('<!['):
595 self._state = self.MSECTION
596 return False
597 elif data == '<!-':
598 return True
599
600 match = self._DECL_MATCH(data)
601 if match is None:
602 return True
603
604 name, value = match.group('name', 'value')
605 pos = match.end()
606 self._buffer, data = data[pos:], data[:pos]
607
608 self._state = self.TEXT
609 self._listener.handle_decl(name, value.strip(), data)
610 return False
611
613 """
614 Processing instruction lexer
615
616 State: We've hit a ``<?`` and now peek inside
617
618 :Return: Unfinished state?
619 :Rtype: ``bool``
620 """
621 data = self._buffer
622 pos = data.find('?>', 2)
623 if pos == -1:
624 return True
625 pos += 2
626
627 self._buffer, data = data[pos:], data[:pos]
628
629 self._state = self.TEXT
630 self._listener.handle_pi(data)
631 return False
632
634 """
635 Empty tag lexer
636
637 State: We've hit a ``<>``
638
639 :Return: Unfinished state?
640 :Rtype: ``bool``
641 """
642 self._buffer, data = self._buffer[2:], self._buffer[:2]
643
644 self._state = self.TEXT
645 self._listener.handle_starttag('', [], False, data)
646 return False
647
649 """
650 Called after the lexer was finalized
651
652 State: after all
653
654 :Exceptions:
655 - `LexerFinalizedError` : The lexer was already finalized
656 (raised always)
657 """
658 raise LexerFinalizedError("The lexer was already finalized")
659
660 _LEXERS = []
661 _STATES = []
662 for _idx, (_statename, _funcname) in enumerate([
663
664
665 ('FINAL', '_lex_final'),
666 ('TEXT', '_lex_text'),
667 ('CDATA', '_lex_cdata'),
668 ('MARKUP', '_lex_markup'),
669 ('STARTTAG', '_lex_start'),
670 ('ENDTAG', '_lex_end'),
671 ('COMMENT', '_lex_comment'),
672 ('MSECTION', '_lex_msection'),
673 ('DECL', '_lex_decl'),
674 ('PI', '_lex_pi'),
675 ('EMPTY', '_lex_empty'),
676 ]):
677 setattr(SoupLexer, _statename, _idx)
678 _LEXERS.append(_funcname)
679 _STATES.append(_statename)
680
681 SoupLexer._LEXERS = tuple(_LEXERS)
682 SoupLexer._STATES = tuple(_STATES)
683 del _idx, _statename, _funcname
684 del _LEXERS, _STATES
685
686
687 from ... import c
688 c = c.load('impl')
689 if c is not None:
690 DEFAULT_LEXER = c.SoupLexer
691 else:
692 DEFAULT_LEXER = SoupLexer
693 del c
697 """
698 =========================
699 (X)HTML Tag Soup Parser
700 =========================
701
702 Overview
703 ~~~~~~~~
704
705 The parser is actually a tagsoup parser by design in order to process
706 most of the "HTML" that can be found out there. Of course, if the HTML
707 is well-formed and valid, this would be the best. There is only as
708 much HTML syntax applied as necessary to parse it. You can influence
709 these syntax definitions by picking another lexer. You can change
710 the semantics by picking another dtd query class.
711
712 This parser guarantees, that for each not-self-closing starttag event also
713 an endtag event is generated (if the endtag is not actually there, the
714 data parameter is an empty string). This also happens for empty tags (like
715 ``br``). On the other hand, there may be more endtag events than starttag
716 events, because of unbalanced or wrongly nested tags.
717
718 Special constructs, which are comments, PIs, marked sections and
719 declarations may occur anywhere, i.e. they are not closing elements
720 implicitly.
721
722 The default lexer does not deal with NET tags (<h1/Heading/). Neither
723 does it handle unfinished starttags by SGML rules like ``<map<area>``.
724 It *does* know about empty tags (``<>`` and ``</>``).
725
726 CDATA elements and comments are handled in a simplified way. Once
727 the particular state is entered, it's only left, when the accompanying
728 end marker was found (``<script>...</script>``, ``<!-- ... -->``).
729 Anything in between is text.
730
731 How is it used?
732 ~~~~~~~~~~~~~~~
733
734 The parser API is "streamy" on the input side and event based on the
735 output side. So, what you need first is a building listener, which will
736 receive all generated parser events and process them. Such is listener
737 object is expected to implement the `BuildingListenerInterface`.
738
739 Now you create a `SoupParser` instance and pass the listener object to
740 the contructor and the parser is ready to be fed. You can feed as many
741 chunks of input data you like into the parser by using the `feed`
742 method. Every feed call may generate mutiple events on the output side.
743 When you're done feeding, call the parser's `finalize` method in order
744 to clean up. This also flushes pending events to the listener.
745
746 :IVariables:
747 `listener` : `BuildingListenerInterface`
748 The building listener to send the events to
749
750 `lexer` : `SoupLexer`
751 The lexer instance
752
753 `_tagstack` : ``list``
754 The current tag stack
755
756 `_inempty` : ``bool``
757 indicates if the last tag on the stack is an empty one
758
759 `_lastopen` : ``str``
760 Stores the last seen open tag name
761 """
762 __implements__ = [
763 _interfaces.ListenerInterface, _interfaces.ParserInterface
764 ]
765
766 - def __init__(self, listener, dtd, lexer=None):
767 """
768 Initialization
769
770 :Parameters:
771 `listener` : `ListenerInterface`
772 The building listener
773
774 `dtd` : `DTDInterface`
775 DTD query object
776
777 `lexer` : ``callable``
778 Lexer class/factory. This mus be a callable taking an
779 event listener and returning a lexer instance. If omitted or
780 ``None``, the default lexer will be used (`DEFAULT_LEXER`).
781 """
782 self._tagstack, self._inempty, self._lastopen = [], False, ''
783 self.listener = listener
784 self._is_nestable = dtd.nestable
785 self._is_cdata = dtd.cdata
786 self._is_empty = dtd.empty
787 if lexer is None:
788 lexer = DEFAULT_LEXER
789 self._lexer = lexer(self)
790 self._normalize = listener.decoder.normalize
791
792 @classmethod
793 - def html(cls, listener):
794 """
795 Construct a parser using the `HTMLDTD`
796
797 :Parameters:
798 `listener` : `BuildingListenerInterface`
799 The building listener
800
801 :Return: The new parser instance
802 :Rtype: `SoupParser`
803 """
804 return cls(listener, _dtd.HTMLDTD())
805
806 @classmethod
807 - def xml(cls, listener):
808 """
809 Construct a parser using the `XMLDTD`
810
811 :Parameters:
812 `listener` : `ListenerInterface`
813 The building listener
814
815 :Return: The new parser instance
816 :Rtype: `SoupParser`
817 """
818 return cls(listener, _dtd.XMLDTD())
819
821 """ Ensure we close last empty tag """
822 if self._inempty:
823 self._inempty = False
824 self.listener.handle_endtag(self._tagstack.pop()[1], '')
825
826
827
828
829
830 - def handle_text(self, data):
831 """ :See: `ListenerInterface` """
832 self._close_empty()
833 self.listener.handle_text(data)
834
836 """ :See: `ListenerInterface` """
837 self._close_empty()
838
839 if name == '' and not attrs:
840 name = self._lastopen
841 else:
842 self._lastopen = name
843
844 tagstack = self._tagstack
845 nestable = self._is_nestable
846 starttag = self._normalize(name)
847 while tagstack and not nestable(tagstack[-1][0], starttag):
848 self.listener.handle_endtag(tagstack.pop()[1], '')
849
850 if closed:
851 self.listener.handle_starttag(name, attrs, closed, data)
852 else:
853 if self._is_cdata(starttag):
854 self._lexer.cdata(self._normalize, starttag)
855 self.listener.handle_starttag(name, attrs, closed, data)
856 tagstack.append((starttag, name))
857 if self._is_empty(starttag):
858 self._inempty = True
859
861 """ :See: `ListenerInterface` """
862 tagstack = self._tagstack
863 if tagstack:
864 if name == '':
865 name = tagstack[-1][1]
866 endtag = self._normalize(name)
867 if endtag in dict(tagstack):
868 toclose, original = tagstack.pop()
869 self._inempty = False
870 while toclose != endtag:
871 self.listener.handle_endtag(original, '')
872 toclose, original = tagstack.pop()
873
874 self._close_empty()
875 self.listener.handle_endtag(name, data)
876
881
883 """ :See: `ListenerInterface` """
884 self._close_empty()
885 self.listener.handle_msection(name, value, data)
886
888 """ :See: `ListenerInterface` """
889 self._close_empty()
890 self.listener.handle_decl(name, value, data)
891
893 """ :See: `ListenerInterface` """
894 self._close_empty()
895 self.listener.handle_pi(data)
896
898 """ :See: `ListenerInterface` """
899
900
901 raise AssertionError()
902
903
904
905
906
907 - def feed(self, food):
908 """ :See: `ParserInterface` """
909 self._lexer.feed(food)
910
912 """
913 :See: `ParserInterface`
914
915 :Exceptions:
916 - `LexerEOFError` : EOF in the middle of a state
917 """
918 if self._lexer is not None:
919 self._lexer, _ = None, self._lexer.finalize()
920
921 tagstack = self._tagstack
922 while tagstack:
923 self.listener.handle_endtag(tagstack.pop()[1], '')
924
925
926 from ... import c
927 c = c.load('impl')
928 if c is not None:
929 DEFAULT_PARSER = c.SoupParser
930 else:
931 DEFAULT_PARSER = SoupParser
932 del c
933