1
2 r"""
3 :Copyright:
4
5 Copyright 2012 - 2015
6 Andr\xe9 Malo or his licensors, as applicable
7
8 :License:
9
10 Licensed under the Apache License, Version 2.0 (the "License");
11 you may not use this file except in compliance with the License.
12 You may obtain a copy of the License at
13
14 http://www.apache.org/licenses/LICENSE-2.0
15
16 Unless required by applicable law or agreed to in writing, software
17 distributed under the License is distributed on an "AS IS" BASIS,
18 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 See the License for the specific language governing permissions and
20 limitations under the License.
21
22 ===================
23 Text Parser Logic
24 ===================
25
26 Text Parser.
27 """
28 if __doc__:
29
30 __doc__ = __doc__.encode('ascii').decode('unicode_escape')
31 __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape')
32 __docformat__ = "restructuredtext en"
33
34 import re as _re
35
36 from ..._exceptions import LexerEOFError, LexerFinalizedError
37 from ... import interfaces as _interfaces
38
39
40 -class TextLexer(object):
41 """ Text Lexer """
42
43
44 - def __init__(self, listener):
45 """
46 Initialization
47
48 :Parameters:
49 `listener` : `ListenerInterface`
50 The event listener
51 """
52 self._listener = listener
53
54 self.state = self.TEXT
55 self._lexers = [getattr(self, name) for name in self._LEXERS]
56 self._buffer = ''
57
58 - def feed(self, food):
59 """
60 Feed the lexer with new data
61
62 :Parameters:
63 `food` : ``str``
64 The data to process
65 """
66 self._buffer += food
67 self._lex()
68
70 """
71 Finalize the lexer
72
73 This processes the rest buffer (if any)
74
75 :Exceptions:
76 - `LexerEOFError` : The rest buffer could not be consumed
77 """
78 self._lex()
79 if self._buffer:
80 raise LexerEOFError(
81 "Unfinished parser state %s" % self._STATES[self.state]
82 )
83
84 self.state = self.FINAL
85
87 """ Parse the current buffer """
88 while self._buffer:
89 if self._lexers[self.state]():
90 break
91
92 - def _lex_text(self):
93 """
94 Text lexer
95
96 State: We are between tags or at the very beginning of the document
97 and look for a ``[``.
98
99 :Return: Unfinished state?
100 :Rtype: ``bool``
101 """
102 data = self._buffer
103 pos = data.find('[')
104 if pos == 0:
105 self.state = self.MARKUP
106 return False
107 elif pos == -1:
108 self._buffer = ''
109 else:
110 self._buffer, data = data[pos:], data[:pos]
111 self.state = self.MARKUP
112
113 self._listener.handle_text(data)
114 return False
115
116 - def _lex_markup(self):
117 """
118 Markup lexer
119
120 State: We've hit a ``[`` character and now find out, what it's
121 becoming
122
123 :Return: Unfinished state?
124 :Rtype: ``bool``
125 """
126 data = self._buffer
127 if len(data) < 2:
128 return True
129
130 char = data[1]
131 if char == '/':
132 state = self.ENDTAG
133 elif char == '#':
134 state = self.COMMENT
135 elif char == '?':
136 state = self.PI
137 elif char == ']':
138 state = self.TEXT
139 self._listener.handle_escape(data[0], data[:2])
140 self._buffer = data[2:]
141 else:
142 state = self.STARTTAG
143
144 self.state = state
145 return False
146
147
148
149
150 _START_MATCH = _re.compile(r'''
151 \[
152 (
153 [^\\"'\[\]]*
154 (?:
155 (?:
156 "[^\\"]*(?:\\.[^\\"]*)*"
157 | '[^\\']*(?:\\.[^\\']*)*'
158 )
159 [^\\"'\[\]]*
160 )*
161 )
162 \]
163 ''', _re.X | _re.S).match
164
165
166
167
168 _EMPTY_START_MATCH = _re.compile(r'''
169 \[
170 (
171 \[
172 [^\\"'\[\]]*
173 (?:
174 (?:
175 "[^\\"]*(?:\\.[^\\"]*)*"
176 | '[^\\']*(?:\\.[^\\']*)*'
177 )
178 [^\\"'\[\]]*
179 )*
180 \]
181 )
182 \]
183 ''', _re.X | _re.S).match
184
185
186
187
188 _ATT_ITER = _re.compile(r'''
189 \s*
190 (?P<name>[^\s=\]]*) # attribute name
191 \s*
192 (?:
193 =
194 (?P<value> # optional value
195 \s* "[^\\"]*(?:\\.[^\\"]*)*"
196 | \s* '[^\\']*(?:\\.[^\\']*)*'
197 | [^\\\s\]]*
198 )
199 )?
200 ''', _re.X | _re.S).finditer
201
202 - def _lex_start(self):
203 """
204 Starttag lexer
205
206 State: We've hit a ``[tag`` and now look for the ``]``
207
208 :Return: Unfinished State?
209 :Rtype: ``bool``
210 """
211 data = self._buffer
212 match = self._EMPTY_START_MATCH(data) or self._START_MATCH(data)
213 if match is None:
214 return True
215
216 pos = match.end()
217 self._buffer, data = data[pos:], data[:pos]
218
219 attrstring = match.group(1)
220 quoted = attrstring.startswith('[')
221 if quoted:
222 attrstring = attrstring[1:-1]
223
224 splitted = attrstring.split(None, 1)
225 if not splitted:
226 self._listener.handle_text(data)
227 self.state = self.TEXT
228 return False
229 name = splitted[0]
230 if '=' in name:
231 name = ''
232 elif len(splitted) == 1:
233 attrstring = None
234 else:
235 attrstring = splitted[1]
236
237 attr = []
238 if attrstring:
239 for match in self._ATT_ITER(attrstring):
240 key, value = match.group('name', 'value')
241 if key or value is not None:
242 if value:
243 value = value.strip()
244 attr.append((key.strip(), value))
245 else:
246 break
247
248 self.state = self.TEXT
249 self._listener.handle_starttag(name, attr, quoted, data)
250 return False
251
252 - def _lex_end(self):
253 """
254 Endtag lexer
255
256 State: We've hit ``[/``.
257
258 :Return: Unfinished state?
259 :Rtype: ``bool``
260 """
261 data = self._buffer
262 pos = data.find(']') + 1
263 if pos == 0:
264 return True
265
266 self._buffer, data = data[pos:], data[:pos]
267 name = data[2:-1].strip()
268
269 self.state = self.TEXT
270 self._listener.handle_endtag(name, data)
271 return False
272
273
274
275
276 _COMMENT_SEARCH = _re.compile(r'#\]').search
277
279 """
280 Comment lexer
281
282 State: We've hit ``[#``.
283
284 :Return: Unfinished state?
285 :Rtype: ``bool``
286 """
287 data = self._buffer
288 if len(data) < 4:
289 return True
290
291 match = self._COMMENT_SEARCH(data, 2)
292 if match is None:
293 return True
294
295 pos = match.end()
296 self._buffer, data = data[pos:], data[:pos]
297
298 self.state = self.TEXT
299 self._listener.handle_comment(data)
300 return False
301
303 """
304 Processing instruction lexer
305
306 State: We've hit a ``[?`` and now peek inside
307
308 :Return: Unfinished state?
309 :Rtype: ``bool``
310 """
311 data = self._buffer
312 pos = data.find('?]', 2)
313 if pos == -1:
314 return True
315 pos += 2
316
317 self._buffer, data = data[pos:], data[:pos]
318
319 self.state = self.TEXT
320 self._listener.handle_pi(data)
321 return False
322
323 - def _lex_final(self):
324 """
325 Called after the lexer was finalized
326
327 State: after all
328
329 :Exceptions:
330 - `LexerFinalizedError` : The lexer was already finalized
331 (raised always)
332 """
333 raise LexerFinalizedError("The lexer was already finalized")
334
335 _LEXERS = []
336 _STATES = []
337 for _idx, (_statename, _funcname) in enumerate([
338
339
340 ('FINAL', '_lex_final'),
341 ('TEXT', '_lex_text'),
342 ('MARKUP', '_lex_markup'),
343 ('STARTTAG', '_lex_start'),
344 ('ENDTAG', '_lex_end'),
345 ('PI', '_lex_pi'),
346 ('COMMENT', '_lex_comment'),
347 ]):
348 setattr(TextLexer, _statename, _idx)
349 _LEXERS.append(_funcname)
350 _STATES.append(_statename)
351
352 TextLexer._LEXERS = tuple(_LEXERS)
353 TextLexer._STATES = tuple(_STATES)
354 del _idx, _statename, _funcname
355 del _LEXERS, _STATES
356
357
358 -class TextParser(object):
359 """ Text Parser """
360 __implements__ = [
361 _interfaces.ListenerInterface, _interfaces.ParserInterface
362 ]
363
364 - def __init__(self, listener, lexer=TextLexer):
365 """
366 Initialization
367
368 :Parameters:
369 `listener` : `BuildingListenerInterface`
370 The building listener
371
372 `lexer` : ``callable``
373 Lexer class/factory. This must be a callable taking an
374 event listener and returning a lexer instance
375 """
376 self._tagstack = []
377 self.listener = listener
378 self._lexer = lexer(self)
379 self._normalize = self.listener.decoder.normalize
380
381
382
383
384
385 - def handle_text(self, data):
386 """ :See: `ListenerInterface` """
387 self.listener.handle_text(data)
388
389 - def handle_escape(self, escaped, data):
390 """ :See: `ListenerInterface` """
391 self.listener.handle_escape(escaped, data)
392
393 - def handle_starttag(self, name, attrs, closed, data):
394 """ :See: `ListenerInterface` """
395 self.listener.handle_starttag(name, attrs, closed, data)
396 if not closed:
397 self._tagstack.append((self._normalize(name), name))
398
399 - def handle_endtag(self, name, data):
400 """ :See: `ListenerInterface` """
401 tagstack = self._tagstack
402 if tagstack:
403 if name == '':
404 name = tagstack[-1][1]
405 endtag = self._normalize(name)
406 if endtag in dict(tagstack):
407 toclose, original = tagstack.pop()
408 while toclose != name:
409 self.listener.handle_endtag(original, '')
410 toclose, original = tagstack.pop()
411 self.listener.handle_endtag(name, data)
412
414 """ :See: `ListenerInterface` """
415 self.listener.handle_comment(data)
416
417 - def handle_pi(self, data):
418 """ :See: `ListenerInterface` """
419 self.listener.handle_pi(data)
420
421 - def handle_msection(self, name, value, data):
422 """ :See: `ListenerInterface` """
423
424
425 raise AssertionError()
426
427 - def handle_decl(self, name, value, data):
428 """ :See: `ListenerInterface` """
429
430
431 raise AssertionError()
432
433
434
435
436
437 - def feed(self, food):
438 """ :See: `ParserInterface` """
439 self._lexer.feed(food)
440
441 - def finalize(self):
442 """
443 :See: `ParserInterface`
444
445 :Exceptions:
446 - `LexerEOFError` : EOF in the middle of a state
447 """
448 if self._lexer is not None:
449 self._lexer, _ = None, self._lexer.finalize()
450
451 tagstack = self._tagstack
452 while tagstack:
453 self.listener.handle_endtag(tagstack.pop()[1], '')
454