Package tdi :: Package markup :: Package text :: Module parser
[frames] | no frames]

Source Code for Module tdi.markup.text.parser

  1  # -*- coding: ascii -*- 
  2  r""" 
  3  :Copyright: 
  4   
  5   Copyright 2012 - 2015 
  6   Andr\xe9 Malo or his licensors, as applicable 
  7   
  8  :License: 
  9   
 10   Licensed under the Apache License, Version 2.0 (the "License"); 
 11   you may not use this file except in compliance with the License. 
 12   You may obtain a copy of the License at 
 13   
 14       http://www.apache.org/licenses/LICENSE-2.0 
 15   
 16   Unless required by applicable law or agreed to in writing, software 
 17   distributed under the License is distributed on an "AS IS" BASIS, 
 18   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 
 19   See the License for the specific language governing permissions and 
 20   limitations under the License. 
 21   
 22  =================== 
 23   Text Parser Logic 
 24  =================== 
 25   
 26  Text Parser. 
 27  """ 
 28  if __doc__: 
 29      # pylint: disable = redefined-builtin 
 30      __doc__ = __doc__.encode('ascii').decode('unicode_escape') 
 31  __author__ = r"Andr\xe9 Malo".encode('ascii').decode('unicode_escape') 
 32  __docformat__ = "restructuredtext en" 
 33   
 34  import re as _re 
 35   
 36  from ..._exceptions import LexerEOFError, LexerFinalizedError 
 37  from ... import interfaces as _interfaces 
 38   
 39   
40 -class TextLexer(object):
41 """ Text Lexer """ 42 # pylint: disable = no-member 43
44 - def __init__(self, listener):
45 """ 46 Initialization 47 48 :Parameters: 49 `listener` : `ListenerInterface` 50 The event listener 51 """ 52 self._listener = listener 53 54 self.state = self.TEXT 55 self._lexers = [getattr(self, name) for name in self._LEXERS] 56 self._buffer = ''
57
58 - def feed(self, food):
59 """ 60 Feed the lexer with new data 61 62 :Parameters: 63 `food` : ``str`` 64 The data to process 65 """ 66 self._buffer += food 67 self._lex()
68
69 - def finalize(self):
70 """ 71 Finalize the lexer 72 73 This processes the rest buffer (if any) 74 75 :Exceptions: 76 - `LexerEOFError` : The rest buffer could not be consumed 77 """ 78 self._lex() 79 if self._buffer: 80 raise LexerEOFError( 81 "Unfinished parser state %s" % self._STATES[self.state] 82 ) 83 84 self.state = self.FINAL
85
86 - def _lex(self):
87 """ Parse the current buffer """ 88 while self._buffer: 89 if self._lexers[self.state](): 90 break
91
92 - def _lex_text(self):
93 """ 94 Text lexer 95 96 State: We are between tags or at the very beginning of the document 97 and look for a ``[``. 98 99 :Return: Unfinished state? 100 :Rtype: ``bool`` 101 """ 102 data = self._buffer 103 pos = data.find('[') 104 if pos == 0: 105 self.state = self.MARKUP 106 return False 107 elif pos == -1: 108 self._buffer = '' 109 else: 110 self._buffer, data = data[pos:], data[:pos] 111 self.state = self.MARKUP 112 113 self._listener.handle_text(data) 114 return False
115
116 - def _lex_markup(self):
117 """ 118 Markup lexer 119 120 State: We've hit a ``[`` character and now find out, what it's 121 becoming 122 123 :Return: Unfinished state? 124 :Rtype: ``bool`` 125 """ 126 data = self._buffer 127 if len(data) < 2: 128 return True 129 130 char = data[1] 131 if char == '/': 132 state = self.ENDTAG 133 elif char == '#': 134 state = self.COMMENT 135 elif char == '?': 136 state = self.PI 137 elif char == ']': 138 state = self.TEXT 139 self._listener.handle_escape(data[0], data[:2]) 140 self._buffer = data[2:] 141 else: 142 state = self.STARTTAG 143 144 self.state = state 145 return False
146 147 #: Regex matcher for a start tag 148 #: 149 #: :Type: ``callable`` 150 _START_MATCH = _re.compile(r''' 151 \[ 152 ( 153 [^\\"'\[\]]* 154 (?: 155 (?: 156 "[^\\"]*(?:\\.[^\\"]*)*" 157 | '[^\\']*(?:\\.[^\\']*)*' 158 ) 159 [^\\"'\[\]]* 160 )* 161 ) 162 \] 163 ''', _re.X | _re.S).match 164 165 #: Regex matcher for an empty start tag 166 #: 167 #: :Type: ``callable`` 168 _EMPTY_START_MATCH = _re.compile(r''' 169 \[ 170 ( 171 \[ 172 [^\\"'\[\]]* 173 (?: 174 (?: 175 "[^\\"]*(?:\\.[^\\"]*)*" 176 | '[^\\']*(?:\\.[^\\']*)*' 177 ) 178 [^\\"'\[\]]* 179 )* 180 \] 181 ) 182 \] 183 ''', _re.X | _re.S).match 184 185 #: Regex iterator for extracting start tag attributes 186 #: 187 #: :Type: ``callable`` 188 _ATT_ITER = _re.compile(r''' 189 \s* 190 (?P<name>[^\s=\]]*) # attribute name 191 \s* 192 (?: 193 = 194 (?P<value> # optional value 195 \s* "[^\\"]*(?:\\.[^\\"]*)*" 196 | \s* '[^\\']*(?:\\.[^\\']*)*' 197 | [^\\\s\]]* 198 ) 199 )? 200 ''', _re.X | _re.S).finditer 201
202 - def _lex_start(self):
203 """ 204 Starttag lexer 205 206 State: We've hit a ``[tag`` and now look for the ``]`` 207 208 :Return: Unfinished State? 209 :Rtype: ``bool`` 210 """ 211 data = self._buffer 212 match = self._EMPTY_START_MATCH(data) or self._START_MATCH(data) 213 if match is None: 214 return True 215 216 pos = match.end() 217 self._buffer, data = data[pos:], data[:pos] 218 219 attrstring = match.group(1) 220 quoted = attrstring.startswith('[') 221 if quoted: 222 attrstring = attrstring[1:-1] 223 224 splitted = attrstring.split(None, 1) 225 if not splitted: 226 self._listener.handle_text(data) 227 self.state = self.TEXT 228 return False 229 name = splitted[0] 230 if '=' in name: 231 name = '' 232 elif len(splitted) == 1: 233 attrstring = None 234 else: 235 attrstring = splitted[1] 236 237 attr = [] 238 if attrstring: 239 for match in self._ATT_ITER(attrstring): 240 key, value = match.group('name', 'value') 241 if key or value is not None: 242 if value: 243 value = value.strip() 244 attr.append((key.strip(), value)) 245 else: # bug in Python < 2.3.5 (fixed in rev 37262) 246 break 247 248 self.state = self.TEXT 249 self._listener.handle_starttag(name, attr, quoted, data) 250 return False
251
252 - def _lex_end(self):
253 """ 254 Endtag lexer 255 256 State: We've hit ``[/``. 257 258 :Return: Unfinished state? 259 :Rtype: ``bool`` 260 """ 261 data = self._buffer 262 pos = data.find(']') + 1 263 if pos == 0: 264 return True 265 266 self._buffer, data = data[pos:], data[:pos] 267 name = data[2:-1].strip() 268 269 self.state = self.TEXT 270 self._listener.handle_endtag(name, data) 271 return False
272 273 #: Regex searcher for finding the end of a comment 274 #: 275 #: :Type: ``callable`` 276 _COMMENT_SEARCH = _re.compile(r'#\]').search 277
278 - def _lex_comment(self):
279 """ 280 Comment lexer 281 282 State: We've hit ``[#``. 283 284 :Return: Unfinished state? 285 :Rtype: ``bool`` 286 """ 287 data = self._buffer 288 if len(data) < 4: 289 return True 290 291 match = self._COMMENT_SEARCH(data, 2) 292 if match is None: 293 return True 294 295 pos = match.end() 296 self._buffer, data = data[pos:], data[:pos] 297 298 self.state = self.TEXT 299 self._listener.handle_comment(data) 300 return False
301
302 - def _lex_pi(self):
303 """ 304 Processing instruction lexer 305 306 State: We've hit a ``[?`` and now peek inside 307 308 :Return: Unfinished state? 309 :Rtype: ``bool`` 310 """ 311 data = self._buffer 312 pos = data.find('?]', 2) 313 if pos == -1: 314 return True 315 pos += 2 316 317 self._buffer, data = data[pos:], data[:pos] 318 319 self.state = self.TEXT 320 self._listener.handle_pi(data) 321 return False
322
323 - def _lex_final(self):
324 """ 325 Called after the lexer was finalized 326 327 State: after all 328 329 :Exceptions: 330 - `LexerFinalizedError` : The lexer was already finalized 331 (raised always) 332 """ 333 raise LexerFinalizedError("The lexer was already finalized")
334 335 _LEXERS = [] 336 _STATES = [] 337 for _idx, (_statename, _funcname) in enumerate([ 338 # pylint: disable = bad-whitespace 339 340 ('FINAL', '_lex_final'), 341 ('TEXT', '_lex_text'), 342 ('MARKUP', '_lex_markup'), 343 ('STARTTAG', '_lex_start'), 344 ('ENDTAG', '_lex_end'), 345 ('PI', '_lex_pi'), 346 ('COMMENT', '_lex_comment'), 347 ]): # noqa 348 setattr(TextLexer, _statename, _idx) 349 _LEXERS.append(_funcname) 350 _STATES.append(_statename) 351 352 TextLexer._LEXERS = tuple(_LEXERS) # pylint: disable = protected-access 353 TextLexer._STATES = tuple(_STATES) # pylint: disable = protected-access 354 del _idx, _statename, _funcname # pylint: disable = undefined-loop-variable 355 del _LEXERS, _STATES 356 357
358 -class TextParser(object):
359 """ Text Parser """ 360 __implements__ = [ 361 _interfaces.ListenerInterface, _interfaces.ParserInterface 362 ] 363
364 - def __init__(self, listener, lexer=TextLexer):
365 """ 366 Initialization 367 368 :Parameters: 369 `listener` : `BuildingListenerInterface` 370 The building listener 371 372 `lexer` : ``callable`` 373 Lexer class/factory. This must be a callable taking an 374 event listener and returning a lexer instance 375 """ 376 self._tagstack = [] 377 self.listener = listener 378 self._lexer = lexer(self) 379 self._normalize = self.listener.decoder.normalize
380 381 ######################################################################### 382 # ListenerInterface ##################################################### 383 ######################################################################### 384
385 - def handle_text(self, data):
386 """ :See: `ListenerInterface` """ 387 self.listener.handle_text(data)
388
389 - def handle_escape(self, escaped, data):
390 """ :See: `ListenerInterface` """ 391 self.listener.handle_escape(escaped, data)
392
393 - def handle_starttag(self, name, attrs, closed, data):
394 """ :See: `ListenerInterface` """ 395 self.listener.handle_starttag(name, attrs, closed, data) 396 if not closed: 397 self._tagstack.append((self._normalize(name), name))
398
399 - def handle_endtag(self, name, data):
400 """ :See: `ListenerInterface` """ 401 tagstack = self._tagstack 402 if tagstack: 403 if name == '': 404 name = tagstack[-1][1] 405 endtag = self._normalize(name) 406 if endtag in dict(tagstack): 407 toclose, original = tagstack.pop() 408 while toclose != name: 409 self.listener.handle_endtag(original, '') 410 toclose, original = tagstack.pop() 411 self.listener.handle_endtag(name, data)
412
413 - def handle_comment(self, data):
414 """ :See: `ListenerInterface` """ 415 self.listener.handle_comment(data)
416
417 - def handle_pi(self, data):
418 """ :See: `ListenerInterface` """ 419 self.listener.handle_pi(data)
420
421 - def handle_msection(self, name, value, data):
422 """ :See: `ListenerInterface` """ 423 # pylint: disable = unused-argument 424 425 raise AssertionError()
426
427 - def handle_decl(self, name, value, data):
428 """ :See: `ListenerInterface` """ 429 # pylint: disable = unused-argument 430 431 raise AssertionError()
432 433 ######################################################################### 434 # ParserInterface ####################################################### 435 ######################################################################### 436
437 - def feed(self, food):
438 """ :See: `ParserInterface` """ 439 self._lexer.feed(food)
440
441 - def finalize(self):
442 """ 443 :See: `ParserInterface` 444 445 :Exceptions: 446 - `LexerEOFError` : EOF in the middle of a state 447 """ 448 if self._lexer is not None: 449 self._lexer, _ = None, self._lexer.finalize() # noqa 450 451 tagstack = self._tagstack 452 while tagstack: 453 self.listener.handle_endtag(tagstack.pop()[1], '')
454