Source code for lexor.core.parser

"""Parser Module

Provides the `Parser` object which defines the basic mechanism for
parsing character sequences. This involves using objects derived from
the abstract class `NodeParser`.

"""

import re
import sys
from lexor.command import config
from lexor.command.lang import get_style_module, map_explanations
LC = sys.modules['lexor.core']


[docs]class NodeParser(object): """An object that has two methods: `makeNode` and `close`. The first method is required to be overloaded in derived objects.""" def __init__(self, parser): """A `NodeParser` needs to be initialized with a `Parser` object. If this method is to be overloaded then make sure that it only accepts one parameter: `parser`. This method is used by `Parser` and it calls it with itself as the parameter. """ self.parser = parser
[docs] def make_node(self): """This method is required to be overloaded by the derived node parser. It returns `None` if the node parser will not be able to create a node from the current information in the parser. Otherwise it creates a `Node` object and returns it. When returning a node you have the option of informing the parser if the node is complete or not. For instance, if your node parser creates an Element and it does not have any children to be parsed then return a list containing only the single node. This will tell the parser that the node has been closed and it will not call the `close` method of the node parser. If the `Node` does not have a child, say `ProcessingInstruction`, `RawText`, or `Void` then there is no need to wrap the node in a list. The `Node` object that this method returns also needs to have the property `pos`. This is a list of two integers stating the line and column number where the node was encountered in the text that is being parsed. This property will be removed by the parser once the parser finishes all processing with the node. If this method is not overloaded as previously stated then a `NotImplementedError` exception will be raised. """ msg = '%s did not implement `make_node`' % self.__class__ raise NotImplementedError(msg)
[docs] def close(self, _): """This method needs to be overloaded if the node parser returns a `Node` with the `make_node` method. This method will not get called if `make_node` returned a `Node` inside a `list`. The close function takes as input the `Node` object that `make_node` returned and it should decide if the node can be closed or not. If it is indeed time to close the `Node` then return a list with the position where the `Node` is being closed, otherwise return `None`. If this method is not overloaded then a `NotImplementedError` exception will be raised. """ msg = '%s did not implement `close`' % self.__class__ raise NotImplementedError(msg)
[docs] def msg(self, code, pos, arg=None, uri=None): """Send a message to the parser. """ self.parser.msg(self.__module__, code, pos, arg, uri) # The default of 7 attributes max in a class is too restrictive. # pylint: disable=R0902
[docs]class Parser(object): """To see the languages that it is able to parse see the `lexor.lang` module. """ def __init__(self, lang='xml', style='default', defaults=None): """Create a new parser by specifying the language and the style in which text will be parsed. """ if defaults is None: defaults = dict() self._lang = lang self._style = style self._np = None self._next_check = None self._in_progress = None self._uri = None self._reload = True self.style_module = None self.text = None self.end = None self.pos = None self.caret = None self.doc = None self.log = None self.defaults = defaults self._node_parser = None def _set_node_parser(self, val): """Helper function to create a node parser and store it in dictionary. """ if isinstance(val, str): return self._node_parser[val] name = val.__name__ self._node_parser[name] = val(self) return self._node_parser[name] def __getitem__(self, name): """Return a Node parser. """ return self._node_parser[name] def _set_node_parsers(self, lang, style, defaults=None): """Imports the correct module based on the language and style. """ self.style_module = get_style_module('parser', lang, style) name = '%s-parser-%s' % (lang, style) config.set_style_cfg(self, name, defaults) self._next_check = dict() self._np = dict() self._node_parser = dict() if hasattr(self.style_module, 'REPOSITORY'): for val in self.style_module.REPOSITORY: self._set_node_parser(val) if hasattr(self.style_module, 'parser_setup'): self.style_module.parser_setup(self) str_key = list() for key, val in self.style_module.MAPPING.iteritems(): self._next_check[key] = re.compile('.*?[%s]' % val[0]) if isinstance(val, str): str_key.append((key, val)) else: self._np[key] = [self._set_node_parser(p) for p in val[1]] for key, val in str_key: self._np[key] = self._np[val]
[docs] def load_node_parsers(self): """Loads the node parsers. This function is called automatically when `parse` is called only if there was a change in the settings. """ self._set_node_parsers( self._lang, self._style, self.defaults ) self._reload = False
[docs] def parse(self, text, uri=None): """parses the given `text`. To see the results of this method see the `document` and `log` property. If no `uri` is given then `document` will return a `DocumentFragment` node. """ if self._reload: self.load_node_parsers() self.text = text self.end = len(text) self.pos = [1, 1] self.caret = 0 self.doc = LC.Document(self._lang) if uri: self._uri = uri else: self._uri = 'string@0x%x' % id(text) self.doc.uri_ = self._uri self.log = LC.Document("lexor", "log") self.log.modules = dict() self.log.explanation = dict() if hasattr(self.style_module, 'pre_process'): self.style_module.pre_process(self) self._parse() if hasattr(self.style_module, 'post_process'): self.style_module.post_process(self) map_explanations(self.log.modules, self.log.explanation)
@property
[docs] def cdata(self): """The character sequence data that was last processed by the `parse` method. You may use the attribute access `text` if performance is an issue. """ return self.text
@property
[docs] def uri(self): """The Uniform Resource Identifier. This is the name that was given to the text that was last parsed. """ return self._uri
@property
[docs] def position(self): """Position of caret in the text in terms of line and column. i.e. returns [line, column]. You may use the attribute access `pos` if performance is an issue. """ return self.pos
@property
[docs] def caret_position(self): """The index in the text the parser is processing. You may use the attribute access `caret` if performance is an issue. """ return self.caret
@property
[docs] def lexor_log(self): """The `lexor_log` document. See this document after each call to `parse` to see warnings and errors in the text that was parsed. """ return self.log
@property
[docs] def document(self): """The parsed document. This is a `Document` or `FragmentedDocument` created by the `parse` method. """ return self.doc
@property def language(self): """The language in which the `Parser` object will parse character sequences. """ return self._lang @language.setter
[docs] def language(self, value): """Setter function for style. """ self._lang = value self._reload = True
@property def parsing_style(self): """The style in which the `Parser` object will parse the character sequences. """ return self._style @parsing_style.setter
[docs] def parsing_style(self, value): """Setter function for style. """ self._style = value self._reload = True
[docs] def set(self, lang, style, defaults=None): """Set the language and style in one call. """ if defaults is not None: self.defaults = defaults self._lang = lang self._style = style self._reload = True
[docs] def copy_pos(self): """Returns a copy of the current position. """ return list(self.pos)
[docs] def update(self, index): """Changes the position of the `caret` and updates `pos`. This function assumes that you are moving forward. Do not update to an index which is less than the current position of the caret. """ if index == self.caret: return nlines = self.text.count('\n', self.caret, index) self.pos[0] += nlines if nlines > 0: self.pos[1] = index - self.text.rfind('\n', self.caret, index) else: self.pos[1] += index - self.caret self.caret = index
[docs] def compute(self, index): """Returns a position in the text `[line, column]` given an index. Note: This does not modify anything in the parser. It only gives you the line and column where the caret would be given the index. The same applies as in update. Do not use compute with an index less than the current position of the caret. """ nlines = self.text.count('\n', self.caret, index) tmpline = self.pos[0] + nlines if nlines > 0: tmpcolumn = index - self.text.rfind('\n', self.caret, index) else: tmpcolumn = self.pos[1] + index - self.caret return [tmpline, tmpcolumn] # pylint: disable=R0913
[docs] def msg(self, mod_name, code, pos, arg=None, uri=None): """Provide the name of module issuing the message, the code number, the position of caret and optional arguments and uri. This information gets stored in the log. """ if uri is None: uri = self._uri if arg is None: arg = () node = LC.Void('msg') node['module'] = mod_name node['code'] = code node['position'] = list(pos) node['uri'] = uri node['arg'] = arg if mod_name not in self.log.modules: self.log.modules[mod_name] = sys.modules[mod_name] self.log.append_child(node)
def _get_np(self, node): """Get a node parser based on the name of the node. """ return self._np.get(node.name, self._np['__default__']) def _get_next_checker(self, node): """Get the checker based on the name of the node. """ return self._next_check.get(node.name, self._next_check['__default__']) def _get_next_check(self, node): """Locate the index where a processor might return Node. If there is no index then return -1.""" match = self._get_next_checker(node).search(self.text, self.caret) if match is None: return -1 return match.end(0)-1 def _process_node(self, crt, node, processor): """Appends the node to crt. """ if isinstance(node, LC.Text): if len(crt) > 0 and isinstance(crt[-1], LC.Text): crt[-1].data += node.data else: crt.append_child(node) elif isinstance(node, list): # Empty Element crt.append_child(node[0]) else: crt.append_child(node) if isinstance(node.child, list): self._in_progress.append((node, processor)) return node return None def _process_text(self, crt): """When there is no node then we just read the text. """ index = self._get_next_check(crt) if index == -1: content = self.text[self.caret:self.end] if len(crt) > 0 and isinstance(crt[-1], LC.Text): crt[-1].data += content else: crt.append_child(content) self.update(self.end) return elif index - self.caret == 0: index += 1 content = self.text[self.caret:index] self.update(index) if len(crt) > 0 and isinstance(crt[-1], LC.Text): crt[-1].data += content else: crt.append_child(content) def _close_node(self): """Checks and closes a node that is in self._in_progress. """ num = len(self._in_progress) autoclose = None for node, processor in reversed(self._in_progress): num -= 1 autoclose = processor.close(node) if autoclose is not None: break if autoclose is not None: # Must go backwards since the list inprogress is # changing. for i in xrange(len(self._in_progress)-1, num, -1): name = self._in_progress[i][0].name self.msg( self.__module__, 'W100', self._in_progress[i][0].pos, (name, autoclose[0], autoclose[1]) ) del self._in_progress[i][0].pos del self._in_progress[i] del self._in_progress[num] if self._in_progress: return self._in_progress[-1][0] else: return self.doc return None def _parse(self): """Main parsing function. This function depends on the node parsers of the language. """ crt = self.doc self._in_progress = [] while self.caret < self.end: tmp = self._close_node() if tmp is not None: crt = tmp continue match = False processor = None for processor in self._get_np(crt): node = processor.make_node() if node is not None: match = True break elif self.caret == self.end: break if match is False: self._process_text(crt) elif self._process_node(crt, node, processor) is node: crt = node for node, processor in self._in_progress: self.msg(self.__module__, 'E100', node.pos, [node.name]) del node.pos
MSG = { 'E100': 'closing string for `Node` of name "{0}" not found', 'W100': 'auto-closing `Node` of name "{0}" at {1}:{2:2}', } MSG_EXPLANATION = [ """ - The parser did not find a closing string for the given node. - This is a general error which is language dependent. Make sure to provide the required closing string for the node. The following are examples for HTML, LaTeX and Lexor: Okay: <node></node> Okay: \\begin{node}\\end{node} Okay: %%{node}%% E100: <node> E100: \\begin{node} E100: %%{node} """, """ - The parser was forced to automatically close the current node in progress due to the encounter of the closing sequence of a parent node. - This is a general warning which is language dependent. To get rid of this warning provide the closing sequence for the node before the closing sequence of the parent node. The following is an example in HTML: Okay: <a><p>stuff</p><p>stuff</p></a> Okay: <a><p>stuff<p>stuff</p></a> W100: <a><p>stuff</p><p>stuff</a> W100: <a><p>stuff<p>stuff</a> """, ]