"""Parser Module
Provides the `Parser` object which defines the basic mechanism for
parsing character sequences. This involves using objects derived from
the abstract class `NodeParser`.
"""
import re
import sys
from lexor.command import config
from lexor.command.lang import get_style_module, map_explanations
LC = sys.modules['lexor.core']
[docs]class NodeParser(object):
"""An object that has two methods: `makeNode` and `close`. The
first method is required to be overloaded in derived objects."""
def __init__(self, parser):
"""A `NodeParser` needs to be initialized with a `Parser`
object. If this method is to be overloaded then make sure
that it only accepts one parameter: `parser`. This method is
used by `Parser` and it calls it with itself as the parameter.
"""
self.parser = parser
[docs] def make_node(self):
"""This method is required to be overloaded by the derived
node parser. It returns `None` if the node parser will not be
able to create a node from the current information in the
parser. Otherwise it creates a `Node` object and returns it.
When returning a node you have the option of informing the
parser if the node is complete or not. For instance, if your
node parser creates an Element and it does not have any
children to be parsed then return a list containing only the
single node. This will tell the parser that the node has been
closed and it will not call the `close` method of the node
parser. If the `Node` does not have a child, say
`ProcessingInstruction`, `RawText`, or `Void` then there is
no need to wrap the node in a list.
The `Node` object that this method returns also needs
to have the property `pos`. This is a list of two integers
stating the line and column number where the node was
encountered in the text that is being parsed. This property
will be removed by the parser once the parser finishes all
processing with the node.
If this method is not overloaded as previously stated then
a `NotImplementedError` exception will be raised.
"""
msg = '%s did not implement `make_node`' % self.__class__
raise NotImplementedError(msg)
[docs] def close(self, _):
"""This method needs to be overloaded if the node parser
returns a `Node` with the `make_node` method.
This method will not get called if `make_node` returned a
`Node` inside a `list`. The close function takes as input the
`Node` object that `make_node` returned and it should decide
if the node can be closed or not. If it is indeed time to
close the `Node` then return a list with the position where
the `Node` is being closed, otherwise return `None`.
If this method is not overloaded then a `NotImplementedError`
exception will be raised.
"""
msg = '%s did not implement `close`' % self.__class__
raise NotImplementedError(msg)
[docs] def msg(self, code, pos, arg=None, uri=None):
"""Send a message to the parser. """
self.parser.msg(self.__module__, code, pos, arg, uri)
# The default of 7 attributes max in a class is too restrictive.
# pylint: disable=R0902
[docs]class Parser(object):
"""To see the languages that it is able to parse see the
`lexor.lang` module. """
def __init__(self, lang='xml', style='default', defaults=None):
"""Create a new parser by specifying the language and the
style in which text will be parsed. """
if defaults is None:
defaults = dict()
self._lang = lang
self._style = style
self._np = None
self._next_check = None
self._in_progress = None
self._uri = None
self._reload = True
self.style_module = None
self.text = None
self.end = None
self.pos = None
self.caret = None
self.doc = None
self.log = None
self.defaults = defaults
self._node_parser = None
def _set_node_parser(self, val):
"""Helper function to create a node parser and store it
in dictionary. """
if isinstance(val, str):
return self._node_parser[val]
name = val.__name__
self._node_parser[name] = val(self)
return self._node_parser[name]
def __getitem__(self, name):
"""Return a Node parser. """
return self._node_parser[name]
def _set_node_parsers(self, lang, style, defaults=None):
"""Imports the correct module based on the language and style. """
self.style_module = get_style_module('parser', lang, style)
name = '%s-parser-%s' % (lang, style)
config.set_style_cfg(self, name, defaults)
self._next_check = dict()
self._np = dict()
self._node_parser = dict()
if hasattr(self.style_module, 'REPOSITORY'):
for val in self.style_module.REPOSITORY:
self._set_node_parser(val)
if hasattr(self.style_module, 'parser_setup'):
self.style_module.parser_setup(self)
str_key = list()
for key, val in self.style_module.MAPPING.iteritems():
self._next_check[key] = re.compile('.*?[%s]' % val[0])
if isinstance(val, str):
str_key.append((key, val))
else:
self._np[key] = [self._set_node_parser(p) for p in val[1]]
for key, val in str_key:
self._np[key] = self._np[val]
[docs] def load_node_parsers(self):
"""Loads the node parsers. This function is called
automatically when `parse` is called only if there was a
change in the settings. """
self._set_node_parsers(
self._lang, self._style, self.defaults
)
self._reload = False
[docs] def parse(self, text, uri=None):
"""parses the given `text`. To see the results of this method see
the `document` and `log` property. If no `uri` is given then
`document` will return a `DocumentFragment` node. """
if self._reload:
self.load_node_parsers()
self.text = text
self.end = len(text)
self.pos = [1, 1]
self.caret = 0
self.doc = LC.Document(self._lang)
if uri:
self._uri = uri
else:
self._uri = 'string@0x%x' % id(text)
self.doc.uri_ = self._uri
self.log = LC.Document("lexor", "log")
self.log.modules = dict()
self.log.explanation = dict()
if hasattr(self.style_module, 'pre_process'):
self.style_module.pre_process(self)
self._parse()
if hasattr(self.style_module, 'post_process'):
self.style_module.post_process(self)
map_explanations(self.log.modules, self.log.explanation)
@property
[docs] def cdata(self):
"""The character sequence data that was last processed by the
`parse` method. You may use the attribute access `text` if
performance is an issue. """
return self.text
@property
[docs] def uri(self):
"""The Uniform Resource Identifier. This is the name that was
given to the text that was last parsed. """
return self._uri
@property
[docs] def position(self):
"""Position of caret in the text in terms of line and column. i.e.
returns [line, column]. You may use the attribute access `pos` if
performance is an issue. """
return self.pos
@property
[docs] def caret_position(self):
"""The index in the text the parser is processing. You may use
the attribute access `caret` if performance is an issue. """
return self.caret
@property
[docs] def lexor_log(self):
"""The `lexor_log` document. See this document after each
call to `parse` to see warnings and errors in the text that
was parsed. """
return self.log
@property
[docs] def document(self):
"""The parsed document. This is a `Document` or
`FragmentedDocument` created by the `parse` method. """
return self.doc
@property
def language(self):
"""The language in which the `Parser` object will parse
character sequences. """
return self._lang
@language.setter
[docs] def language(self, value):
"""Setter function for style. """
self._lang = value
self._reload = True
@property
def parsing_style(self):
"""The style in which the `Parser` object will parse the
character sequences. """
return self._style
@parsing_style.setter
[docs] def parsing_style(self, value):
"""Setter function for style. """
self._style = value
self._reload = True
[docs] def set(self, lang, style, defaults=None):
"""Set the language and style in one call. """
if defaults is not None:
self.defaults = defaults
self._lang = lang
self._style = style
self._reload = True
[docs] def copy_pos(self):
"""Returns a copy of the current position. """
return list(self.pos)
[docs] def update(self, index):
"""Changes the position of the `caret` and updates `pos`.
This function assumes that you are moving forward. Do not
update to an index which is less than the current position of
the caret. """
if index == self.caret:
return
nlines = self.text.count('\n', self.caret, index)
self.pos[0] += nlines
if nlines > 0:
self.pos[1] = index - self.text.rfind('\n', self.caret, index)
else:
self.pos[1] += index - self.caret
self.caret = index
[docs] def compute(self, index):
"""Returns a position in the text `[line, column]` given an
index. Note: This does not modify anything in the parser. It
only gives you the line and column where the caret would be
given the index. The same applies as in update. Do not use
compute with an index less than the current position of the
caret. """
nlines = self.text.count('\n', self.caret, index)
tmpline = self.pos[0] + nlines
if nlines > 0:
tmpcolumn = index - self.text.rfind('\n', self.caret, index)
else:
tmpcolumn = self.pos[1] + index - self.caret
return [tmpline, tmpcolumn]
# pylint: disable=R0913
[docs] def msg(self, mod_name, code, pos, arg=None, uri=None):
"""Provide the name of module issuing the message, the code
number, the position of caret and optional arguments and uri.
This information gets stored in the log. """
if uri is None:
uri = self._uri
if arg is None:
arg = ()
node = LC.Void('msg')
node['module'] = mod_name
node['code'] = code
node['position'] = list(pos)
node['uri'] = uri
node['arg'] = arg
if mod_name not in self.log.modules:
self.log.modules[mod_name] = sys.modules[mod_name]
self.log.append_child(node)
def _get_np(self, node):
"""Get a node parser based on the name of the node. """
return self._np.get(node.name, self._np['__default__'])
def _get_next_checker(self, node):
"""Get the checker based on the name of the node. """
return self._next_check.get(node.name, self._next_check['__default__'])
def _get_next_check(self, node):
"""Locate the index where a processor might return Node. If
there is no index then return -1."""
match = self._get_next_checker(node).search(self.text, self.caret)
if match is None:
return -1
return match.end(0)-1
def _process_node(self, crt, node, processor):
"""Appends the node to crt. """
if isinstance(node, LC.Text):
if len(crt) > 0 and isinstance(crt[-1], LC.Text):
crt[-1].data += node.data
else:
crt.append_child(node)
elif isinstance(node, list): # Empty Element
crt.append_child(node[0])
else:
crt.append_child(node)
if isinstance(node.child, list):
self._in_progress.append((node, processor))
return node
return None
def _process_text(self, crt):
"""When there is no node then we just read the text. """
index = self._get_next_check(crt)
if index == -1:
content = self.text[self.caret:self.end]
if len(crt) > 0 and isinstance(crt[-1], LC.Text):
crt[-1].data += content
else:
crt.append_child(content)
self.update(self.end)
return
elif index - self.caret == 0:
index += 1
content = self.text[self.caret:index]
self.update(index)
if len(crt) > 0 and isinstance(crt[-1], LC.Text):
crt[-1].data += content
else:
crt.append_child(content)
def _close_node(self):
"""Checks and closes a node that is in self._in_progress. """
num = len(self._in_progress)
autoclose = None
for node, processor in reversed(self._in_progress):
num -= 1
autoclose = processor.close(node)
if autoclose is not None:
break
if autoclose is not None:
# Must go backwards since the list inprogress is
# changing.
for i in xrange(len(self._in_progress)-1, num, -1):
name = self._in_progress[i][0].name
self.msg(
self.__module__, 'W100',
self._in_progress[i][0].pos,
(name, autoclose[0], autoclose[1])
)
del self._in_progress[i][0].pos
del self._in_progress[i]
del self._in_progress[num]
if self._in_progress:
return self._in_progress[-1][0]
else:
return self.doc
return None
def _parse(self):
"""Main parsing function. This function depends on the
node parsers of the language. """
crt = self.doc
self._in_progress = []
while self.caret < self.end:
tmp = self._close_node()
if tmp is not None:
crt = tmp
continue
match = False
processor = None
for processor in self._get_np(crt):
node = processor.make_node()
if node is not None:
match = True
break
elif self.caret == self.end:
break
if match is False:
self._process_text(crt)
elif self._process_node(crt, node, processor) is node:
crt = node
for node, processor in self._in_progress:
self.msg(self.__module__, 'E100', node.pos, [node.name])
del node.pos
MSG = {
'E100': 'closing string for `Node` of name "{0}" not found',
'W100': 'auto-closing `Node` of name "{0}" at {1}:{2:2}',
}
MSG_EXPLANATION = [
"""
- The parser did not find a closing string for the given node.
- This is a general error which is language dependent. Make sure
to provide the required closing string for the node.
The following are examples for HTML, LaTeX and Lexor:
Okay: <node></node>
Okay: \\begin{node}\\end{node}
Okay: %%{node}%%
E100: <node>
E100: \\begin{node}
E100: %%{node}
""",
"""
- The parser was forced to automatically close the current node
in progress due to the encounter of the closing sequence of a
parent node.
- This is a general warning which is language dependent. To get
rid of this warning provide the closing sequence for the node
before the closing sequence of the parent node.
The following is an example in HTML:
Okay: <a><p>stuff</p><p>stuff</p></a>
Okay: <a><p>stuff<p>stuff</p></a>
W100: <a><p>stuff</p><p>stuff</a>
W100: <a><p>stuff<p>stuff</a>
""",
]