| #!/usr/bin/python |
| # Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file |
| # for details. All rights reserved. Use of this source code is governed by a |
| # BSD-style license that can be found in the LICENSE file. |
| |
| import logging |
| import re |
| import weakref |
| |
| _logger = logging.getLogger('pegparser') |
| |
| # functions can refer to each other, hence creating infinite loops. The |
| # following hashmap is used to memoize functions that were already compiled. |
| _compiled_functions_memory = weakref.WeakKeyDictionary() |
| |
| _regex_type = type(re.compile(r'')) |
| _list_type = type([]) |
| _function_type = type(lambda func: 0) |
| |
| |
| class _PegParserState(object): |
| """Object for storing parsing state variables and options""" |
| |
| def __init__(self, text, whitespace_rule, strings_are_tokens): |
| # Parsing state: |
| self.text = text |
| self.is_whitespace_mode = False |
| |
| # Error message helpers: |
| self.max_pos = None |
| self.max_rule = None |
| |
| # Parsing options: |
| self.whitespace_rule = whitespace_rule |
| self.strings_are_tokens = strings_are_tokens |
| |
| |
| class _PegParserRule(object): |
| """Base class for all rules""" |
| |
| def __init__(self): |
| return |
| |
| def __str__(self): |
| return self.__class__.__name__ |
| |
| def _match_impl(self, state, pos): |
| """Default implementation of the matching algorithm. |
| Should be overwritten by sub-classes. |
| """ |
| raise RuntimeError('_match_impl not implemented') |
| |
| def match(self, state, pos): |
| """Matches the rule against the text in the given position. |
| |
| The actual rule evaluation is delegated to _match_impl, |
| while this function deals mostly with support tasks such as |
| skipping whitespace, debug information and data for exception. |
| |
| Args: |
| state -- the current parsing state and options. |
| pos -- the current offset in the text. |
| |
| Returns: |
| (next position, value) if the rule matches, or |
| (None, None) if it doesn't. |
| """ |
| if not state.is_whitespace_mode: |
| # Skip whitespace |
| pos = _skip_whitespace(state, pos) |
| |
| # Track position for possible error messaging |
| if pos > state.max_pos: |
| # Store position and the rule. |
| state.max_pos = pos |
| if isinstance(self, _StringRule): |
| state.max_rule = [self] |
| else: |
| state.max_rule = [] |
| elif pos == state.max_pos: |
| if isinstance(self, _StringRule): |
| state.max_rule.append(self) |
| |
| if _logger.isEnabledFor(logging.DEBUG): |
| # Used for debugging |
| _logger.debug('Try: pos=%s char=%s rule=%s' % \ |
| (pos, state.text[pos:pos + 1], self)) |
| |
| # Delegate the matching logic to the specialized function. |
| res = self._match_impl(state, pos) |
| |
| if not state.is_whitespace_mode \ |
| and _logger.isEnabledFor(logging.DEBUG): |
| # More debugging information |
| (nextPos, ast) = res |
| if nextPos is not None: |
| _logger.debug('Match! pos=%s char=%s rule=%s' % \ |
| (pos, state.text[pos:pos + 1], self)) |
| else: |
| _logger.debug('Fail. pos=%s char=%s rule=%s' % \ |
| (pos, state.text[pos:pos + 1], self)) |
| |
| return res |
| |
| |
| def _compile(rule): |
| """Recursively compiles user-defined rules into parser rules. |
| Compilation is performed by converting strings, regular expressions, lists |
| and functions into _StringRule, _RegExpRule, SEQUENCE and _FunctionRule |
| (respectively). Memoization is used to avoid infinite recursion as rules |
| may refer to each other.""" |
| if rule is None: |
| raise RuntimeError('None is not a valid rule') |
| elif isinstance(rule, str): |
| return _StringRule(rule) |
| elif isinstance(rule, _regex_type): |
| return _RegExpRule(rule) |
| elif isinstance(rule, _list_type): |
| return SEQUENCE(*rule) |
| elif isinstance(rule, _function_type): |
| # Memoize compiled functions to avoid infinite compliation loops. |
| if rule in _compiled_functions_memory: |
| return _compiled_functions_memory[rule] |
| else: |
| compiled_function = _FunctionRule(rule) |
| _compiled_functions_memory[rule] = compiled_function |
| compiled_function._sub_rule = _compile(rule()) |
| return compiled_function |
| elif isinstance(rule, _PegParserRule): |
| return rule |
| else: |
| raise RuntimeError('Invalid rule type %s: %s', (type(rule), rule)) |
| |
| |
| def _skip_whitespace(state, pos): |
| """Returns the next non-whitespace position. |
| This is done by matching the optional whitespace_rule with the current |
| text.""" |
| if not state.whitespace_rule: |
| return pos |
| state.is_whitespace_mode = True |
| nextPos = pos |
| while nextPos is not None: |
| pos = nextPos |
| (nextPos, ast) = state.whitespace_rule.match(state, pos) |
| state.is_whitespace_mode = False |
| return pos |
| |
| |
| class _StringRule(_PegParserRule): |
| """This rule tries to match a whole string.""" |
| |
| def __init__(self, string): |
| """Constructor. |
| Args: |
| string -- string to match. |
| """ |
| _PegParserRule.__init__(self) |
| self._string = string |
| |
| def __str__(self): |
| return '"%s"' % self._string |
| |
| def _match_impl(self, state, pos): |
| """Tries to match the string at the current position""" |
| if state.text.startswith(self._string, pos): |
| nextPos = pos + len(self._string) |
| if state.strings_are_tokens: |
| return (nextPos, None) |
| else: |
| return (nextPos, self._string) |
| return (None, None) |
| |
| |
| class _RegExpRule(_PegParserRule): |
| """This rule tries to matches a regular expression.""" |
| |
| def __init__(self, reg_exp): |
| """Constructor. |
| Args: |
| reg_exp -- a regular expression used in matching. |
| """ |
| _PegParserRule.__init__(self) |
| self.reg_exp = reg_exp |
| |
| def __str__(self): |
| return 'regexp' |
| |
| def _match_impl(self, state, pos): |
| """Tries to match the regular expression with current text""" |
| matchObj = self.reg_exp.match(state.text, pos) |
| if matchObj: |
| matchStr = matchObj.group() |
| return (pos + len(matchStr), matchStr) |
| return (None, None) |
| |
| |
| class _FunctionRule(_PegParserRule): |
| """Function rule wraps a rule defined via a Python function. |
| |
| Defining rules via functions helps break the grammar into parts, labeling |
| the ast, and supporting recursive definitions in the grammar |
| |
| Usage Example: |
| def Func(): return ['function', TOKEN('('), TOKEN(')')] |
| def Var(): return OR('x', 'y') |
| def Program(): return OR(Func, Var) |
| |
| When matched with 'function()', will return the tuple: |
| ('Program', ('Func', 'function')) |
| When matched with 'x', will return the tuple: |
| ('Program', ('Var', 'x')) |
| |
| Functions who's name begins with '_' will not be labelled. This is useful |
| for creating utility rules. Extending the example above: |
| |
| def _Program(): return OR(Func, Var) |
| |
| When matched with 'function()', will return the tuple: |
| ('Func', 'function') |
| """ |
| |
| def __init__(self, func): |
| """Constructor. |
| Args: |
| func -- the original function will be used for labeling output. |
| """ |
| _PegParserRule.__init__(self) |
| self._func = func |
| # Sub-rule is compiled by _compile to avoid infinite recursion. |
| self._sub_rule = None |
| |
| def __str__(self): |
| return self._func.__name__ |
| |
| def _match_impl(self, state, pos): |
| """Simply invokes the sub rule""" |
| (nextPos, ast) = self._sub_rule.match(state, pos) |
| if nextPos is not None: |
| if not self._func.__name__.startswith('_'): |
| ast = (self._func.__name__, ast) |
| return (nextPos, ast) |
| return (None, None) |
| |
| |
| class SEQUENCE(_PegParserRule): |
| """This rule expects all given rules to match in sequence. |
| Note that SEQUENCE is equivalent to a rule composed of a Python list of |
| rules. |
| Usage example: SEQUENCE('A', 'B', 'C') |
| or: ['A', 'B', 'C'] |
| Will match 'ABC' but not 'A', 'B' or ''. |
| """ |
| def __init__(self, *rules): |
| """Constructor. |
| Args: |
| rules -- one or more rules to match. |
| """ |
| _PegParserRule.__init__(self) |
| self._sub_rules = [] |
| for rule in rules: |
| self._sub_rules.append(_compile(rule)) |
| |
| def _match_impl(self, state, pos): |
| """Tries to match all the sub rules""" |
| sequence = [] |
| for rule in self._sub_rules: |
| (nextPos, ast) = rule.match(state, pos) |
| if nextPos is not None: |
| if ast: |
| if isinstance(ast, _list_type): |
| sequence.extend(ast) |
| else: |
| sequence.append(ast) |
| pos = nextPos |
| else: |
| return (None, None) |
| return (pos, sequence) |
| |
| |
| class OR(_PegParserRule): |
| """This rule matches one and only one of multiple sub-rules. |
| Usage example: OR('A', 'B', 'C') |
| Will match 'A', 'B' or 'C'. |
| """ |
| def __init__(self, *rules): |
| """Constructor. |
| Args: |
| rules -- rules to choose from. |
| """ |
| _PegParserRule.__init__(self) |
| self._sub_rules = [] |
| for rule in rules: |
| self._sub_rules.append(_compile(rule)) |
| |
| def _match_impl(self, state, pos): |
| """Tries to match at leat one of the sub rules""" |
| for rule in self._sub_rules: |
| (nextPos, ast) = rule.match(state, pos) |
| if nextPos is not None: |
| return (nextPos, ast) |
| return (None, None) |
| |
| |
| class MAYBE(_PegParserRule): |
| """Will try to match the given rule, tolerating absence. |
| Usage example: MAYBE('A') |
| Will match 'A' but also ''. |
| """ |
| def __init__(self, rule): |
| """Constructor. |
| Args: |
| rule -- the rule that may be absent. |
| """ |
| _PegParserRule.__init__(self) |
| self._sub_rule = _compile(rule) |
| |
| def _match_impl(self, state, pos): |
| """Tries to match at leat one of the sub rules""" |
| (nextPos, ast) = self._sub_rule.match(state, pos) |
| if nextPos is not None: |
| return (nextPos, ast) |
| return (pos, None) |
| |
| |
| class MANY(_PegParserRule): |
| """Will try to match the given rule one or more times. |
| Usage example 1: MANY('A') |
| Will match 'A', 'AAAAA' but not ''. |
| Usage example 2: MANY('A', separator=',') |
| Will match 'A', 'A,A' but not 'AA'. |
| """ |
| |
| def __init__(self, rule, separator=None): |
| """Constructor. |
| Args: |
| rule -- the rule to match multiple times. |
| separator -- this optional rule is used to match separators. |
| """ |
| _PegParserRule.__init__(self) |
| self._sub_rule = _compile(rule) |
| self._separator = _compile(separator) if separator else None |
| |
| def _match_impl(self, state, pos): |
| res = [] |
| count = 0 |
| while True: |
| if count > 0 and self._separator: |
| (nextPos, ast) = self._separator.match(state, pos) |
| if nextPos is not None: |
| pos = nextPos |
| if ast: |
| res.append(ast) |
| else: |
| break |
| (nextPos, ast) = self._sub_rule.match(state, pos) |
| if nextPos is None: |
| break |
| count += 1 |
| pos = nextPos |
| res.append(ast) |
| if count > 0: |
| return (pos, res) |
| return (None, None) |
| |
| |
| class TOKEN(_PegParserRule): |
| """The matched rule will not appear in the output. |
| Usage example: ['A', TOKEN('.'), 'B'] |
| When matching 'A.B', will return the sequence ['A', 'B']. |
| """ |
| |
| def __init__(self, rule): |
| """Constructor. |
| Args: |
| rule -- the rule to match. |
| """ |
| _PegParserRule.__init__(self) |
| self._sub_rule = _compile(rule) |
| |
| def _match_impl(self, state, pos): |
| (nextPos, ast) = self._sub_rule.match(state, pos) |
| if nextPos is not None: |
| return (nextPos, None) |
| return (None, None) |
| |
| |
| class LABEL(_PegParserRule): |
| """The matched rule will appear in the output with the given label. |
| Usage example: LABEL('number', re.compile(r'[0-9]+')) |
| When matched with '1234', will return ('number', '1234'). |
| |
| Keyword arguments: |
| label -- a string. |
| rule -- the rule to match. |
| """ |
| |
| def __init__(self, label, rule): |
| """Constructor. |
| Args: |
| rule -- the rule to match. |
| """ |
| _PegParserRule.__init__(self) |
| self._label = label |
| self._sub_rule = _compile(rule) |
| |
| def _match_impl(self, state, pos): |
| (nextPos, ast) = self._sub_rule.match(state, pos) |
| if nextPos is not None: |
| return (nextPos, (self._label, ast)) |
| return (None, None) |
| |
| |
| class RAISE(_PegParserRule): |
| """Raises a SyntaxError with a user-provided message. |
| Usage example: ['A','B', RAISE('should have not gotten here')] |
| Will not match 'A' but will raise an exception for 'AB'. |
| This rule is useful mostly for debugging grammars. |
| """ |
| def __init__(self, message): |
| """Constructor. |
| Args: |
| message -- the message for the raised exception. |
| """ |
| _PegParserRule.__init__(self) |
| self._message = message |
| |
| def _match_impl(self, state, pos): |
| raise RuntimeError(self._message) |
| |
| |
| class PegParser(object): |
| """PegParser class. |
| This generic parser can be configured with rules to parse a wide |
| range of inputs. |
| """ |
| |
| def __init__(self, root_rule, whitespace_rule=None, |
| strings_are_tokens=False): |
| """Initializes a PegParser with rules and parsing options. |
| |
| Args: |
| root_rule -- the top level rule to start matching at. Rule can be |
| a regular expression, a string, or one of the special rules |
| such as SEQUENCE, MANY, OR, etc. |
| whitespace_rule -- used to identify and strip whitespace. Default |
| isNone, configuring the parser to not tolerate whitespace. |
| strings_are_tokens -- by default string rules are not treated as |
| tokens. In many programming languages, strings are tokens, |
| so this should be set to True. |
| """ |
| self._strings_are_tokens = strings_are_tokens |
| self._root_rule = _compile(root_rule) |
| if whitespace_rule is None: |
| self._whitespace_rule = None |
| else: |
| self._whitespace_rule = _compile(whitespace_rule) |
| |
| def parse(self, text, start_pos=0): |
| """Parses the given text input |
| Args: |
| text -- data to parse. |
| start_pos -- the offset to start parsing at. |
| |
| Returns: |
| An abstract syntax tree, with nodes being pairs of the format |
| (label, value), where label is a string or a function, and value |
| is a string, a pair or a list of pairs. |
| """ |
| |
| def calculate_line_number_and_offset(globalOffset): |
| """Calculates the line number and in-line offset""" |
| i = 0 |
| lineNumber = 1 |
| lineOffset = 0 |
| lineData = [] |
| while i < globalOffset and i < len(text): |
| if text[i] == '\n': |
| lineNumber += 1 |
| lineOffset = 0 |
| lineData = [] |
| else: |
| lineData.append(text[i]) |
| lineOffset += 1 |
| i += 1 |
| while i < len(text) and text[i] != '\n': |
| lineData.append(text[i]) |
| i += 1 |
| return (lineNumber, lineOffset, ''.join(lineData)) |
| |
| def analyze_result(state, pos, ast): |
| """Analyze match output""" |
| if pos is not None: |
| # Its possible that matching is successful but trailing |
| # whitespace remains, so skip it. |
| pos = _skip_whitespace(state, pos) |
| if pos == len(state.text): |
| # End of intput reached. Success! |
| return ast |
| |
| # Failure - analyze and raise an error. |
| (lineNumber, lineOffset, lineData) = \ |
| calculate_line_number_and_offset(state.max_pos) |
| message = 'unexpected error' |
| if state.max_rule: |
| set = {} |
| map(set.__setitem__, state.max_rule, []) |
| |
| def to_str(item): |
| return item.__str__() |
| |
| expected = ' or '.join(map(to_str, set.keys())) |
| found = state.text[state.max_pos:state.max_pos + 1] |
| message = 'Expected %s but "%s" found: "%s"' % \ |
| (expected, found, lineData) |
| raise SyntaxError( |
| 'At line %s offset %s: %s' % \ |
| (lineNumber, lineOffset, message)) |
| |
| # Initialize state |
| state = _PegParserState(text, |
| whitespace_rule=self._whitespace_rule, |
| strings_are_tokens=self._strings_are_tokens) |
| |
| # Match and analyze result |
| (pos, ast) = self._root_rule.match(state, start_pos) |
| return analyze_result(state, pos, ast) |