|  | #!/usr/bin/python | 
|  | # Copyright (c) 2011, the Dart project authors.  Please see the AUTHORS file | 
|  | # for details. All rights reserved. Use of this source code is governed by a | 
|  | # BSD-style license that can be found in the LICENSE file. | 
|  |  | 
|  | import logging | 
|  | import re | 
|  | import weakref | 
|  |  | 
|  | _logger = logging.getLogger('pegparser') | 
|  |  | 
|  | # functions can refer to each other, hence creating infinite loops. The | 
|  | # following hashmap is used to memoize functions that were already compiled. | 
|  | _compiled_functions_memory = weakref.WeakKeyDictionary() | 
|  |  | 
|  | _regex_type = type(re.compile(r'')) | 
|  | _list_type = type([]) | 
|  | _function_type = type(lambda func: 0) | 
|  |  | 
|  |  | 
|  | class _PegParserState(object): | 
|  | """Object for storing parsing state variables and options""" | 
|  |  | 
|  | def __init__(self, text, whitespace_rule, strings_are_tokens): | 
|  | # Parsing state: | 
|  | self.text = text | 
|  | self.is_whitespace_mode = False | 
|  |  | 
|  | # Error message helpers: | 
|  | self.max_pos = None | 
|  | self.max_rule = None | 
|  |  | 
|  | # Parsing options: | 
|  | self.whitespace_rule = whitespace_rule | 
|  | self.strings_are_tokens = strings_are_tokens | 
|  |  | 
|  |  | 
|  | class _PegParserRule(object): | 
|  | """Base class for all rules""" | 
|  |  | 
|  | def __init__(self): | 
|  | return | 
|  |  | 
|  | def __str__(self): | 
|  | return self.__class__.__name__ | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | """Default implementation of the matching algorithm. | 
|  | Should be overwritten by sub-classes. | 
|  | """ | 
|  | raise RuntimeError('_match_impl not implemented') | 
|  |  | 
|  | def match(self, state, pos): | 
|  | """Matches the rule against the text in the given position. | 
|  |  | 
|  | The actual rule evaluation is delegated to _match_impl, | 
|  | while this function deals mostly with support tasks such as | 
|  | skipping whitespace, debug information and data for exception. | 
|  |  | 
|  | Args: | 
|  | state -- the current parsing state and options. | 
|  | pos -- the current offset in the text. | 
|  |  | 
|  | Returns: | 
|  | (next position, value) if the rule matches, or | 
|  | (None, None) if it doesn't. | 
|  | """ | 
|  | if not state.is_whitespace_mode: | 
|  | # Skip whitespace | 
|  | pos = _skip_whitespace(state, pos) | 
|  |  | 
|  | # Track position for possible error messaging | 
|  | if pos > state.max_pos: | 
|  | # Store position and the rule. | 
|  | state.max_pos = pos | 
|  | if isinstance(self, _StringRule): | 
|  | state.max_rule = [self] | 
|  | else: | 
|  | state.max_rule = [] | 
|  | elif pos == state.max_pos: | 
|  | if isinstance(self, _StringRule): | 
|  | state.max_rule.append(self) | 
|  |  | 
|  | if _logger.isEnabledFor(logging.DEBUG): | 
|  | # Used for debugging | 
|  | _logger.debug('Try:   pos=%s char=%s rule=%s' % \ | 
|  | (pos, state.text[pos:pos + 1], self)) | 
|  |  | 
|  | # Delegate the matching logic to the specialized function. | 
|  | res = self._match_impl(state, pos) | 
|  |  | 
|  | if not state.is_whitespace_mode \ | 
|  | and _logger.isEnabledFor(logging.DEBUG): | 
|  | # More debugging information | 
|  | (nextPos, ast) = res | 
|  | if nextPos is not None: | 
|  | _logger.debug('Match! pos=%s char=%s rule=%s' % \ | 
|  | (pos, state.text[pos:pos + 1], self)) | 
|  | else: | 
|  | _logger.debug('Fail.  pos=%s char=%s rule=%s' % \ | 
|  | (pos, state.text[pos:pos + 1], self)) | 
|  |  | 
|  | return res | 
|  |  | 
|  |  | 
|  | def _compile(rule): | 
|  | """Recursively compiles user-defined rules into parser rules. | 
|  | Compilation is performed by converting strings, regular expressions, lists | 
|  | and functions into _StringRule, _RegExpRule, SEQUENCE and _FunctionRule | 
|  | (respectively). Memoization is used to avoid infinite recursion as rules | 
|  | may refer to each other.""" | 
|  | if rule is None: | 
|  | raise RuntimeError('None is not a valid rule') | 
|  | elif isinstance(rule, str): | 
|  | return _StringRule(rule) | 
|  | elif isinstance(rule, _regex_type): | 
|  | return _RegExpRule(rule) | 
|  | elif isinstance(rule, _list_type): | 
|  | return SEQUENCE(*rule) | 
|  | elif isinstance(rule, _function_type): | 
|  | # Memoize compiled functions to avoid infinite compliation loops. | 
|  | if rule in _compiled_functions_memory: | 
|  | return _compiled_functions_memory[rule] | 
|  | else: | 
|  | compiled_function = _FunctionRule(rule) | 
|  | _compiled_functions_memory[rule] = compiled_function | 
|  | compiled_function._sub_rule = _compile(rule()) | 
|  | return compiled_function | 
|  | elif isinstance(rule, _PegParserRule): | 
|  | return rule | 
|  | else: | 
|  | raise RuntimeError('Invalid rule type %s: %s', (type(rule), rule)) | 
|  |  | 
|  |  | 
|  | def _skip_whitespace(state, pos): | 
|  | """Returns the next non-whitespace position. | 
|  | This is done by matching the optional whitespace_rule with the current | 
|  | text.""" | 
|  | if not state.whitespace_rule: | 
|  | return pos | 
|  | state.is_whitespace_mode = True | 
|  | nextPos = pos | 
|  | while nextPos is not None: | 
|  | pos = nextPos | 
|  | (nextPos, ast) = state.whitespace_rule.match(state, pos) | 
|  | state.is_whitespace_mode = False | 
|  | return pos | 
|  |  | 
|  |  | 
|  | class _StringRule(_PegParserRule): | 
|  | """This rule tries to match a whole string.""" | 
|  |  | 
|  | def __init__(self, string): | 
|  | """Constructor. | 
|  | Args: | 
|  | string -- string to match. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self._string = string | 
|  |  | 
|  | def __str__(self): | 
|  | return '"%s"' % self._string | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | """Tries to match the string at the current position""" | 
|  | if state.text.startswith(self._string, pos): | 
|  | nextPos = pos + len(self._string) | 
|  | if state.strings_are_tokens: | 
|  | return (nextPos, None) | 
|  | else: | 
|  | return (nextPos, self._string) | 
|  | return (None, None) | 
|  |  | 
|  |  | 
|  | class _RegExpRule(_PegParserRule): | 
|  | """This rule tries to matches a regular expression.""" | 
|  |  | 
|  | def __init__(self, reg_exp): | 
|  | """Constructor. | 
|  | Args: | 
|  | reg_exp -- a regular expression used in matching. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self.reg_exp = reg_exp | 
|  |  | 
|  | def __str__(self): | 
|  | return 'regexp' | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | """Tries to match the regular expression with current text""" | 
|  | matchObj = self.reg_exp.match(state.text, pos) | 
|  | if matchObj: | 
|  | matchStr = matchObj.group() | 
|  | return (pos + len(matchStr), matchStr) | 
|  | return (None, None) | 
|  |  | 
|  |  | 
|  | class _FunctionRule(_PegParserRule): | 
|  | """Function rule wraps a rule defined via a Python function. | 
|  |  | 
|  | Defining rules via functions helps break the grammar into parts, labeling | 
|  | the ast, and supporting recursive definitions in the grammar | 
|  |  | 
|  | Usage Example: | 
|  | def Func(): return ['function', TOKEN('('), TOKEN(')')] | 
|  | def Var(): return OR('x', 'y') | 
|  | def Program(): return OR(Func, Var) | 
|  |  | 
|  | When matched with 'function()', will return the tuple: | 
|  | ('Program', ('Func', 'function')) | 
|  | When matched with 'x', will return the tuple: | 
|  | ('Program', ('Var', 'x')) | 
|  |  | 
|  | Functions who's name begins with '_' will not be labelled. This is useful | 
|  | for creating utility rules. Extending the example above: | 
|  |  | 
|  | def _Program(): return OR(Func, Var) | 
|  |  | 
|  | When matched with 'function()', will return the tuple: | 
|  | ('Func', 'function') | 
|  | """ | 
|  |  | 
|  | def __init__(self, func): | 
|  | """Constructor. | 
|  | Args: | 
|  | func -- the original function will be used for labeling output. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self._func = func | 
|  | # Sub-rule is compiled by _compile to avoid infinite recursion. | 
|  | self._sub_rule = None | 
|  |  | 
|  | def __str__(self): | 
|  | return self._func.__name__ | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | """Simply invokes the sub rule""" | 
|  | (nextPos, ast) = self._sub_rule.match(state, pos) | 
|  | if nextPos is not None: | 
|  | if not self._func.__name__.startswith('_'): | 
|  | ast = (self._func.__name__, ast) | 
|  | return (nextPos, ast) | 
|  | return (None, None) | 
|  |  | 
|  |  | 
|  | class SEQUENCE(_PegParserRule): | 
|  | """This rule expects all given rules to match in sequence. | 
|  | Note that SEQUENCE is equivalent to a rule composed of a Python list of | 
|  | rules. | 
|  | Usage example: SEQUENCE('A', 'B', 'C') | 
|  | or: ['A', 'B', 'C'] | 
|  | Will match 'ABC' but not 'A', 'B' or ''. | 
|  | """ | 
|  | def __init__(self, *rules): | 
|  | """Constructor. | 
|  | Args: | 
|  | rules -- one or more rules to match. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self._sub_rules = [] | 
|  | for rule in rules: | 
|  | self._sub_rules.append(_compile(rule)) | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | """Tries to match all the sub rules""" | 
|  | sequence = [] | 
|  | for rule in self._sub_rules: | 
|  | (nextPos, ast) = rule.match(state, pos) | 
|  | if nextPos is not None: | 
|  | if ast: | 
|  | if isinstance(ast, _list_type): | 
|  | sequence.extend(ast) | 
|  | else: | 
|  | sequence.append(ast) | 
|  | pos = nextPos | 
|  | else: | 
|  | return (None, None) | 
|  | return (pos, sequence) | 
|  |  | 
|  |  | 
|  | class OR(_PegParserRule): | 
|  | """This rule matches one and only one of multiple sub-rules. | 
|  | Usage example: OR('A', 'B', 'C') | 
|  | Will match 'A', 'B' or 'C'. | 
|  | """ | 
|  | def __init__(self, *rules): | 
|  | """Constructor. | 
|  | Args: | 
|  | rules -- rules to choose from. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self._sub_rules = [] | 
|  | for rule in rules: | 
|  | self._sub_rules.append(_compile(rule)) | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | """Tries to match at leat one of the sub rules""" | 
|  | for rule in self._sub_rules: | 
|  | (nextPos, ast) = rule.match(state, pos) | 
|  | if nextPos is not None: | 
|  | return (nextPos, ast) | 
|  | return (None, None) | 
|  |  | 
|  |  | 
|  | class MAYBE(_PegParserRule): | 
|  | """Will try to match the given rule, tolerating absence. | 
|  | Usage example: MAYBE('A') | 
|  | Will match 'A' but also ''. | 
|  | """ | 
|  | def __init__(self, rule): | 
|  | """Constructor. | 
|  | Args: | 
|  | rule -- the rule that may be absent. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self._sub_rule = _compile(rule) | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | """Tries to match at leat one of the sub rules""" | 
|  | (nextPos, ast) = self._sub_rule.match(state, pos) | 
|  | if nextPos is not None: | 
|  | return (nextPos, ast) | 
|  | return (pos, None) | 
|  |  | 
|  |  | 
|  | class MANY(_PegParserRule): | 
|  | """Will try to match the given rule one or more times. | 
|  | Usage example 1: MANY('A') | 
|  | Will match 'A', 'AAAAA' but not ''. | 
|  | Usage example 2: MANY('A', separator=',') | 
|  | Will match 'A', 'A,A' but not 'AA'. | 
|  | """ | 
|  |  | 
|  | def __init__(self, rule, separator=None): | 
|  | """Constructor. | 
|  | Args: | 
|  | rule -- the rule to match multiple times. | 
|  | separator -- this optional rule is used to match separators. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self._sub_rule = _compile(rule) | 
|  | self._separator = _compile(separator) if separator else None | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | res = [] | 
|  | count = 0 | 
|  | while True: | 
|  | if count > 0 and self._separator: | 
|  | (nextPos, ast) = self._separator.match(state, pos) | 
|  | if nextPos is not None: | 
|  | pos = nextPos | 
|  | if ast: | 
|  | res.append(ast) | 
|  | else: | 
|  | break | 
|  | (nextPos, ast) = self._sub_rule.match(state, pos) | 
|  | if nextPos is None: | 
|  | break | 
|  | count += 1 | 
|  | pos = nextPos | 
|  | res.append(ast) | 
|  | if count > 0: | 
|  | return (pos, res) | 
|  | return (None, None) | 
|  |  | 
|  |  | 
|  | class TOKEN(_PegParserRule): | 
|  | """The matched rule will not appear in the output. | 
|  | Usage example: ['A', TOKEN('.'), 'B'] | 
|  | When matching 'A.B', will return the sequence ['A', 'B']. | 
|  | """ | 
|  |  | 
|  | def __init__(self, rule): | 
|  | """Constructor. | 
|  | Args: | 
|  | rule -- the rule to match. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self._sub_rule = _compile(rule) | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | (nextPos, ast) = self._sub_rule.match(state, pos) | 
|  | if nextPos is not None: | 
|  | return (nextPos, None) | 
|  | return (None, None) | 
|  |  | 
|  |  | 
|  | class LABEL(_PegParserRule): | 
|  | """The matched rule will appear in the output with the given label. | 
|  | Usage example: LABEL('number', re.compile(r'[0-9]+')) | 
|  | When matched with '1234', will return ('number', '1234'). | 
|  |  | 
|  | Keyword arguments: | 
|  | label -- a string. | 
|  | rule -- the rule to match. | 
|  | """ | 
|  |  | 
|  | def __init__(self, label, rule): | 
|  | """Constructor. | 
|  | Args: | 
|  | rule -- the rule to match. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self._label = label | 
|  | self._sub_rule = _compile(rule) | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | (nextPos, ast) = self._sub_rule.match(state, pos) | 
|  | if nextPos is not None: | 
|  | return (nextPos, (self._label, ast)) | 
|  | return (None, None) | 
|  |  | 
|  |  | 
|  | class RAISE(_PegParserRule): | 
|  | """Raises a SyntaxError with a user-provided message. | 
|  | Usage example: ['A','B', RAISE('should have not gotten here')] | 
|  | Will not match 'A' but will raise an exception for 'AB'. | 
|  | This rule is useful mostly for debugging grammars. | 
|  | """ | 
|  | def __init__(self, message): | 
|  | """Constructor. | 
|  | Args: | 
|  | message -- the message for the raised exception. | 
|  | """ | 
|  | _PegParserRule.__init__(self) | 
|  | self._message = message | 
|  |  | 
|  | def _match_impl(self, state, pos): | 
|  | raise RuntimeError(self._message) | 
|  |  | 
|  |  | 
|  | class PegParser(object): | 
|  | """PegParser class. | 
|  | This generic parser can be configured with rules to parse a wide | 
|  | range of inputs. | 
|  | """ | 
|  |  | 
|  | def __init__(self, root_rule, whitespace_rule=None, | 
|  | strings_are_tokens=False): | 
|  | """Initializes a PegParser with rules and parsing options. | 
|  |  | 
|  | Args: | 
|  | root_rule -- the top level rule to start matching at. Rule can be | 
|  | a regular expression, a string, or one of the special rules | 
|  | such as SEQUENCE, MANY, OR, etc. | 
|  | whitespace_rule -- used to identify and strip whitespace. Default | 
|  | isNone, configuring the parser to not tolerate whitespace. | 
|  | strings_are_tokens -- by default string rules are not treated as | 
|  | tokens. In many programming languages, strings are tokens, | 
|  | so this should be set to True. | 
|  | """ | 
|  | self._strings_are_tokens = strings_are_tokens | 
|  | self._root_rule = _compile(root_rule) | 
|  | if whitespace_rule is None: | 
|  | self._whitespace_rule = None | 
|  | else: | 
|  | self._whitespace_rule = _compile(whitespace_rule) | 
|  |  | 
|  | def parse(self, text, start_pos=0): | 
|  | """Parses the given text input | 
|  | Args: | 
|  | text -- data to parse. | 
|  | start_pos -- the offset to start parsing at. | 
|  |  | 
|  | Returns: | 
|  | An abstract syntax tree, with nodes being pairs of the format | 
|  | (label, value), where label is a string or a function, and value | 
|  | is a string, a pair or a list of pairs. | 
|  | """ | 
|  |  | 
|  | def calculate_line_number_and_offset(globalOffset): | 
|  | """Calculates the line number and in-line offset""" | 
|  | i = 0 | 
|  | lineNumber = 1 | 
|  | lineOffset = 0 | 
|  | lineData = [] | 
|  | while i < globalOffset and i < len(text): | 
|  | if text[i] == '\n': | 
|  | lineNumber += 1 | 
|  | lineOffset = 0 | 
|  | lineData = [] | 
|  | else: | 
|  | lineData.append(text[i]) | 
|  | lineOffset += 1 | 
|  | i += 1 | 
|  | while i < len(text) and  text[i] != '\n': | 
|  | lineData.append(text[i]) | 
|  | i += 1 | 
|  | return (lineNumber, lineOffset, ''.join(lineData)) | 
|  |  | 
|  | def analyze_result(state, pos, ast): | 
|  | """Analyze match output""" | 
|  | if pos is not None: | 
|  | # Its possible that matching is successful but trailing | 
|  | # whitespace remains, so skip it. | 
|  | pos = _skip_whitespace(state, pos) | 
|  | if pos == len(state.text): | 
|  | # End of intput reached. Success! | 
|  | return ast | 
|  |  | 
|  | # Failure - analyze and raise an error. | 
|  | (lineNumber, lineOffset, lineData) = \ | 
|  | calculate_line_number_and_offset(state.max_pos) | 
|  | message = 'unexpected error' | 
|  | if state.max_rule: | 
|  | set = {} | 
|  | map(set.__setitem__, state.max_rule, []) | 
|  |  | 
|  | def to_str(item): | 
|  | return item.__str__() | 
|  |  | 
|  | expected = ' or '.join(map(to_str, set.keys())) | 
|  | found = state.text[state.max_pos:state.max_pos + 1] | 
|  | message = 'Expected %s but "%s" found: "%s"' % \ | 
|  | (expected, found, lineData) | 
|  | raise SyntaxError( | 
|  | 'At line %s offset %s: %s' % \ | 
|  | (lineNumber, lineOffset, message)) | 
|  |  | 
|  | # Initialize state | 
|  | state = _PegParserState(text, | 
|  | whitespace_rule=self._whitespace_rule, | 
|  | strings_are_tokens=self._strings_are_tokens) | 
|  |  | 
|  | # Match and analyze result | 
|  | (pos, ast) = self._root_rule.match(state, start_pos) | 
|  | return analyze_result(state, pos, ast) |