blob: 1bdcc7b197aa7704969863252aed43fdc669eb1c [file] [log] [blame]
#!/usr/bin/python
# Copyright (c) 2011, the Dart project authors. Please see the AUTHORS file
# for details. All rights reserved. Use of this source code is governed by a
# BSD-style license that can be found in the LICENSE file.
import logging
import re
import weakref
_logger = logging.getLogger('pegparser')
# functions can refer to each other, hence creating infinite loops. The
# following hashmap is used to memoize functions that were already compiled.
_compiled_functions_memory = weakref.WeakKeyDictionary()
_regex_type = type(re.compile(r''))
_list_type = type([])
_function_type = type(lambda func: 0)
class _PegParserState(object):
"""Object for storing parsing state variables and options"""
def __init__(self, text, whitespace_rule, strings_are_tokens):
# Parsing state:
self.text = text
self.is_whitespace_mode = False
# Error message helpers:
self.max_pos = None
self.max_rule = None
# Parsing options:
self.whitespace_rule = whitespace_rule
self.strings_are_tokens = strings_are_tokens
class _PegParserRule(object):
"""Base class for all rules"""
def __init__(self):
return
def __str__(self):
return self.__class__.__name__
def _match_impl(self, state, pos):
"""Default implementation of the matching algorithm.
Should be overwritten by sub-classes.
"""
raise RuntimeError('_match_impl not implemented')
def match(self, state, pos):
"""Matches the rule against the text in the given position.
The actual rule evaluation is delegated to _match_impl,
while this function deals mostly with support tasks such as
skipping whitespace, debug information and data for exception.
Args:
state -- the current parsing state and options.
pos -- the current offset in the text.
Returns:
(next position, value) if the rule matches, or
(None, None) if it doesn't.
"""
if not state.is_whitespace_mode:
# Skip whitespace
pos = _skip_whitespace(state, pos)
# Track position for possible error messaging
if pos > state.max_pos:
# Store position and the rule.
state.max_pos = pos
if isinstance(self, _StringRule):
state.max_rule = [self]
else:
state.max_rule = []
elif pos == state.max_pos:
if isinstance(self, _StringRule):
state.max_rule.append(self)
if _logger.isEnabledFor(logging.DEBUG):
# Used for debugging
_logger.debug('Try: pos=%s char=%s rule=%s' % \
(pos, state.text[pos:pos + 1], self))
# Delegate the matching logic to the specialized function.
res = self._match_impl(state, pos)
if not state.is_whitespace_mode \
and _logger.isEnabledFor(logging.DEBUG):
# More debugging information
(nextPos, ast) = res
if nextPos is not None:
_logger.debug('Match! pos=%s char=%s rule=%s' % \
(pos, state.text[pos:pos + 1], self))
else:
_logger.debug('Fail. pos=%s char=%s rule=%s' % \
(pos, state.text[pos:pos + 1], self))
return res
def _compile(rule):
"""Recursively compiles user-defined rules into parser rules.
Compilation is performed by converting strings, regular expressions, lists
and functions into _StringRule, _RegExpRule, SEQUENCE and _FunctionRule
(respectively). Memoization is used to avoid infinite recursion as rules
may refer to each other."""
if rule is None:
raise RuntimeError('None is not a valid rule')
elif isinstance(rule, str):
return _StringRule(rule)
elif isinstance(rule, _regex_type):
return _RegExpRule(rule)
elif isinstance(rule, _list_type):
return SEQUENCE(*rule)
elif isinstance(rule, _function_type):
# Memoize compiled functions to avoid infinite compliation loops.
if rule in _compiled_functions_memory:
return _compiled_functions_memory[rule]
else:
compiled_function = _FunctionRule(rule)
_compiled_functions_memory[rule] = compiled_function
compiled_function._sub_rule = _compile(rule())
return compiled_function
elif isinstance(rule, _PegParserRule):
return rule
else:
raise RuntimeError('Invalid rule type %s: %s', (type(rule), rule))
def _skip_whitespace(state, pos):
"""Returns the next non-whitespace position.
This is done by matching the optional whitespace_rule with the current
text."""
if not state.whitespace_rule:
return pos
state.is_whitespace_mode = True
nextPos = pos
while nextPos is not None:
pos = nextPos
(nextPos, ast) = state.whitespace_rule.match(state, pos)
state.is_whitespace_mode = False
return pos
class _StringRule(_PegParserRule):
"""This rule tries to match a whole string."""
def __init__(self, string):
"""Constructor.
Args:
string -- string to match.
"""
_PegParserRule.__init__(self)
self._string = string
def __str__(self):
return '"%s"' % self._string
def _match_impl(self, state, pos):
"""Tries to match the string at the current position"""
if state.text.startswith(self._string, pos):
nextPos = pos + len(self._string)
if state.strings_are_tokens:
return (nextPos, None)
else:
return (nextPos, self._string)
return (None, None)
class _RegExpRule(_PegParserRule):
"""This rule tries to matches a regular expression."""
def __init__(self, reg_exp):
"""Constructor.
Args:
reg_exp -- a regular expression used in matching.
"""
_PegParserRule.__init__(self)
self.reg_exp = reg_exp
def __str__(self):
return 'regexp'
def _match_impl(self, state, pos):
"""Tries to match the regular expression with current text"""
matchObj = self.reg_exp.match(state.text, pos)
if matchObj:
matchStr = matchObj.group()
return (pos + len(matchStr), matchStr)
return (None, None)
class _FunctionRule(_PegParserRule):
"""Function rule wraps a rule defined via a Python function.
Defining rules via functions helps break the grammar into parts, labeling
the ast, and supporting recursive definitions in the grammar
Usage Example:
def Func(): return ['function', TOKEN('('), TOKEN(')')]
def Var(): return OR('x', 'y')
def Program(): return OR(Func, Var)
When matched with 'function()', will return the tuple:
('Program', ('Func', 'function'))
When matched with 'x', will return the tuple:
('Program', ('Var', 'x'))
Functions who's name begins with '_' will not be labelled. This is useful
for creating utility rules. Extending the example above:
def _Program(): return OR(Func, Var)
When matched with 'function()', will return the tuple:
('Func', 'function')
"""
def __init__(self, func):
"""Constructor.
Args:
func -- the original function will be used for labeling output.
"""
_PegParserRule.__init__(self)
self._func = func
# Sub-rule is compiled by _compile to avoid infinite recursion.
self._sub_rule = None
def __str__(self):
return self._func.__name__
def _match_impl(self, state, pos):
"""Simply invokes the sub rule"""
(nextPos, ast) = self._sub_rule.match(state, pos)
if nextPos is not None:
if not self._func.__name__.startswith('_'):
ast = (self._func.__name__, ast)
return (nextPos, ast)
return (None, None)
class SEQUENCE(_PegParserRule):
"""This rule expects all given rules to match in sequence.
Note that SEQUENCE is equivalent to a rule composed of a Python list of
rules.
Usage example: SEQUENCE('A', 'B', 'C')
or: ['A', 'B', 'C']
Will match 'ABC' but not 'A', 'B' or ''.
"""
def __init__(self, *rules):
"""Constructor.
Args:
rules -- one or more rules to match.
"""
_PegParserRule.__init__(self)
self._sub_rules = []
for rule in rules:
self._sub_rules.append(_compile(rule))
def _match_impl(self, state, pos):
"""Tries to match all the sub rules"""
sequence = []
for rule in self._sub_rules:
(nextPos, ast) = rule.match(state, pos)
if nextPos is not None:
if ast:
if isinstance(ast, _list_type):
sequence.extend(ast)
else:
sequence.append(ast)
pos = nextPos
else:
return (None, None)
return (pos, sequence)
class OR(_PegParserRule):
"""This rule matches one and only one of multiple sub-rules.
Usage example: OR('A', 'B', 'C')
Will match 'A', 'B' or 'C'.
"""
def __init__(self, *rules):
"""Constructor.
Args:
rules -- rules to choose from.
"""
_PegParserRule.__init__(self)
self._sub_rules = []
for rule in rules:
self._sub_rules.append(_compile(rule))
def _match_impl(self, state, pos):
"""Tries to match at leat one of the sub rules"""
for rule in self._sub_rules:
(nextPos, ast) = rule.match(state, pos)
if nextPos is not None:
return (nextPos, ast)
return (None, None)
class MAYBE(_PegParserRule):
"""Will try to match the given rule, tolerating absence.
Usage example: MAYBE('A')
Will match 'A' but also ''.
"""
def __init__(self, rule):
"""Constructor.
Args:
rule -- the rule that may be absent.
"""
_PegParserRule.__init__(self)
self._sub_rule = _compile(rule)
def _match_impl(self, state, pos):
"""Tries to match at leat one of the sub rules"""
(nextPos, ast) = self._sub_rule.match(state, pos)
if nextPos is not None:
return (nextPos, ast)
return (pos, None)
class MANY(_PegParserRule):
"""Will try to match the given rule one or more times.
Usage example 1: MANY('A')
Will match 'A', 'AAAAA' but not ''.
Usage example 2: MANY('A', separator=',')
Will match 'A', 'A,A' but not 'AA'.
"""
def __init__(self, rule, separator=None):
"""Constructor.
Args:
rule -- the rule to match multiple times.
separator -- this optional rule is used to match separators.
"""
_PegParserRule.__init__(self)
self._sub_rule = _compile(rule)
self._separator = _compile(separator) if separator else None
def _match_impl(self, state, pos):
res = []
count = 0
while True:
if count > 0 and self._separator:
(nextPos, ast) = self._separator.match(state, pos)
if nextPos is not None:
pos = nextPos
if ast:
res.append(ast)
else:
break
(nextPos, ast) = self._sub_rule.match(state, pos)
if nextPos is None:
break
count += 1
pos = nextPos
res.append(ast)
if count > 0:
return (pos, res)
return (None, None)
class TOKEN(_PegParserRule):
"""The matched rule will not appear in the output.
Usage example: ['A', TOKEN('.'), 'B']
When matching 'A.B', will return the sequence ['A', 'B'].
"""
def __init__(self, rule):
"""Constructor.
Args:
rule -- the rule to match.
"""
_PegParserRule.__init__(self)
self._sub_rule = _compile(rule)
def _match_impl(self, state, pos):
(nextPos, ast) = self._sub_rule.match(state, pos)
if nextPos is not None:
return (nextPos, None)
return (None, None)
class LABEL(_PegParserRule):
"""The matched rule will appear in the output with the given label.
Usage example: LABEL('number', re.compile(r'[0-9]+'))
When matched with '1234', will return ('number', '1234').
Keyword arguments:
label -- a string.
rule -- the rule to match.
"""
def __init__(self, label, rule):
"""Constructor.
Args:
rule -- the rule to match.
"""
_PegParserRule.__init__(self)
self._label = label
self._sub_rule = _compile(rule)
def _match_impl(self, state, pos):
(nextPos, ast) = self._sub_rule.match(state, pos)
if nextPos is not None:
return (nextPos, (self._label, ast))
return (None, None)
class RAISE(_PegParserRule):
"""Raises a SyntaxError with a user-provided message.
Usage example: ['A','B', RAISE('should have not gotten here')]
Will not match 'A' but will raise an exception for 'AB'.
This rule is useful mostly for debugging grammars.
"""
def __init__(self, message):
"""Constructor.
Args:
message -- the message for the raised exception.
"""
_PegParserRule.__init__(self)
self._message = message
def _match_impl(self, state, pos):
raise RuntimeError(self._message)
class PegParser(object):
"""PegParser class.
This generic parser can be configured with rules to parse a wide
range of inputs.
"""
def __init__(self, root_rule, whitespace_rule=None,
strings_are_tokens=False):
"""Initializes a PegParser with rules and parsing options.
Args:
root_rule -- the top level rule to start matching at. Rule can be
a regular expression, a string, or one of the special rules
such as SEQUENCE, MANY, OR, etc.
whitespace_rule -- used to identify and strip whitespace. Default
isNone, configuring the parser to not tolerate whitespace.
strings_are_tokens -- by default string rules are not treated as
tokens. In many programming languages, strings are tokens,
so this should be set to True.
"""
self._strings_are_tokens = strings_are_tokens
self._root_rule = _compile(root_rule)
if whitespace_rule is None:
self._whitespace_rule = None
else:
self._whitespace_rule = _compile(whitespace_rule)
def parse(self, text, start_pos=0):
"""Parses the given text input
Args:
text -- data to parse.
start_pos -- the offset to start parsing at.
Returns:
An abstract syntax tree, with nodes being pairs of the format
(label, value), where label is a string or a function, and value
is a string, a pair or a list of pairs.
"""
def calculate_line_number_and_offset(globalOffset):
"""Calculates the line number and in-line offset"""
i = 0
lineNumber = 1
lineOffset = 0
lineData = []
while i < globalOffset and i < len(text):
if text[i] == '\n':
lineNumber += 1
lineOffset = 0
lineData = []
else:
lineData.append(text[i])
lineOffset += 1
i += 1
while i < len(text) and text[i] != '\n':
lineData.append(text[i])
i += 1
return (lineNumber, lineOffset, ''.join(lineData))
def analyze_result(state, pos, ast):
"""Analyze match output"""
if pos is not None:
# Its possible that matching is successful but trailing
# whitespace remains, so skip it.
pos = _skip_whitespace(state, pos)
if pos == len(state.text):
# End of intput reached. Success!
return ast
# Failure - analyze and raise an error.
(lineNumber, lineOffset, lineData) = \
calculate_line_number_and_offset(state.max_pos)
message = 'unexpected error'
if state.max_rule:
set = {}
map(set.__setitem__, state.max_rule, [])
def to_str(item):
return item.__str__()
expected = ' or '.join(map(to_str, set.keys()))
found = state.text[state.max_pos:state.max_pos + 1]
message = 'Expected %s but "%s" found: "%s"' % \
(expected, found, lineData)
raise SyntaxError(
'At line %s offset %s: %s' % \
(lineNumber, lineOffset, message))
# Initialize state
state = _PegParserState(text,
whitespace_rule=self._whitespace_rule,
strings_are_tokens=self._strings_are_tokens)
# Match and analyze result
(pos, ast) = self._root_rule.match(state, start_pos)
return analyze_result(state, pos, ast)