infix_parser.py · aaronreidsmith/patsy

aaronreidsmith / patsy python

Repository URL to install this package:
Version: 0.5.1

/ infix_parser.py

# This file is part of Patsy
# Copyright (C) 2011 Nathaniel Smith <njs@pobox.com>
# See file LICENSE.txt for license information.

# This file implements a simple "shunting yard algorithm" parser for infix
# languages with parentheses. It is used as the core of our parser for
# formulas, but is generic enough to be used for other purposes as well
# (e.g. parsing linear constraints). It just builds a parse tree; semantics
# are somebody else's problem.
# 
# Plus it spends energy on tracking where each item in the parse tree comes
# from, to allow high-quality error reporting.
#
# You are expected to provide an collection of Operators, a collection of
# atomic types, and an iterator that provides Tokens. Each Operator should
# have a unique token_type (which is an arbitrary Python object), and each
# Token should have a matching token_type, or one of the special types
# Token.LPAREN, Token.RPAREN. Each Token is required to have a valid Origin
# attached, for error reporting.

# XX: still seriously consider putting the magic intercept handling into the
# tokenizer. we'd still need separate term-sets that get pasted together by ~
# to create the modeldesc, though... heck maybe we should just have a
# modeldesc be 1-or-more termsets, with the convention that if it's 1, then
# it's a rhs, and if it's 2, it's (lhs, rhs), and otherwise you're on your
# own. Test: would this be useful for multiple-group log-linear models,
# maybe? Answer: Perhaps. outcome ~ x1 + x2 ~ group. But lots of other
# plausible, maybe better ways to write this -- (outcome | group) ~ x1 + x2?
# "outcome ~ x1 + x2", group="group"? etc.

from __future__ import print_function

__all__ = ["Token", "ParseNode", "Operator", "parse"]

from patsy import PatsyError
from patsy.origin import Origin
from patsy.util import (repr_pretty_delegate, repr_pretty_impl,
                        no_pickling, assert_no_pickling)

class _UniqueValue(object):
    def __init__(self, print_as):
        self._print_as = print_as

    def __repr__(self):
        return "%s(%r)" % (self.__class__.__name__, self._print_as)

    __getstate__ = no_pickling

class Token(object):
    """A token with possible payload.

    .. attribute:: type

       An arbitrary object indicating the type of this token. Should be
      :term:`hashable`, but otherwise it can be whatever you like.
    """
    LPAREN = _UniqueValue("LPAREN")
    RPAREN = _UniqueValue("RPAREN")

    def __init__(self, type, origin, extra=None):
        self.type = type
        self.origin = origin
        self.extra = extra

    __repr__ = repr_pretty_delegate
    def _repr_pretty_(self, p, cycle):
        assert not cycle
        kwargs = []
        if self.extra is not None:
            kwargs = [("extra", self.extra)]
        return repr_pretty_impl(p, self, [self.type, self.origin], kwargs)

    __getstate__ = no_pickling

class ParseNode(object):
    def __init__(self, type, token, args, origin):
        self.type = type
        self.token = token
        self.args = args
        self.origin = origin

    __repr__ = repr_pretty_delegate
    def _repr_pretty_(self, p, cycle):
        return repr_pretty_impl(p, self, [self.type, self.token, self.args])

    __getstate__ = no_pickling

class Operator(object):
    def __init__(self, token_type, arity, precedence):
        self.token_type = token_type
        self.arity = arity
        self.precedence = precedence

    def __repr__(self):
        return "%s(%r, %r, %r)" % (self.__class__.__name__,
                                   self.token_type, self.arity, self.precedence)

    __getstate__ = no_pickling

class _StackOperator(object):
    def __init__(self, op, token):
        self.op = op
        self.token = token

    __getstate__ = no_pickling

_open_paren = Operator(Token.LPAREN, -1, -9999999)

class _ParseContext(object):
    def __init__(self, unary_ops, binary_ops, atomic_types, trace):
        self.op_stack = []
        self.noun_stack = []
        self.unary_ops = unary_ops
        self.binary_ops = binary_ops
        self.atomic_types = atomic_types
        self.trace = trace

    __getstate__ = no_pickling

def _read_noun_context(token, c):
    if token.type == Token.LPAREN:
        if c.trace:
            print("Pushing open-paren")
        c.op_stack.append(_StackOperator(_open_paren, token))
        return True
    elif token.type in c.unary_ops:
        if c.trace:
            print("Pushing unary op %r" % (token.type,))
        c.op_stack.append(_StackOperator(c.unary_ops[token.type], token))
        return True
    elif token.type in c.atomic_types:
        if c.trace:
            print("Pushing noun %r (%r)" % (token.type, token.extra))
        c.noun_stack.append(ParseNode(token.type, token, [],
                                      token.origin))
        return False
    else:
        raise PatsyError("expected a noun, not '%s'"
                            % (token.origin.relevant_code(),),
                            token)

def _run_op(c):
    assert c.op_stack
    stackop = c.op_stack.pop()
    args = []
    for i in range(stackop.op.arity):
        args.append(c.noun_stack.pop())
    args.reverse()
    if c.trace:
        print("Reducing %r (%r)" % (stackop.op.token_type, args))
    node = ParseNode(stackop.op.token_type, stackop.token, args,
                     Origin.combine([stackop.token] + args))
    c.noun_stack.append(node)

def _read_op_context(token, c):
    if token.type == Token.RPAREN:
        if c.trace:
            print("Found close-paren")
        while c.op_stack and c.op_stack[-1].op.token_type != Token.LPAREN:
            _run_op(c)
        if not c.op_stack:
            raise PatsyError("missing '(' or extra ')'", token)
        assert c.op_stack[-1].op.token_type == Token.LPAREN
        # Expand the origin of the item on top of the noun stack to include
        # the open and close parens:
        combined = Origin.combine([c.op_stack[-1].token,
                                   c.noun_stack[-1].token,
                                   token])
        c.noun_stack[-1].origin = combined
        # Pop the open-paren
        c.op_stack.pop()
        return False
    elif token.type in c.binary_ops:
        if c.trace:
            print("Found binary operator %r" % (token.type))
        stackop = _StackOperator(c.binary_ops[token.type], token)
        while (c.op_stack
               and stackop.op.precedence <= c.op_stack[-1].op.precedence):
            _run_op(c)
        if c.trace:
            print("Pushing binary operator %r" % (token.type))
        c.op_stack.append(stackop)
        return True
    else:
        raise PatsyError("expected an operator, not '%s'"
                            % (token.origin.relevant_code(),),
                            token)

def infix_parse(tokens, operators, atomic_types, trace=False):
    token_source = iter(tokens)

    unary_ops = {}
    binary_ops = {}
    for op in operators:
        assert op.precedence > _open_paren.precedence
        if op.arity == 1:
            unary_ops[op.token_type] = op
        elif op.arity == 2:
            binary_ops[op.token_type] = op
        else:
            raise ValueError("operators must be unary or binary")

    c = _ParseContext(unary_ops, binary_ops, atomic_types, trace)

    # This is an implementation of Dijkstra's shunting yard algorithm:
    #   http://en.wikipedia.org/wiki/Shunting_yard_algorithm
    #   http://www.engr.mun.ca/~theo/Misc/exp_parsing.htm

    want_noun = True
    for token in token_source:
        if c.trace:
            print("Reading next token (want_noun=%r)" % (want_noun,))
        if want_noun:
            want_noun = _read_noun_context(token, c)
        else:
            want_noun = _read_op_context(token, c)
    if c.trace:
        print("End of token stream")
        
    if want_noun:
        raise PatsyError("expected a noun, but instead the expression ended",
                            c.op_stack[-1].token.origin)

    while c.op_stack:
        if c.op_stack[-1].op.token_type == Token.LPAREN:
            raise PatsyError("Unmatched '('", c.op_stack[-1].token)
        _run_op(c)

    assert len(c.noun_stack) == 1
    return c.noun_stack.pop()

# Much more thorough tests in parse_formula.py, this is just a smoke test:
def test_infix_parse():
    ops = [Operator("+", 2, 10),
           Operator("*", 2, 20),
           Operator("-", 1, 30)]
    atomic = ["ATOM1", "ATOM2"]
    # a + -b * (c + d)
    mock_origin = Origin("asdf", 2, 3)
    tokens = [Token("ATOM1", mock_origin, "a"),
              Token("+", mock_origin, "+"),
              Token("-", mock_origin, "-"),
              Token("ATOM2", mock_origin, "b"),
              Token("*", mock_origin, "*"),
              Token(Token.LPAREN, mock_origin, "("),
              Token("ATOM1", mock_origin, "c"),
              Token("+", mock_origin, "+"),
              Token("ATOM2", mock_origin, "d"),
              Token(Token.RPAREN, mock_origin, ")")]
    tree = infix_parse(tokens, ops, atomic)
    def te(tree, type, extra):
        assert tree.type == type
        assert tree.token.extra == extra
    te(tree, "+", "+")
    te(tree.args[0], "ATOM1", "a")
    assert tree.args[0].args == []
    te(tree.args[1], "*", "*")
    te(tree.args[1].args[0], "-", "-")
    assert len(tree.args[1].args[0].args) == 1
    te(tree.args[1].args[0].args[0], "ATOM2", "b")
    te(tree.args[1].args[1], "+", "+")
    te(tree.args[1].args[1].args[0], "ATOM1", "c")
    te(tree.args[1].args[1].args[1], "ATOM2", "d")

    from nose.tools import assert_raises
    # No ternary ops
    assert_raises(ValueError,
                  infix_parse, [], [Operator("+", 3, 10)], ["ATOMIC"])

    # smoke test just to make sure there are no egregious bugs in 'trace'
    infix_parse(tokens, ops, atomic, trace=True)
aaronreidsmith / patsy python

Version: 0.5.1

/ infix_parser.py

Products

About

Resources

Contact Gemfury