Why Gemfury? Push, build, and install  RubyGems npm packages Python packages Maven artifacts PHP packages Go Modules Debian packages RPM packages NuGet packages

Repository URL to install this package:

Details    
pandas / core / computation / expr.py
Size: Mime:
""":func:`~pandas.eval` parsers
"""

import ast
import tokenize

from functools import partial
import numpy as np

import pandas as pd
from pandas import compat
from pandas.compat import StringIO, lmap, zip, reduce, string_types
from pandas.core.base import StringMixin
from pandas.core import common as com
import pandas.io.formats.printing as printing
from pandas.core.reshape.util import compose
from pandas.core.computation.ops import (
    _cmp_ops_syms, _bool_ops_syms,
    _arith_ops_syms, _unary_ops_syms, is_term)
from pandas.core.computation.ops import _reductions, _mathops, _LOCAL_TAG
from pandas.core.computation.ops import Op, BinOp, UnaryOp, Term, Constant, Div
from pandas.core.computation.ops import UndefinedVariableError, FuncNode
from pandas.core.computation.scope import Scope


def tokenize_string(source):
    """Tokenize a Python source code string.

    Parameters
    ----------
    source : str
        A Python source code string
    """
    line_reader = StringIO(source).readline
    for toknum, tokval, _, _, _ in tokenize.generate_tokens(line_reader):
        yield toknum, tokval


def _rewrite_assign(tok):
    """Rewrite the assignment operator for PyTables expressions that use ``=``
    as a substitute for ``==``.

    Parameters
    ----------
    tok : tuple of int, str
        ints correspond to the all caps constants in the tokenize module

    Returns
    -------
    t : tuple of int, str
        Either the input or token or the replacement values
    """
    toknum, tokval = tok
    return toknum, '==' if tokval == '=' else tokval


def _replace_booleans(tok):
    """Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
    precedence is changed to boolean precedence.

    Parameters
    ----------
    tok : tuple of int, str
        ints correspond to the all caps constants in the tokenize module

    Returns
    -------
    t : tuple of int, str
        Either the input or token or the replacement values
    """
    toknum, tokval = tok
    if toknum == tokenize.OP:
        if tokval == '&':
            return tokenize.NAME, 'and'
        elif tokval == '|':
            return tokenize.NAME, 'or'
        return toknum, tokval
    return toknum, tokval


def _replace_locals(tok):
    """Replace local variables with a syntactically valid name.

    Parameters
    ----------
    tok : tuple of int, str
        ints correspond to the all caps constants in the tokenize module

    Returns
    -------
    t : tuple of int, str
        Either the input or token or the replacement values

    Notes
    -----
    This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
    ``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
    is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
    """
    toknum, tokval = tok
    if toknum == tokenize.OP and tokval == '@':
        return tokenize.OP, _LOCAL_TAG
    return toknum, tokval


def _preparse(source, f=compose(_replace_locals, _replace_booleans,
                                _rewrite_assign)):
    """Compose a collection of tokenization functions

    Parameters
    ----------
    source : str
        A Python source code string
    f : callable
        This takes a tuple of (toknum, tokval) as its argument and returns a
        tuple with the same structure but possibly different elements. Defaults
        to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
        ``_replace_locals``.

    Returns
    -------
    s : str
        Valid Python source code

    Notes
    -----
    The `f` parameter can be any callable that takes *and* returns input of the
    form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
    the ``tokenize`` module and ``tokval`` is a string.
    """
    assert callable(f), 'f must be callable'
    return tokenize.untokenize(lmap(f, tokenize_string(source)))


def _is_type(t):
    """Factory for a type checking function of type ``t`` or tuple of types."""
    return lambda x: isinstance(x.value, t)


_is_list = _is_type(list)
_is_str = _is_type(string_types)


# partition all AST nodes
_all_nodes = frozenset(filter(lambda x: isinstance(x, type) and
                              issubclass(x, ast.AST),
                              (getattr(ast, node) for node in dir(ast))))


def _filter_nodes(superclass, all_nodes=_all_nodes):
    """Filter out AST nodes that are subclasses of ``superclass``."""
    node_names = (node.__name__ for node in all_nodes
                  if issubclass(node, superclass))
    return frozenset(node_names)


_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes))
_mod_nodes = _filter_nodes(ast.mod)
_stmt_nodes = _filter_nodes(ast.stmt)
_expr_nodes = _filter_nodes(ast.expr)
_expr_context_nodes = _filter_nodes(ast.expr_context)
_slice_nodes = _filter_nodes(ast.slice)
_boolop_nodes = _filter_nodes(ast.boolop)
_operator_nodes = _filter_nodes(ast.operator)
_unary_op_nodes = _filter_nodes(ast.unaryop)
_cmp_op_nodes = _filter_nodes(ast.cmpop)
_comprehension_nodes = _filter_nodes(ast.comprehension)
_handler_nodes = _filter_nodes(ast.excepthandler)
_arguments_nodes = _filter_nodes(ast.arguments)
_keyword_nodes = _filter_nodes(ast.keyword)
_alias_nodes = _filter_nodes(ast.alias)


# nodes that we don't support directly but are needed for parsing
_hacked_nodes = frozenset(['Assign', 'Module', 'Expr'])


_unsupported_expr_nodes = frozenset(['Yield', 'GeneratorExp', 'IfExp',
                                     'DictComp', 'SetComp', 'Repr', 'Lambda',
                                     'Set', 'AST', 'Is', 'IsNot'])

# these nodes are low priority or won't ever be supported (e.g., AST)
_unsupported_nodes = ((_stmt_nodes | _mod_nodes | _handler_nodes |
                       _arguments_nodes | _keyword_nodes | _alias_nodes |
                       _expr_context_nodes | _unsupported_expr_nodes) -
                      _hacked_nodes)

# we're adding a different assignment in some cases to be equality comparison
# and we don't want `stmt` and friends in their so get only the class whose
# names are capitalized
_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
_msg = 'cannot both support and not support {intersection}'.format(
    intersection=_unsupported_nodes & _base_supported_nodes)
assert not _unsupported_nodes & _base_supported_nodes, _msg


def _node_not_implemented(node_name, cls):
    """Return a function that raises a NotImplementedError with a passed node
    name.
    """

    def f(self, *args, **kwargs):
        raise NotImplementedError("{name!r} nodes are not "
                                  "implemented".format(name=node_name))
    return f


def disallow(nodes):
    """Decorator to disallow certain nodes from parsing. Raises a
    NotImplementedError instead.

    Returns
    -------
    disallowed : callable
    """
    def disallowed(cls):
        cls.unsupported_nodes = ()
        for node in nodes:
            new_method = _node_not_implemented(node, cls)
            name = 'visit_{node}'.format(node=node)
            cls.unsupported_nodes += (name,)
            setattr(cls, name, new_method)
        return cls
    return disallowed


def _op_maker(op_class, op_symbol):
    """Return a function to create an op class with its symbol already passed.

    Returns
    -------
    f : callable
    """

    def f(self, node, *args, **kwargs):
        """Return a partial function with an Op subclass with an operator
        already passed.

        Returns
        -------
        f : callable
        """
        return partial(op_class, op_symbol, *args, **kwargs)
    return f


_op_classes = {'binary': BinOp, 'unary': UnaryOp}


def add_ops(op_classes):
    """Decorator to add default implementation of ops."""
    def f(cls):
        for op_attr_name, op_class in compat.iteritems(op_classes):
            ops = getattr(cls, '{name}_ops'.format(name=op_attr_name))
            ops_map = getattr(cls, '{name}_op_nodes_map'.format(
                name=op_attr_name))
            for op in ops:
                op_node = ops_map[op]
                if op_node is not None:
                    made_op = _op_maker(op_class, op)
                    setattr(cls, 'visit_{node}'.format(node=op_node), made_op)
        return cls
    return f


@disallow(_unsupported_nodes)
@add_ops(_op_classes)
class BaseExprVisitor(ast.NodeVisitor):

    """Custom ast walker. Parsers of other engines should subclass this class
    if necessary.

    Parameters
    ----------
    env : Scope
    engine : str
    parser : str
    preparser : callable
    """
    const_type = Constant
    term_type = Term

    binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms
    binary_op_nodes = ('Gt', 'Lt', 'GtE', 'LtE', 'Eq', 'NotEq', 'In', 'NotIn',
                       'BitAnd', 'BitOr', 'And', 'Or', 'Add', 'Sub', 'Mult',
                       None, 'Pow', 'FloorDiv', 'Mod')
    binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))

    unary_ops = _unary_ops_syms
    unary_op_nodes = 'UAdd', 'USub', 'Invert', 'Not'
    unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))

    rewrite_map = {
        ast.Eq: ast.In,
        ast.NotEq: ast.NotIn,
        ast.In: ast.In,
        ast.NotIn: ast.NotIn
    }

    def __init__(self, env, engine, parser, preparser=_preparse):
        self.env = env
        self.engine = engine
        self.parser = parser
        self.preparser = preparser
        self.assigner = None

    def visit(self, node, **kwargs):
        if isinstance(node, string_types):
            clean = self.preparser(node)
            try:
                node = ast.fix_missing_locations(ast.parse(clean))
            except SyntaxError as e:
                from keyword import iskeyword
                if any(iskeyword(x) for x in clean.split()):
                    e.msg = ("Python keyword not valid identifier"
                             " in numexpr query")
                raise e

        method = 'visit_' + node.__class__.__name__
        visitor = getattr(self, method)
        return visitor(node, **kwargs)

    def visit_Module(self, node, **kwargs):
        if len(node.body) != 1:
            raise SyntaxError('only a single expression is allowed')
        expr = node.body[0]
        return self.visit(expr, **kwargs)

    def visit_Expr(self, node, **kwargs):
        return self.visit(node.value, **kwargs)

    def _rewrite_membership_op(self, node, left, right):
        # the kind of the operator (is actually an instance)
        op_instance = node.op
        op_type = type(op_instance)

        # must be two terms and the comparison operator must be ==/!=/in/not in
        if is_term(left) and is_term(right) and op_type in self.rewrite_map:

            left_list, right_list = map(_is_list, (left, right))
            left_str, right_str = map(_is_str, (left, right))

            # if there are any strings or lists in the expression
            if left_list or right_list or left_str or right_str:
                op_instance = self.rewrite_map[op_type]()

            # pop the string variable out of locals and replace it with a list
            # of one string, kind of a hack
            if right_str:
                name = self.env.add_tmp([right.value])
                right = self.term_type(name, self.env)

            if left_str:
                name = self.env.add_tmp([left.value])
                left = self.term_type(name, self.env)

        op = self.visit(op_instance)
        return op, op_instance, left, right

    def _maybe_transform_eq_ne(self, node, left=None, right=None):
        if left is None:
            left = self.visit(node.left, side='left')
        if right is None:
            right = self.visit(node.right, side='right')
        op, op_class, left, right = self._rewrite_membership_op(node, left,
                                                                right)
        return op, op_class, left, right

    def _maybe_downcast_constants(self, left, right):
        f32 = np.dtype(np.float32)
        if left.is_scalar and not right.is_scalar and right.return_type == f32:
            # right is a float32 array, left is a scalar
            name = self.env.add_tmp(np.float32(left.value))
            left = self.term_type(name, self.env)
        if right.is_scalar and not left.is_scalar and left.return_type == f32:
            # left is a float32 array, right is a scalar
            name = self.env.add_tmp(np.float32(right.value))
            right = self.term_type(name, self.env)

        return left, right

    def _maybe_eval(self, binop, eval_in_python):
        # eval `in` and `not in` (for now) in "partial" python space
        # things that can be evaluated in "eval" space will be turned into
        # temporary variables. for example,
        # [1,2] in a + 2 * b
        # in that case a + 2 * b will be evaluated using numexpr, and the "in"
        # call will be evaluated using isin (in python space)
        return binop.evaluate(self.env, self.engine, self.parser,
                              self.term_type, eval_in_python)

    def _maybe_evaluate_binop(self, op, op_class, lhs, rhs,
                              eval_in_python=('in', 'not in'),
                              maybe_eval_in_python=('==', '!=', '<', '>',
                                                    '<=', '>=')):
        res = op(lhs, rhs)

        if res.has_invalid_return_type:
            raise TypeError("unsupported operand type(s) for {op}:"
                            " '{lhs}' and '{rhs}'".format(op=res.op,
                                                          lhs=lhs.type,
                                                          rhs=rhs.type))

        if self.engine != 'pytables':
            if (res.op in _cmp_ops_syms and
                    getattr(lhs, 'is_datetime', False) or
                    getattr(rhs, 'is_datetime', False)):
                # all date ops must be done in python bc numexpr doesn't work
                # well with NaT
                return self._maybe_eval(res, self.binary_ops)

        if res.op in eval_in_python:
            # "in"/"not in" ops are always evaluated in python
            return self._maybe_eval(res, eval_in_python)
        elif self.engine != 'pytables':
            if (getattr(lhs, 'return_type', None) == object or
                    getattr(rhs, 'return_type', None) == object):
                # evaluate "==" and "!=" in python if either of our operands
                # has an object return type
                return self._maybe_eval(res, eval_in_python +
                                        maybe_eval_in_python)
        return res

    def visit_BinOp(self, node, **kwargs):
        op, op_class, left, right = self._maybe_transform_eq_ne(node)
        left, right = self._maybe_downcast_constants(left, right)
        return self._maybe_evaluate_binop(op, op_class, left, right)

    def visit_Div(self, node, **kwargs):
        truediv = self.env.scope['truediv']
        return lambda lhs, rhs: Div(lhs, rhs, truediv)

    def visit_UnaryOp(self, node, **kwargs):
        op = self.visit(node.op)
        operand = self.visit(node.operand)
        return op(operand)

    def visit_Name(self, node, **kwargs):
        return self.term_type(node.id, self.env, **kwargs)

    def visit_NameConstant(self, node, **kwargs):
        return self.const_type(node.value, self.env)

    def visit_Num(self, node, **kwargs):
        return self.const_type(node.n, self.env)

    def visit_Str(self, node, **kwargs):
        name = self.env.add_tmp(node.s)
        return self.term_type(name, self.env)

    def visit_List(self, node, **kwargs):
        name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
        return self.term_type(name, self.env)

    visit_Tuple = visit_List

    def visit_Index(self, node, **kwargs):
        """ df.index[4] """
        return self.visit(node.value)

    def visit_Subscript(self, node, **kwargs):
        value = self.visit(node.value)
        slobj = self.visit(node.slice)
        result = pd.eval(slobj, local_dict=self.env, engine=self.engine,
                         parser=self.parser)
        try:
            # a Term instance
            v = value.value[result]
        except AttributeError:
            # an Op instance
            lhs = pd.eval(value, local_dict=self.env, engine=self.engine,
                          parser=self.parser)
            v = lhs[result]
        name = self.env.add_tmp(v)
        return self.term_type(name, env=self.env)

    def visit_Slice(self, node, **kwargs):
        """ df.index[slice(4,6)] """
        lower = node.lower
        if lower is not None:
            lower = self.visit(lower).value
        upper = node.upper
        if upper is not None:
            upper = self.visit(upper).value
        step = node.step
        if step is not None:
            step = self.visit(step).value

        return slice(lower, upper, step)

    def visit_Assign(self, node, **kwargs):
        """
        support a single assignment node, like

        c = a + b

        set the assigner at the top level, must be a Name node which
        might or might not exist in the resolvers

        """

        if len(node.targets) != 1:
            raise SyntaxError('can only assign a single expression')
        if not isinstance(node.targets[0], ast.Name):
            raise SyntaxError('left hand side of an assignment must be a '
                              'single name')
        if self.env.target is None:
            raise ValueError('cannot assign without a target object')

        try:
            assigner = self.visit(node.targets[0], **kwargs)
        except UndefinedVariableError:
            assigner = node.targets[0].id

        self.assigner = getattr(assigner, 'name', assigner)
        if self.assigner is None:
            raise SyntaxError('left hand side of an assignment must be a '
                              'single resolvable name')

        return self.visit(node.value, **kwargs)

    def visit_Attribute(self, node, **kwargs):
        attr = node.attr
        value = node.value

        ctx = node.ctx
        if isinstance(ctx, ast.Load):
            # resolve the value
            resolved = self.visit(value).value
            try:
                v = getattr(resolved, attr)
                name = self.env.add_tmp(v)
                return self.term_type(name, self.env)
            except AttributeError:
                # something like datetime.datetime where scope is overridden
                if isinstance(value, ast.Name) and value.id == attr:
                    return resolved

        raise ValueError("Invalid Attribute context {name}"
                         .format(name=ctx.__name__))

    def visit_Call_35(self, node, side=None, **kwargs):
        """ in 3.5 the starargs attribute was changed to be more flexible,
        #11097 """

        if isinstance(node.func, ast.Attribute):
            res = self.visit_Attribute(node.func)
        elif not isinstance(node.func, ast.Name):
            raise TypeError("Only named functions are supported")
        else:
            try:
                res = self.visit(node.func)
            except UndefinedVariableError:
                # Check if this is a supported function name
                try:
                    res = FuncNode(node.func.id)
                except ValueError:
                    # Raise original error
                    raise

        if res is None:
            raise ValueError("Invalid function call {func}"
                             .format(func=node.func.id))
        if hasattr(res, 'value'):
            res = res.value

        if isinstance(res, FuncNode):

            new_args = [self.visit(arg) for arg in node.args]

            if node.keywords:
                raise TypeError("Function \"{name}\" does not support keyword "
                                "arguments".format(name=res.name))

            return res(*new_args, **kwargs)

        else:

            new_args = [self.visit(arg).value for arg in node.args]

            for key in node.keywords:
                if not isinstance(key, ast.keyword):
                    raise ValueError("keyword error in function call "
                                     "'{func}'".format(func=node.func.id))

                if key.arg:
                    # TODO: bug?
                    kwargs.append(ast.keyword(
                        keyword.arg, self.visit(keyword.value)))  # noqa

            return self.const_type(res(*new_args, **kwargs), self.env)

    def visit_Call_legacy(self, node, side=None, **kwargs):

        # this can happen with: datetime.datetime
        if isinstance(node.func, ast.Attribute):
            res = self.visit_Attribute(node.func)
        elif not isinstance(node.func, ast.Name):
            raise TypeError("Only named functions are supported")
        else:
            try:
                res = self.visit(node.func)
            except UndefinedVariableError:
                # Check if this is a supported function name
                try:
                    res = FuncNode(node.func.id)
                except ValueError:
                    # Raise original error
                    raise

        if res is None:
            raise ValueError("Invalid function call {func}"
                             .format(func=node.func.id))
        if hasattr(res, 'value'):
            res = res.value

        if isinstance(res, FuncNode):
            args = [self.visit(targ) for targ in node.args]

            if node.starargs is not None:
                args += self.visit(node.starargs)

            if node.keywords or node.kwargs:
                raise TypeError("Function \"{name}\" does not support keyword "
                                "arguments".format(name=res.name))

            return res(*args, **kwargs)

        else:
            args = [self.visit(targ).value for targ in node.args]
            if node.starargs is not None:
                args += self.visit(node.starargs).value

            keywords = {}
            for key in node.keywords:
                if not isinstance(key, ast.keyword):
                    raise ValueError("keyword error in function call "
                                     "'{func}'".format(func=node.func.id))
                keywords[key.arg] = self.visit(key.value).value
            if node.kwargs is not None:
                keywords.update(self.visit(node.kwargs).value)

            return self.const_type(res(*args, **keywords), self.env)

    def translate_In(self, op):
        return op

    def visit_Compare(self, node, **kwargs):
        ops = node.ops
        comps = node.comparators

        # base case: we have something like a CMP b
        if len(comps) == 1:
            op = self.translate_In(ops[0])
            binop = ast.BinOp(op=op, left=node.left, right=comps[0])
            return self.visit(binop)

        # recursive case: we have a chained comparison, a CMP b CMP c, etc.
        left = node.left
        values = []
        for op, comp in zip(ops, comps):
            new_node = self.visit(ast.Compare(comparators=[comp], left=left,
                                              ops=[self.translate_In(op)]))
            left = comp
            values.append(new_node)
        return self.visit(ast.BoolOp(op=ast.And(), values=values))

    def _try_visit_binop(self, bop):
        if isinstance(bop, (Op, Term)):
            return bop
        return self.visit(bop)

    def visit_BoolOp(self, node, **kwargs):
        def visitor(x, y):
            lhs = self._try_visit_binop(x)
            rhs = self._try_visit_binop(y)

            op, op_class, lhs, rhs = self._maybe_transform_eq_ne(
                node, lhs, rhs)
            return self._maybe_evaluate_binop(op, node.op, lhs, rhs)

        operands = node.values
        return reduce(visitor, operands)


# ast.Call signature changed on 3.5,
# conditionally change which methods is named
# visit_Call depending on Python version, #11097
if compat.PY35:
    BaseExprVisitor.visit_Call = BaseExprVisitor.visit_Call_35
else:
    BaseExprVisitor.visit_Call = BaseExprVisitor.visit_Call_legacy

_python_not_supported = frozenset(['Dict', 'BoolOp', 'In', 'NotIn'])
_numexpr_supported_calls = frozenset(_reductions + _mathops)


@disallow((_unsupported_nodes | _python_not_supported) -
          (_boolop_nodes | frozenset(['BoolOp', 'Attribute', 'In', 'NotIn',
                                      'Tuple'])))
class PandasExprVisitor(BaseExprVisitor):

    def __init__(self, env, engine, parser,
                 preparser=partial(_preparse, f=compose(_replace_locals,
                                                        _replace_booleans))):
        super(PandasExprVisitor, self).__init__(env, engine, parser, preparser)


@disallow(_unsupported_nodes | _python_not_supported | frozenset(['Not']))
class PythonExprVisitor(BaseExprVisitor):

    def __init__(self, env, engine, parser, preparser=lambda x: x):
        super(PythonExprVisitor, self).__init__(env, engine, parser,
                                                preparser=preparser)


class Expr(StringMixin):

    """Object encapsulating an expression.

    Parameters
    ----------
    expr : str
    engine : str, optional, default 'numexpr'
    parser : str, optional, default 'pandas'
    env : Scope, optional, default None
    truediv : bool, optional, default True
    level : int, optional, default 2
    """

    def __init__(self, expr, engine='numexpr', parser='pandas', env=None,
                 truediv=True, level=0):
        self.expr = expr
        self.env = env or Scope(level=level + 1)
        self.engine = engine
        self.parser = parser
        self.env.scope['truediv'] = truediv
        self._visitor = _parsers[parser](self.env, self.engine, self.parser)
        self.terms = self.parse()

    @property
    def assigner(self):
        return getattr(self._visitor, 'assigner', None)

    def __call__(self):
        return self.terms(self.env)

    def __unicode__(self):
        return printing.pprint_thing(self.terms)

    def __len__(self):
        return len(self.expr)

    def parse(self):
        """Parse an expression"""
        return self._visitor.visit(self.expr)

    @property
    def names(self):
        """Get the names in an expression"""
        if is_term(self.terms):
            return frozenset([self.terms.name])
        return frozenset(term.name for term in com.flatten(self.terms))


_parsers = {'python': PythonExprVisitor, 'pandas': PandasExprVisitor}