""":func:`~pandas.eval` parsers
"""
import ast
from functools import partial, reduce
from io import StringIO
import itertools as it
import operator
import tokenize
from typing import Type
import numpy as np
import pandas as pd
from pandas.core import common as com
from pandas.core.base import StringMixin
from pandas.core.computation.common import (
_BACKTICK_QUOTED_STRING,
_remove_spaces_column_name,
)
from pandas.core.computation.ops import (
_LOCAL_TAG,
BinOp,
Constant,
Div,
FuncNode,
Op,
Term,
UnaryOp,
UndefinedVariableError,
_arith_ops_syms,
_bool_ops_syms,
_cmp_ops_syms,
_mathops,
_reductions,
_unary_ops_syms,
is_term,
)
from pandas.core.computation.scope import Scope
import pandas.io.formats.printing as printing
def tokenize_string(source):
"""Tokenize a Python source code string.
Parameters
----------
source : str
A Python source code string
"""
line_reader = StringIO(source).readline
token_generator = tokenize.generate_tokens(line_reader)
# Loop over all tokens till a backtick (`) is found.
# Then, take all tokens till the next backtick to form a backtick quoted
# string.
for toknum, tokval, _, _, _ in token_generator:
if tokval == "`":
tokval = " ".join(
it.takewhile(
lambda tokval: tokval != "`",
map(operator.itemgetter(1), token_generator),
)
)
toknum = _BACKTICK_QUOTED_STRING
yield toknum, tokval
def _rewrite_assign(tok):
"""Rewrite the assignment operator for PyTables expressions that use ``=``
as a substitute for ``==``.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
return toknum, "==" if tokval == "=" else tokval
def _replace_booleans(tok):
"""Replace ``&`` with ``and`` and ``|`` with ``or`` so that bitwise
precedence is changed to boolean precedence.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == tokenize.OP:
if tokval == "&":
return tokenize.NAME, "and"
elif tokval == "|":
return tokenize.NAME, "or"
return toknum, tokval
return toknum, tokval
def _replace_locals(tok):
"""Replace local variables with a syntactically valid name.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
Notes
-----
This is somewhat of a hack in that we rewrite a string such as ``'@a'`` as
``'__pd_eval_local_a'`` by telling the tokenizer that ``__pd_eval_local_``
is a ``tokenize.OP`` and to replace the ``'@'`` symbol with it.
"""
toknum, tokval = tok
if toknum == tokenize.OP and tokval == "@":
return tokenize.OP, _LOCAL_TAG
return toknum, tokval
def _clean_spaces_backtick_quoted_names(tok):
"""Clean up a column name if surrounded by backticks.
Backtick quoted string are indicated by a certain tokval value. If a string
is a backtick quoted token it will processed by
:func:`_remove_spaces_column_name` so that the parser can find this
string when the query is executed.
See also :meth:`NDFrame._get_space_character_free_column_resolver`.
Parameters
----------
tok : tuple of int, str
ints correspond to the all caps constants in the tokenize module
Returns
-------
t : tuple of int, str
Either the input or token or the replacement values
"""
toknum, tokval = tok
if toknum == _BACKTICK_QUOTED_STRING:
return tokenize.NAME, _remove_spaces_column_name(tokval)
return toknum, tokval
def _compose2(f, g):
"""Compose 2 callables"""
return lambda *args, **kwargs: f(g(*args, **kwargs))
def _compose(*funcs):
"""Compose 2 or more callables"""
assert len(funcs) > 1, "At least 2 callables must be passed to compose"
return reduce(_compose2, funcs)
def _preparse(
source,
f=_compose(
_replace_locals,
_replace_booleans,
_rewrite_assign,
_clean_spaces_backtick_quoted_names,
),
):
"""Compose a collection of tokenization functions
Parameters
----------
source : str
A Python source code string
f : callable
This takes a tuple of (toknum, tokval) as its argument and returns a
tuple with the same structure but possibly different elements. Defaults
to the composition of ``_rewrite_assign``, ``_replace_booleans``, and
``_replace_locals``.
Returns
-------
s : str
Valid Python source code
Notes
-----
The `f` parameter can be any callable that takes *and* returns input of the
form ``(toknum, tokval)``, where ``toknum`` is one of the constants from
the ``tokenize`` module and ``tokval`` is a string.
"""
assert callable(f), "f must be callable"
return tokenize.untokenize((f(x) for x in tokenize_string(source)))
def _is_type(t):
"""Factory for a type checking function of type ``t`` or tuple of types."""
return lambda x: isinstance(x.value, t)
_is_list = _is_type(list)
_is_str = _is_type(str)
# partition all AST nodes
_all_nodes = frozenset(
filter(
lambda x: isinstance(x, type) and issubclass(x, ast.AST),
(getattr(ast, node) for node in dir(ast)),
)
)
def _filter_nodes(superclass, all_nodes=_all_nodes):
"""Filter out AST nodes that are subclasses of ``superclass``."""
node_names = (node.__name__ for node in all_nodes if issubclass(node, superclass))
return frozenset(node_names)
_all_node_names = frozenset(map(lambda x: x.__name__, _all_nodes))
_mod_nodes = _filter_nodes(ast.mod)
_stmt_nodes = _filter_nodes(ast.stmt)
_expr_nodes = _filter_nodes(ast.expr)
_expr_context_nodes = _filter_nodes(ast.expr_context)
_slice_nodes = _filter_nodes(ast.slice)
_boolop_nodes = _filter_nodes(ast.boolop)
_operator_nodes = _filter_nodes(ast.operator)
_unary_op_nodes = _filter_nodes(ast.unaryop)
_cmp_op_nodes = _filter_nodes(ast.cmpop)
_comprehension_nodes = _filter_nodes(ast.comprehension)
_handler_nodes = _filter_nodes(ast.excepthandler)
_arguments_nodes = _filter_nodes(ast.arguments)
_keyword_nodes = _filter_nodes(ast.keyword)
_alias_nodes = _filter_nodes(ast.alias)
# nodes that we don't support directly but are needed for parsing
_hacked_nodes = frozenset(["Assign", "Module", "Expr"])
_unsupported_expr_nodes = frozenset(
[
"Yield",
"GeneratorExp",
"IfExp",
"DictComp",
"SetComp",
"Repr",
"Lambda",
"Set",
"AST",
"Is",
"IsNot",
]
)
# these nodes are low priority or won't ever be supported (e.g., AST)
_unsupported_nodes = (
_stmt_nodes
| _mod_nodes
| _handler_nodes
| _arguments_nodes
| _keyword_nodes
| _alias_nodes
| _expr_context_nodes
| _unsupported_expr_nodes
) - _hacked_nodes
# we're adding a different assignment in some cases to be equality comparison
# and we don't want `stmt` and friends in their so get only the class whose
# names are capitalized
_base_supported_nodes = (_all_node_names - _unsupported_nodes) | _hacked_nodes
_msg = "cannot both support and not support {intersection}".format(
intersection=_unsupported_nodes & _base_supported_nodes
)
assert not _unsupported_nodes & _base_supported_nodes, _msg
def _node_not_implemented(node_name, cls):
"""Return a function that raises a NotImplementedError with a passed node
name.
"""
def f(self, *args, **kwargs):
raise NotImplementedError(
"{name!r} nodes are not " "implemented".format(name=node_name)
)
return f
def disallow(nodes):
"""Decorator to disallow certain nodes from parsing. Raises a
NotImplementedError instead.
Returns
-------
disallowed : callable
"""
def disallowed(cls):
cls.unsupported_nodes = ()
for node in nodes:
new_method = _node_not_implemented(node, cls)
name = "visit_{node}".format(node=node)
cls.unsupported_nodes += (name,)
setattr(cls, name, new_method)
return cls
return disallowed
def _op_maker(op_class, op_symbol):
"""Return a function to create an op class with its symbol already passed.
Returns
-------
f : callable
"""
def f(self, node, *args, **kwargs):
"""Return a partial function with an Op subclass with an operator
already passed.
Returns
-------
f : callable
"""
return partial(op_class, op_symbol, *args, **kwargs)
return f
_op_classes = {"binary": BinOp, "unary": UnaryOp}
def add_ops(op_classes):
"""Decorator to add default implementation of ops."""
def f(cls):
for op_attr_name, op_class in op_classes.items():
ops = getattr(cls, "{name}_ops".format(name=op_attr_name))
ops_map = getattr(cls, "{name}_op_nodes_map".format(name=op_attr_name))
for op in ops:
op_node = ops_map[op]
if op_node is not None:
made_op = _op_maker(op_class, op)
setattr(cls, "visit_{node}".format(node=op_node), made_op)
return cls
return f
@disallow(_unsupported_nodes)
@add_ops(_op_classes)
class BaseExprVisitor(ast.NodeVisitor):
"""Custom ast walker. Parsers of other engines should subclass this class
if necessary.
Parameters
----------
env : Scope
engine : str
parser : str
preparser : callable
"""
const_type = Constant # type: Type[Term]
term_type = Term
binary_ops = _cmp_ops_syms + _bool_ops_syms + _arith_ops_syms
binary_op_nodes = (
"Gt",
"Lt",
"GtE",
"LtE",
"Eq",
"NotEq",
"In",
"NotIn",
"BitAnd",
"BitOr",
"And",
"Or",
"Add",
"Sub",
"Mult",
None,
"Pow",
"FloorDiv",
"Mod",
)
binary_op_nodes_map = dict(zip(binary_ops, binary_op_nodes))
unary_ops = _unary_ops_syms
unary_op_nodes = "UAdd", "USub", "Invert", "Not"
unary_op_nodes_map = dict(zip(unary_ops, unary_op_nodes))
rewrite_map = {
ast.Eq: ast.In,
ast.NotEq: ast.NotIn,
ast.In: ast.In,
ast.NotIn: ast.NotIn,
}
def __init__(self, env, engine, parser, preparser=_preparse):
self.env = env
self.engine = engine
self.parser = parser
self.preparser = preparser
self.assigner = None
def visit(self, node, **kwargs):
if isinstance(node, str):
clean = self.preparser(node)
try:
node = ast.fix_missing_locations(ast.parse(clean))
except SyntaxError as e:
from keyword import iskeyword
if any(iskeyword(x) for x in clean.split()):
e.msg = "Python keyword not valid identifier" " in numexpr query"
raise e
method = "visit_" + node.__class__.__name__
visitor = getattr(self, method)
return visitor(node, **kwargs)
def visit_Module(self, node, **kwargs):
if len(node.body) != 1:
raise SyntaxError("only a single expression is allowed")
expr = node.body[0]
return self.visit(expr, **kwargs)
def visit_Expr(self, node, **kwargs):
return self.visit(node.value, **kwargs)
def _rewrite_membership_op(self, node, left, right):
# the kind of the operator (is actually an instance)
op_instance = node.op
op_type = type(op_instance)
# must be two terms and the comparison operator must be ==/!=/in/not in
if is_term(left) and is_term(right) and op_type in self.rewrite_map:
left_list, right_list = map(_is_list, (left, right))
left_str, right_str = map(_is_str, (left, right))
# if there are any strings or lists in the expression
if left_list or right_list or left_str or right_str:
op_instance = self.rewrite_map[op_type]()
# pop the string variable out of locals and replace it with a list
# of one string, kind of a hack
if right_str:
name = self.env.add_tmp([right.value])
right = self.term_type(name, self.env)
if left_str:
name = self.env.add_tmp([left.value])
left = self.term_type(name, self.env)
op = self.visit(op_instance)
return op, op_instance, left, right
def _maybe_transform_eq_ne(self, node, left=None, right=None):
if left is None:
left = self.visit(node.left, side="left")
if right is None:
right = self.visit(node.right, side="right")
op, op_class, left, right = self._rewrite_membership_op(node, left, right)
return op, op_class, left, right
def _maybe_downcast_constants(self, left, right):
f32 = np.dtype(np.float32)
if (
left.is_scalar
and hasattr(left, "value")
and not right.is_scalar
and right.return_type == f32
):
# right is a float32 array, left is a scalar
name = self.env.add_tmp(np.float32(left.value))
left = self.term_type(name, self.env)
if (
right.is_scalar
and hasattr(right, "value")
and not left.is_scalar
and left.return_type == f32
):
# left is a float32 array, right is a scalar
name = self.env.add_tmp(np.float32(right.value))
right = self.term_type(name, self.env)
return left, right
def _maybe_eval(self, binop, eval_in_python):
# eval `in` and `not in` (for now) in "partial" python space
# things that can be evaluated in "eval" space will be turned into
# temporary variables. for example,
# [1,2] in a + 2 * b
# in that case a + 2 * b will be evaluated using numexpr, and the "in"
# call will be evaluated using isin (in python space)
return binop.evaluate(
self.env, self.engine, self.parser, self.term_type, eval_in_python
)
def _maybe_evaluate_binop(
self,
op,
op_class,
lhs,
rhs,
eval_in_python=("in", "not in"),
maybe_eval_in_python=("==", "!=", "<", ">", "<=", ">="),
):
res = op(lhs, rhs)
if res.has_invalid_return_type:
raise TypeError(
"unsupported operand type(s) for {op}:"
" '{lhs}' and '{rhs}'".format(op=res.op, lhs=lhs.type, rhs=rhs.type)
)
if self.engine != "pytables":
if (
res.op in _cmp_ops_syms
and getattr(lhs, "is_datetime", False)
or getattr(rhs, "is_datetime", False)
):
# all date ops must be done in python bc numexpr doesn't work
# well with NaT
return self._maybe_eval(res, self.binary_ops)
if res.op in eval_in_python:
# "in"/"not in" ops are always evaluated in python
return self._maybe_eval(res, eval_in_python)
elif self.engine != "pytables":
if (
getattr(lhs, "return_type", None) == object
or getattr(rhs, "return_type", None) == object
):
# evaluate "==" and "!=" in python if either of our operands
# has an object return type
return self._maybe_eval(res, eval_in_python + maybe_eval_in_python)
return res
def visit_BinOp(self, node, **kwargs):
op, op_class, left, right = self._maybe_transform_eq_ne(node)
left, right = self._maybe_downcast_constants(left, right)
return self._maybe_evaluate_binop(op, op_class, left, right)
def visit_Div(self, node, **kwargs):
truediv = self.env.scope["truediv"]
return lambda lhs, rhs: Div(lhs, rhs, truediv)
def visit_UnaryOp(self, node, **kwargs):
op = self.visit(node.op)
operand = self.visit(node.operand)
return op(operand)
def visit_Name(self, node, **kwargs):
return self.term_type(node.id, self.env, **kwargs)
def visit_NameConstant(self, node, **kwargs):
return self.const_type(node.value, self.env)
def visit_Num(self, node, **kwargs):
return self.const_type(node.n, self.env)
def visit_Constant(self, node, **kwargs):
return self.const_type(node.n, self.env)
def visit_Str(self, node, **kwargs):
name = self.env.add_tmp(node.s)
return self.term_type(name, self.env)
def visit_List(self, node, **kwargs):
name = self.env.add_tmp([self.visit(e)(self.env) for e in node.elts])
return self.term_type(name, self.env)
visit_Tuple = visit_List
def visit_Index(self, node, **kwargs):
""" df.index[4] """
return self.visit(node.value)
def visit_Subscript(self, node, **kwargs):
value = self.visit(node.value)
slobj = self.visit(node.slice)
result = pd.eval(
slobj, local_dict=self.env, engine=self.engine, parser=self.parser
)
try:
# a Term instance
v = value.value[result]
except AttributeError:
# an Op instance
lhs = pd.eval(
value, local_dict=self.env, engine=self.engine, parser=self.parser
)
v = lhs[result]
name = self.env.add_tmp(v)
return self.term_type(name, env=self.env)
def visit_Slice(self, node, **kwargs):
""" df.index[slice(4,6)] """
lower = node.lower
if lower is not None:
lower = self.visit(lower).value
upper = node.upper
if upper is not None:
upper = self.visit(upper).value
step = node.step
if step is not None:
step = self.visit(step).value
return slice(lower, upper, step)
def visit_Assign(self, node, **kwargs):
"""
support a single assignment node, like
c = a + b
set the assigner at the top level, must be a Name node which
might or might not exist in the resolvers
"""
if len(node.targets) != 1:
raise SyntaxError("can only assign a single expression")
if not isinstance(node.targets[0], ast.Name):
raise SyntaxError(
"left hand side of an assignment must be a " "single name"
)
if self.env.target is None:
raise ValueError("cannot assign without a target object")
try:
assigner = self.visit(node.targets[0], **kwargs)
except UndefinedVariableError:
assigner = node.targets[0].id
self.assigner = getattr(assigner, "name", assigner)
if self.assigner is None:
raise SyntaxError(
"left hand side of an assignment must be a " "single resolvable name"
)
return self.visit(node.value, **kwargs)
def visit_Attribute(self, node, **kwargs):
attr = node.attr
value = node.value
ctx = node.ctx
if isinstance(ctx, ast.Load):
# resolve the value
resolved = self.visit(value).value
try:
v = getattr(resolved, attr)
name = self.env.add_tmp(v)
return self.term_type(name, self.env)
except AttributeError:
# something like datetime.datetime where scope is overridden
if isinstance(value, ast.Name) and value.id == attr:
return resolved
raise ValueError("Invalid Attribute context {name}".format(name=ctx.__name__))
def visit_Call(self, node, side=None, **kwargs):
if isinstance(node.func, ast.Attribute):
res = self.visit_Attribute(node.func)
elif not isinstance(node.func, ast.Name):
raise TypeError("Only named functions are supported")
else:
try:
res = self.visit(node.func)
except UndefinedVariableError:
# Check if this is a supported function name
try:
res = FuncNode(node.func.id)
except ValueError:
# Raise original error
raise
if res is None:
raise ValueError("Invalid function call {func}".format(func=node.func.id))
if hasattr(res, "value"):
res = res.value
if isinstance(res, FuncNode):
new_args = [self.visit(arg) for arg in node.args]
if node.keywords:
raise TypeError(
'Function "{name}" does not support keyword '
"arguments".format(name=res.name)
)
return res(*new_args, **kwargs)
else:
new_args = [self.visit(arg).value for arg in node.args]
for key in node.keywords:
if not isinstance(key, ast.keyword):
raise ValueError(
"keyword error in function call "
"'{func}'".format(func=node.func.id)
)
if key.arg:
kwargs[key.arg] = self.visit(key.value).value
return self.const_type(res(*new_args, **kwargs), self.env)
def translate_In(self, op):
return op
def visit_Compare(self, node, **kwargs):
ops = node.ops
comps = node.comparators
# base case: we have something like a CMP b
if len(comps) == 1:
op = self.translate_In(ops[0])
binop = ast.BinOp(op=op, left=node.left, right=comps[0])
return self.visit(binop)
# recursive case: we have a chained comparison, a CMP b CMP c, etc.
left = node.left
values = []
for op, comp in zip(ops, comps):
new_node = self.visit(
ast.Compare(comparators=[comp], left=left, ops=[self.translate_In(op)])
)
left = comp
values.append(new_node)
return self.visit(ast.BoolOp(op=ast.And(), values=values))
def _try_visit_binop(self, bop):
if isinstance(bop, (Op, Term)):
return bop
return self.visit(bop)
def visit_BoolOp(self, node, **kwargs):
def visitor(x, y):
lhs = self._try_visit_binop(x)
rhs = self._try_visit_binop(y)
op, op_class, lhs, rhs = self._maybe_transform_eq_ne(node, lhs, rhs)
return self._maybe_evaluate_binop(op, node.op, lhs, rhs)
operands = node.values
return reduce(visitor, operands)
_python_not_supported = frozenset(["Dict", "BoolOp", "In", "NotIn"])
_numexpr_supported_calls = frozenset(_reductions + _mathops)
@disallow(
(_unsupported_nodes | _python_not_supported)
- (_boolop_nodes | frozenset(["BoolOp", "Attribute", "In", "NotIn", "Tuple"]))
)
class PandasExprVisitor(BaseExprVisitor):
def __init__(
self,
env,
engine,
parser,
preparser=partial(
_preparse,
f=_compose(
_replace_locals, _replace_booleans, _clean_spaces_backtick_quoted_names
),
),
):
super().__init__(env, engine, parser, preparser)
@disallow(_unsupported_nodes | _python_not_supported | frozenset(["Not"]))
class PythonExprVisitor(BaseExprVisitor):
def __init__(self, env, engine, parser, preparser=lambda x: x):
super().__init__(env, engine, parser, preparser=preparser)
class Expr(StringMixin):
"""Object encapsulating an expression.
Parameters
----------
expr : str
engine : str, optional, default 'numexpr'
parser : str, optional, default 'pandas'
env : Scope, optional, default None
truediv : bool, optional, default True
level : int, optional, default 2
"""
def __init__(
self, expr, engine="numexpr", parser="pandas", env=None, truediv=True, level=0
):
self.expr = expr
self.env = env or Scope(level=level + 1)
self.engine = engine
self.parser = parser
self.env.scope["truediv"] = truediv
self._visitor = _parsers[parser](self.env, self.engine, self.parser)
self.terms = self.parse()
@property
def assigner(self):
return getattr(self._visitor, "assigner", None)
def __call__(self):
return self.terms(self.env)
def __str__(self):
return printing.pprint_thing(self.terms)
def __len__(self):
return len(self.expr)
def parse(self):
"""Parse an expression"""
return self._visitor.visit(self.expr)
@property
def names(self):
"""Get the names in an expression"""
if is_term(self.terms):
return frozenset([self.terms.name])
return frozenset(term.name for term in com.flatten(self.terms))
_parsers = {"python": PythonExprVisitor, "pandas": PandasExprVisitor}