from io import StringIO
import operator
import numpy as np
import pytest
import pandas.util._test_decorators as td
import pandas as pd
from pandas import DataFrame, Index, MultiIndex, Series, date_range
from pandas.core.computation.check import _NUMEXPR_INSTALLED
from pandas.tests.frame.common import TestData
from pandas.util.testing import (
assert_frame_equal,
assert_series_equal,
makeCustomDataframe as mkdf,
)
PARSERS = "python", "pandas"
ENGINES = "python", pytest.param("numexpr", marks=td.skip_if_no_ne)
@pytest.fixture(params=PARSERS, ids=lambda x: x)
def parser(request):
return request.param
@pytest.fixture(params=ENGINES, ids=lambda x: x)
def engine(request):
return request.param
def skip_if_no_pandas_parser(parser):
if parser != "pandas":
pytest.skip("cannot evaluate with parser {0!r}".format(parser))
class TestCompat:
def setup_method(self, method):
self.df = DataFrame({"A": [1, 2, 3]})
self.expected1 = self.df[self.df.A > 0]
self.expected2 = self.df.A + 1
def test_query_default(self):
# GH 12749
# this should always work, whether _NUMEXPR_INSTALLED or not
df = self.df
result = df.query("A>0")
assert_frame_equal(result, self.expected1)
result = df.eval("A+1")
assert_series_equal(result, self.expected2, check_names=False)
def test_query_None(self):
df = self.df
result = df.query("A>0", engine=None)
assert_frame_equal(result, self.expected1)
result = df.eval("A+1", engine=None)
assert_series_equal(result, self.expected2, check_names=False)
def test_query_python(self):
df = self.df
result = df.query("A>0", engine="python")
assert_frame_equal(result, self.expected1)
result = df.eval("A+1", engine="python")
assert_series_equal(result, self.expected2, check_names=False)
def test_query_numexpr(self):
df = self.df
if _NUMEXPR_INSTALLED:
result = df.query("A>0", engine="numexpr")
assert_frame_equal(result, self.expected1)
result = df.eval("A+1", engine="numexpr")
assert_series_equal(result, self.expected2, check_names=False)
else:
with pytest.raises(ImportError):
df.query("A>0", engine="numexpr")
with pytest.raises(ImportError):
df.eval("A+1", engine="numexpr")
class TestDataFrameEval(TestData):
def test_ops(self):
# tst ops and reversed ops in evaluation
# GH7198
# smaller hits python, larger hits numexpr
for n in [4, 4000]:
df = DataFrame(1, index=range(n), columns=list("abcd"))
df.iloc[0] = 2
m = df.mean()
for op_str, op, rop in [
("+", "__add__", "__radd__"),
("-", "__sub__", "__rsub__"),
("*", "__mul__", "__rmul__"),
("/", "__truediv__", "__rtruediv__"),
]:
base = DataFrame( # noqa
np.tile(m.values, n).reshape(n, -1), columns=list("abcd")
)
expected = eval("base{op}df".format(op=op_str))
# ops as strings
result = eval("m{op}df".format(op=op_str))
assert_frame_equal(result, expected)
# these are commutative
if op in ["+", "*"]:
result = getattr(df, op)(m)
assert_frame_equal(result, expected)
# these are not
elif op in ["-", "/"]:
result = getattr(df, rop)(m)
assert_frame_equal(result, expected)
# GH7192
df = DataFrame(dict(A=np.random.randn(25000)))
df.iloc[0:5] = np.nan
expected = 1 - np.isnan(df.iloc[0:25])
result = (1 - np.isnan(df)).iloc[0:25]
assert_frame_equal(result, expected)
def test_query_non_str(self):
# GH 11485
df = pd.DataFrame({"A": [1, 2, 3], "B": ["a", "b", "b"]})
msg = "expr must be a string to be evaluated"
with pytest.raises(ValueError, match=msg):
df.query(lambda x: x.B == "b")
with pytest.raises(ValueError, match=msg):
df.query(111)
def test_query_empty_string(self):
# GH 13139
df = pd.DataFrame({"A": [1, 2, 3]})
msg = "expr cannot be an empty string"
with pytest.raises(ValueError, match=msg):
df.query("")
def test_eval_resolvers_as_list(self):
# GH 14095
df = DataFrame(np.random.randn(10, 2), columns=list("ab"))
dict1 = {"a": 1}
dict2 = {"b": 2}
assert df.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
assert pd.eval("a + b", resolvers=[dict1, dict2]) == dict1["a"] + dict2["b"]
class TestDataFrameQueryWithMultiIndex:
def test_query_with_named_multiindex(self, parser, engine):
skip_if_no_pandas_parser(parser)
a = np.random.choice(["red", "green"], size=10)
b = np.random.choice(["eggs", "ham"], size=10)
index = MultiIndex.from_arrays([a, b], names=["color", "food"])
df = DataFrame(np.random.randn(10, 2), index=index)
ind = Series(
df.index.get_level_values("color").values, index=index, name="color"
)
# equality
res1 = df.query('color == "red"', parser=parser, engine=engine)
res2 = df.query('"red" == color', parser=parser, engine=engine)
exp = df[ind == "red"]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# inequality
res1 = df.query('color != "red"', parser=parser, engine=engine)
res2 = df.query('"red" != color', parser=parser, engine=engine)
exp = df[ind != "red"]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# list equality (really just set membership)
res1 = df.query('color == ["red"]', parser=parser, engine=engine)
res2 = df.query('["red"] == color', parser=parser, engine=engine)
exp = df[ind.isin(["red"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
res1 = df.query('color != ["red"]', parser=parser, engine=engine)
res2 = df.query('["red"] != color', parser=parser, engine=engine)
exp = df[~ind.isin(["red"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# in/not in ops
res1 = df.query('["red"] in color', parser=parser, engine=engine)
res2 = df.query('"red" in color', parser=parser, engine=engine)
exp = df[ind.isin(["red"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
res1 = df.query('["red"] not in color', parser=parser, engine=engine)
res2 = df.query('"red" not in color', parser=parser, engine=engine)
exp = df[~ind.isin(["red"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
def test_query_with_unnamed_multiindex(self, parser, engine):
skip_if_no_pandas_parser(parser)
a = np.random.choice(["red", "green"], size=10)
b = np.random.choice(["eggs", "ham"], size=10)
index = MultiIndex.from_arrays([a, b])
df = DataFrame(np.random.randn(10, 2), index=index)
ind = Series(df.index.get_level_values(0).values, index=index)
res1 = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
res2 = df.query('"red" == ilevel_0', parser=parser, engine=engine)
exp = df[ind == "red"]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# inequality
res1 = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
res2 = df.query('"red" != ilevel_0', parser=parser, engine=engine)
exp = df[ind != "red"]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# list equality (really just set membership)
res1 = df.query('ilevel_0 == ["red"]', parser=parser, engine=engine)
res2 = df.query('["red"] == ilevel_0', parser=parser, engine=engine)
exp = df[ind.isin(["red"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
res1 = df.query('ilevel_0 != ["red"]', parser=parser, engine=engine)
res2 = df.query('["red"] != ilevel_0', parser=parser, engine=engine)
exp = df[~ind.isin(["red"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# in/not in ops
res1 = df.query('["red"] in ilevel_0', parser=parser, engine=engine)
res2 = df.query('"red" in ilevel_0', parser=parser, engine=engine)
exp = df[ind.isin(["red"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
res1 = df.query('["red"] not in ilevel_0', parser=parser, engine=engine)
res2 = df.query('"red" not in ilevel_0', parser=parser, engine=engine)
exp = df[~ind.isin(["red"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# ## LEVEL 1
ind = Series(df.index.get_level_values(1).values, index=index)
res1 = df.query('ilevel_1 == "eggs"', parser=parser, engine=engine)
res2 = df.query('"eggs" == ilevel_1', parser=parser, engine=engine)
exp = df[ind == "eggs"]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# inequality
res1 = df.query('ilevel_1 != "eggs"', parser=parser, engine=engine)
res2 = df.query('"eggs" != ilevel_1', parser=parser, engine=engine)
exp = df[ind != "eggs"]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# list equality (really just set membership)
res1 = df.query('ilevel_1 == ["eggs"]', parser=parser, engine=engine)
res2 = df.query('["eggs"] == ilevel_1', parser=parser, engine=engine)
exp = df[ind.isin(["eggs"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
res1 = df.query('ilevel_1 != ["eggs"]', parser=parser, engine=engine)
res2 = df.query('["eggs"] != ilevel_1', parser=parser, engine=engine)
exp = df[~ind.isin(["eggs"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
# in/not in ops
res1 = df.query('["eggs"] in ilevel_1', parser=parser, engine=engine)
res2 = df.query('"eggs" in ilevel_1', parser=parser, engine=engine)
exp = df[ind.isin(["eggs"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
res1 = df.query('["eggs"] not in ilevel_1', parser=parser, engine=engine)
res2 = df.query('"eggs" not in ilevel_1', parser=parser, engine=engine)
exp = df[~ind.isin(["eggs"])]
assert_frame_equal(res1, exp)
assert_frame_equal(res2, exp)
def test_query_with_partially_named_multiindex(self, parser, engine):
skip_if_no_pandas_parser(parser)
a = np.random.choice(["red", "green"], size=10)
b = np.arange(10)
index = MultiIndex.from_arrays([a, b])
index.names = [None, "rating"]
df = DataFrame(np.random.randn(10, 2), index=index)
res = df.query("rating == 1", parser=parser, engine=engine)
ind = Series(
df.index.get_level_values("rating").values, index=index, name="rating"
)
exp = df[ind == 1]
assert_frame_equal(res, exp)
res = df.query("rating != 1", parser=parser, engine=engine)
ind = Series(
df.index.get_level_values("rating").values, index=index, name="rating"
)
exp = df[ind != 1]
assert_frame_equal(res, exp)
res = df.query('ilevel_0 == "red"', parser=parser, engine=engine)
ind = Series(df.index.get_level_values(0).values, index=index)
exp = df[ind == "red"]
assert_frame_equal(res, exp)
res = df.query('ilevel_0 != "red"', parser=parser, engine=engine)
ind = Series(df.index.get_level_values(0).values, index=index)
exp = df[ind != "red"]
assert_frame_equal(res, exp)
def test_query_multiindex_get_index_resolvers(self):
df = mkdf(10, 3, r_idx_nlevels=2, r_idx_names=["spam", "eggs"])
resolvers = df._get_index_resolvers()
def to_series(mi, level):
level_values = mi.get_level_values(level)
s = level_values.to_series()
s.index = mi
return s
col_series = df.columns.to_series()
expected = {
"index": df.index,
"columns": col_series,
"spam": to_series(df.index, "spam"),
"eggs": to_series(df.index, "eggs"),
"C0": col_series,
}
for k, v in resolvers.items():
if isinstance(v, Index):
assert v.is_(expected[k])
elif isinstance(v, Series):
assert_series_equal(v, expected[k])
else:
raise AssertionError("object must be a Series or Index")
@td.skip_if_no_ne
class TestDataFrameQueryNumExprPandas:
@classmethod
def setup_class(cls):
cls.engine = "numexpr"
cls.parser = "pandas"
@classmethod
def teardown_class(cls):
del cls.engine, cls.parser
def test_date_query_with_attribute_access(self):
engine, parser = self.engine, self.parser
skip_if_no_pandas_parser(parser)
df = DataFrame(np.random.randn(5, 3))
df["dates1"] = date_range("1/1/2012", periods=5)
df["dates2"] = date_range("1/1/2013", periods=5)
df["dates3"] = date_range("1/1/2014", periods=5)
res = df.query(
"@df.dates1 < 20130101 < @df.dates3", engine=engine, parser=parser
)
expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_query_no_attribute_access(self):
engine, parser = self.engine, self.parser
df = DataFrame(np.random.randn(5, 3))
df["dates1"] = date_range("1/1/2012", periods=5)
df["dates2"] = date_range("1/1/2013", periods=5)
df["dates3"] = date_range("1/1/2014", periods=5)
res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_query_with_NaT(self):
engine, parser = self.engine, self.parser
n = 10
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates2"] = date_range("1/1/2013", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT
res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_index_query(self):
engine, parser = self.engine, self.parser
n = 10
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.set_index("dates1", inplace=True, drop=True)
res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_index_query_with_NaT(self):
engine, parser = self.engine, self.parser
n = 10
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.iloc[0, 0] = pd.NaT
df.set_index("dates1", inplace=True, drop=True)
res = df.query("index < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_index_query_with_NaT_duplicates(self):
engine, parser = self.engine, self.parser
n = 10
d = {}
d["dates1"] = date_range("1/1/2012", periods=n)
d["dates3"] = date_range("1/1/2014", periods=n)
df = DataFrame(d)
df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
df.set_index("dates1", inplace=True, drop=True)
res = df.query("dates1 < 20130101 < dates3", engine=engine, parser=parser)
expec = df[(df.index.to_series() < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_query_with_non_date(self):
engine, parser = self.engine, self.parser
n = 10
df = DataFrame(
{"dates": date_range("1/1/2012", periods=n), "nondate": np.arange(n)}
)
result = df.query("dates == nondate", parser=parser, engine=engine)
assert len(result) == 0
result = df.query("dates != nondate", parser=parser, engine=engine)
assert_frame_equal(result, df)
for op in ["<", ">", "<=", ">="]:
with pytest.raises(TypeError):
df.query(
"dates {op} nondate".format(op=op), parser=parser, engine=engine
)
def test_query_syntax_error(self):
engine, parser = self.engine, self.parser
df = DataFrame({"i": range(10), "+": range(3, 13), "r": range(4, 14)})
with pytest.raises(SyntaxError):
df.query("i - +", engine=engine, parser=parser)
def test_query_scope(self):
from pandas.core.computation.ops import UndefinedVariableError
engine, parser = self.engine, self.parser
skip_if_no_pandas_parser(parser)
df = DataFrame(np.random.randn(20, 2), columns=list("ab"))
a, b = 1, 2 # noqa
res = df.query("a > b", engine=engine, parser=parser)
expected = df[df.a > df.b]
assert_frame_equal(res, expected)
res = df.query("@a > b", engine=engine, parser=parser)
expected = df[a > df.b]
assert_frame_equal(res, expected)
# no local variable c
with pytest.raises(UndefinedVariableError):
df.query("@a > b > @c", engine=engine, parser=parser)
# no column named 'c'
with pytest.raises(UndefinedVariableError):
df.query("@a > b > c", engine=engine, parser=parser)
def test_query_doesnt_pickup_local(self):
from pandas.core.computation.ops import UndefinedVariableError
engine, parser = self.engine, self.parser
n = m = 10
df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc"))
# we don't pick up the local 'sin'
with pytest.raises(UndefinedVariableError):
df.query("sin > 5", engine=engine, parser=parser)
def test_query_builtin(self):
from pandas.core.computation.engines import NumExprClobberingError
engine, parser = self.engine, self.parser
n = m = 10
df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc"))
df.index.name = "sin"
msg = "Variables in expression.+"
with pytest.raises(NumExprClobberingError, match=msg):
df.query("sin > 5", engine=engine, parser=parser)
def test_query(self):
engine, parser = self.engine, self.parser
df = DataFrame(np.random.randn(10, 3), columns=["a", "b", "c"])
assert_frame_equal(
df.query("a < b", engine=engine, parser=parser), df[df.a < df.b]
)
assert_frame_equal(
df.query("a + b > b * c", engine=engine, parser=parser),
df[df.a + df.b > df.b * df.c],
)
def test_query_index_with_name(self):
engine, parser = self.engine, self.parser
df = DataFrame(
np.random.randint(10, size=(10, 3)),
index=Index(range(10), name="blob"),
columns=["a", "b", "c"],
)
res = df.query("(blob < 5) & (a < b)", engine=engine, parser=parser)
expec = df[(df.index < 5) & (df.a < df.b)]
assert_frame_equal(res, expec)
res = df.query("blob < b", engine=engine, parser=parser)
expec = df[df.index < df.b]
assert_frame_equal(res, expec)
def test_query_index_without_name(self):
engine, parser = self.engine, self.parser
df = DataFrame(
np.random.randint(10, size=(10, 3)),
index=range(10),
columns=["a", "b", "c"],
)
# "index" should refer to the index
res = df.query("index < b", engine=engine, parser=parser)
expec = df[df.index < df.b]
assert_frame_equal(res, expec)
# test against a scalar
res = df.query("index < 5", engine=engine, parser=parser)
expec = df[df.index < 5]
assert_frame_equal(res, expec)
def test_nested_scope(self):
engine = self.engine
parser = self.parser
skip_if_no_pandas_parser(parser)
df = DataFrame(np.random.randn(5, 3))
df2 = DataFrame(np.random.randn(5, 3))
expected = df[(df > 0) & (df2 > 0)]
result = df.query("(@df > 0) & (@df2 > 0)", engine=engine, parser=parser)
assert_frame_equal(result, expected)
result = pd.eval("df[df > 0 and df2 > 0]", engine=engine, parser=parser)
assert_frame_equal(result, expected)
result = pd.eval(
"df[df > 0 and df2 > 0 and df[df > 0] > 0]", engine=engine, parser=parser
)
expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
assert_frame_equal(result, expected)
result = pd.eval("df[(df>0) & (df2>0)]", engine=engine, parser=parser)
expected = df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser)
assert_frame_equal(result, expected)
def test_nested_raises_on_local_self_reference(self):
from pandas.core.computation.ops import UndefinedVariableError
df = DataFrame(np.random.randn(5, 3))
# can't reference ourself b/c we're a local so @ is necessary
with pytest.raises(UndefinedVariableError):
df.query("df > 0", engine=self.engine, parser=self.parser)
def test_local_syntax(self):
skip_if_no_pandas_parser(self.parser)
engine, parser = self.engine, self.parser
df = DataFrame(np.random.randn(100, 10), columns=list("abcdefghij"))
b = 1
expect = df[df.a < b]
result = df.query("a < @b", engine=engine, parser=parser)
assert_frame_equal(result, expect)
expect = df[df.a < df.b]
result = df.query("a < b", engine=engine, parser=parser)
assert_frame_equal(result, expect)
def test_chained_cmp_and_in(self):
skip_if_no_pandas_parser(self.parser)
engine, parser = self.engine, self.parser
cols = list("abc")
df = DataFrame(np.random.randn(100, len(cols)), columns=cols)
res = df.query(
"a < b < c and a not in b not in c", engine=engine, parser=parser
)
ind = (
(df.a < df.b) & (df.b < df.c) & ~df.b.isin(df.a) & ~df.c.isin(df.b)
) # noqa
expec = df[ind]
assert_frame_equal(res, expec)
def test_local_variable_with_in(self):
engine, parser = self.engine, self.parser
skip_if_no_pandas_parser(parser)
a = Series(np.random.randint(3, size=15), name="a")
b = Series(np.random.randint(10, size=15), name="b")
df = DataFrame({"a": a, "b": b})
expected = df.loc[(df.b - 1).isin(a)]
result = df.query("b - 1 in a", engine=engine, parser=parser)
assert_frame_equal(expected, result)
b = Series(np.random.randint(10, size=15), name="b")
expected = df.loc[(b - 1).isin(a)]
result = df.query("@b - 1 in a", engine=engine, parser=parser)
assert_frame_equal(expected, result)
def test_at_inside_string(self):
engine, parser = self.engine, self.parser
skip_if_no_pandas_parser(parser)
c = 1 # noqa
df = DataFrame({"a": ["a", "a", "b", "b", "@c", "@c"]})
result = df.query('a == "@c"', engine=engine, parser=parser)
expected = df[df.a == "@c"]
assert_frame_equal(result, expected)
def test_query_undefined_local(self):
from pandas.core.computation.ops import UndefinedVariableError
engine, parser = self.engine, self.parser
skip_if_no_pandas_parser(parser)
df = DataFrame(np.random.rand(10, 2), columns=list("ab"))
msg = "local variable 'c' is not defined"
with pytest.raises(UndefinedVariableError, match=msg):
df.query("a == @c", engine=engine, parser=parser)
def test_index_resolvers_come_after_columns_with_the_same_name(self):
n = 1 # noqa
a = np.r_[20:101:20]
df = DataFrame({"index": a, "b": np.random.randn(a.size)})
df.index.name = "index"
result = df.query("index > 5", engine=self.engine, parser=self.parser)
expected = df[df["index"] > 5]
assert_frame_equal(result, expected)
df = DataFrame({"index": a, "b": np.random.randn(a.size)})
result = df.query("ilevel_0 > 5", engine=self.engine, parser=self.parser)
expected = df.loc[df.index[df.index > 5]]
assert_frame_equal(result, expected)
df = DataFrame({"a": a, "b": np.random.randn(a.size)})
df.index.name = "a"
result = df.query("a > 5", engine=self.engine, parser=self.parser)
expected = df[df.a > 5]
assert_frame_equal(result, expected)
result = df.query("index > 5", engine=self.engine, parser=self.parser)
expected = df.loc[df.index[df.index > 5]]
assert_frame_equal(result, expected)
def test_inf(self):
n = 10
df = DataFrame({"a": np.random.rand(n), "b": np.random.rand(n)})
df.loc[::2, 0] = np.inf
ops = "==", "!="
d = dict(zip(ops, (operator.eq, operator.ne)))
for op, f in d.items():
q = "a {op} inf".format(op=op)
expected = df[f(df.a, np.inf)]
result = df.query(q, engine=self.engine, parser=self.parser)
assert_frame_equal(result, expected)
@td.skip_if_no_ne
class TestDataFrameQueryNumExprPython(TestDataFrameQueryNumExprPandas):
@classmethod
def setup_class(cls):
super().setup_class()
cls.engine = "numexpr"
cls.parser = "python"
cls.frame = TestData().frame
def test_date_query_no_attribute_access(self):
engine, parser = self.engine, self.parser
df = DataFrame(np.random.randn(5, 3))
df["dates1"] = date_range("1/1/2012", periods=5)
df["dates2"] = date_range("1/1/2013", periods=5)
df["dates3"] = date_range("1/1/2014", periods=5)
res = df.query(
"(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
)
expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_query_with_NaT(self):
engine, parser = self.engine, self.parser
n = 10
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates2"] = date_range("1/1/2013", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
df.loc[np.random.rand(n) > 0.5, "dates3"] = pd.NaT
res = df.query(
"(dates1 < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
)
expec = df[(df.dates1 < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_index_query(self):
engine, parser = self.engine, self.parser
n = 10
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.set_index("dates1", inplace=True, drop=True)
res = df.query(
"(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
)
expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_index_query_with_NaT(self):
engine, parser = self.engine, self.parser
n = 10
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.iloc[0, 0] = pd.NaT
df.set_index("dates1", inplace=True, drop=True)
res = df.query(
"(index < 20130101) & (20130101 < dates3)", engine=engine, parser=parser
)
expec = df[(df.index < "20130101") & ("20130101" < df.dates3)]
assert_frame_equal(res, expec)
def test_date_index_query_with_NaT_duplicates(self):
engine, parser = self.engine, self.parser
n = 10
df = DataFrame(np.random.randn(n, 3))
df["dates1"] = date_range("1/1/2012", periods=n)
df["dates3"] = date_range("1/1/2014", periods=n)
df.loc[np.random.rand(n) > 0.5, "dates1"] = pd.NaT
df.set_index("dates1", inplace=True, drop=True)
with pytest.raises(NotImplementedError):
df.query("index < 20130101 < dates3", engine=engine, parser=parser)
def test_nested_scope(self):
from pandas.core.computation.ops import UndefinedVariableError
engine = self.engine
parser = self.parser
# smoke test
x = 1 # noqa
result = pd.eval("x + 1", engine=engine, parser=parser)
assert result == 2
df = DataFrame(np.random.randn(5, 3))
df2 = DataFrame(np.random.randn(5, 3))
# don't have the pandas parser
with pytest.raises(SyntaxError):
df.query("(@df>0) & (@df2>0)", engine=engine, parser=parser)
with pytest.raises(UndefinedVariableError):
df.query("(df>0) & (df2>0)", engine=engine, parser=parser)
expected = df[(df > 0) & (df2 > 0)]
result = pd.eval("df[(df > 0) & (df2 > 0)]", engine=engine, parser=parser)
assert_frame_equal(expected, result)
expected = df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]
result = pd.eval(
"df[(df > 0) & (df2 > 0) & (df[df > 0] > 0)]", engine=engine, parser=parser
)
assert_frame_equal(expected, result)
class TestDataFrameQueryPythonPandas(TestDataFrameQueryNumExprPandas):
@classmethod
def setup_class(cls):
super().setup_class()
cls.engine = "python"
cls.parser = "pandas"
cls.frame = TestData().frame
def test_query_builtin(self):
engine, parser = self.engine, self.parser
n = m = 10
df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc"))
df.index.name = "sin"
expected = df[df.index > 5]
result = df.query("sin > 5", engine=engine, parser=parser)
assert_frame_equal(expected, result)
class TestDataFrameQueryPythonPython(TestDataFrameQueryNumExprPython):
@classmethod
def setup_class(cls):
super().setup_class()
cls.engine = cls.parser = "python"
cls.frame = TestData().frame
def test_query_builtin(self):
engine, parser = self.engine, self.parser
n = m = 10
df = DataFrame(np.random.randint(m, size=(n, 3)), columns=list("abc"))
df.index.name = "sin"
expected = df[df.index > 5]
result = df.query("sin > 5", engine=engine, parser=parser)
assert_frame_equal(expected, result)
class TestDataFrameQueryStrings:
def test_str_query_method(self, parser, engine):
df = DataFrame(np.random.randn(10, 1), columns=["b"])
df["strings"] = Series(list("aabbccddee"))
expect = df[df.strings == "a"]
if parser != "pandas":
col = "strings"
lst = '"a"'
lhs = [col] * 2 + [lst] * 2
rhs = lhs[::-1]
eq, ne = "==", "!="
ops = 2 * ([eq] + [ne])
for lhs, op, rhs in zip(lhs, ops, rhs):
ex = "{lhs} {op} {rhs}".format(lhs=lhs, op=op, rhs=rhs)
msg = r"'(Not)?In' nodes are not implemented"
with pytest.raises(NotImplementedError, match=msg):
df.query(
ex,
engine=engine,
parser=parser,
local_dict={"strings": df.strings},
)
else:
res = df.query('"a" == strings', engine=engine, parser=parser)
assert_frame_equal(res, expect)
res = df.query('strings == "a"', engine=engine, parser=parser)
assert_frame_equal(res, expect)
assert_frame_equal(res, df[df.strings.isin(["a"])])
expect = df[df.strings != "a"]
res = df.query('strings != "a"', engine=engine, parser=parser)
assert_frame_equal(res, expect)
res = df.query('"a" != strings', engine=engine, parser=parser)
assert_frame_equal(res, expect)
assert_frame_equal(res, df[~df.strings.isin(["a"])])
def test_str_list_query_method(self, parser, engine):
df = DataFrame(np.random.randn(10, 1), columns=["b"])
df["strings"] = Series(list("aabbccddee"))
expect = df[df.strings.isin(["a", "b"])]
if parser != "pandas":
col = "strings"
lst = '["a", "b"]'
lhs = [col] * 2 + [lst] * 2
rhs = lhs[::-1]
eq, ne = "==", "!="
ops = 2 * ([eq] + [ne])
for lhs, op, rhs in zip(lhs, ops, rhs):
ex = "{lhs} {op} {rhs}".format(lhs=lhs, op=op, rhs=rhs)
with pytest.raises(NotImplementedError):
df.query(ex, engine=engine, parser=parser)
else:
res = df.query('strings == ["a", "b"]', engine=engine, parser=parser)
assert_frame_equal(res, expect)
res = df.query('["a", "b"] == strings', engine=engine, parser=parser)
assert_frame_equal(res, expect)
expect = df[~df.strings.isin(["a", "b"])]
res = df.query('strings != ["a", "b"]', engine=engine, parser=parser)
assert_frame_equal(res, expect)
res = df.query('["a", "b"] != strings', engine=engine, parser=parser)
assert_frame_equal(res, expect)
def test_query_with_string_columns(self, parser, engine):
df = DataFrame(
{
"a": list("aaaabbbbcccc"),
"b": list("aabbccddeeff"),
"c": np.random.randint(5, size=12),
"d": np.random.randint(9, size=12),
}
)
if parser == "pandas":
res = df.query("a in b", parser=parser, engine=engine)
expec = df[df.a.isin(df.b)]
assert_frame_equal(res, expec)
res = df.query("a in b and c < d", parser=parser, engine=engine)
expec = df[df.a.isin(df.b) & (df.c < df.d)]
assert_frame_equal(res, expec)
else:
with pytest.raises(NotImplementedError):
df.query("a in b", parser=parser, engine=engine)
with pytest.raises(NotImplementedError):
df.query("a in b and c < d", parser=parser, engine=engine)
def test_object_array_eq_ne(self, parser, engine):
df = DataFrame(
{
"a": list("aaaabbbbcccc"),
"b": list("aabbccddeeff"),
"c": np.random.randint(5, size=12),
"d": np.random.randint(9, size=12),
}
)
res = df.query("a == b", parser=parser, engine=engine)
exp = df[df.a == df.b]
assert_frame_equal(res, exp)
res = df.query("a != b", parser=parser, engine=engine)
exp = df[df.a != df.b]
assert_frame_equal(res, exp)
def test_query_with_nested_strings(self, parser, engine):
skip_if_no_pandas_parser(parser)
raw = """id event timestamp
1 "page 1 load" 1/1/2014 0:00:01
1 "page 1 exit" 1/1/2014 0:00:31
2 "page 2 load" 1/1/2014 0:01:01
2 "page 2 exit" 1/1/2014 0:01:31
3 "page 3 load" 1/1/2014 0:02:01
3 "page 3 exit" 1/1/2014 0:02:31
4 "page 1 load" 2/1/2014 1:00:01
4 "page 1 exit" 2/1/2014 1:00:31
5 "page 2 load" 2/1/2014 1:01:01
5 "page 2 exit" 2/1/2014 1:01:31
6 "page 3 load" 2/1/2014 1:02:01
6 "page 3 exit" 2/1/2014 1:02:31
"""
df = pd.read_csv(
StringIO(raw), sep=r"\s{2,}", engine="python", parse_dates=["timestamp"]
)
expected = df[df.event == '"page 1 load"']
res = df.query("""'"page 1 load"' in event""", parser=parser, engine=engine)
assert_frame_equal(expected, res)
def test_query_with_nested_special_character(self, parser, engine):
skip_if_no_pandas_parser(parser)
df = DataFrame({"a": ["a", "b", "test & test"], "b": [1, 2, 3]})
res = df.query('a == "test & test"', parser=parser, engine=engine)
expec = df[df.a == "test & test"]
assert_frame_equal(res, expec)
def test_query_lex_compare_strings(self, parser, engine):
import operator as opr
a = Series(np.random.choice(list("abcde"), 20))
b = Series(np.arange(a.size))
df = DataFrame({"X": a, "Y": b})
ops = {"<": opr.lt, ">": opr.gt, "<=": opr.le, ">=": opr.ge}
for op, func in ops.items():
res = df.query('X %s "d"' % op, engine=engine, parser=parser)
expected = df[func(df.X, "d")]
assert_frame_equal(res, expected)
def test_query_single_element_booleans(self, parser, engine):
columns = "bid", "bidsize", "ask", "asksize"
data = np.random.randint(2, size=(1, len(columns))).astype(bool)
df = DataFrame(data, columns=columns)
res = df.query("bid & ask", engine=engine, parser=parser)
expected = df[df.bid & df.ask]
assert_frame_equal(res, expected)
def test_query_string_scalar_variable(self, parser, engine):
skip_if_no_pandas_parser(parser)
df = pd.DataFrame(
{
"Symbol": ["BUD US", "BUD US", "IBM US", "IBM US"],
"Price": [109.70, 109.72, 183.30, 183.35],
}
)
e = df[df.Symbol == "BUD US"]
symb = "BUD US" # noqa
r = df.query("Symbol == @symb", parser=parser, engine=engine)
assert_frame_equal(e, r)
class TestDataFrameEvalWithFrame:
def setup_method(self, method):
self.frame = DataFrame(np.random.randn(10, 3), columns=list("abc"))
def teardown_method(self, method):
del self.frame
def test_simple_expr(self, parser, engine):
res = self.frame.eval("a + b", engine=engine, parser=parser)
expect = self.frame.a + self.frame.b
assert_series_equal(res, expect)
def test_bool_arith_expr(self, parser, engine):
res = self.frame.eval("a[a < 1] + b", engine=engine, parser=parser)
expect = self.frame.a[self.frame.a < 1] + self.frame.b
assert_series_equal(res, expect)
@pytest.mark.parametrize("op", ["+", "-", "*", "/"])
def test_invalid_type_for_operator_raises(self, parser, engine, op):
df = DataFrame({"a": [1, 2], "b": ["c", "d"]})
msg = r"unsupported operand type\(s\) for .+: '.+' and '.+'"
with pytest.raises(TypeError, match=msg):
df.eval("a {0} b".format(op), engine=engine, parser=parser)
class TestDataFrameQueryBacktickQuoting:
@pytest.fixture(scope="class")
def df(self):
yield DataFrame(
{
"A": [1, 2, 3],
"B B": [3, 2, 1],
"C C": [4, 5, 6],
"C_C": [8, 9, 10],
"D_D D": [11, 1, 101],
}
)
def test_single_backtick_variable_query(self, df):
res = df.query("1 < `B B`")
expect = df[1 < df["B B"]]
assert_frame_equal(res, expect)
def test_two_backtick_variables_query(self, df):
res = df.query("1 < `B B` and 4 < `C C`")
expect = df[(1 < df["B B"]) & (4 < df["C C"])]
assert_frame_equal(res, expect)
def test_single_backtick_variable_expr(self, df):
res = df.eval("A + `B B`")
expect = df["A"] + df["B B"]
assert_series_equal(res, expect)
def test_two_backtick_variables_expr(self, df):
res = df.eval("`B B` + `C C`")
expect = df["B B"] + df["C C"]
assert_series_equal(res, expect)
def test_already_underscore_variable(self, df):
res = df.eval("`C_C` + A")
expect = df["C_C"] + df["A"]
assert_series_equal(res, expect)
def test_same_name_but_underscores(self, df):
res = df.eval("C_C + `C C`")
expect = df["C_C"] + df["C C"]
assert_series_equal(res, expect)
def test_mixed_underscores_and_spaces(self, df):
res = df.eval("A + `D_D D`")
expect = df["A"] + df["D_D D"]
assert_series_equal(res, expect)
def backtick_quote_name_with_no_spaces(self, df):
res = df.eval("A + `C_C`")
expect = df["A"] + df["C_C"]
assert_series_equal(res, expect)