pandas
pandas copied to clipboard
BUG: `df.eval` can't concatenate string column and string via `+`
Pandas version checks
-
[X] I have checked that this issue has not already been reported.
-
[X] I have confirmed this bug exists on the latest version of pandas.
-
[ ] I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
In [1]: import pandas as pd
In [2]: df = pd.DataFrame({'a': ['a', 'b'], 'b': ['1', '2']})
In [3]: df
Out[3]:
a b
0 a 1
1 b 2
In [4]: df.eval('a + "a"') # fail
TypeError: unsupported operand type(s) for +: 'object' and '<class 'str'>'
In [5]: df.convert_dtypes().eval('a + "a"') # fail too
TypeError: Cannot interpret 'string[python]' as a data type
In [6]: df.eval("a + b") # work
Out[6]:
0 a1
1 b2
dtype: object
In [7]: df.dtypes
Out[7]:
a object
b object
dtype: object
In [8]: df.convert_dtypes().dtypes
Out[8]:
a string
b string
dtype: object
`df.eval('a + "a"')`` error messages
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-4-588814d68130> in <cell line: 1>()
----> 1 df.eval('a + "a"')
~\mambaforge\envs\work\lib\site-packages\pandas\core\frame.py in eval(self, expr, inplace, **kwargs)
4238 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
4239
-> 4240 return _eval(expr, inplace=inplace, **kwargs)
4241
4242 def select_dtypes(self, include=None, exclude=None) -> DataFrame:
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
348 )
349
--> 350 parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
351
352 # construct the engine and evaluate the parsed expression
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in __init__(self, expr, engine, parser, env, level)
809 self.parser = parser
810 self._visitor = PARSERS[parser](self.env, self.engine, self.parser)
--> 811 self.terms = self.parse()
812
813 @property
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in parse(self)
828 Parse an expression.
829 """
--> 830 return self._visitor.visit(self.expr)
831
832 @property
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit(self, node, **kwargs)
413 method = "visit_" + type(node).__name__
414 visitor = getattr(self, method)
--> 415 return visitor(node, **kwargs)
416
417 def visit_Module(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit_Module(self, node, **kwargs)
419 raise SyntaxError("only a single expression is allowed")
420 expr = node.body[0]
--> 421 return self.visit(expr, **kwargs)
422
423 def visit_Expr(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit(self, node, **kwargs)
413 method = "visit_" + type(node).__name__
414 visitor = getattr(self, method)
--> 415 return visitor(node, **kwargs)
416
417 def visit_Module(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit_Expr(self, node, **kwargs)
422
423 def visit_Expr(self, node, **kwargs):
--> 424 return self.visit(node.value, **kwargs)
425
426 def _rewrite_membership_op(self, node, left, right):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit(self, node, **kwargs)
413 method = "visit_" + type(node).__name__
414 visitor = getattr(self, method)
--> 415 return visitor(node, **kwargs)
416
417 def visit_Module(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit_BinOp(self, node, **kwargs)
536 op, op_class, left, right = self._maybe_transform_eq_ne(node)
537 left, right = self._maybe_downcast_constants(left, right)
--> 538 return self._maybe_evaluate_binop(op, op_class, left, right)
539
540 def visit_Div(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in _maybe_evaluate_binop(self, op, op_class, lhs, rhs, eval_in_python, maybe_eval_in_python)
506
507 if res.has_invalid_return_type:
--> 508 raise TypeError(
509 f"unsupported operand type(s) for {res.op}: "
510 f"'{lhs.type}' and '{rhs.type}'"
TypeError: unsupported operand type(s) for +: 'object' and '<class 'str'>'
`df.convert_dtypes().eval('a + "a"')` error messages
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-5-42cd31671fee> in <cell line: 1>()
----> 1 df.convert_dtypes().eval('a + "a"')
~\mambaforge\envs\work\lib\site-packages\pandas\core\frame.py in eval(self, expr, inplace, **kwargs)
4238 kwargs["resolvers"] = tuple(kwargs.get("resolvers", ())) + resolvers
4239
-> 4240 return _eval(expr, inplace=inplace, **kwargs)
4241
4242 def select_dtypes(self, include=None, exclude=None) -> DataFrame:
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\eval.py in eval(expr, parser, engine, truediv, local_dict, global_dict, resolvers, level, target, inplace)
348 )
349
--> 350 parsed_expr = Expr(expr, engine=engine, parser=parser, env=env)
351
352 # construct the engine and evaluate the parsed expression
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in __init__(self, expr, engine, parser, env, level)
809 self.parser = parser
810 self._visitor = PARSERS[parser](self.env, self.engine, self.parser)
--> 811 self.terms = self.parse()
812
813 @property
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in parse(self)
828 Parse an expression.
829 """
--> 830 return self._visitor.visit(self.expr)
831
832 @property
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit(self, node, **kwargs)
413 method = "visit_" + type(node).__name__
414 visitor = getattr(self, method)
--> 415 return visitor(node, **kwargs)
416
417 def visit_Module(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit_Module(self, node, **kwargs)
419 raise SyntaxError("only a single expression is allowed")
420 expr = node.body[0]
--> 421 return self.visit(expr, **kwargs)
422
423 def visit_Expr(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit(self, node, **kwargs)
413 method = "visit_" + type(node).__name__
414 visitor = getattr(self, method)
--> 415 return visitor(node, **kwargs)
416
417 def visit_Module(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit_Expr(self, node, **kwargs)
422
423 def visit_Expr(self, node, **kwargs):
--> 424 return self.visit(node.value, **kwargs)
425
426 def _rewrite_membership_op(self, node, left, right):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit(self, node, **kwargs)
413 method = "visit_" + type(node).__name__
414 visitor = getattr(self, method)
--> 415 return visitor(node, **kwargs)
416
417 def visit_Module(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in visit_BinOp(self, node, **kwargs)
536 op, op_class, left, right = self._maybe_transform_eq_ne(node)
537 left, right = self._maybe_downcast_constants(left, right)
--> 538 return self._maybe_evaluate_binop(op, op_class, left, right)
539
540 def visit_Div(self, node, **kwargs):
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\expr.py in _maybe_evaluate_binop(self, op, op_class, lhs, rhs, eval_in_python, maybe_eval_in_python)
505 res = op(lhs, rhs)
506
--> 507 if res.has_invalid_return_type:
508 raise TypeError(
509 f"unsupported operand type(s) for {res.op}: "
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\ops.py in has_invalid_return_type(self)
243 types = self.operand_types
244 obj_dtype_set = frozenset([np.dtype("object")])
--> 245 return self.return_type == object and types - obj_dtype_set
246
247 @property
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\ops.py in return_type(self)
237 if self.op in (CMP_OPS_SYMS + BOOL_OPS_SYMS):
238 return np.bool_
--> 239 return result_type_many(*(term.type for term in com.flatten(self)))
240
241 @property
~\mambaforge\envs\work\lib\site-packages\pandas\core\computation\common.py in result_type_many(*arrays_and_dtypes)
21 """
22 try:
---> 23 return np.result_type(*arrays_and_dtypes)
24 except ValueError:
25 # we have > NPY_MAXARGS terms in our expression
~\mambaforge\envs\work\lib\site-packages\numpy\core\overrides.py in result_type(*args, **kwargs)
TypeError: Cannot interpret 'string[python]' as a data type
Issue Description
df.eval
can't concatenate string column and string via +
operator like df.eval("string_column + 'a string'")
I also tried some other cases, but they also failed.
-
df.astype(str).eval("a + 'a'")
fail -
df.eval("a + 'a'", engine="python")
fail
Expected Behavior
>>> df.eval("a + 'a'")
0 aa
1 ba
Name: a, dtype: object
Installed Versions
While run pd.show_versions()
I noted I don't have 'numexpr'.
So after finishing installing 'numexpr', above cases still couldn't work.
INSTALLED VERSIONS
commit : e8093ba372f9adfe79439d90fe74b0b5b6dea9d6 python : 3.9.13.final.0 python-bits : 64 OS : Windows OS-release : 10 Version : 10.0.19044 machine : AMD64 processor : Intel64 Family 6 Model 158 Stepping 10, GenuineIntel byteorder : little LC_ALL : None LANG : None LOCALE : Chinese (Simplified)_China.936
pandas : 1.4.3 numpy : 1.22.4 pytz : 2022.1 dateutil : 2.8.2 setuptools : 62.6.0 pip : 22.1.2 Cython : None pytest : None hypothesis : None sphinx : 5.0.2 blosc : None feather : None xlsxwriter : None lxml.etree : None html5lib : None pymysql : None psycopg2 : None jinja2 : 3.1.2 IPython : 7.33.0 pandas_datareader: None bs4 : 4.11.1 bottleneck : None brotli : fastparquet : None fsspec : None gcsfs : None markupsafe : 2.1.1 matplotlib : 3.5.2 numba : None numexpr : None odfpy : None openpyxl : 3.0.9 pandas_gbq : None pyarrow : None pyreadstat : None pyxlsb : None s3fs : None scipy : 1.8.1 snappy : None sqlalchemy : None tables : None tabulate : None xarray : 2022.3.0 xlrd : 1.2.0 xlwt : 1.3.0 zstandard : None
take
This looks like a numexpr
issue as this works without it:
In [1]: import pandas as pd
In [2]: df = pd.DataFrame({'A': ['a', 'b'], 'B': ['1', '2']})
In [3]: df.eval("A + B", engine="python")
Out[20]:
0 a1
1 b2
dtype: object
In [4]: df2 = df.convert_dtypes()
In [5]: df2.eval("A + B", engine="python")
Out[22]:
0 a1
1 b2
dtype: string
Your example also works verbatim, if numexpr isn't installed.
Looks like an issue in pandas/core/computation/ops.py:105
judging by the traceback.
I have the same problem.
The problem appears to be with the Op.has_invalid_return_type
method. If one changes line 241 from:
obj_dtype_set = frozenset([np.dtype("object")])
To:
obj_dtype_set = frozenset([np.dtype("object"), str])
Then the problem appears fixed for me.