modin
modin copied to clipboard
BUG: Series.compare with differently named series raises ValueError, but should not
Modin version checks
-
[X] I have checked that this issue has not already been reported.
-
[X] I have confirmed this bug exists on the latest released version of Modin.
-
[X] I have confirmed this bug exists on the main branch of Modin. (In order to do this you can follow this guide.)
Reproducible Example
import modin.pandas as pd
pd.Series(1, name='a').compare(pd.Series(2, name='b'))
Issue Description
DataFrame.compare
requires the two frames to have the same columns, but Series.compare
should not.
Expected Behavior
should match pandas and ignore the series name
.
Error Logs
RayTaskError(ValueError) Traceback (most recent call last)
File ~/miniconda3/envs/modin-latest/lib/python3.9/site-packages/IPython/core/formatters.py:708, in PlainTextFormatter.__call__(self, obj)
701 stream = StringIO()
702 printer = pretty.RepresentationPrinter(stream, self.verbose,
703 self.max_width, self.newline,
704 max_seq_length=self.max_seq_length,
705 singleton_pprinters=self.singleton_printers,
706 type_pprinters=self.type_printers,
707 deferred_pprinters=self.deferred_printers)
--> 708 printer.pretty(obj)
709 printer.flush()
710 return stream.getvalue()
File ~/miniconda3/envs/modin-latest/lib/python3.9/site-packages/IPython/lib/pretty.py:410, in RepresentationPrinter.pretty(self, obj)
407 return meth(obj, self, cycle)
408 if cls is not object \
409 and callable(cls.__dict__.get('__repr__')):
--> 410 return _repr_pprint(obj, self, cycle)
412 return _default_pprint(obj, self, cycle)
413 finally:
File ~/miniconda3/envs/modin-latest/lib/python3.9/site-packages/IPython/lib/pretty.py:778, in _repr_pprint(obj, p, cycle)
776 """A pprint that just redirects to the normal repr function."""
777 # Find newlines and replace them with p.break_()
--> 778 output = repr(obj)
779 lines = output.splitlines()
780 with p.group():
File ~/sources/modin/modin/logging/logger_decorator.py:144, in enable_logging.<locals>.decorator.<locals>.run_and_log(*args, **kwargs)
129 """
130 Compute function with logging if Modin logging is enabled.
131
(...)
141 Any
142 """
143 if LogMode.get() == "disable":
--> 144 return obj(*args, **kwargs)
146 logger = get_logger()
147 logger.log(log_level, start_line)
File ~/sources/modin/modin/pandas/dataframe.py:273, in DataFrame.__repr__(self)
271 num_rows = pandas.get_option("display.max_rows") or len(self.index)
272 num_cols = pandas.get_option("display.max_columns") or len(self.columns)
--> 273 result = repr(self._build_repr_df(num_rows, num_cols))
274 if len(self.index) > num_rows or len(self.columns) > num_cols:
275 # The split here is so that we don't repr pandas row lengths.
276 return result.rsplit("\n\n", 1)[0] + "\n\n[{0} rows x {1} columns]".format(
277 len(self.index), len(self.columns)
278 )
File ~/sources/modin/modin/logging/logger_decorator.py:144, in enable_logging.<locals>.decorator.<locals>.run_and_log(*args, **kwargs)
129 """
130 Compute function with logging if Modin logging is enabled.
131
(...)
141 Any
142 """
143 if LogMode.get() == "disable":
--> 144 return obj(*args, **kwargs)
146 logger = get_logger()
147 logger.log(log_level, start_line)
File ~/sources/modin/modin/pandas/base.py:276, in BasePandasDataset._build_repr_df(self, num_rows, num_cols)
254 """
255 Build pandas DataFrame for string representation.
256
(...)
273 A pandas dataset with `num_rows` or fewer rows and `num_cols` or fewer columns.
274 """
275 # Fast track for empty dataframe.
--> 276 if len(self.index) == 0 or (self._is_dataframe and len(self.columns) == 0):
277 return pandas.DataFrame(
278 index=self.index,
279 columns=self.columns if self._is_dataframe else None,
280 )
281 row_indexer = _get_repr_axis_label_indexer(self.index, num_rows)
File ~/sources/modin/modin/pandas/base.py:4294, in BasePandasDataset.__getattribute__(self, item)
4280 @disable_logging
4281 def __getattribute__(self, item) -> Any:
4282 """
4283 Return item from the `BasePandasDataset`.
4284
(...)
4292 Any
4293 """
-> 4294 attr = super().__getattribute__(item)
4295 if item not in _DEFAULT_BEHAVIOUR and not self._query_compiler.lazy_execution:
4296 # We default to pandas on empty DataFrames. This avoids a large amount of
4297 # pain in underlying implementation and returns a result immediately rather
4298 # than dealing with the edge cases that empty DataFrames have.
4299 if callable(attr) and self.empty and hasattr(self._pandas_class, item):
File ~/sources/modin/modin/pandas/base.py:643, in BasePandasDataset._get_index(self)
634 def _get_index(self) -> pandas.Index:
635 """
636 Get the index for this DataFrame.
637
(...)
641 The union of all indexes across the partitions.
642 """
--> 643 return self._query_compiler.index
File ~/sources/modin/modin/core/storage_formats/pandas/query_compiler.py:102, in _get_axis.<locals>.<lambda>(self)
89 """
90 Build index labels getter of the specified axis.
91
(...)
99 callable(PandasQueryCompiler) -> pandas.Index
100 """
101 if axis == 0:
--> 102 return lambda self: self._modin_frame.index
103 else:
104 return lambda self: self._modin_frame.columns
File ~/sources/modin/modin/core/dataframe/pandas/dataframe/dataframe.py:709, in PandasDataframe._get_index(self)
700 """
701 Get the index from the cache object.
702
(...)
706 An index object containing the row labels.
707 """
708 if self.has_index_cache:
--> 709 index, row_lengths = self._index_cache.get(return_lengths=True)
710 else:
711 index, row_lengths = self._compute_axis_labels_and_lengths(0)
File ~/sources/modin/modin/core/dataframe/pandas/metadata/index.py:194, in ModinIndex.get(self, return_lengths)
192 if not self.is_materialized:
193 if callable(self._value):
--> 194 index, self._lengths_cache = self._value()
195 self._value = ensure_index(index)
196 elif self._value is None:
File ~/sources/modin/modin/core/dataframe/pandas/metadata/index.py:106, in ModinIndex._get_default_callable.<locals>.<lambda>()
91 @staticmethod
92 def _get_default_callable(dataframe_obj, axis):
93 """
94 Build a callable extracting index labels and partitions lengths for the specified axis.
95
(...)
104 callable() -> tuple(pandas.Index, list[ints])
105 """
--> 106 return lambda: dataframe_obj._compute_axis_labels_and_lengths(axis)
File ~/sources/modin/modin/logging/logger_decorator.py:144, in enable_logging.<locals>.decorator.<locals>.run_and_log(*args, **kwargs)
129 """
130 Compute function with logging if Modin logging is enabled.
131
(...)
141 Any
142 """
143 if LogMode.get() == "disable":
--> 144 return obj(*args, **kwargs)
146 logger = get_logger()
147 logger.log(log_level, start_line)
File ~/sources/modin/modin/core/dataframe/pandas/dataframe/dataframe.py:835, in PandasDataframe._compute_axis_labels_and_lengths(self, axis, partitions)
833 if partitions is None:
834 partitions = self._partitions
--> 835 new_index, internal_idx = self._partition_mgr_cls.get_indices(axis, partitions)
836 return new_index, list(map(len, internal_idx))
File ~/sources/modin/modin/logging/logger_decorator.py:144, in enable_logging.<locals>.decorator.<locals>.run_and_log(*args, **kwargs)
129 """
130 Compute function with logging if Modin logging is enabled.
131
(...)
141 Any
142 """
143 if LogMode.get() == "disable":
--> 144 return obj(*args, **kwargs)
146 logger = get_logger()
147 logger.log(log_level, start_line)
File ~/sources/modin/modin/core/dataframe/pandas/partitioning/partition_manager.py:1193, in PandasDataframePartitionManager.get_indices(cls, axis, partitions, index_func)
1191 if len(target):
1192 new_idx = [idx.apply(func) for idx in target[0]]
-> 1193 new_idx = cls.get_objects_from_partitions(new_idx)
1194 else:
1195 new_idx = [pandas.Index([])]
File ~/sources/modin/modin/logging/logger_decorator.py:144, in enable_logging.<locals>.decorator.<locals>.run_and_log(*args, **kwargs)
129 """
130 Compute function with logging if Modin logging is enabled.
131
(...)
141 Any
142 """
143 if LogMode.get() == "disable":
--> 144 return obj(*args, **kwargs)
146 logger = get_logger()
147 logger.log(log_level, start_line)
File ~/sources/modin/modin/core/dataframe/pandas/partitioning/partition_manager.py:1134, in PandasDataframePartitionManager.get_objects_from_partitions(cls, partitions)
1130 partitions[idx] = part.force_materialization()
1131 assert all(
1132 [len(partition.list_of_blocks) == 1 for partition in partitions]
1133 ), "Implementation assumes that each partition contains a single block."
-> 1134 return cls._execution_wrapper.materialize(
1135 [partition.list_of_blocks[0] for partition in partitions]
1136 )
1137 return [partition.get() for partition in partitions]
File ~/sources/modin/modin/core/execution/ray/common/engine_wrapper.py:139, in RayWrapper.materialize(cls, obj_id)
136 return ray.get(obj_id) if isinstance(obj_id, ray.ObjectRef) else obj_id
138 if all(isinstance(obj, ray.ObjectRef) for obj in obj_id):
--> 139 return ray.get(obj_id)
141 ids = {}
142 result = []
File ~/miniconda3/envs/modin-latest/lib/python3.9/site-packages/ray/_private/auto_init_hook.py:24, in wrap_auto_init.<locals>.auto_init_wrapper(*args, **kwargs)
21 @wraps(fn)
22 def auto_init_wrapper(*args, **kwargs):
23 auto_init_ray()
---> 24 return fn(*args, **kwargs)
File ~/miniconda3/envs/modin-latest/lib/python3.9/site-packages/ray/_private/client_mode_hook.py:103, in client_mode_hook.<locals>.wrapper(*args, **kwargs)
101 if func.__name__ != "init" or is_client_mode_enabled_by_default:
102 return getattr(ray, func.__name__)(*args, **kwargs)
--> 103 return func(*args, **kwargs)
File ~/miniconda3/envs/modin-latest/lib/python3.9/site-packages/ray/_private/worker.py:2563, in get(object_refs, timeout)
2561 worker.core_worker.dump_object_store_memory_usage()
2562 if isinstance(value, RayTaskError):
-> 2563 raise value.as_instanceof_cause()
2564 else:
2565 raise value
RayTaskError(ValueError): ray::remote_exec_func() (pid=58063, ip=127.0.0.1)
At least one of the input arguments for this task could not be computed:
ray.exceptions.RayTaskError: ray::_deploy_ray_func() (pid=58063, ip=127.0.0.1)
File "/Users/mvashishtha/sources/modin/modin/core/execution/ray/implementations/pandas_on_ray/partitioning/virtual_partition.py", line 335, in _deploy_ray_func
result = deployer(axis, f_to_deploy, f_args, f_kwargs, *deploy_args, **kwargs)
File "/Users/mvashishtha/sources/modin/modin/logging/logger_decorator.py", line 144, in run_and_log
return obj(*args, **kwargs)
File "/Users/mvashishtha/sources/modin/modin/core/dataframe/pandas/partitioning/axis_partition.py", line 575, in deploy_func_between_two_axis_partitions
result = func(lt_frame, rt_frame, *f_args, **f_kwargs)
File "/Users/mvashishtha/sources/modin/modin/core/dataframe/pandas/dataframe/dataframe.py", line 2078, in _tree_reduce_func
series_result = func(df, *args, **kwargs)
File "/Users/mvashishtha/sources/modin/modin/core/storage_formats/pandas/query_compiler.py", line 4663, in <lambda>
lambda left, right: pandas.DataFrame.compare(
File "/Users/mvashishtha/miniconda3/envs/modin-latest/lib/python3.9/site-packages/pandas/core/frame.py", line 8580, in compare
return super().compare(
File "/Users/mvashishtha/miniconda3/envs/modin-latest/lib/python3.9/site-packages/pandas/core/generic.py", line 10118, in compare
mask = ~((self == other) | (self.isna() & other.isna()))
File "/Users/mvashishtha/miniconda3/envs/modin-latest/lib/python3.9/site-packages/pandas/core/ops/common.py", line 76, in new_method
return method(self, other)
File "/Users/mvashishtha/miniconda3/envs/modin-latest/lib/python3.9/site-packages/pandas/core/arraylike.py", line 40, in __eq__
return self._cmp_method(other, operator.eq)
File "/Users/mvashishtha/miniconda3/envs/modin-latest/lib/python3.9/site-packages/pandas/core/frame.py", line 7884, in _cmp_method
self, other = self._align_for_op(other, axis, flex=False, level=None)
File "/Users/mvashishtha/miniconda3/envs/modin-latest/lib/python3.9/site-packages/pandas/core/frame.py", line 8183, in _align_for_op
raise ValueError(
ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects
Installed Versions
INSTALLED VERSIONS
------------------
commit : 759d548814a6ac224e83e7531cf98e20b13d85cb
python : 3.9.18.final.0
python-bits : 64
OS : Darwin
OS-release : 23.5.0
Version : Darwin Kernel Version 23.5.0: Wed May 1 20:14:38 PDT 2024; root:xnu-10063.121.3~5/RELEASE_ARM64_T6020
machine : arm64
processor : arm
byteorder : little
LC_ALL : None
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
Modin dependencies
------------------
modin : 0.31.0+2.g759d5488
ray : 2.8.0
dask : 2024.3.1
distributed : 2024.3.1
pandas dependencies
-------------------
pandas : 2.2.1
numpy : 1.26.1
pytz : 2023.3.post1
dateutil : 2.8.2
setuptools : 68.0.0
pip : 23.3
Cython : None
pytest : 8.1.1
hypothesis : None
sphinx : 7.2.6
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 5.1.0
html5lib : None
pymysql : None
psycopg2 : 2.9.9
jinja2 : 3.1.2
IPython : 8.17.2
pandas_datareader : None
adbc-driver-postgresql: None
adbc-driver-sqlite : None
bs4 : 4.12.3
bottleneck : None
dataframe-api-compat : None
fastparquet : 2024.2.0
fsspec : 2024.3.1
gcsfs : None
matplotlib : 3.8.1
numba : None
numexpr : 2.8.4
odfpy : None
openpyxl : 3.1.2
pandas_gbq : 0.22.0
pyarrow : 14.0.1
pyreadstat : None
python-calamine : None
pyxlsb : None
s3fs : 2024.3.1
scipy : 1.11.3
sqlalchemy : 2.0.29
tables : 3.9.2
tabulate : 0.9.0
xarray : 2024.2.0
xlrd : 2.0.1
zstandard : None
tzdata : 2023.3
qtpy : None
pyqt5 : None