polars
polars copied to clipboard
`test_streaming_join`, `test_reproducible_hash_with_seeds` fail on M1
Freshly cloned from master. On the first run:
@typing.no_type_check
def test_streaming_joins() -> None:
n = 100
dfa = pd.DataFrame(
{
"a": np.random.randint(0, 40, n),
"b": np.arange(0, n),
}
)
n = 100
dfb = pd.DataFrame(
{
"a": np.random.randint(0, 40, n),
"b": np.arange(0, n),
}
)
dfa_pl = pl.from_pandas(dfa).sort("a")
dfb_pl = pl.from_pandas(dfb)
for how in ["inner", "left"]:
pd_result = dfa.merge(dfb, on="a", how=how)
pd_result.columns = ["a", "b", "b_right"]
pl_result = (
> dfa_pl.lazy()
.join(dfb_pl.lazy(), on="a", how=how)
.sort(["a", "b"])
.collect(streaming=True)
)
tests/unit/operations/test_join.py:427:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <polars.LazyFrame object at 0x12FF72130>
def collect(
self,
*,
type_coercion: bool = True,
predicate_pushdown: bool = True,
projection_pushdown: bool = True,
simplify_expression: bool = True,
no_optimization: bool = False,
slice_pushdown: bool = True,
common_subplan_elimination: bool = True,
streaming: bool = False,
) -> DataFrame:
if no_optimization:
predicate_pushdown = False
projection_pushdown = False
slice_pushdown = False
common_subplan_elimination = False
if streaming:
common_subplan_elimination = False
ldf = self._ldf.optimization_toggle(
type_coercion,
predicate_pushdown,
projection_pushdown,
simplify_expression,
slice_pushdown,
common_subplan_elimination,
streaming,
)
> return wrap_df(ldf.collect())
E OSError: No such file or directory (os error 2)
polars/lazyframe/frame.py:1475: OSError
======================================================================== short test summary info =========================================================================
FAILED tests/unit/operations/test_join.py::test_streaming_joins - OSError: No such file or directory (os error 2)
First seen this in #7537, where after rebase the same test throws this 🤔
E AssertionError: Series are different.
E
E Value mismatch
E [left]: [22, 46, 22, 46, 0, 44, 74, 44, 74, 44, 74, 51, 61, 80, 4, 10, 17, 38, 48, 64, 73, 75, 81, 48, 64, 73, 75, 81, 48, 64, 73, 75, 81, 48, 64, 73, 75, 81, 48, 64, 73, 75, 81, 48, 64, 73, 75, 81, 3, 14, 39, 58, 3, 14, 39, 58, 7, 35, 67, 7, 35, 67, 7, 35, 67, 7, 35, 67, 49, 55, 60, 6, 34, 6, 34, 6, 34, 11, 23, 57, 11, 23, 57, 11, 23, 57, 11, 23, 57, 2, 8, 45, 52, 2, 8, 45, 52, 2, 8, 45, 52, 32, 40, 42, 32, 40, 42, 32, 40, 42, 76, 77, 87, 24, 50, 50, 95, 15, 26, 79, 15, 26, 79, 15, 26, 79, 15, 26, 79, 15, 26, 79]
E [right]: [46, 22, 46, 22, 44, 74, 0, 74, 44, 74, 44, 61, 80, 51, 38, 17, 4, 10, 64, 73, 75, 81, 48, 73, 75, 81, 64, 48, 73, 75, 81, 64, 48, 73, 75, 81, 64, 48, 73, 75, 81, 64, 48, 73, 75, 81, 64, 48, 39, 3, 58, 14, 39, 3, 58, 14, 67, 7, 35, 67, 7, 35, 67, 7, 35, 67, 7, 35, 60, 49, 55, 34, 6, 34, 6, 34, 6, 23, 57, 11, 23, 57, 11, 23, 57, 11, 23, 57, 11, 45, 52, 2, 8, 45, 52, 2, 8, 45, 52, 2, 8, 40, 42, 32, 40, 42, 32, 40, 42, 32, 87, 76, 77, 50, 24, 95, 50, 79, 15, 26, 79, 15, 26, 79, 15, 26, 79, 15, 26, 79, 15, 26]
Reproducible example
# on M1 mac
git clone
cd py-polars
make test
Expected behavior
Installed versions
That's strange. I cannot reproduce. Maybe this something related to m1?
Can anybody else confirm having problems?
I also get a failure on my M1, but on a different test...
I get an AssertionError: Possible import speed regression; took 773ms
for test_polars_import
on my non-M1 machine but I suspect that's not related.
def test_reproducible_hash_with_seeds() -> None:
"""
Test the reproducibility of DataFrame.hash_rows, Series.hash, and Expr.hash.
cf. issue #3966, hashes must always be reproducible across sessions when using
the same seeds.
"""
df = pl.DataFrame({"s": [1234, None, 5678]})
seeds = (11, 22, 33, 44)
# TODO: introduce a platform-stable string hash...
# in the meantime, try to account for arm64 (mac) hash values to reduce noise
expected = pl.Series(
"s",
[13477868900383131459, 988796329533502010, 16840582678788620208],
dtype=pl.UInt64,
)
result = df.hash_rows(*seeds)
> assert_series_equal(expected, result, check_names=False, check_exact=True)
tests/unit/test_df.py:1623:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
polars/utils/decorators.py:136: in wrapper
return function(*args, **kwargs)
polars/testing/asserts.py:244: in assert_series_equal
_assert_series_inner(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
left = shape: (3,)
Series: 's' [u64]
[
13477868900383131459
988796329533502010
16840582678788620208
]
right = shape: (3,)
Series: '' [u64]
[
13477868900383131459
15496313222292466864
16840582678788620208
], check_dtype = True, check_exact = True
nans_compare_equal = True, atol = 1e-08, rtol = 1e-05
def _assert_series_inner(
left: Series,
right: Series,
check_dtype: bool,
check_exact: bool,
nans_compare_equal: bool,
atol: float,
rtol: float,
) -> None:
"""Compare Series dtype + values."""
try:
can_be_subtracted = hasattr(dtype_to_py_type(left.dtype), "__sub__")
except NotImplementedError:
can_be_subtracted = False
check_exact = check_exact or not can_be_subtracted or left.dtype == Boolean
if check_dtype and left.dtype != right.dtype:
raise_assert_detail("Series", "Dtype mismatch", left.dtype, right.dtype)
# confirm that we can call 'is_nan' on both sides
left_is_float = left.dtype in (Float32, Float64)
right_is_float = right.dtype in (Float32, Float64)
comparing_float_dtypes = left_is_float and right_is_float
# create mask of which (if any) values are unequal
unequal = left != right
if unequal.any() and nans_compare_equal and comparing_float_dtypes:
# handle NaN values (which compare unequal to themselves)
unequal = unequal & ~((left.is_nan() & right.is_nan()).fill_null(F.lit(False)))
# assert exact, or with tolerance
if unequal.any():
if check_exact:
> raise_assert_detail(
"Series", "Exact value mismatch", left=list(left), right=list(right)
)
E AssertionError: Series are different.
E
E Exact value mismatch
E [left]: [13477868900383131459, 988796329533502010, 16840582678788620208]
E [right]: [13477868900383131459, 15496313222292466864, 16840582678788620208]
polars/testing/asserts.py:341: AssertionError
======================================================================= short test summary info =======================================================================
FAILED tests/unit/test_df.py::test_reproducible_hash_with_seeds - AssertionError: Series are different.
============================================================= 1 failed, 2005 passed, 1 skipped in 15.91s ==============================================================
make: *** [test] Error 1
That's strange. I cannot reproduce. Maybe this something related to m1?
I would think so. I specifically fresh-cloned the latest master and built it from scratch (git clone + cd py-polars + make test
, nothing else)
@ritchie46 Asked someone to run a test independently on their M1 machine, same story:
📦 Built wheel for abi3 Python ≥ 3.7 to /var/folders/dr/72hf18qj4yn4zsqd2q9fwtvm0000gn/T/.tmp7M9b7w/polars-0.16.16-cp37-abi3-macosx_11_0_arm64.whl
🛠 Installed polars-0.16.16
.venv/bin/pytest -n auto --dist worksteal
=============================================================================================== test session starts ===============================================================================================
platform darwin -- Python 3.10.10, pytest-7.2.0, pluggy-1.0.0
________________________________________________________________________________________ test_reproducible_hash_with_seeds ________________________________________________________________________________________
[gw0] darwin -- Python 3.10.10 .../polars/polars/py-polars/.venv/bin/python
def test_reproducible_hash_with_seeds() -> None:
"""
Test the reproducibility of DataFrame.hash_rows, Series.hash, and Expr.hash.
cf. issue #3966, hashes must always be reproducible across sessions when using
the same seeds.
"""
df = pl.DataFrame({"s": [1234, None, 5678]})
seeds = (11, 22, 33, 44)
# TODO: introduce a platform-stable string hash...
# in the meantime, try to account for arm64 (mac) hash values to reduce noise
expected = pl.Series(
"s",
[13477868900383131459, 988796329533502010, 16840582678788620208],
dtype=pl.UInt64,
)
result = df.hash_rows(*seeds)
> assert_series_equal(expected, result, check_names=False, check_exact=True)
tests/unit/test_df.py:1623:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
polars/utils/decorators.py:136: in wrapper
return function(*args, **kwargs)
polars/testing/asserts.py:244: in assert_series_equal
_assert_series_inner(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
left = shape: (3,)
Series: 's' [u64]
[
13477868900383131459
988796329533502010
16840582678788620208
]
right = shape: (3,)
Series: '' [u64]
[
13477868900383131459
15496313222292466864
16840582678788620208
], check_dtype = True, check_exact = True, nans_compare_equal = True, atol = 1e-08, rtol = 1e-05
def _assert_series_inner(
left: Series,
right: Series,
check_dtype: bool,
check_exact: bool,
nans_compare_equal: bool,
atol: float,
rtol: float,
) -> None:
"""Compare Series dtype + values."""
try:
can_be_subtracted = hasattr(dtype_to_py_type(left.dtype), "__sub__")
except NotImplementedError:
can_be_subtracted = False
check_exact = check_exact or not can_be_subtracted or left.dtype == Boolean
if check_dtype and left.dtype != right.dtype:
raise_assert_detail("Series", "Dtype mismatch", left.dtype, right.dtype)
# confirm that we can call 'is_nan' on both sides
left_is_float = left.dtype in (Float32, Float64)
right_is_float = right.dtype in (Float32, Float64)
comparing_float_dtypes = left_is_float and right_is_float
# create mask of which (if any) values are unequal
unequal = left != right
if unequal.any() and nans_compare_equal and comparing_float_dtypes:
# handle NaN values (which compare unequal to themselves)
unequal = unequal & ~((left.is_nan() & right.is_nan()).fill_null(F.lit(False)))
# assert exact, or with tolerance
if unequal.any():
if check_exact:
> raise_assert_detail(
"Series", "Exact value mismatch", left=list(left), right=list(right)
)
E AssertionError: Series are different.
E
E Exact value mismatch
E [left]: [13477868900383131459, 988796329533502010, 16840582678788620208]
E [right]: [13477868900383131459, 15496313222292466864, 16840582678788620208]
polars/testing/asserts.py:341: AssertionError
============================================================================================= short test summary info =============================================================================================
FAILED tests/unit/test_df.py::test_reproducible_hash_with_seeds - AssertionError: Series are different.
==================================================================================== 1 failed, 2006 passed, 1 skipped in 6.26s ====================================================================================
make: *** [test] Error 1
There's a suspicious "in the meantime, try to account for arm64 (mac) hash values to reduce noise" comment there as well 🙂
Right, the test_reproducible_hash_with_seeds
is expected. I must turn that test off for that architecture. We test that the hash is constant. But everytime I change the hash function I miss the constants of the m1
architecture, so here we are.
But the person you asked did not experience the test_streaming_join
error?
I ran with
@pytest.mark.xfail(sys.platform == "darwin" and platform.machine() == 'arm64', reason="Does not work on Mac M1")
def test_reproducible_hash_with_seeds() -> None:
...
All tests passed