sgkit
sgkit copied to clipboard
Possible race condition in _concat_zarrs_optimized
================================== FAILURES ===================================
_________________ test_vcfzarr_to_zarr[None-True-True-False] __________________
shared_datadir = WindowsPath('C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/data')
tmpdir = local('C:\\Users\\runneradmin\\AppData\\Local\\Temp\\pytest-of-runneradmin\\pytest-0\\test_vcfzarr_to_zarr_None_True1')
grouped_by_contig = True, consolidated = True, has_variant_id = False
concat_algorithm = None
@pytest.mark.parametrize(
"grouped_by_contig, consolidated, has_variant_id",
[
(False, False, False),
(False, False, True),
(True, False, True),
(True, True, False),
],
)
@pytest.mark.parametrize(
"concat_algorithm",
[None, "xarray_internal"],
)
def test_vcfzarr_to_zarr(
shared_datadir,
tmpdir,
grouped_by_contig,
consolidated,
has_variant_id,
concat_algorithm,
):
if has_variant_id:
fields = None
else:
fields = [
"variants/CHROM",
"variants/POS",
"variants/REF",
"variants/ALT",
"calldata/GT",
"samples",
]
vcfzarr_path = create_vcfzarr( # type: ignore[no-untyped-call]
shared_datadir,
tmpdir,
fields=fields,
grouped_by_contig=grouped_by_contig,
consolidated=consolidated,
)
output = str(tmpdir / "vcf.zarr")
> vcfzarr_to_zarr(
vcfzarr_path,
output,
grouped_by_contig=grouped_by_contig,
concat_algorithm=concat_algorithm,
consolidated=consolidated,
)
sgkit\tests\test_vcfzarr_reader.py:135:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
sgkit\io\vcfzarr_reader.py:134: in vcfzarr_to_zarr
_concat_zarrs_optimized(
sgkit\io\vcfzarr_reader.py:259: in _concat_zarrs_optimized
output_zarr[var].attrs.update(attrs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
self = <zarr.hierarchy.Group '/'>, item = 'variant_allele'
def __getitem__(self, item):
"""Obtain a group member.
Parameters
----------
item : string
Member name or path.
Examples
--------
>>> import zarr
>>> g1 = zarr.group()
>>> d1 = g1.create_dataset('foo/bar/baz', shape=100, chunks=10)
>>> g1['foo']
<zarr.hierarchy.Group '/foo'>
>>> g1['foo/bar']
<zarr.hierarchy.Group '/foo/bar'>
>>> g1['foo/bar/baz']
<zarr.core.Array '/foo/bar/baz' (100,) float64>
"""
path = self._item_path(item)
if contains_array(self._store, path):
return Array(self._store, read_only=self._read_only, path=path,
chunk_store=self._chunk_store,
synchronizer=self._synchronizer, cache_attrs=self.attrs.cache)
elif contains_group(self._store, path):
return Group(self._store, read_only=self._read_only, path=path,
chunk_store=self._chunk_store, cache_attrs=self.attrs.cache,
synchronizer=self._synchronizer)
else:
> raise KeyError(item)
E KeyError: 'variant_allele'
c:\miniconda3\envs\test\lib\site-packages\zarr\hierarchy.py:349: KeyError
---------------------------- Captured stderr call -----------------------------
Exception ignored in: <_io.FileIO name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/call_genotype_mask/.zarray' mode='rb' closefd=True>
Traceback (most recent call last):
File "c:\miniconda3\envs\test\lib\site-packages\fsspec\spec.py", line 626, in cat_file
return self.open(path, "rb").read()
ResourceWarning: unclosed file <_io.BufferedReader name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/call_genotype_mask/.zarray'>
Exception ignored in: <_io.FileIO name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/variant_position/.zarray' mode='rb' closefd=True>
Traceback (most recent call last):
File "c:\miniconda3\envs\test\lib\site-packages\fsspec\spec.py", line 626, in cat_file
return self.open(path, "rb").read()
ResourceWarning: unclosed file <_io.BufferedReader name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/variant_position/.zarray'>
Exception ignored in: <_io.FileIO name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/call_genotype/.zarray' mode='rb' closefd=True>
Traceback (most recent call last):
File "c:\miniconda3\envs\test\lib\site-packages\fsspec\spec.py", line 626, in cat_file
return self.open(path, "rb").read()
ResourceWarning: unclosed file <_io.BufferedReader name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/call_genotype/.zarray'>
Exception ignored in: <_io.FileIO name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/variant_contig/.zarray' mode='rb' closefd=True>
Traceback (most recent call last):
File "c:\miniconda3\envs\test\lib\site-packages\fsspec\spec.py", line 626, in cat_file
return self.open(path, "rb").read()
ResourceWarning: unclosed file <_io.BufferedReader name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/variant_contig/.zarray'>
=========================== short test summary info ===========================
FAILED sgkit/tests/test_vcfzarr_reader.py::test_vcfzarr_to_zarr[None-True-True-False]
====== 1 failed, 480 passed, 13 skipped, 2 xfailed in 198.28s (0:03:18) =======
Error: Process completed with exit code 1.
Hmm, what do you think the race condition is on @tomwhite? Windows can be good for shaking out bugs like this - but on the other hand, its more strict permissions model around open files leads to some annoying quirks too.
I'm not sure exactly, but it looks like a race between the Zarr attributes for a variable being updated (variant_allele) the first and second time. Perhaps those two blocks should be in separate Zarr context managers?
I can't reproduce this locally, even if I run the test in a loop.
Tricky. I think we need @alimanfoo's eyes on this one...
Failed again here: https://github.com/ravwojdyla/sgkit/runs/1855182247?check_suite_focus=true
Another one: https://github.com/pystatgen/sgkit/runs/1891619188?check_suite_focus=true
So not just on Windows
I tried to reproduce this again, but no luck. I tried creating standalone test cases that I could run more frequently, but I never saw the error locally on my machine. One possibility is that it needs a more highly-contended machine.
So I had a look at what is happening in more detail. The problem boils down to two places in the code where writes to the same Zarr array take place:
Operation 1. is run by Dask delayed, but the contract of compute is such that the tasks should all be completed before the call to compute returns. Operation 2. is run after the call to compute returns, but in the main thread.
As an attempt to fix this, I have made a change so that both operations are run by Dask delayed, and such that the second uses the array object created by the first. (In the current code for 2. the Zarr array is read from the file system separately.) Since I haven't been able to reproduce the problem I don't know if this fix works, but since we see it fairly regularly in CI, it might be worth merging to see if it has an effect.
There was another race condition failure here: https://github.com/pystatgen/sgkit/runs/2157617424?check_suite_focus=true. The failure occurred in a different place though - between writing the zarr array, and reading it back in the test. So it might be mitigated in test code.
================================== FAILURES ===================================
_________________ test_vcfzarr_to_zarr[None-True-False-True] __________________
self = <xarray.Dataset>
Dimensions: (alleles: 4, ploidy: 2, samples: 3, variants: 9)
Dimensions without coordinat...:
contigs: ['19', '20', 'X']
max_length_variant_allele: 1
max_length_variant_id: 1
name = 'variant_position'
def _construct_dataarray(self, name: Hashable) -> "DataArray":
"""Construct a DataArray by indexing this dataset"""
from .dataarray import DataArray
try:
> variable = self._variables[name]
E KeyError: 'variant_position'
c:\miniconda3\envs\test\lib\site-packages\xarray\core\dataset.py:1305: KeyError
During handling of the above exception, another exception occurred:
shared_datadir = WindowsPath('C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True0/data')
tmpdir = local('C:\\Users\\runneradmin\\AppData\\Local\\Temp\\pytest-of-runneradmin\\pytest-0\\test_vcfzarr_to_zarr_None_True0')
grouped_by_contig = True, consolidated = False, has_variant_id = True
concat_algorithm = None
@pytest.mark.parametrize(
"grouped_by_contig, consolidated, has_variant_id",
[
(False, False, False),
(False, False, True),
(True, False, True),
(True, True, False),
],
)
@pytest.mark.parametrize(
"concat_algorithm",
[None, "xarray_internal"],
)
def test_vcfzarr_to_zarr(
shared_datadir,
tmpdir,
grouped_by_contig,
consolidated,
has_variant_id,
concat_algorithm,
):
if has_variant_id:
fields = None
else:
fields = [
"variants/CHROM",
"variants/POS",
"variants/REF",
"variants/ALT",
"calldata/GT",
"samples",
]
vcfzarr_path = create_vcfzarr( # type: ignore[no-untyped-call]
shared_datadir,
tmpdir,
fields=fields,
grouped_by_contig=grouped_by_contig,
consolidated=consolidated,
)
output = str(tmpdir / "vcf.zarr")
vcfzarr_to_zarr(
vcfzarr_path,
output,
grouped_by_contig=grouped_by_contig,
concat_algorithm=concat_algorithm,
consolidated=consolidated,
)
ds = xr.open_zarr(output) # type: ignore[no-untyped-call]
# Note that variant_allele values are byte strings, not unicode strings (unlike for read_vcfzarr)
# We should make the two consistent.
assert ds.attrs["contigs"] == ["19", "20", "X"]
assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
assert_array_equal(
> ds["variant_position"],
[111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
)
sgkit\tests\test_vcfzarr_reader.py:151:
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
c:\miniconda3\envs\test\lib\site-packages\xarray\core\dataset.py:1409: in __getitem__
return self._construct_dataarray(key)
c:\miniconda3\envs\test\lib\site-packages\xarray\core\dataset.py:1307: in _construct_dataarray
_, name, variable = _get_virtual_variable(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
variables = {'call_genotype': <xarray.Variable (variants: 9, samples: 3, ploidy: 2)>
dask.array<xarray-call_genotype, shape=(9, 3,...e=|S3, chunksize=(2, 4), chunktype=numpy.ndarray>
Attributes:
comment: The possible alleles for the variant., ...}
key = 'variant_position', level_vars = {}
dim_sizes = Frozen(SortedKeysDict({'variants': 9, 'samples': 3, 'ploidy': 2, 'alleles': 4}))
def _get_virtual_variable(
variables, key: Hashable, level_vars: Mapping = None, dim_sizes: Mapping = None
) -> Tuple[Hashable, Hashable, Variable]:
"""Get a virtual variable (e.g., 'time.year' or a MultiIndex level)
from a dict of xarray.Variable objects (if possible)
"""
if level_vars is None:
level_vars = {}
if dim_sizes is None:
dim_sizes = {}
if key in dim_sizes:
data = pd.Index(range(dim_sizes[key]), name=key)
variable = IndexVariable((key,), data)
return key, key, variable
if not isinstance(key, str):
raise KeyError(key)
split_key = key.split(".", 1)
var_name: Optional[str]
if len(split_key) == 2:
ref_name, var_name = split_key
elif len(split_key) == 1:
ref_name, var_name = key, None
else:
raise KeyError(key)
if ref_name in level_vars:
dim_var = variables[level_vars[ref_name]]
ref_var = dim_var.to_index_variable().get_level_variable(ref_name)
else:
> ref_var = variables[ref_name]
E KeyError: 'variant_position'
c:\miniconda3\envs\test\lib\site-packages\xarray\core\dataset.py:171: KeyError
=========================== short test summary info ===========================
FAILED sgkit/tests/test_vcfzarr_reader.py::test_vcfzarr_to_zarr[None-True-False-True]