sgkit icon indicating copy to clipboard operation
sgkit copied to clipboard

Possible race condition in _concat_zarrs_optimized

Open tomwhite opened this issue 5 years ago • 8 comments

================================== FAILURES ===================================
_________________ test_vcfzarr_to_zarr[None-True-True-False] __________________

shared_datadir = WindowsPath('C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/data')
tmpdir = local('C:\\Users\\runneradmin\\AppData\\Local\\Temp\\pytest-of-runneradmin\\pytest-0\\test_vcfzarr_to_zarr_None_True1')
grouped_by_contig = True, consolidated = True, has_variant_id = False
concat_algorithm = None

    @pytest.mark.parametrize(
        "grouped_by_contig, consolidated, has_variant_id",
        [
            (False, False, False),
            (False, False, True),
            (True, False, True),
            (True, True, False),
        ],
    )
    @pytest.mark.parametrize(
        "concat_algorithm",
        [None, "xarray_internal"],
    )
    def test_vcfzarr_to_zarr(
        shared_datadir,
        tmpdir,
        grouped_by_contig,
        consolidated,
        has_variant_id,
        concat_algorithm,
    ):
        if has_variant_id:
            fields = None
        else:
            fields = [
                "variants/CHROM",
                "variants/POS",
                "variants/REF",
                "variants/ALT",
                "calldata/GT",
                "samples",
            ]
    
        vcfzarr_path = create_vcfzarr(  # type: ignore[no-untyped-call]
            shared_datadir,
            tmpdir,
            fields=fields,
            grouped_by_contig=grouped_by_contig,
            consolidated=consolidated,
        )
    
        output = str(tmpdir / "vcf.zarr")
>       vcfzarr_to_zarr(
            vcfzarr_path,
            output,
            grouped_by_contig=grouped_by_contig,
            concat_algorithm=concat_algorithm,
            consolidated=consolidated,
        )

sgkit\tests\test_vcfzarr_reader.py:135: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
sgkit\io\vcfzarr_reader.py:134: in vcfzarr_to_zarr
    _concat_zarrs_optimized(
sgkit\io\vcfzarr_reader.py:259: in _concat_zarrs_optimized
    output_zarr[var].attrs.update(attrs)
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

self = <zarr.hierarchy.Group '/'>, item = 'variant_allele'

    def __getitem__(self, item):
        """Obtain a group member.
    
        Parameters
        ----------
        item : string
            Member name or path.
    
        Examples
        --------
        >>> import zarr
        >>> g1 = zarr.group()
        >>> d1 = g1.create_dataset('foo/bar/baz', shape=100, chunks=10)
        >>> g1['foo']
        <zarr.hierarchy.Group '/foo'>
        >>> g1['foo/bar']
        <zarr.hierarchy.Group '/foo/bar'>
        >>> g1['foo/bar/baz']
        <zarr.core.Array '/foo/bar/baz' (100,) float64>
    
        """
        path = self._item_path(item)
        if contains_array(self._store, path):
            return Array(self._store, read_only=self._read_only, path=path,
                         chunk_store=self._chunk_store,
                         synchronizer=self._synchronizer, cache_attrs=self.attrs.cache)
        elif contains_group(self._store, path):
            return Group(self._store, read_only=self._read_only, path=path,
                         chunk_store=self._chunk_store, cache_attrs=self.attrs.cache,
                         synchronizer=self._synchronizer)
        else:
>           raise KeyError(item)
E           KeyError: 'variant_allele'

c:\miniconda3\envs\test\lib\site-packages\zarr\hierarchy.py:349: KeyError
---------------------------- Captured stderr call -----------------------------
Exception ignored in: <_io.FileIO name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/call_genotype_mask/.zarray' mode='rb' closefd=True>
Traceback (most recent call last):
  File "c:\miniconda3\envs\test\lib\site-packages\fsspec\spec.py", line 626, in cat_file
    return self.open(path, "rb").read()
ResourceWarning: unclosed file <_io.BufferedReader name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/call_genotype_mask/.zarray'>
Exception ignored in: <_io.FileIO name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/variant_position/.zarray' mode='rb' closefd=True>
Traceback (most recent call last):
  File "c:\miniconda3\envs\test\lib\site-packages\fsspec\spec.py", line 626, in cat_file
    return self.open(path, "rb").read()
ResourceWarning: unclosed file <_io.BufferedReader name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/variant_position/.zarray'>
Exception ignored in: <_io.FileIO name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/call_genotype/.zarray' mode='rb' closefd=True>
Traceback (most recent call last):
  File "c:\miniconda3\envs\test\lib\site-packages\fsspec\spec.py", line 626, in cat_file
    return self.open(path, "rb").read()
ResourceWarning: unclosed file <_io.BufferedReader name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/call_genotype/.zarray'>
Exception ignored in: <_io.FileIO name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/variant_contig/.zarray' mode='rb' closefd=True>
Traceback (most recent call last):
  File "c:\miniconda3\envs\test\lib\site-packages\fsspec\spec.py", line 626, in cat_file
    return self.open(path, "rb").read()
ResourceWarning: unclosed file <_io.BufferedReader name='C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True1/vcf.zarr/variant_contig/.zarray'>
=========================== short test summary info ===========================
FAILED sgkit/tests/test_vcfzarr_reader.py::test_vcfzarr_to_zarr[None-True-True-False]
====== 1 failed, 480 passed, 13 skipped, 2 xfailed in 198.28s (0:03:18) =======
Error: Process completed with exit code 1.

tomwhite avatar Nov 17 '20 13:11 tomwhite

Hmm, what do you think the race condition is on @tomwhite? Windows can be good for shaking out bugs like this - but on the other hand, its more strict permissions model around open files leads to some annoying quirks too.

jeromekelleher avatar Nov 17 '20 14:11 jeromekelleher

I'm not sure exactly, but it looks like a race between the Zarr attributes for a variable being updated (variant_allele) the first and second time. Perhaps those two blocks should be in separate Zarr context managers?

I can't reproduce this locally, even if I run the test in a loop.

tomwhite avatar Nov 17 '20 14:11 tomwhite

Tricky. I think we need @alimanfoo's eyes on this one...

jeromekelleher avatar Nov 17 '20 14:11 jeromekelleher

Failed again here: https://github.com/ravwojdyla/sgkit/runs/1855182247?check_suite_focus=true

ravwojdyla avatar Feb 08 '21 14:02 ravwojdyla

Another one: https://github.com/pystatgen/sgkit/runs/1891619188?check_suite_focus=true

ravwojdyla avatar Feb 18 '21 04:02 ravwojdyla

So not just on Windows

tomwhite avatar Feb 18 '21 15:02 tomwhite

I tried to reproduce this again, but no luck. I tried creating standalone test cases that I could run more frequently, but I never saw the error locally on my machine. One possibility is that it needs a more highly-contended machine.

So I had a look at what is happening in more detail. The problem boils down to two places in the code where writes to the same Zarr array take place:

  1. where the Zarr array is being created
  2. where the Zarr array metadata is being updated

Operation 1. is run by Dask delayed, but the contract of compute is such that the tasks should all be completed before the call to compute returns. Operation 2. is run after the call to compute returns, but in the main thread.

As an attempt to fix this, I have made a change so that both operations are run by Dask delayed, and such that the second uses the array object created by the first. (In the current code for 2. the Zarr array is read from the file system separately.) Since I haven't been able to reproduce the problem I don't know if this fix works, but since we see it fairly regularly in CI, it might be worth merging to see if it has an effect.

tomwhite avatar Mar 08 '21 18:03 tomwhite

There was another race condition failure here: https://github.com/pystatgen/sgkit/runs/2157617424?check_suite_focus=true. The failure occurred in a different place though - between writing the zarr array, and reading it back in the test. So it might be mitigated in test code.

================================== FAILURES ===================================
_________________ test_vcfzarr_to_zarr[None-True-False-True] __________________

self = <xarray.Dataset>
Dimensions:             (alleles: 4, ploidy: 2, samples: 3, variants: 9)
Dimensions without coordinat...:
    contigs:                    ['19', '20', 'X']
    max_length_variant_allele:  1
    max_length_variant_id:      1
name = 'variant_position'

    def _construct_dataarray(self, name: Hashable) -> "DataArray":
        """Construct a DataArray by indexing this dataset"""
        from .dataarray import DataArray
    
        try:
>           variable = self._variables[name]
E           KeyError: 'variant_position'

c:\miniconda3\envs\test\lib\site-packages\xarray\core\dataset.py:1305: KeyError

During handling of the above exception, another exception occurred:

shared_datadir = WindowsPath('C:/Users/runneradmin/AppData/Local/Temp/pytest-of-runneradmin/pytest-0/test_vcfzarr_to_zarr_None_True0/data')
tmpdir = local('C:\\Users\\runneradmin\\AppData\\Local\\Temp\\pytest-of-runneradmin\\pytest-0\\test_vcfzarr_to_zarr_None_True0')
grouped_by_contig = True, consolidated = False, has_variant_id = True
concat_algorithm = None

    @pytest.mark.parametrize(
        "grouped_by_contig, consolidated, has_variant_id",
        [
            (False, False, False),
            (False, False, True),
            (True, False, True),
            (True, True, False),
        ],
    )
    @pytest.mark.parametrize(
        "concat_algorithm",
        [None, "xarray_internal"],
    )
    def test_vcfzarr_to_zarr(
        shared_datadir,
        tmpdir,
        grouped_by_contig,
        consolidated,
        has_variant_id,
        concat_algorithm,
    ):
        if has_variant_id:
            fields = None
        else:
            fields = [
                "variants/CHROM",
                "variants/POS",
                "variants/REF",
                "variants/ALT",
                "calldata/GT",
                "samples",
            ]
    
        vcfzarr_path = create_vcfzarr(  # type: ignore[no-untyped-call]
            shared_datadir,
            tmpdir,
            fields=fields,
            grouped_by_contig=grouped_by_contig,
            consolidated=consolidated,
        )
    
        output = str(tmpdir / "vcf.zarr")
        vcfzarr_to_zarr(
            vcfzarr_path,
            output,
            grouped_by_contig=grouped_by_contig,
            concat_algorithm=concat_algorithm,
            consolidated=consolidated,
        )
    
        ds = xr.open_zarr(output)  # type: ignore[no-untyped-call]
    
        # Note that variant_allele values are byte strings, not unicode strings (unlike for read_vcfzarr)
        # We should make the two consistent.
    
        assert ds.attrs["contigs"] == ["19", "20", "X"]
        assert_array_equal(ds["variant_contig"], [0, 0, 1, 1, 1, 1, 1, 1, 2])
        assert_array_equal(
>           ds["variant_position"],
            [111, 112, 14370, 17330, 1110696, 1230237, 1234567, 1235237, 10],
        )

sgkit\tests\test_vcfzarr_reader.py:151: 
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
c:\miniconda3\envs\test\lib\site-packages\xarray\core\dataset.py:1409: in __getitem__
    return self._construct_dataarray(key)
c:\miniconda3\envs\test\lib\site-packages\xarray\core\dataset.py:1307: in _construct_dataarray
    _, name, variable = _get_virtual_variable(
_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _

variables = {'call_genotype': <xarray.Variable (variants: 9, samples: 3, ploidy: 2)>
dask.array<xarray-call_genotype, shape=(9, 3,...e=|S3, chunksize=(2, 4), chunktype=numpy.ndarray>
Attributes:
    comment:  The possible alleles for the variant., ...}
key = 'variant_position', level_vars = {}
dim_sizes = Frozen(SortedKeysDict({'variants': 9, 'samples': 3, 'ploidy': 2, 'alleles': 4}))

    def _get_virtual_variable(
        variables, key: Hashable, level_vars: Mapping = None, dim_sizes: Mapping = None
    ) -> Tuple[Hashable, Hashable, Variable]:
        """Get a virtual variable (e.g., 'time.year' or a MultiIndex level)
        from a dict of xarray.Variable objects (if possible)
        """
        if level_vars is None:
            level_vars = {}
        if dim_sizes is None:
            dim_sizes = {}
    
        if key in dim_sizes:
            data = pd.Index(range(dim_sizes[key]), name=key)
            variable = IndexVariable((key,), data)
            return key, key, variable
    
        if not isinstance(key, str):
            raise KeyError(key)
    
        split_key = key.split(".", 1)
        var_name: Optional[str]
        if len(split_key) == 2:
            ref_name, var_name = split_key
        elif len(split_key) == 1:
            ref_name, var_name = key, None
        else:
            raise KeyError(key)
    
        if ref_name in level_vars:
            dim_var = variables[level_vars[ref_name]]
            ref_var = dim_var.to_index_variable().get_level_variable(ref_name)
        else:
>           ref_var = variables[ref_name]
E           KeyError: 'variant_position'

c:\miniconda3\envs\test\lib\site-packages\xarray\core\dataset.py:171: KeyError
=========================== short test summary info ===========================
FAILED sgkit/tests/test_vcfzarr_reader.py::test_vcfzarr_to_zarr[None-True-False-True]

tomwhite avatar Mar 23 '21 09:03 tomwhite