zarr-python Poor performance when slicing with steps

Minimal, reproducible code sample, a copy-pastable example if possible

"""Demonstrate low performance of slicing zarr array with step size."""

import sys
import json
import time
from collections.abc import MutableMapping

import numpy
import zarr
import dask
import dask.array

shape = (149568, 66369, 3)  # 27.7G RGB image
chunks = (1, 66369, 3)  # one scanline
dtype = '|u1'
step = 512
repeat = 2
counter = 0  # global counter, not thread-safe

class ZarrStore(MutableMapping):
    def __init__(self, shape, chunks, dtype):
        self.chunk = numpy.zeros(chunks, dtype)
        self.store = {}
        self.store['.zattrs'] = json_dump({})
        self.store['.zarray'] = json_dump(
            {
                'zarr_format': 2,
                'shape': shape,
                'chunks': chunks,
                'dtype': dtype,
                'fill_value': 0,
                'order': 'C',
                'compressor': None,
                'filters': None,
            }
        )

    def flush(self):
        raise PermissionError

    def clear(self):
        raise PermissionError

    def keys(self):
        return self.store.keys()

    def items(self):
        return self.store.items()

    def values(self):
        return self.store.values()

    def __iter__(self):
        return iter(self.store.keys())

    def __len__(self):
        return len(self.store)

    def __delitem__(self, key):
        raise PermissionError

    def __setitem__(self, key, value):
        raise PermissionError

    def __contains__(self, key):
        return key in self.store

    def __getitem__(self, key):
        global counter
        if key in self.store:
            return self.store[key]
        counter += 1
        return self.chunk


def json_dump(obj):
    return json.dumps(
        obj,
        indent=1,
        sort_keys=True,
        ensure_ascii=True,
        separators=(',', ': '),
    ).encode('ascii')


if 1:
    store = ZarrStore(shape, chunks, dtype)
    z = zarr.open(store, mode='r')
else:
    # use zarr MemoryStore
    z = zarr.zeros(
        shape=shape,
        chunks=chunks,
        dtype=dtype,
        compressor=None,
    )

print('Python', sys.version)
print('Platform', sys.platform)
print('numpy', numpy.__version__)
print('zarr', zarr.__version__)
print('dask', dask.__version__)
print()
print(z.info)

print('z[:]  # slice all chunks')
for _ in range(repeat):
    counter = 0
    start = time.perf_counter()
    arr = z[:]
    print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
    del arr

print()
print('z[:293]  # slice first 293 chunks')
for _ in range(repeat):
    counter = 0
    start = time.perf_counter()
    arr = z[: shape[0] // step + 1]
    print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
    del arr

print()
print('z[::512]  # slice every 512th chunk, 293 total')
for _ in range(repeat):
    counter = 0
    start = time.perf_counter()
    arr = z[::step]
    print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
    del arr

print()
print('z[::149568]  # slice every 149568th chunk, 1 total')
for _ in range(repeat):
    counter = 0
    start = time.perf_counter()
    arr = z[:: shape[0]]
    print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
    del arr

print()
print('z.oindex[[0, 512 .. 149504]]  # index using list')
for _ in range(repeat):
    counter = 0
    start = time.perf_counter()
    arr = z.oindex[list(range(0, shape[0], step))]
    print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
    del arr

print()
print('dask.array.from_zarr(store)[::512].compute()  # slice using dask')
da = dask.array.from_zarr(store)
for _ in range(repeat):
    counter = 0
    start = time.perf_counter()
    arr = da[::step].compute()
    print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
    del arr

print()
print('dask.array.from_zarr(store).compute()  # slice all chunks using dask')
da = dask.array.from_zarr(store)
for _ in range(repeat):
    counter = 0
    start = time.perf_counter()
    arr = da.compute()
    print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
    del arr

print()
print('# baseline: manually copy all chunks from store to numpy array')
da = dask.array.from_zarr(store)
for _ in range(repeat):
    counter = 0
    start = time.perf_counter()
    arr = numpy.empty(shape, dtype)
    for i in range(arr.shape[0]):
        arr[i] = store[i]
    print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
    del arr

Output:

Python 3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]
Platform win32
numpy 1.20.3
zarr 2.10.1
dask 2021.09.1

Type               : zarr.core.Array
Data type          : uint8
Shape              : (149568, 66369, 3)
Chunk shape        : (1, 66369, 3)
Order              : C
Read-only          : True
Compressor         : None
Store type         : __main__.ZarrStore
No. bytes          : 29780035776 (27.7G)
Chunks initialized : 0/149568

z[:]  # slice all chunks
read 149568 chunks in 9.41 s
read 149568 chunks in 9.34 s

z[:293]  # slice first 293 chunks
read 293 chunks in 0.0178 s
read 293 chunks in 0.0176 s

z[::512]  # slice every 512th chunk, 293 total
read 149568 chunks in 2.33 s
read 149568 chunks in 2.23 s

z[::149568]  # slice every 149568th chunk, 1 total
read 149568 chunks in 2.22 s
read 149568 chunks in 2.35 s

z.oindex[[0, 512 .. 149504]]  # index using list
read 293 chunks in 2.07 s
read 293 chunks in 2.07 s

dask.array.from_zarr(store)[::512].compute()  # slice using dask
read 293 chunks in 0.188 s
read 293 chunks in 0.18 s

dask.array.from_zarr(store).compute()  # slice all chunks using dask
read 149568 chunks in 92.5 s
read 149564 chunks in 93.4 s

# baseline: manually copy all chunks from store to numpy array
read 149568 chunks in 6.26 s
read 149568 chunks in 6.44 s

Problem description

Consider the minimal Zarr store posted above, which models the storage of a ~27 GB RGB image as individual scanlines/chunks in a TIFF file (shape=(149568, 66369, 3), chunks=(1, 66369, 3), dtype='uint8'). The store does no I/O and returns a pre-allocated chunk in __getitem__ to keep the overhead from the store at a minimum.

Slicing the whole array with z[:] takes ~9.3 seconds. OK.

Slicing the first 293 chunks with z[:293] takes ~0.02 s. OK.

Slicing every 512th chunk with z[::512], 293 chunks total, takes ~2.2 s. That's a hundred times what could be expected. It turns out that all chunks are read/accessed, not just the 293 chunks needed. This issue becomes significant when __getitem__ performs real work, I/O or decoding.

Even when slicing only a single chunk via step size with z[::149568], all chunks are read.

This behavior can be reproduced with a zarr MemoryStore.

Is this behavior expected? Can the store be improved to work more efficiently with zarr slicing?

I tried two workarounds:

Manually indexing the chunks via list zobj.oindex[list(range(0, 149568, 512))]. This only reads the expected number of chunks but it is a burden and does not improve the performance in this simple case.

Using dask to slice the store with steps (dask.array.from_zarr(store)[::512]) performs better than zarr and only reads the expected number of chunks. However, the overhead of dask+zarr is generally much larger than zarr alone (performance, memory, and dependencies).

Sep 30 '21 20:09 cgohlke

@cgohlke, are there any updates on this?

Mar 05 '22 10:03 aliaksei-chareshneu

are there any updates on this?

Doesn't look like. Timings are basically the same, maybe slightly worse:

Python 3.9.10 (tags/v3.9.10:f2f3f53, Jan 17 2022, 15:14:21) [MSC v.1929 64 bit (AMD64)]
Platform win32
numpy 1.21.5
zarr 2.11.0
dask 2022.02.1

Type               : zarr.core.Array
Data type          : uint8
Shape              : (149568, 66369, 3)
Chunk shape        : (1, 66369, 3)
Order              : C
Read-only          : True
Compressor         : None
Store type         : zarr.storage.KVStore
No. bytes          : 29780035776 (27.7G)
No. bytes stored   : 199107 (194.4K)
Storage ratio      : 149568.0
Chunks initialized : 0/149568

z[:]  # slice all chunks
read 149568 chunks in 9.47 s
read 149568 chunks in 9.49 s

z[:293]  # slice first 293 chunks
read 293 chunks in 0.0204 s
read 293 chunks in 0.0205 s

z[::512]  # slice every 512th chunk, 293 total
read 149568 chunks in 2.83 s
read 149568 chunks in 2.62 s

z[::149568]  # slice every 149568th chunk, 1 total
read 149568 chunks in 2.6 s
read 149568 chunks in 2.99 s

z.oindex[[0, 512 .. 149504]]  # index using list
read 293 chunks in 2.11 s
read 293 chunks in 2.08 s

dask.array.from_zarr(store)[::512].compute()  # slice using dask
read 293 chunks in 0.286 s
read 293 chunks in 0.28 s

dask.array.from_zarr(store).compute()  # slice all chunks using dask
read 149548 chunks in 95.7 s
read 149562 chunks in 95.1 s

# baseline: manually copy all chunks from store to numpy array
read 149568 chunks in 6.38 s
read 149568 chunks in 6.41 s

Mar 05 '22 15:03 cgohlke

HI @cgohlke! Thanks for reporting this issue and sorry that no one has replied to you yet.

It sounds like you are suggesting that an optimization should be made such that strided indexing can pull only the chunks that are needed for the desired selection, rather than the entire range. This sounds reasonable to me. Some of the relevant code is here:

https://github.com/zarr-developers/zarr-python/blob/999593961a305ac7b15616f9b7df1826f1acf8f7/zarr/core.py#L1151

Right now a lot of Zarr dev effort is focused on a big refactor (#877) which should dramatically improve slicing performance. Hopefully this optimization can be included in that effort.

In the meantime, if you wanted to work on implementing this feature yourself, we would do our best to support you in a PR.

Thanks for your valuable feedback.

Mar 07 '22 12:03 rabernat

@rabernat @cgohlke this issue has just bitten me too. I've taken a stab a fixing it in PR #1154, which is successful from a performance perspective but I'm very happy to accept feedback and make changes if you can think of better ways to fix it.

Oct 01 '22 20:10 jrs65

zarr-python zarr-python copied to clipboard

Poor performance when slicing with steps

Minimal, reproducible code sample, a copy-pastable example if possible

Problem description

zarr-python
zarr-python copied to clipboard