zarr-python
zarr-python copied to clipboard
Poor performance when slicing with steps
Minimal, reproducible code sample, a copy-pastable example if possible
"""Demonstrate low performance of slicing zarr array with step size."""
import sys
import json
import time
from collections.abc import MutableMapping
import numpy
import zarr
import dask
import dask.array
shape = (149568, 66369, 3) # 27.7G RGB image
chunks = (1, 66369, 3) # one scanline
dtype = '|u1'
step = 512
repeat = 2
counter = 0 # global counter, not thread-safe
class ZarrStore(MutableMapping):
def __init__(self, shape, chunks, dtype):
self.chunk = numpy.zeros(chunks, dtype)
self.store = {}
self.store['.zattrs'] = json_dump({})
self.store['.zarray'] = json_dump(
{
'zarr_format': 2,
'shape': shape,
'chunks': chunks,
'dtype': dtype,
'fill_value': 0,
'order': 'C',
'compressor': None,
'filters': None,
}
)
def flush(self):
raise PermissionError
def clear(self):
raise PermissionError
def keys(self):
return self.store.keys()
def items(self):
return self.store.items()
def values(self):
return self.store.values()
def __iter__(self):
return iter(self.store.keys())
def __len__(self):
return len(self.store)
def __delitem__(self, key):
raise PermissionError
def __setitem__(self, key, value):
raise PermissionError
def __contains__(self, key):
return key in self.store
def __getitem__(self, key):
global counter
if key in self.store:
return self.store[key]
counter += 1
return self.chunk
def json_dump(obj):
return json.dumps(
obj,
indent=1,
sort_keys=True,
ensure_ascii=True,
separators=(',', ': '),
).encode('ascii')
if 1:
store = ZarrStore(shape, chunks, dtype)
z = zarr.open(store, mode='r')
else:
# use zarr MemoryStore
z = zarr.zeros(
shape=shape,
chunks=chunks,
dtype=dtype,
compressor=None,
)
print('Python', sys.version)
print('Platform', sys.platform)
print('numpy', numpy.__version__)
print('zarr', zarr.__version__)
print('dask', dask.__version__)
print()
print(z.info)
print('z[:] # slice all chunks')
for _ in range(repeat):
counter = 0
start = time.perf_counter()
arr = z[:]
print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
del arr
print()
print('z[:293] # slice first 293 chunks')
for _ in range(repeat):
counter = 0
start = time.perf_counter()
arr = z[: shape[0] // step + 1]
print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
del arr
print()
print('z[::512] # slice every 512th chunk, 293 total')
for _ in range(repeat):
counter = 0
start = time.perf_counter()
arr = z[::step]
print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
del arr
print()
print('z[::149568] # slice every 149568th chunk, 1 total')
for _ in range(repeat):
counter = 0
start = time.perf_counter()
arr = z[:: shape[0]]
print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
del arr
print()
print('z.oindex[[0, 512 .. 149504]] # index using list')
for _ in range(repeat):
counter = 0
start = time.perf_counter()
arr = z.oindex[list(range(0, shape[0], step))]
print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
del arr
print()
print('dask.array.from_zarr(store)[::512].compute() # slice using dask')
da = dask.array.from_zarr(store)
for _ in range(repeat):
counter = 0
start = time.perf_counter()
arr = da[::step].compute()
print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
del arr
print()
print('dask.array.from_zarr(store).compute() # slice all chunks using dask')
da = dask.array.from_zarr(store)
for _ in range(repeat):
counter = 0
start = time.perf_counter()
arr = da.compute()
print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
del arr
print()
print('# baseline: manually copy all chunks from store to numpy array')
da = dask.array.from_zarr(store)
for _ in range(repeat):
counter = 0
start = time.perf_counter()
arr = numpy.empty(shape, dtype)
for i in range(arr.shape[0]):
arr[i] = store[i]
print(f'read {counter} chunks in {time.perf_counter() - start:.3} s')
del arr
Output:
Python 3.9.7 (tags/v3.9.7:1016ef3, Aug 30 2021, 20:19:38) [MSC v.1929 64 bit (AMD64)]
Platform win32
numpy 1.20.3
zarr 2.10.1
dask 2021.09.1
Type : zarr.core.Array
Data type : uint8
Shape : (149568, 66369, 3)
Chunk shape : (1, 66369, 3)
Order : C
Read-only : True
Compressor : None
Store type : __main__.ZarrStore
No. bytes : 29780035776 (27.7G)
Chunks initialized : 0/149568
z[:] # slice all chunks
read 149568 chunks in 9.41 s
read 149568 chunks in 9.34 s
z[:293] # slice first 293 chunks
read 293 chunks in 0.0178 s
read 293 chunks in 0.0176 s
z[::512] # slice every 512th chunk, 293 total
read 149568 chunks in 2.33 s
read 149568 chunks in 2.23 s
z[::149568] # slice every 149568th chunk, 1 total
read 149568 chunks in 2.22 s
read 149568 chunks in 2.35 s
z.oindex[[0, 512 .. 149504]] # index using list
read 293 chunks in 2.07 s
read 293 chunks in 2.07 s
dask.array.from_zarr(store)[::512].compute() # slice using dask
read 293 chunks in 0.188 s
read 293 chunks in 0.18 s
dask.array.from_zarr(store).compute() # slice all chunks using dask
read 149568 chunks in 92.5 s
read 149564 chunks in 93.4 s
# baseline: manually copy all chunks from store to numpy array
read 149568 chunks in 6.26 s
read 149568 chunks in 6.44 s
Problem description
Consider the minimal Zarr store posted above, which models the storage of a ~27 GB RGB image as individual scanlines/chunks in a TIFF file (shape=(149568, 66369, 3), chunks=(1, 66369, 3), dtype='uint8'). The store does no I/O and returns a pre-allocated chunk in __getitem__ to keep the overhead from the store at a minimum.
Slicing the whole array with z[:] takes ~9.3 seconds. OK.
Slicing the first 293 chunks with z[:293] takes ~0.02 s. OK.
Slicing every 512th chunk with z[::512], 293 chunks total, takes ~2.2 s. That's a hundred times what could be expected. It turns out that all chunks are read/accessed, not just the 293 chunks needed. This issue becomes significant when __getitem__ performs real work, I/O or decoding.
Even when slicing only a single chunk via step size with z[::149568], all chunks are read.
This behavior can be reproduced with a zarr MemoryStore.
Is this behavior expected? Can the store be improved to work more efficiently with zarr slicing?
I tried two workarounds:
Manually indexing the chunks via list zobj.oindex[list(range(0, 149568, 512))]. This only reads the expected number of chunks but it is a burden and does not improve the performance in this simple case.
Using dask to slice the store with steps (dask.array.from_zarr(store)[::512]) performs better than zarr and only reads the expected number of chunks. However, the overhead of dask+zarr is generally much larger than zarr alone (performance, memory, and dependencies).
@cgohlke, are there any updates on this?
are there any updates on this?
Doesn't look like. Timings are basically the same, maybe slightly worse:
Python 3.9.10 (tags/v3.9.10:f2f3f53, Jan 17 2022, 15:14:21) [MSC v.1929 64 bit (AMD64)]
Platform win32
numpy 1.21.5
zarr 2.11.0
dask 2022.02.1
Type : zarr.core.Array
Data type : uint8
Shape : (149568, 66369, 3)
Chunk shape : (1, 66369, 3)
Order : C
Read-only : True
Compressor : None
Store type : zarr.storage.KVStore
No. bytes : 29780035776 (27.7G)
No. bytes stored : 199107 (194.4K)
Storage ratio : 149568.0
Chunks initialized : 0/149568
z[:] # slice all chunks
read 149568 chunks in 9.47 s
read 149568 chunks in 9.49 s
z[:293] # slice first 293 chunks
read 293 chunks in 0.0204 s
read 293 chunks in 0.0205 s
z[::512] # slice every 512th chunk, 293 total
read 149568 chunks in 2.83 s
read 149568 chunks in 2.62 s
z[::149568] # slice every 149568th chunk, 1 total
read 149568 chunks in 2.6 s
read 149568 chunks in 2.99 s
z.oindex[[0, 512 .. 149504]] # index using list
read 293 chunks in 2.11 s
read 293 chunks in 2.08 s
dask.array.from_zarr(store)[::512].compute() # slice using dask
read 293 chunks in 0.286 s
read 293 chunks in 0.28 s
dask.array.from_zarr(store).compute() # slice all chunks using dask
read 149548 chunks in 95.7 s
read 149562 chunks in 95.1 s
# baseline: manually copy all chunks from store to numpy array
read 149568 chunks in 6.38 s
read 149568 chunks in 6.41 s
HI @cgohlke! Thanks for reporting this issue and sorry that no one has replied to you yet.
It sounds like you are suggesting that an optimization should be made such that strided indexing can pull only the chunks that are needed for the desired selection, rather than the entire range. This sounds reasonable to me. Some of the relevant code is here:
https://github.com/zarr-developers/zarr-python/blob/999593961a305ac7b15616f9b7df1826f1acf8f7/zarr/core.py#L1151
Right now a lot of Zarr dev effort is focused on a big refactor (#877) which should dramatically improve slicing performance. Hopefully this optimization can be included in that effort.
In the meantime, if you wanted to work on implementing this feature yourself, we would do our best to support you in a PR.
Thanks for your valuable feedback.
@rabernat @cgohlke this issue has just bitten me too. I've taken a stab a fixing it in PR #1154, which is successful from a performance perspective but I'm very happy to accept feedback and make changes if you can think of better ways to fix it.