Fix specifying chunks when reading 1D
Attempt to fix the following example:
import hyperspy.api as hs
import numpy as np
s = hs.signals.Signal1D(np.arange(100))
s.save("test.rpl", overwrite=True)
s2 = hs.load("test.rpl", lazy=True, chunks=(50,))
s2.compute(close_file=True)
which gives the error:
Error traceback
11 s = hs.signals.Signal1D(np.arange(100))
12 s.save("test.rpl", overwrite=True)
---> 14 s2 = hs.load("test.rpl", lazy=True, chunks=(50,))
15 s2.compute(close_file=True)
18 # from rsciio.utils.distributed import get_chunk_slice
19
20 # slices, chunks = get_chunk_slice(shape=(100, ), chunks=(50,), dtype=int)
File ~\Dev\hyperspy\hyperspy\io.py:550, in load(filenames, signal_type, stack, stack_axis, new_axis_name, lazy, convert_units, escape_square_brackets, stack_metadata, load_original_metadata, show_progressbar, **kwds)
546 objects.append(signal)
547 else:
548 # No stack, so simply we load all signals in all files separately
549 objects = [
--> 550 load_single_file(filename, lazy=lazy, **kwds) for filename in filenames
551 ]
553 if len(objects) == 1:
554 objects = objects[0]
File ~\Dev\hyperspy\hyperspy\io.py:609, in load_single_file(filename, **kwds)
603 raise ValueError(
604 "`reader` should be one of None, str, or a custom file reader object"
605 )
607 try:
608 # Try and load the file
--> 609 return load_with_reader(filename=filename, reader=reader, **kwds)
611 except BaseException:
612 _logger.error(
613 "If this file format is supported, please "
614 "report this error to the RosettaSciIO developers at "
615 "https://github.com/hyperspy/rosettasciio/issues"
616 )
File ~\Dev\hyperspy\hyperspy\io.py:631, in load_with_reader(filename, reader, signal_type, convert_units, load_original_metadata, **kwds)
629 lazy = kwds.get("lazy", False)
630 if isinstance(reader, dict):
--> 631 file_data_list = importlib.import_module(reader["api"]).file_reader(
632 filename, **kwds
633 )
634 else:
635 # We assume it is a module
636 file_data_list = reader.file_reader(filename, **kwds)
File ~\Dev\rosettasciio\rsciio\utils\_deprecated.py:154, in deprecated_argument.__call__.<locals>.wrapped(*args, **kwargs)
147 func_code = func.__code__
148 warnings.warn_explicit(
149 message=msg,
150 category=VisibleDeprecationWarning,
151 filename=func_code.co_filename,
152 lineno=func_code.co_firstlineno + 1,
153 )
--> 154 return func(*args, **kwargs)
File ~\Dev\rosettasciio\rsciio\utils\_deprecated.py:154, in deprecated_argument.__call__.<locals>.wrapped(*args, **kwargs)
147 func_code = func.__code__
148 warnings.warn_explicit(
149 message=msg,
150 category=VisibleDeprecationWarning,
151 filename=func_code.co_filename,
152 lineno=func_code.co_firstlineno + 1,
153 )
--> 154 return func(*args, **kwargs)
File ~\Dev\rosettasciio\rsciio\ripple\_api.py:308, in file_reader(filename, lazy, rpl_info, encoding, chunks)
305 if not rawfname:
306 raise IOError(f'RAW file "{rawfname}" does not exists')
--> 308 data = read_raw(rpl_info, rawfname, chunks=chunks)
310 if not lazy:
311 data = data.compute()
File ~\Dev\rosettasciio\rsciio\utils\_deprecated.py:154, in deprecated_argument.__call__.<locals>.wrapped(*args, **kwargs)
147 func_code = func.__code__
148 warnings.warn_explicit(
149 message=msg,
150 category=VisibleDeprecationWarning,
151 filename=func_code.co_filename,
152 lineno=func_code.co_firstlineno + 1,
153 )
--> 154 return func(*args, **kwargs)
File ~\Dev\rosettasciio\rsciio\utils\_deprecated.py:154, in deprecated_argument.__call__.<locals>.wrapped(*args, **kwargs)
147 func_code = func.__code__
148 warnings.warn_explicit(
149 message=msg,
150 category=VisibleDeprecationWarning,
151 filename=func_code.co_filename,
152 lineno=func_code.co_firstlineno + 1,
153 )
--> 154 return func(*args, **kwargs)
File ~\Dev\rosettasciio\rsciio\ripple\_api.py:253, in read_raw(rpl_info, filename, chunks)
250 elif record_by == "dont-care": # stack of images
251 shape = (height, width)
--> 253 data = memmap_distributed(
254 filename,
255 offset=offset,
256 shape=shape,
257 dtype=data_type,
258 chunks=chunks,
259 )
261 return data.squeeze()
File ~\Dev\rosettasciio\rsciio\utils\distributed.py:266, in memmap_distributed(filename, dtype, positions, offset, shape, order, chunks, block_size_limit, key)
263 shape = (len(positions),) + shape[-2:] # update the shape to be linear
264 else:
265 # Separates slices into appropriately sized chunks.
--> 266 chunked_slices, data_chunks = get_chunk_slice(
267 shape=shape + sub_array_shape,
268 chunks=chunks,
269 block_size_limit=block_size_limit,
270 dtype=array_dtype,
271 )
272 drop_axes = (
273 num_dim,
274 num_dim + 1,
275 ) # Dask 2021.10.0 minimum to use negative indexing
276 use_positions = False
File ~\Dev\rosettasciio\rsciio\utils\distributed.py:60, in get_chunk_slice(shape, chunks, block_size_limit, dtype)
25 def get_chunk_slice(
26 shape,
27 chunks="auto",
28 block_size_limit=None,
29 dtype=None,
30 ):
31 """
32 Get chunk slices for the :func:`rsciio.utils.distributed.slice_memmap` function.
33
(...)
57 Tuple of the chunks.
58 """
---> 60 chunks = da.core.normalize_chunks(
61 chunks=chunks, shape=shape, limit=block_size_limit, dtype=dtype
62 )
63 chunks_shape = tuple([len(c) for c in chunks])
64 slices = np.empty(
65 shape=chunks_shape + (len(chunks_shape), 2),
66 dtype=int,
67 )
File ~\miniforge3\Lib\site-packages\dask\array\core.py:3184, in normalize_chunks(chunks, shape, limit, dtype, previous_chunks)
3181 chunks = (chunks,)
3183 if shape and len(chunks) != len(shape):
-> 3184 raise ValueError(
3185 "Chunks and shape must be of the same length/dimension. "
3186 "Got chunks=%s, shape=%s" % (chunks, shape)
3187 )
3188 if -1 in chunks or None in chunks:
3189 chunks = tuple(s if c == -1 or c is None else c for c, s in zip(chunks, shape))
ValueError: Chunks and shape must be of the same length/dimension. Got chunks=(50,), shape=(1, 1, 100)
After fixing the shape (this PR), there is a different error, which I have seen before and I was actually trying to produce in the context of #418:
Error traceback
12 s.save("test.rpl", overwrite=True)
14 s2 = hs.load("test.rpl", lazy=True, chunks=(50,))
---> 15 s2.compute(close_file=True)
18 # from rsciio.utils.distributed import get_chunk_slice
19
20 # slices, chunks = get_chunk_slice(shape=(100, ), chunks=(50,), dtype=int)
File ~\Dev\hyperspy\hyperspy\_signals\lazy.py:244, in LazySignal.compute(self, close_file, show_progressbar, **kwargs)
196 def compute(self, close_file=False, show_progressbar=None, **kwargs):
197 """
198 Attempt to store the full signal in memory.
199
(...)
242
243 """
--> 244 self.data = _compute(self.data, show_progressbar=show_progressbar, **kwargs)
245 if close_file:
246 self.close_file()
File ~\Dev\hyperspy\hyperspy\misc\utils.py:1463, in _compute(array, store_to, show_progressbar, **kwargs)
1459 da.store(
1460 array, store_to, dtype=array.dtype, compute=True, lock=False, **kwargs
1461 )
1462 else:
-> 1463 return array.compute(**kwargs)
File ~\miniforge3\Lib\site-packages\dask\base.py:373, in DaskMethodsMixin.compute(self, **kwargs)
349 def compute(self, **kwargs):
350 """Compute this dask collection
351
352 This turns a lazy Dask collection into its in-memory equivalent.
(...)
371 dask.compute
372 """
--> 373 (result,) = compute(self, traverse=False, **kwargs)
374 return result
File ~\miniforge3\Lib\site-packages\dask\base.py:681, in compute(traverse, optimize_graph, scheduler, get, *args, **kwargs)
678 expr = expr.optimize()
679 keys = list(flatten(expr.__dask_keys__()))
--> 681 results = schedule(expr, keys, **kwargs)
683 return repack(results)
File ~\Dev\rosettasciio\rsciio\utils\distributed.py:177, in slice_memmap(slices, file, dtypes, shape, key, positions, **kwargs)
175 return data[slices_]
176 else:
--> 177 slices_ = tuple([slice(s[0], s[1]) for s in slices_])
178 return data[slices_]
IndexError: invalid index to scalar variable.
@CSSFrancis, it looks to me that in the case of array with a single dimension, there is an empty slice that cause the error. Do you have an idea what is a good fix for it?
Progress of the PR
- [ ] Change implemented (can be split into several points),
- [ ] update docstring (if appropriate),
- [ ] update user guide (if appropriate),
- [ ] add a changelog entry in the
upcoming_changesfolder (seeupcoming_changes/README.rst), - [ ] Check formatting of the changelog entry (and eventual user guide changes) in the
docs/readthedocs.org:rosettasciiobuild of this PR (link in github checks) - [ ] add tests,
- [ ] ready for review.
Codecov Report
:white_check_mark: All modified and coverable lines are covered by tests.
:white_check_mark: Project coverage is 87.90%. Comparing base (a647f2e) to head (63b6d3f).
:warning: Report is 58 commits behind head on main.
Additional details and impacted files
@@ Coverage Diff @@
## main #419 +/- ##
=======================================
Coverage 87.90% 87.90%
=======================================
Files 89 89
Lines 11464 11465 +1
Branches 2116 2116
=======================================
+ Hits 10077 10078 +1
Misses 878 878
Partials 509 509
:umbrella: View full report in Codecov by Sentry.
:loudspeaker: Have feedback on the report? Share it here.
:rocket: New features to boost your workflow:
- :snowflake: Test Analytics: Detect flaky tests, report on failures, and find test suite problems.
@ericpre hmmm sorry it's taken me a bit to get around to this. I've been sick the last couple of weeks.
I'd have to step through the code to see where this goes wrong. It's probably just something with get_chunk_slice I don't know if I fully considered the 1D case.
@CSSFrancis, any chance, you could help with this PR? Thanks!