Qcodes icon indicating copy to clipboard operation
Qcodes copied to clipboard

Dataset I/O performance

Open thangleiter opened this issue 4 months ago • 0 comments

I frequently deal with fairly large datasets (some high-dimensional loops and a buffered get-parameter). Loading these datasets from the database into a usable format (i.e., xarray) takes an extremely long time.

A representative example:

>>> xds 
<xarray.Dataset> Size: 792MB
Dimensions: (... : 19,
             ... : 51,
             ... : 51,
             ... : 2000)
	...

For reference, the HDF5 I/O performance for this dataset:

>>> %timeit xds.to_netcdf(file := tempfile.mktemp(), engine='h5netcdf')
6.72 s ± 832 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
>>> %timeit xr.load_dataset(file)
500 ms ± 43.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

By contrast, loading this dataset from the database (~4GB on disk) takes 60 seconds! During the course of this minute, memory usage temporarily spikes by 8GB (Windows 10). The runtime breaks down as follows (some functions cherry-picked for detail, %lprun -u 1e-3 -f ... xds = qcds.to_xarray_dataset(), timings in ms):

to_xarray_dataset()

Total time: 57.4706 s
File: ...\qcodes\dataset\data_set.py
Function: to_xarray_dataset at line 986

Line # Hits Time Per Hit % Time Line Contents 1042 2 28081.6 14040.8 48.9 data = self.get_parameter_data(*params, 1043 1 0.0 0.0 0.0 start=start, 1044 1 0.0 0.0 0.0 end=end) 1045
1046 1 29389.0 29389.0 51.1 return load_to_xarray_dataset(self, data, use_multi_index=use_multi_index)

export_to_xarray()

Total time: 29.3664 s
File: ...\qcodes\dataset\exporters\export_to_xarray.py
Function: _load_to_xarray_dataarray_dict_no_metadata at line 62

Line # Hits Time Per Hit % Time Line Contents 68 1 0.0 0.0 0.0 import pandas as pd 69 1 0.0 0.0 0.0 import xarray as xr 70
71 1 0.0 0.0 0.0 if use_multi_index not in ("auto", "always", "never"): 72 raise ValueError( 73 f"Invalid value for use_multi_index. Expected one of 'auto', 'always', 'never' but got {use_multi_index}" 74 ) 75
76 1 0.0 0.0 0.0 data_xrdarray_dict: dict[str, xr.DataArray] = {} 77
78 6 0.0 0.0 0.0 for name, subdict in datadict.items(): 79 5 8157.3 1631.5 27.8 index = _generate_pandas_index(subdict) 80
81 5 0.0 0.0 0.0 if index is None: 82 xrdarray: xr.DataArray = ( 83 _data_to_dataframe(subdict, index=index) 84 .to_xarray() 85 .get(name, xr.DataArray()) 86 ) 87 data_xrdarray_dict[name] = xrdarray 88 else: 89 5 15452.9 3090.6 52.6 index_unique = len(index.unique()) == len(index) 90
91 5 248.4 49.7 0.8 df = _data_to_dataframe(subdict, index) 92
93 5 0.0 0.0 0.0 if not index_unique: 94 # index is not unique so we fallback to using a counter as index 95 # and store the index as a variable 96 xrdata_temp = df.reset_index().to_xarray() 97 for _name in subdict: 98 data_xrdarray_dict[_name] = xrdata_temp[_name] 99 else: 100 5 1982.9 396.6 6.8 calc_index = _calculate_index_shape(index) 101 5 0.0 0.0 0.0 index_prod = prod(calc_index.values()) 102 # if the product of the len of individual index dims == len(total_index) 103 # we are on a grid 104
105 5 0.0 0.0 0.0 on_grid = index_prod == len(index) 106
107 10 0.0 0.0 0.0 export_with_multi_index = ( 108 10 0.0 0.0 0.0 not on_grid 109 5 0.0 0.0 0.0 and dataset.description.shapes is None 110 and use_multi_index == "auto" 111 5 0.0 0.0 0.0 ) or use_multi_index == "always" 112
113 5 0.0 0.0 0.0 if export_with_multi_index: 114 assert isinstance(df.index, pd.MultiIndex) 115
116 if hasattr(xr, "Coordinates"): 117 coords = xr.Coordinates.from_pandas_multiindex( 118 df.index, "multi_index" 119 ) 120 xrdarray = xr.DataArray(df[name], coords=coords) 121 else: 122 # support xarray < 2023.8.0, can be removed when we drop support for that 123 xrdarray = xr.DataArray(df[name], [("multi_index", df.index)]) 124 else: 125 5 3524.9 705.0 12.0 xrdarray = df.to_xarray().get(name, xr.DataArray()) 126
127 5 0.0 0.0 0.0 data_xrdarray_dict[name] = xrdarray 128
129 1 0.0 0.0 0.0 return data_xrdarray_dict

_expand_data_to_arrays()

Total time: 4.06941 s
File: ...\qcodes\dataset\sqlite\queries.py
Function: _expand_data_to_arrays at line 266

Line # Hits Time Per Hit % Time Line Contents 266 def _expand_data_to_arrays( 267 data: list[tuple[Any, ...]], paramspecs: Sequence[ParamSpecBase] 268 ) -> None: 269 26 0.0 0.0 0.0 types = [param.type for param in paramspecs] 270 # if we have array type parameters expand all other parameters 271 # to arrays 272 5 0.0 0.0 0.0 if 'array' in types: 273
274 1 0.0 0.0 0.0 if ('numeric' in types or 'text' in types 275 or 'complex' in types): 276 1 0.0 0.0 0.0 first_array_element = types.index('array') 277 1 0.0 0.0 0.0 types_mapping: dict[int, Callable[[str], np.dtype[Any]]] = {} 278 6 0.0 0.0 0.0 for i, x in enumerate(types): 279 5 0.0 0.0 0.0 if x == "numeric": 280 3 0.0 0.0 0.0 types_mapping[i] = lambda _: np.dtype(np.float64) 281 2 0.0 0.0 0.0 elif x == "complex": 282 types_mapping[i] = lambda _: np.dtype(np.complex128) 283 2 0.0 0.0 0.0 elif x == "text": 284 types_mapping[i] = lambda array: np.dtype(f"U{len(array)}") 285
286 46253 30.0 0.0 0.7 for i_row, row in enumerate(data): 287 # todo should we handle int/float types here 288 # we would in practice have to perform another 289 # loop to check that all elements of a given can be cast to 290 # int without loosing precision before choosing an integer 291 # representation of the array 292 92504 3070.6 0.0 75.5 data[i_row] = tuple( 293 np.full_like( 294 row[first_array_element], array, dtype=types_mappingi 295 ) 296 if i in types_mapping 297 else array 298 46252 25.8 0.0 0.6 for i, array in enumerate(row) 299 ) 300
301 1 0.0 0.0 0.0 row_shape = None 302 46253 25.8 0.0 0.6 for i_row, row in enumerate(data): 303 # now expand all one element arrays to match the expected size 304 # one element arrays are introduced if scalar values are stored 305 # with an explicit array storage type 306 46252 16.1 0.0 0.4 max_size = 0 307 277512 146.8 0.0 3.6 for i, array in enumerate(row): 308 231260 110.2 0.0 2.7 if array.size > max_size: 309 46252 18.8 0.0 0.5 if max_size > 1: 310 log.warning( 311 f"Cannot expand array of size {max_size} " 312 f"to size {array.size}" 313 ) 314 46252 23.4 0.0 0.6 max_size, row_shape = array.size, array.shape 315
316 46252 19.4 0.0 0.5 if max_size > 1: 317 46252 18.2 0.0 0.4 assert row_shape is not None 318 92504 549.2 0.0 13.5 data[i_row] = tuple( 319 np.full(row_shape, array, dtype=array.dtype) 320 if array.size == 1 321 else array 322 46252 15.1 0.0 0.4 for array in row 323 )

many_many()

Total time: 21.7225 s
File: ...\qcodes\dataset\sqlite\query_helpers.py
Function: many_many at line 117

Line # Hits Time Per Hit % Time Line Contents 117 def many_many(curr: sqlite3.Cursor, *columns: str) -> list[tuple[Any, ...]]: 118 """Get all values of many columns 119 Args: 120 curr: cursor to operate on 121 columns: names of the columns 122
123 Returns: 124 list of lists of values 125 """ 126 5 21722.4 4344.5 100.0 res = curr.fetchall() 127
128 5 0.1 0.0 0.0 if _need_to_select(curr, *columns): 129 raise RuntimeError( 130 "Expected consistent selection: cursor has columns " 131 f"{tuple(c[0] for c in curr.description)} but expected {columns}" 132 ) 133
134 5 0.0 0.0 0.0 return res

Unfortunately, I am not familiar with SQLite so I cannot say if there is much to be gained in the actual I/O from the database, but the discrepancy in speed to HDF5 makes me think that at least in the pure Python parts a lot could be gained. Loading a factor of 120 slower than xarray seems absurd.

thangleiter avatar Oct 18 '24 15:10 thangleiter