pyiron_base icon indicating copy to clipboard operation
pyiron_base copied to clipboard

Storing only-str numpy arrays in hdf fails

Open niklassiemer opened this issue 3 years ago • 2 comments

MWE:

from pyiron_base import Project
import numpy as np
pr = Project('dummy')
hdf = pr.create_hdf(pr.path + 'any', 'any')
hdf['key'] = np.array(['list', 'of', 'str'])

results in

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-1-c0cd380030f9> in <module>
      3 pr = Project('dummy')
      4 hdf = pr.create_hdf(pr.path + 'any', 'any')
----> 5 hdf['key'] = np.array(['list', 'of', 'str'])

/mnt/c/Users/Siemer/pyiron_git/pyiron_base/pyiron_base/generic/hdfio.py in __setitem__(self, key, value)
    258         elif isinstance(value, tuple):
    259             value = list(value)
--> 260         h5io.write_hdf5(
    261             self.file_name,
    262             value,

~/anaconda3/envs/pyiron_git/lib/python3.8/site-packages/h5io/_h5io.py in write_hdf5(fname, data, overwrite, compression, title, slash, use_json)
    109             del fid[title]
    110         cleanup_data = []
--> 111         _triage_write(title, data, fid, comp_kw, str(type(data)),
    112                       cleanup_data, slash=slash, title=title,
    113                       use_json=use_json)

~/anaconda3/envs/pyiron_git/lib/python3.8/site-packages/h5io/_h5io.py in _triage_write(key, value, root, comp_kw, where, cleanup_data, slash, title, use_json)
    186         if not (value.dtype == np.dtype('object') and
    187                 len(set([sub.dtype for sub in value])) == 1):
--> 188             _create_titled_dataset(root, key, 'ndarray', value)
    189         else:
    190             ma_index, ma_data = multiarray_dump(value)

~/anaconda3/envs/pyiron_git/lib/python3.8/site-packages/h5io/_h5io.py in _create_titled_dataset(root, key, title, data, comp_kw)
     46     """Helper to create a titled dataset in h5py"""
     47     comp_kw = {} if comp_kw is None else comp_kw
---> 48     out = root.create_dataset(key, data=data, **comp_kw)
     49     out.attrs['TITLE'] = title
     50     return out

~/anaconda3/envs/pyiron_git/lib/python3.8/site-packages/h5py/_hl/group.py in create_dataset(self, name, shape, dtype, data, **kwds)
    146                     group = self.require_group(parent_path)
    147 
--> 148             dsid = dataset.make_new_dset(group, shape, dtype, data, name, **kwds)
    149             dset = dataset.Dataset(dsid)
    150             return dset

~/anaconda3/envs/pyiron_git/lib/python3.8/site-packages/h5py/_hl/dataset.py in make_new_dset(parent, shape, dtype, data, name, chunks, compression, shuffle, fletcher32, maxshape, compression_opts, fillvalue, scaleoffset, track_times, external, track_order, dcpl, allow_unknown_filter)
     87         else:
     88             dtype = numpy.dtype(dtype)
---> 89         tid = h5t.py_create(dtype, logical=1)
     90 
     91     # Legacy

h5py/h5t.pyx in h5py.h5t.py_create()

h5py/h5t.pyx in h5py.h5t.py_create()

h5py/h5t.pyx in h5py.h5t.py_create()

TypeError: No conversion path for dtype: dtype('<U4')

niklassiemer avatar Mar 04 '22 14:03 niklassiemer

HDF wants bytes (dtype('S')), I have a work-around for this in FlattenedStorage here.

pmrv avatar Mar 05 '22 09:03 pmrv

Wouldn't it be reasonable to move this solution upstream to FileHDFio?

niklassiemer avatar Jul 06 '22 07:07 niklassiemer