arctic icon indicating copy to clipboard operation
arctic copied to clipboard

TickStore float32 (f4) support

Open fl4p-old opened this issue 6 years ago • 2 comments

A cast to float32/f4 before LZ4 compression reduces storage size by 25~40 % and reads use half of memory. For testing I used prices and various other time series where 64-bit precision is not necessary.

I read somewhere in the code that TickStore only supports a limited set of types because of compatibility to a Java version. Is that still relevant?

Here is the drop in code for writing and reading float32:

click to expand
def arctic_float32_extension():
    import numpy as np
    import arctic.tickstore.tickstore
    from arctic.exceptions import UnhandledDtypeException
    from pandas._libs.lib import infer_dtype

    def _ensure_supported_dtypes(array):
        # We only support these types for now, as we need to read them in Java
        if array.dtype.kind == 'i':
            array = array.astype('<i8')
        elif array.dtype.kind == 'f':
            array = array.astype('<f%d' % array.dtype.itemsize)
        elif array.dtype.kind in ('O', 'U', 'S'):
            if array.dtype.kind == 'O' and infer_dtype(array) not in ['unicode', 'string', 'bytes']:
                # `string` in python2 and `bytes` in python3
                raise UnhandledDtypeException("Casting object column to string failed")
            try:
                array = array.astype(np.unicode_)
            except (UnicodeDecodeError, SystemError):
                # `UnicodeDecodeError` in python2 and `SystemError` in python3
                array = np.array([s.decode('utf-8') for s in array])
            except:
                raise UnhandledDtypeException("Only unicode and utf8 strings are supported.")
        else:
            raise UnhandledDtypeException(
                "Unsupported dtype '%s' - only int64, float64 and U are supported" % array.dtype)
        # Everything is little endian in tickstore
        if array.dtype.byteorder != '<':
            array = array.astype(array.dtype.newbyteorder('<'))
        return array

    def _str_dtype(dtype):
        """
        Represent dtypes without byte order, as earlier Java tickstore code doesn't support explicit byte order.
        """
        assert dtype.byteorder != '>'
        if dtype.kind == 'i':
            assert dtype.itemsize == 8
            return 'int64'
        elif dtype.kind == 'f':
            if dtype.itemsize == 8:
                return 'float64'
            elif dtype.itemsize == 4:
                return 'float32'
        elif dtype.kind == 'U':
            return 'U%d' % (dtype.itemsize / 4)
        else:
            raise UnhandledDtypeException("Bad dtype '%s'" % dtype)

    def _empty(self, length, dtype):
        if dtype is not None and (dtype == np.float64 or dtype == np.float32):
            rtn = np.empty(length, dtype)
            rtn[:] = np.nan
            return rtn
        else:
            return np.empty(length, dtype=np.object_)

    arctic.tickstore.tickstore.TickStore._ensure_supported_dtypes = _ensure_supported_dtypes
    arctic.tickstore.tickstore.TickStore._str_dtype = _str_dtype
    arctic.tickstore.tickstore.TickStore._empty = _empty

fl4p-old avatar Jan 16 '19 13:01 fl4p-old

Good point, let me check if @jamesblackburn might know about this.

shashank88 avatar Jan 22 '19 22:01 shashank88

Java compatibility is still required.

This shouldn't stop incremental improvements, but they should be done in a forwards and backwards compatible way, e.g. explicit feature flags and/or library metadata.

yschimke avatar Jan 23 '19 10:01 yschimke