arctic
arctic copied to clipboard
TickStore float32 (f4) support
A cast to float32/f4 before LZ4 compression reduces storage size by 25~40 % and reads use half of memory. For testing I used prices and various other time series where 64-bit precision is not necessary.
I read somewhere in the code that TickStore only supports a limited set of types because of compatibility to a Java version. Is that still relevant?
Here is the drop in code for writing and reading float32:
click to expand
def arctic_float32_extension():
import numpy as np
import arctic.tickstore.tickstore
from arctic.exceptions import UnhandledDtypeException
from pandas._libs.lib import infer_dtype
def _ensure_supported_dtypes(array):
# We only support these types for now, as we need to read them in Java
if array.dtype.kind == 'i':
array = array.astype('<i8')
elif array.dtype.kind == 'f':
array = array.astype('<f%d' % array.dtype.itemsize)
elif array.dtype.kind in ('O', 'U', 'S'):
if array.dtype.kind == 'O' and infer_dtype(array) not in ['unicode', 'string', 'bytes']:
# `string` in python2 and `bytes` in python3
raise UnhandledDtypeException("Casting object column to string failed")
try:
array = array.astype(np.unicode_)
except (UnicodeDecodeError, SystemError):
# `UnicodeDecodeError` in python2 and `SystemError` in python3
array = np.array([s.decode('utf-8') for s in array])
except:
raise UnhandledDtypeException("Only unicode and utf8 strings are supported.")
else:
raise UnhandledDtypeException(
"Unsupported dtype '%s' - only int64, float64 and U are supported" % array.dtype)
# Everything is little endian in tickstore
if array.dtype.byteorder != '<':
array = array.astype(array.dtype.newbyteorder('<'))
return array
def _str_dtype(dtype):
"""
Represent dtypes without byte order, as earlier Java tickstore code doesn't support explicit byte order.
"""
assert dtype.byteorder != '>'
if dtype.kind == 'i':
assert dtype.itemsize == 8
return 'int64'
elif dtype.kind == 'f':
if dtype.itemsize == 8:
return 'float64'
elif dtype.itemsize == 4:
return 'float32'
elif dtype.kind == 'U':
return 'U%d' % (dtype.itemsize / 4)
else:
raise UnhandledDtypeException("Bad dtype '%s'" % dtype)
def _empty(self, length, dtype):
if dtype is not None and (dtype == np.float64 or dtype == np.float32):
rtn = np.empty(length, dtype)
rtn[:] = np.nan
return rtn
else:
return np.empty(length, dtype=np.object_)
arctic.tickstore.tickstore.TickStore._ensure_supported_dtypes = _ensure_supported_dtypes
arctic.tickstore.tickstore.TickStore._str_dtype = _str_dtype
arctic.tickstore.tickstore.TickStore._empty = _empty
Good point, let me check if @jamesblackburn might know about this.
Java compatibility is still required.
This shouldn't stop incremental improvements, but they should be done in a forwards and backwards compatible way, e.g. explicit feature flags and/or library metadata.