bcolz
bcolz copied to clipboard
Query datetime using `where` interface
Is it possible to query on datetime
columns?
The structure of ctable
is as follows
ctable((173185091,), [('medallion', 'S32'), ('hack_license', 'S32'), ('vendor_id', 'S3'), ('rate_code', '<i8'), ('store_and_fwd_flag', 'S1'), ('pickup_datetime', '<M8[ns]'), ('dropoff_datetime', '<M8[ns]'), ('passenger_count', '<i8'), ('trip_time_in_secs', '<i8'), ('trip_distance', '<f8'), ('pickup_longitude', '<f8'), ('pickup_latitude', '<f8'), ('dropoff_longitude', '<f8'), ('dropoff_latitude', '<f8'), ('payment_type', 'S3'), ('fare_amount', '<f8'), ('surcharge', '<f8'), ('mta_tax', '<f8'), ('tip_amount', '<f8'), ('tolls_amount', '<f8'), ('total_amount', '<f8')])
nbytes: 32.10 GB; cbytes: 16.55 GB; ratio: 1.94
cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
rootdir := '/data/NYC/nyc.bcolz'
[ ('89D227B655E5C82AECF13C3F540D4CF4', 'BA96DE419E711691B9445D6A6307C170', 'CMT', 1, 'N', 1357053108000000000L, 1357053490000000000L, 4, 382, 1.0, -73.978165, 40.757977000000004, -73.989838, 40.751171, 'CSH', 6.5, 0.0, 0.5, 0.0, 0.0, 7.0)]
I would like to perform a query to subset the data on specific date range using something like:
result = bz.where('pickup_datetime > "2013-12-30"')
This produces the following error and I was wondering if something like this is even possible?
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-37-cf606f7701a3> in <module>()
----> 1 bz.where('pickup_datetime > "2013-12-30"')
/home/abisen/opt/anaconda/envs/py/lib/python2.7/site-packages/bcolz/ctable.pyc in where(self, expression, outcols, limit, skip)
824 if type(expression) is str:
825 # That must be an expression
--> 826 boolarr = self.eval(expression)
827 elif hasattr(expression, "dtype") and expression.dtype.kind == 'b':
828 boolarr = expression
/home/abisen/opt/anaconda/envs/py/lib/python2.7/site-packages/bcolz/ctable.pyc in eval(self, expression, **kwargs)
1191 # Call top-level eval with cols as user_dict
1192 return bcolz.eval(expression, user_dict=self.cols, depth=depth,
-> 1193 **kwargs)
1194
1195 def flush(self):
/home/abisen/opt/anaconda/envs/py/lib/python2.7/site-packages/bcolz/chunked_eval.pyc in eval(expression, vm, out_flavor, user_dict, **kwargs)
152
153 return _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor,
--> 154 **kwargs)
155
156
/home/abisen/opt/anaconda/envs/py/lib/python2.7/site-packages/bcolz/chunked_eval.pyc in _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, **kwargs)
220 return _eval_blocks(
221 expression, vars, vlen, typesize, "python",
--> 222 out_flavor, **kwargs)
223
224 if i == 0:
/home/abisen/opt/anaconda/envs/py/lib/python2.7/site-packages/bcolz/chunked_eval.pyc in _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, **kwargs)
226 scalar = False
227 dim_reduction = False
--> 228 if len(res_block.shape) == 0:
229 scalar = True
230 result = res_block
AttributeError: 'bool' object has no attribute 'shape'
A small fyi: we also ran into numexpr issues (used by eval) with datetimes; we implemented a workaround by saving the date and time as separate integers
Hmm, this does not seem related with numexpr limitations (vm="python"), so this should be qualifyied as bug inside bcolz.
This also fails when trying to use the underlying datetime64
value.
t = np.datetime64("2015-04-25 23:59:11+0000")
t.view(np.int64)
# 1430006351
bt["time > 1430006351"]
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-117-3a1074cc5e4b> in <module>()
----> 1 bt["time > 1430006351"]
\bcolz\ctable.pyc in __getitem__(self, key)
1089 if key not in self.names:
1090 # key is not a column name, try to evaluate
-> 1091 arr = self.eval(key, depth=4)
1092 if arr.dtype.type != np.bool_:
1093 raise IndexError(
\bcolz\ctable.pyc in eval(self, expression, **kwargs)
1191 # Call top-level eval with cols as user_dict
1192 return bcolz.eval(expression, user_dict=self.cols, depth=depth,
-> 1193 **kwargs)
1194
1195 def flush(self):
\bcolz\chunked_eval.pyc in eval(expression, vm, out_flavor, user_dict, **kwargs)
152
153 return _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor,
--> 154 **kwargs)
155
156
\bcolz\chunked_eval.pyc in _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, **kwargs)
220 return _eval_blocks(
221 expression, vars, vlen, typesize, "python",
--> 222 out_flavor, **kwargs)
223
224 if i == 0:
\bcolz\chunked_eval.pyc in _eval_blocks(expression, vars, vlen, typesize, vm, out_flavor, **kwargs)
211 # Perform the evaluation for this block
212 if vm == "python":
--> 213 res_block = _eval(expression, vars_)
214 else:
215 try:
<string> in <module>()
TypeError: invalid type promotion
import bcolz
import numpy as np
import datetime
a = bcolz.carray(np.array(['2007-07-13', '2006-01-13',
'2010-08-13'], dtype='datetime64'), mode="w")
b = bcolz.carray(np.array([1, 2,3], dtype='int'), mode="w")
this works!
eval("a < datetime.date(2013,01,01)",vm="python")
Out[48]:
carray((3,), bool)
nbytes: 3; cbytes: 16.00 KB; ratio: 0.00
cparams := cparams(clevel=5, shuffle=True, cname='blosclz')
[ True True True]
Seems like numexpr throws an AttributeError that is not caught anywhere to revert to python vm.
eval("a < datetime.date(2013,01,01)",vm="numexpr")
Out[48]:
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-46-f803302118d9> in <module>()
----> 1 bcolz.numexpr.evaluate("a< datetime.date('2013-01-01')",local_dict=user_locals)
/Applications/Anaconda/anaconda/envs/bcolzpr/lib/python2.7/site-packages/numexpr/necompiler.pyc in evaluate(ex, local_dict, global_dict, out, order, casting, **kwargs)
710 expr_key = (ex, tuple(sorted(context.items())))
711 if expr_key not in _names_cache:
--> 712 _names_cache[expr_key] = getExprNames(ex, context)
713 names, ex_uses_vml = _names_cache[expr_key]
714 # Get the arguments based on the names.
/Applications/Anaconda/anaconda/envs/bcolzpr/lib/python2.7/site-packages/numexpr/necompiler.pyc in getExprNames(text, context)
631
632 def getExprNames(text, context):
--> 633 ex = stringToExpression(text, {}, context)
634 ast = expressionToAST(ex)
635 input_order = getInputOrder(ast, None)
/Applications/Anaconda/anaconda/envs/bcolzpr/lib/python2.7/site-packages/numexpr/necompiler.pyc in stringToExpression(s, types, context)
243 names.update(expressions.functions)
244 # now build the expression
--> 245 ex = eval(c, names)
246 if expressions.isConstant(ex):
247 ex = expressions.ConstantNode(ex, expressions.getKind(ex))
<expr> in <module>()
AttributeError: 'VariableNode' object has no attribute 'date'
@esc @FrancescAlted is the fix here to catch the numexpr AttributeError to the try except block here, so it falls back to Python VM?
Please submit a pull-request with a test-case that exposes the issue.
any update on this front guys?