Problem with running plasticc benchmark using HDK
I am running plasticc benchmark from repo https://github.com/gshimansky/data-science-processing-workload. Eventually it always gets an exception like this:
Traceback (most recent call last):
File "/home/gregory/work/data-science-processing-workload/launcher.py", line 205, in <module>
main()
File "/home/gregory/work/data-science-processing-workload/launcher.py", line 201, in main
benchmark.run()
File "/home/gregory/work/data-science-processing-workload/launcher.py", line 98, in run
res = plasticc_run(*output_files)
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 268, in run
cpu_loss, res["ML"] = measure(ml, train_final, test_final)
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 79, in measure
res = func(*args, **kw)
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 233, in ml
clf = xgb.train(
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/xgboost/core.py", line 620, in inner_f
return func(**kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/xgboost/training.py", line 186, in train
if cb_container.after_iteration(bst, i, dtrain, evals):
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/xgboost/callback.py", line 240, in after_iteration
score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/xgboost/core.py", line 2002, in eval_set
feval_ret = feval(
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 131, in xgb_multi_weighted_logloss
loss = multi_weighted_logloss(
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 122, in multi_weighted_logloss
nb_pos = y_ohe.sum(axis=0).values.astype(float)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/pandas/dataframe.py", line 2226, in sum
data._query_compiler.sum(
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/storage_formats/hdk/query_compiler.py", line 95, in method_wrapper
return method(self, *args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/storage_formats/hdk/query_compiler.py", line 402, in sum
return self._agg("sum", **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/storage_formats/hdk/query_compiler.py", line 95, in method_wrapper
return method(self, *args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/storage_formats/hdk/query_compiler.py", line 454, in _agg
new_frame = new_frame._set_index(
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py", line 1923, in _set_index
self._execute()
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py", line 1685, in _execute
new_partitions = self._partition_mgr_cls.run_exec_plan(
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py", line 280, in run_exec_plan
calcite_json = CalciteSerializer().serialize(calcite_plan)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 72, in serialize
return json.dumps({"rels": [self.serialize_item(node) for node in plan]})
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 72, in <listcomp>
return json.dumps({"rels": [self.serialize_item(node) for node in plan]})
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 105, in serialize_item
return self.serialize_node(item)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 142, in serialize_node
return self.serialize_obj(node)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 168, in serialize_obj
res[k] = self.serialize_item(v)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 111, in serialize_item
return [self.serialize_item(v) for v in item]
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 111, in <listcomp>
return [self.serialize_item(v) for v in item]
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 107, in serialize_item
return self.serialize_expr(item)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 215, in serialize_expr
return self.serialize_typed_obj(expr)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 189, in serialize_typed_obj
res["type"] = self.serialize_dtype(obj._dtype)
File "/home/gregory/work/miniconda3/envs/modin-0.18.0/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 331, in serialize_dtype
return {"type": type(self).dtype_strings[dtype.name], "nullable": True}
KeyError: 'uint8'
The problem is very similar to what modin-hdk had some time ago and it is described in the bug https://github.com/modin-project/modin/issues/3368. But that bug is closed and is supposed to be fixed. It looks like a regression. Patch from comment https://github.com/modin-project/modin/issues/3368#issuecomment-916279557 helps to workaround this bug and with this patch benchmark completes successfully.
cc @AndreyPavlenko
Should be fixed by https://github.com/modin-project/modin/pull/5563
The fix has been merged to modin master.
Interesting but on version of modin-hdk 0.18.1 I can again reproduce the bug. I am running plasticc benchmark from my repo like this: python launcher.py -m plasticc --hdk and I am getting this exception:
Traceback (most recent call last):
File "/home/gregory/work/data-science-processing-workload/launcher.py", line 247, in <module>
main()
File "/home/gregory/work/data-science-processing-workload/launcher.py", line 229, in main
results, total_time = benchmark.run()
File "/home/gregory/work/data-science-processing-workload/launcher.py", line 98, in run
res = plasticc_run(*output_files)
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 269, in run
cpu_loss, res["ML"] = measure(ml, train_final, test_final)
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 80, in measure
res = func(*args, **kw)
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 234, in ml
clf = xgb.train(
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/xgboost/core.py", line 620, in inner_f
return func(**kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/xgboost/training.py", line 186, in train
if cb_container.after_iteration(bst, i, dtrain, evals):
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/xgboost/callback.py", line 240, in after_iteration
score: str = model.eval_set(evals, epoch, self.metric, self._output_margin)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/xgboost/core.py", line 2002, in eval_set
feval_ret = feval(
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 132, in xgb_multi_weighted_logloss
loss = multi_weighted_logloss(
File "/home/gregory/work/data-science-processing-workload/benchmarks/plasticc.py", line 123, in multi_weighted_logloss
nb_pos = y_ohe.sum(axis=0).values.astype(float)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/pandas/dataframe.py", line 2231, in sum
data._query_compiler.sum(
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/storage_formats/hdk/query_compiler.py", line 95, in method_wrapper
return method(self, *args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/storage_formats/hdk/query_compiler.py", line 402, in sum
return self._agg("sum", **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/storage_formats/hdk/query_compiler.py", line 95, in method_wrapper
return method(self, *args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/storage_formats/hdk/query_compiler.py", line 454, in _agg
new_frame = new_frame._set_index(
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py", line 1923, in _set_index
self._execute()
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/dataframe/dataframe.py", line 1685, in _execute
new_partitions = self._partition_mgr_cls.run_exec_plan(
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/logging/logger_decorator.py", line 128, in run_and_log
return obj(*args, **kwargs)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/partitioning/partition_manager.py", line 280, in run_exec_plan
calcite_json = CalciteSerializer().serialize(calcite_plan)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 72, in serialize
return json.dumps({"rels": [self.serialize_item(node) for node in plan]})
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 72, in <listcomp>
return json.dumps({"rels": [self.serialize_item(node) for node in plan]})
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 105, in serialize_item
return self.serialize_node(item)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 142, in serialize_node
return self.serialize_obj(node)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 168, in serialize_obj
res[k] = self.serialize_item(v)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 111, in serialize_item
return [self.serialize_item(v) for v in item]
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 111, in <listcomp>
return [self.serialize_item(v) for v in item]
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 107, in serialize_item
return self.serialize_expr(item)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 215, in serialize_expr
return self.serialize_typed_obj(expr)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 189, in serialize_typed_obj
res["type"] = self.serialize_dtype(obj._dtype)
File "/home/gregory/work/miniconda3/envs/modin-conda-0.18.1/lib/python3.10/site-packages/modin/experimental/core/execution/native/implementations/hdk_on_native/calcite_serializer.py", line 331, in serialize_dtype
return {"type": type(self).dtype_strings[dtype.name], "nullable": True}
KeyError: 'uint8'
@AndreyPavlenko @ienkovich @alexbaden - is this version released? If not, can you please provide Gregory with a conda package?
I'm unable to reproduce the failure on the latest modin master. @gshimansky try this:
git clone https://github.com/modin-project/modin.git
cd modin
mamba create -f requirements/env_hdk.yml
mamba activate modin_on_hdk
export PYTHONPATH=$PWD:$PYTHONPATH
cd ../data-science-processing-workload
python launcher.py -m plasticc --hdk -np
@gshimansky does it work for you?