Check column names before passing on
e.g. if they're ints, xgboost will refuse them.
import pandas as pd
import numpy as np
import dask.dataframe as dd
import dask_xgboost as xgb
from distributed import Client
df = pd.DataFrame({0: np.random.randint(0, 2, size=100),
1: np.random.uniform(0, 1, size=100),
2: np.random.uniform(0, 1, size=100)})
a = dd.from_pandas(df, 2)
labels = a.loc[:, 0]
data = a.loc[:, 1:]
c = Client()
xgb.train(c, {}, data, labels)
ValueError Traceback (most recent call last)
<ipython-input-6-ea984a812dfe> in <module>()
14 c = Client()
15
---> 16 xgb.train(c, {}, data, labels)
~/sandbox/dask-xgboost/dask_xgboost/core.py in train(client, params, data, labels, dmatrix_kwargs, **kwargs)
167 """
168 return sync(client.loop, _train, client, params, data,
--> 169 labels, dmatrix_kwargs, **kwargs)
170
171
~/Envs/dask-dev/lib/python3.6/site-packages/distributed/distributed/utils.py in sync(loop, func, *args, **kwargs)
252 e.wait(1000000)
253 if error[0]:
--> 254 six.reraise(*error[0])
255 else:
256 return result[0]
~/Envs/dask-dev/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
691 if value.__traceback__ is not tb:
692 raise value.with_traceback(tb)
--> 693 raise value
694 finally:
695 value = None
~/Envs/dask-dev/lib/python3.6/site-packages/distributed/distributed/utils.py in f()
236 yield gen.moment
237 thread_state.asynchronous = True
--> 238 result[0] = yield make_coro()
239 except Exception as exc:
240 logger.exception(exc)
~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
~/sandbox/dask-xgboost/dask_xgboost/core.py in _train(client, params, data, labels, dmatrix_kwargs, **kwargs)
132
133 # Get the results, only one will be non-None
--> 134 results = yield client._gather(futures)
135 result = [v for v in results if v][0]
136 raise gen.Return(result)
~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/gen.py in run(self)
1053
1054 try:
-> 1055 value = future.result()
1056 except Exception:
1057 self.had_exception = True
~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/concurrent.py in result(self, timeout)
236 if self._exc_info is not None:
237 try:
--> 238 raise_exc_info(self._exc_info)
239 finally:
240 self = None
~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/util.py in raise_exc_info(exc_info)
~/.virtualenvs/dask-dev/lib/python3.6/site-packages/tornado/gen.py in run(self)
1061 if exc_info is not None:
1062 try:
-> 1063 yielded = self.gen.throw(*exc_info)
1064 finally:
1065 # Break up a reference to itself
~/Envs/dask-dev/lib/python3.6/site-packages/distributed/distributed/client.py in _gather(self, futures, errors, direct, local_worker)
1305 six.reraise(type(exception),
1306 exception,
-> 1307 traceback)
1308 if errors == 'skip':
1309 bad_keys.add(key)
~/Envs/dask-dev/lib/python3.6/site-packages/six.py in reraise(tp, value, tb)
690 value = tp()
691 if value.__traceback__ is not tb:
--> 692 raise value.with_traceback(tb)
693 raise value
694 finally:
~/sandbox/dask-xgboost/dask_xgboost/core.py in train_part()
66 labels = concat(labels)
67 dmatrix_kwargs["feature_names"] = getattr(data, 'columns', None)
---> 68 dtrain = xgb.DMatrix(data, labels, **dmatrix_kwargs)
69
70 args = [('%s=%s' % item).encode() for item in env.items()]
~/sandbox/xgboost/python-package/xgboost/core.py in __init__()
294 self.set_weight(weight)
295
--> 296 self.feature_names = feature_names
297 self.feature_types = feature_types
298
~/sandbox/xgboost/python-package/xgboost/core.py in feature_names()
663 not any(x in f for x in set(('[', ']', '<')))
664 for f in feature_names):
--> 665 raise ValueError('feature_names may not contain [, ] or <')
666 else:
667 # reset feature_types also
ValueError: feature_names may not contain [, ] or <
hello, is this issue resolved? @TomAugspurger
Seems like it's still open. Are you interested in working on it?
On Mon, Nov 26, 2018 at 6:10 AM chenzikun [email protected] wrote:
hello, is this issue resolved? @TomAugspurger https://github.com/TomAugspurger
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/dask/dask-xgboost/issues/15#issuecomment-441617700, or mute the thread https://github.com/notifications/unsubscribe-auth/ABQHIjYwQyef2OsIwHi8otcFsCfcGJJaks5uy9pTgaJpZM4Qq6IO .
I use xgboost on single machine, it works well. But comes with the same error with dask-xgboost. I am trying to figure out what was happened
~/sandbox/dask-xgboost/dask_xgboost/core.py in train_part() 66 labels = concat(labels) 67 dmatrix_kwargs["feature_names"] = getattr(data, 'columns', None) ---> 68 dtrain = xgb.DMatrix(data, labels, **dmatrix_kwargs)
why send "feature_names" to xgb.DMatrix?
The feature names are nice to preserve.
On Mon, Nov 26, 2018 at 7:07 AM chenzikun [email protected] wrote:
~/sandbox/dask-xgboost/dask_xgboost/core.py in train_part() 66 labels = concat(labels) 67 dmatrix_kwargs["feature_names"] = getattr(data, 'columns', None) ---> 68 dtrain = xgb.DMatrix(data, labels, **dmatrix_kwargs)
why send "feature_names" to xgb.DMatrix?
— You are receiving this because you were mentioned. Reply to this email directly, view it on GitHub https://github.com/dask/dask-xgboost/issues/15#issuecomment-441632498, or mute the thread https://github.com/notifications/unsubscribe-auth/ABQHInLdqxBRmI_ZPx4RvhjUg8fQkKcmks5uy-eGgaJpZM4Qq6IO .