Wordbatch
Wordbatch copied to clipboard
TypeError: object of type 'type' has no len()
My configuration - wordbatch-1.3.0 pandas-0.22 python 3.6.2 ubuntu 14.04 Executing kaggle script without any changes https://www.kaggle.com/anttip/wordbatch-ftrl-fm-lgb-lbl-0-42555
TypeError Traceback (most recent call last)
~/lal/Kaggle/kaggleme/input/bkup/wordbatch/wordbatch.py in fit_transform(self, texts, labels, extractor, cache_features, input_split) 239 240 def fit_transform(self, texts, labels=None, extractor= None, cache_features= None, input_split= False): --> 241 return self.transform(texts, labels, extractor, cache_features, input_split) 242 243 def partial_fit(self, texts, labels=None, input_split= False, merge_output= True):
~/lal/Kaggle/kaggleme/input/bkup/wordbatch/wordbatch.py in transform(self, texts, labels, extractor, cache_features, input_split) 248 if extractor== None: extractor= self.extractor 249 if cache_features != None and os.path.exists(cache_features): return extractor.load_features(cache_features) --> 250 if not(input_split): texts= self.split_batches(texts) 251 texts= self.fit(texts, return_texts=True, input_split=True, merge_output=False) 252 if extractor!= None:
~/lal/Kaggle/kaggleme/input/bkup/wordbatch/wordbatch.py in split_batches(self, *args, **kwargs) 265 266 def split_batches(self, *args, **kwargs): --> 267 return self.batcher.split_batches(*args, **kwargs) 268 269 def merge_batches(self, *args, **kwargs):
~/lal/Kaggle/kaggleme/input/bkup/wordbatch/batcher.py in split_batches(self, data, minibatch_size) 70 else: len_data= data.shape[0] 71 if minibatch_size> len_data: minibatch_size= len_data ---> 72 if data_type == pd.DataFrame: 73 data_split = [data.iloc[x * minibatch_size:(x + 1) * minibatch_size] for x in 74 range(int(ceil(len_data / minibatch_size)))]
~/anaconda2/envs/sdp/lib/python3.6/site-packages/pandas/core/ops.py in f(self, other) 1326 return self._compare_frame(other, func, str_rep) 1327 elif isinstance(other, ABCSeries): -> 1328 return self._combine_series_infer(other, func, try_cast=False) 1329 else: 1330
~/anaconda2/envs/sdp/lib/python3.6/site-packages/pandas/core/frame.py in _combine_series_infer(self, other, func, level, fill_value, try_cast) 3946 def _combine_series_infer(self, other, func, level=None, 3947 fill_value=None, try_cast=True): -> 3948 if len(other) == 0: 3949 return self * np.nan 3950
TypeError: object of type 'type' has no len()
This would need debugging. You're calling Python3.6 from ~/anaconda2? I suspect the Python installation could have issues.
I am running python 3.6 in a python virtual environment. All other kaggle scripts seem to run well in this environment. Please look at https://conda.io/docs/user-guide/tasks/manage-environments.html for how this environment was created.
I have same problem((
wb = wordbatch.WordBatch(normalize_text, extractor=(WordBag, {"hash_ngrams": 2, "hash_ngrams_weights": [1.5, 1.0], "hash_size": 2 ** 29, "norm": None, "tf": 'binary', "idf": None, }), procs=4) wb.dictionary_freeze= True wb.fit(merge['name'])
TypeError Traceback (most recent call last)
in () ----> 1 wb.fit(merge['name']) /mnt/SSD/dzikr/env_python_3/env_p_3.6/lib/python3.6/site-packages/wordbatch/wordbatch.py in fit(self, texts, labels, return_texts, input_split, merge_output) 211 if self.verbose > 0: print("Normalize text") 212 if self.normalize_text != None: --> 213 texts= self.normalize_texts(texts, input_split= input_split, merge_output= False) 214 input_split= True 215 if self.spellcor_count> 0 or self.stemmer!=None:
/mnt/SSD/dzikr/env_python_3/env_p_3.6/lib/python3.6/site-packages/wordbatch/wordbatch.py in normalize_texts(self, texts, input_split, merge_output) 179 def normalize_texts(self, texts, input_split=False, merge_output=True): 180 texts2= self.parallelize_batches(self.procs, batch_normalize_texts, texts, [self.normalize_text], --> 181 input_split=input_split, merge_output=merge_output) 182 #if self.use_sc==False: return [item for sublist in texts2 for item in sublist] 183 return texts2
/mnt/SSD/dzikr/env_python_3/env_p_3.6/lib/python3.6/site-packages/wordbatch/wordbatch.py in parallelize_batches(self, procs, task, data, args, method, timeout, rdd_col, input_split, merge_output, minibatch_size) 301 attempt= 0 302 if not(input_split): --> 303 paral_params= [[data_batch]+ args for data_batch in self.split_batches(data, minibatch_size)] 304 else: 305 paral_params= [[data_batch]+ args for data_batch in data]
/mnt/SSD/dzikr/env_python_3/env_p_3.6/lib/python3.6/site-packages/wordbatch/wordbatch.py in split_batches(self, data, minibatch_size) 274 else: len_data= data.shape[0] 275 if minibatch_size> len_data: minibatch_size= len_data --> 276 if data_type == pd.DataFrame: 277 data_split = [data.iloc[x * minibatch_size:(x + 1) * minibatch_size] for x in 278 range(int(ceil(len_data / minibatch_size)))]
/mnt/SSD/dzikr/env_python_3/env_p_3.6/lib/python3.6/site-packages/pandas/core/ops.py in f(self, other) 1326 return self._compare_frame(other, func, str_rep) 1327 elif isinstance(other, ABCSeries): -> 1328 return self._combine_series_infer(other, func, try_cast=False) 1329 else: 1330
/mnt/SSD/dzikr/env_python_3/env_p_3.6/lib/python3.6/site-packages/pandas/core/frame.py in _combine_series_infer(self, other, func, level, fill_value, try_cast) 3946 def _combine_series_infer(self, other, func, level=None, 3947 fill_value=None, try_cast=True): -> 3948 if len(other) == 0: 3949 return self * np.nan 3950
TypeError: object of type 'type' has no len()
I'll need sample data to debug this case.
You can download sample of data from competition Mercari Price Suggestion Challenge on Kaggle