deeplake
deeplake copied to clipboard
[BUG] sample_by on view does not work
🐛🐛 Bug Report
Hi, when using the sample by
query on the result of another query, it fails with an exception.
ds = deeplake.load("hub://activeloop/mnist-train")
ds2 = ds.query("select * limit 1000")
ds2.query("select * sample by max_weight(contains(labels, '1'): 2, true: 1) limit 10").labels.numpy()
This fails with exception:
IndexError Traceback (most recent call last)
<ipython-input-147-431c4577b4bb> in <cell line: 1>()
----> 1 ds2.query("select * sample by max_weight(contains(labels, '1'): 2, true: 1) limit 10").labels.numpy()
~/src/Hub/deeplake/core/dataset/dataset.py in query(self, query_string)
1709 from deeplake.enterprise import query
1710
-> 1711 return query(self, query_string)
1712
1713 def sample_by(
~/.pyenv/versions/3.10.0/lib/python3.10/site-packages/humbug/report.py in wrapped_callable(*args, **kwargs)
443 self.feature_report(callable.__name__, parameters)
444
--> 445 return callable(*args, **kwargs)
446
447 return wrapped_callable
~/src/Hub/deeplake/enterprise/libdeeplake_query.py in query(dataset, query_string)
39 dsv = ds.query(query_string)
40 indexes = dsv.indexes
---> 41 return dataset[indexes]
42
43
~/src/Hub/deeplake/core/dataset/dataset.py in __getitem__(self, item, is_iteration)
456 ret = self.__class__(
457 storage=self.storage,
--> 458 index=self.index[item],
459 group_index=self.group_index,
460 read_only=self._read_only,
~/src/Hub/deeplake/core/index/index.py in __getitem__(self, item)
374 return new_index
375 elif isinstance(item, list):
--> 376 return self[(tuple(item),)] # type: ignore
377 elif isinstance(item, Index):
378 return self[tuple(v.value for v in item.values)] # type: ignore
~/src/Hub/deeplake/core/index/index.py in __getitem__(self, item)
371 for idx, sub_item in enumerate(item):
372 ax = new_index.find_axis(offset=idx)
--> 373 new_index = new_index.compose_at(sub_item, ax)
374 return new_index
375 elif isinstance(item, list):
~/src/Hub/deeplake/core/index/index.py in compose_at(self, item, i)
330 return Index(self.values + [IndexEntry(item)])
331 else:
--> 332 new_values = self.values[:i] + [self.values[i][item]] + self.values[i + 1 :]
333 return Index(new_values)
334
~/src/Hub/deeplake/core/index/index.py in __getitem__(self, item)
189 return IndexEntry(self.value[item])
190 elif isinstance(item, (tuple, list)):
--> 191 new_value = tuple(self.value[idx] for idx in item)
192 return IndexEntry(new_value)
193
~/src/Hub/deeplake/core/index/index.py in <genexpr>(.0)
189 return IndexEntry(self.value[item])
190 elif isinstance(item, (tuple, list)):
--> 191 new_value = tuple(self.value[idx] for idx in item)
192 return IndexEntry(new_value)
193
IndexError: tuple index out of range
Using versions:
deeplake==3.1.0
libdeeplake==0.0.29
It would also be nice if I could automatically sample to get a uniformed distribution instead of using weights, because now I need to do the query in two steps:
- Filter on any metadata that I am insterested in
- Calculate the class imballance
- Sample by the inverse of the class imballance