deeplake
deeplake copied to clipboard
[BUG] deeplake.util.exceptions.ReadSampleFromChunkError
Severity
P0 - Critical breaking issue or missing functionality
Current Behavior
I am using torch.distributed.DistributedSampler(dataset, shuffle=shuffle) to write the dataloader where dataset needs to be read from deeplake, And I load deeplake dataset with def init() in the dataset class. But when I iteratively access the dataloader, I get the following error: deeplake.util.exceptions.ReadSampleFromChunkError: Unable to read sample at index 97 from chunk 'images/chunks/bc4c02f9eec3464e' in tensor images.
Steps to Reproduce
class LoadDeeplakeImagesAndLabels(Dataset):
# YOLOv5 train_loader/val_loader, loads images and labels for training and validation
cache_version = 0.6 # dataset labels *.cache version
rand_interp_methods = [cv2.INTER_NEAREST, cv2.INTER_LINEAR, cv2.INTER_CUBIC, cv2.INTER_AREA, cv2.INTER_LANCZOS4]
def __init__(self,
path,
img_size=640,
batch_size=16,
augment=False,
hyp=None,
rect=False,
image_weights=False,
cache_images=False,
single_cls=False,
stride=32,
pad=0.0,
min_items=0,
prefix=''):
self.img_size = img_size
self.augment = augment
self.hyp = hyp
self.image_weights = image_weights
self.rect = False if image_weights else rect
self.mosaic = self.augment and not self.rect # load 4 images at a time into a mosaic (only during training)
self.mosaic_border = [-img_size // 2, -img_size // 2]
self.stride = stride
self.path = path
self.albumentations = Albumentations(size=img_size) if augment else None
#print("self.img_size",self.img_size)
#print("self.augment",self.augment)
#print("self.image_weights",self.image_weights)
#print("self.rect",self.rect)
#print("self.mosaic",self.mosaic)
#a = 1/0
# 读取数据集
f = []
label_path_f = []
label_f = []
shape_f = []
username = 'zhanglisheng'
passwd = '***********'
if path.endswith("train"):
dest = f's3://{username}/yolomix-train'
else:
dest = f's3://{username}/yolomix-val'
creds = {
'aws_access_key_id': username,
'aws_secret_access_key': passwd,
'endpoint_url': 'http://172.24.**.**:9000'
}
#dest = 's3://admin/yolo-mix-train'
self.dest = dest
self.creds = creds
ds = deeplake.load(dest,creds=creds,read_only=True)
self.ds = ds
sample_num= 128
#sample_num = len(ds)
for i in tqdm(range(sample_num)):
labels = ds['labels'][i].numpy()
boxes = ds['boxes'][i].numpy()
c = np.vstack([labels,boxes.T]).T
label_f.append(c)
shapes = ds['shapes'][i].text()
w = int(shapes.split(':')[0])
h = int(shapes.split(':')[1])
shape_f.append([w, h])
f.append(str(i)+'.jpg')
label_path_f.append(str(i)+'.txt')
self.im_files = f
self.label_files = label_path_f
self.labels = label_f
self.shapes = np.array(shape_f)
list_of_empty_lists = [[] for _ in range(len(self.im_files))]
self.segments = tuple(list_of_empty_lists)
def __getitem__(self, index):
index = self.indices[index] # linear, shuffled, or image_weights
hyp = self.hyp
mosaic = self.mosaic and random.random() < hyp['mosaic']
if mosaic:
# Load mosaic
img, labels = self.load_mosaic(index)
shapes = None
# MixUp augmentation
if random.random() < hyp['mixup']:
img, labels = mixup(img, labels, *self.load_mosaic(random.randint(0, self.n - 1)))
else:
# Load image
img, (h0, w0), (h, w) = self.load_image(index)
# Letterbox
shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size # final letterboxed shape
img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment)
shapes = (h0, w0), ((h / h0, w / w0), pad) # for COCO mAP rescaling
labels = self.labels[index].copy()
if labels.size: # normalized xywh to pixel xyxy format
labels[:, 1:] = xywhn2xyxy(labels[:, 1:], ratio[0] * w, ratio[1] * h, padw=pad[0], padh=pad[1])
if self.augment:
img, labels = random_perspective(img,
labels,
degrees=hyp['degrees'],
translate=hyp['translate'],
scale=hyp['scale'],
shear=hyp['shear'],
perspective=hyp['perspective'])
nl = len(labels) # number of labels
if nl:
labels[:, 1:5] = xyxy2xywhn(labels[:, 1:5], w=img.shape[1], h=img.shape[0], clip=True, eps=1E-3)
if self.augment:
# Albumentations
img, labels = self.albumentations(img, labels)
nl = len(labels) # update after albumentations
# HSV color-space
augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v'])
# Flip up-down
if random.random() < hyp['flipud']:
img = np.flipud(img)
if nl:
labels[:, 2] = 1 - labels[:, 2]
# Flip left-right
if random.random() < hyp['fliplr']:
img = np.fliplr(img)
if nl:
labels[:, 1] = 1 - labels[:, 1]
# Cutouts
# labels = cutout(img, labels, p=0.5)
# nl = len(labels) # update after cutout
labels_out = torch.zeros((nl, 6))
if nl:
labels_out[:, 1:] = torch.from_numpy(labels)
# Convert
img = img.transpose((2, 0, 1))[::-1] # HWC to CHW, BGR to RGB
img = np.ascontiguousarray(img)
return torch.from_numpy(img), labels_out, self.im_files[index], shapes
def load_image(self, i):
# Loads 1 image from dataset index 'i', returns (im, original hw, resized hw)
im, f, fn = self.ims[i], self.im_files[i], self.npy_files[i],
if im is None: # not cached in RAM
if fn.exists(): # load npy
im = np.load(fn)
else: # read image
image = self.ds['images'][i]
im = image.data()['value']
#im = cv2.imread(f) # BGR
assert im is not None, f'Image Not Found {f}'
h0, w0 = im.shape[:2] # orig hw
r = self.img_size / max(h0, w0) # ratio
if r != 1: # if sizes are not equal
interp = cv2.INTER_LINEAR if (self.augment or r > 1) else cv2.INTER_AREA
im = cv2.resize(im, (math.ceil(w0 * r), math.ceil(h0 * r)), interpolation=interp)
return im, (h0, w0), im.shape[:2] # im, hw_original, hw_resized
return self.ims[i], self.im_hw0[i], self.im_hw[i] # im, hw_original, hw_resized
If I load the deeplake dataset in the init__()__ function of the dataset, and then access it in the getitem function I have a problem.
Expected/Desired Behavior
Customize a dataloader that reads data from deeplake and supports distributed training.
Python Version
Python 3.10.9 (main, Mar 8 2023, 10:47:38) [GCC 11.2.0] on linux
OS
No response
IDE
No response
Packages
No response
Additional Context
No response
Possible Solution
No response
Are you willing to submit a PR?
- [ ] I'm willing to submit a PR (Thank you!)