ssd_keras
ssd_keras copied to clipboard
Bouding boxes predictions are concentrated in left top corner
Hi! I have a problem with the SSD300 implementation. I'm using a dataset of 1000 images and I'm using 750 of them to train and 250 to be the validation set. My dataset has only 1 positive class.
My training code is the following:
`img_height = 300 img_width = 300 img_channels = 3 mean_color = [123, 117, 104] swap_channels = [2, 1, 0] n_classes = 1 scales_pascal = [0.1, 0.2, 0.37, 0.54, 0.71, 0.88, 1.05] scales_coco = [0.07, 0.15, 0.33, 0.51, 0.69, 0.87, 1.05] scales = scales_pascal aspect_ratios = [[1.0, 2.0, 0.5], [1.0, 2.0, 0.5, 3.0, 1.0/3.0], [1.0, 2.0, 0.5, 3.0, 1.0/3.0], [1.0, 2.0, 0.5, 3.0, 1.0/3.0], [1.0, 2.0, 0.5], [1.0, 2.0, 0.5]] two_boxes_for_ar1 = True steps = [8, 16, 32, 64, 100, 300] offsets = [0.5, 0.5, 0.5, 0.5, 0.5, 0.5] clip_boxes = False variances = [0.1, 0.1, 0.2, 0.2] normalize_coords = True
K.clear_session()
model = ssd_300(image_size=(img_height, img_width, img_channels), n_classes=n_classes, mode='training', l2_regularization=0.0005, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, normalize_coords=normalize_coords, subtract_mean=mean_color, swap_channels=swap_channels)
weights_path = 'VGG_weights/VGG_ILSVRC_16_layers_fc_reduced.h5' model.load_weights(weights_path, by_name=True) adam = Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0) ssd_loss = SSDLoss(neg_pos_ratio=3, alpha=1.0) model.compile(optimizer=adam, loss=ssd_loss.compute_loss)
train_dataset = DataGenerator(load_images_into_memory=True, hdf5_dataset_path=None) val_dataset = DataGenerator(load_images_into_memory=True, hdf5_dataset_path=None)
myDataSet_train_images_dir = 'myDatasets/Training/JPEGImages/' myDataSet_train_annotations_dir = 'myDatasets/Training/Annotations/' myDataSet_trainval_image_set_filename = 'myDatasets/Training/ImageSets/Main/default.txt'
myDataSet_test_images_dir = 'myDatasets/Testing/JPEGImages/' myDataSet_test_annotations_dir = 'myDatasets/Testing/Annotations/' myDataSet_test_image_set_filename = 'myDatasets/Testing/ImageSets/Main/default.txt'
classes = ['background', 'Plant']
train_dataset.parse_xml(images_dirs=[myDataSet_train_images_dir], image_set_filenames=[myDataSet_trainval_image_set_filename], annotations_dirs=[myDataSet_train_annotations_dir], classes=classes, include_classes='all', exclude_truncated=False, exclude_difficult=False, ret=False)
val_dataset.parse_xml(images_dirs=[myDataSet_test_images_dir], image_set_filenames=[myDataSet_test_image_set_filename], annotations_dirs=[myDataSet_test_annotations_dir], classes=classes, include_classes='all', exclude_truncated=False, exclude_difficult=True, ret=False)
batch_size = 5
ssd_data_augmentation = SSDDataAugmentation(img_height=img_height, img_width=img_width, background=mean_color)
convert_to_3_channels = ConvertTo3Channels() resize = Resize(height=img_height, width=img_width)
predictor_sizes = [model.get_layer('conv4_3_norm_mbox_conf').output_shape[1:3], model.get_layer('fc7_mbox_conf').output_shape[1:3], model.get_layer('conv6_2_mbox_conf').output_shape[1:3], model.get_layer('conv7_2_mbox_conf').output_shape[1:3], model.get_layer('conv8_2_mbox_conf').output_shape[1:3], model.get_layer('conv9_2_mbox_conf').output_shape[1:3]]
ssd_input_encoder = SSDInputEncoder(img_height=img_height, img_width=img_width, n_classes=n_classes, predictor_sizes=predictor_sizes, scales=scales, aspect_ratios_per_layer=aspect_ratios, two_boxes_for_ar1=two_boxes_for_ar1, steps=steps, offsets=offsets, clip_boxes=clip_boxes, variances=variances, matching_type='multi', pos_iou_threshold=0.5, neg_iou_limit=0.5, normalize_coords=normalize_coords)
train_generator = train_dataset.generate(batch_size=batch_size, shuffle=True, transformations=[ssd_data_augmentation], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False)
val_generator = val_dataset.generate(batch_size=batch_size, shuffle=False, transformations=[convert_to_3_channels, resize], label_encoder=ssd_input_encoder, returns={'processed_images', 'encoded_labels'}, keep_images_without_gt=False)
train_dataset_size = train_dataset.get_dataset_size() val_dataset_size = val_dataset.get_dataset_size()
print("Number of images in the training dataset:\t{:>6}".format(train_dataset_size)) print("Number of images in the validation dataset:\t{:>6}".format(val_dataset_size))
def lr_schedule(epoch): if epoch < 300: return 0.0001 elif epoch < 450: return 0.00001 else: return 0.000001
model_checkpoint = ModelCheckpoint(filepath='ssd300_pascal_07+12_epoch-{epoch:02d}_loss-{loss:.4f}_val_loss-{val_loss:.4f}.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=False, mode='auto', period=1)
csv_logger = CSVLogger(filename='ssd300_pascal_07+12_training_log.csv', separator=',', append=True)
learning_rate_scheduler = LearningRateScheduler(schedule=lr_schedule, verbose=1)
terminate_on_nan = TerminateOnNaN()
callbacks = [model_checkpoint, csv_logger, learning_rate_scheduler, terminate_on_nan]
initial_epoch = 0 final_epoch = 1000 steps_per_epoch = 1000
history = model.fit_generator(generator=train_generator, steps_per_epoch=steps_per_epoch, epochs=final_epoch, callbacks=callbacks, validation_data=val_generator, validation_steps=ceil(val_dataset_size/batch_size), initial_epoch=initial_epoch)`
The inference code is the following: `img_height = 300 img_width = 300
model_path = 'ssd300_pascal_07+12_epoch-180_loss-3.5966_val_loss-3.3306.h5' ssd_loss = SSDLoss(neg_pos_ratio=3, n_neg_min=0, alpha=1.0)
K.clear_session()
model = load_model(model_path, custom_objects={'AnchorBoxes': AnchorBoxes, 'L2Normalization': L2Normalization, 'DecodeDetections': DecodeDetections, 'compute_loss': ssd_loss.compute_loss})
orig_images = [] input_images = []
img_path = 'myDatasets/Testing/JPEGImages/scene00371.png'
orig_images.append(imread(img_path)) img = image.load_img(img_path, target_size=(img_height, img_width)) img = image.img_to_array(img) input_images.append(img) input_images = np.array(input_images)
y_pred = model.predict(input_images)
confidence_threshold = 0.25
y_pred_thresh = [y_pred[k][y_pred[k,:,1] > confidence_threshold] for k in range(y_pred.shape[0])]
np.set_printoptions(precision=2, suppress=True, linewidth=90, threshold=sys.maxsize) print("Predicted boxes:\n") print('class conf xmin ymin xmax ymax') print(y_pred_thresh[0])
colors = plt.cm.hsv(np.linspace(0, 1, 2)).tolist() classes = ['background', 'Plant']
plt.figure(figsize=(20,12)) plt.imshow(orig_images[0])
current_axis = plt.gca()
for box in y_pred_thresh[0]:
xmin = box[2] * orig_images[0].shape[1] / img_width
ymin = box[3] * orig_images[0].shape[0] / img_height
xmax = box[4] * orig_images[0].shape[1] / img_width
ymax = box[5] * orig_images[0].shape[0] / img_height
color = colors[round(box[0])]
label = '{}: {:.2f}'.format(classes[round(box[0])], box[1])
current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color=color, fill=False, linewidth=2))
current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor':color, 'alpha':1.0})
plt.show()`
That's the output from the network:
An example from the image annotation XML file:
<annotation>
<folder/>
<filename>scene00058.png</filename>
<source>
<database>Unknown</database>
<annotation>Unknown</annotation>
<image>Unknown</image>
</source>
<size>
<width>1280</width>
<height>720</height>
<depth/>
</size>
<segmented>0</segmented>
<object>
<name>Plant</name>
<occluded>0</occluded>
<bndbox>
<xmin>33.02</xmin>
<ymin>13.54</ymin>
<xmax>105.77000000000001</xmax>
<ymax>610.24</ymax>
</bndbox>
<attributes>
<attribute>
<name>track_id</name>
<value>1</value>
</attribute>
<attribute>
<name>keyframe</name>
<value>True</value>
</attribute>
</attributes>
</object>
<object>
<name>Plant</name>
<occluded>0</occluded>
<bndbox>
<xmin>189.63</xmin>
<ymin>10.2</ymin>
<xmax>262.43</xmax>
<ymax>545.78</ymax>
</bndbox>
<attributes>
<attribute>
<name>track_id</name>
<value>2</value>
</attribute>
<attribute>
<name>keyframe</name>
<value>True</value>
</attribute>
</attributes>
</object>
<object>
I really don't know why this is not working and I made just parameter tune changes from the original code.
I think this is related to a difference between training vs. inference modes.
The predictions are correct, but they are scaled between 0 and 1 on both x and y axes. On the other hand the image has kept its original shape. As a result, all the predictions are close to the (0, 0) point on the image which is the top left corner. You should make sure the predictions and the image are on the same scale.
As a workaround you can decode the predictions from y_pred with decode_detections instead of doing it manually. Make sure to import decode_detections first:
from ssd_encoder_decoder.ssd_output_decoder import decode_detections
Here's how you can do this:
y_pred = model.predict(input_images)
confidence_threshold = 0.25
y_pred_decoded = decode_detections(y_pred, confidence_tresh=confidence_threshold, iou_thresh=0.5, top_k=1000, normalize_coords=True, img_height=orig_images[0].shape[0], img_width=orig_images[0].shape[1])
np.set_printoptions(precision=2, suppress=True, linewidth=90, threshold=sys.maxsize)
print("Predicted boxes:\n")
print('class conf xmin ymin xmax ymax')
print(y_pred_decoded[0])
colors = plt.cm.hsv(np.linspace(0, 1, 2)).tolist()
classes = ['background', 'Plant']
plt.figure(figsize=(20,12))
plt.imshow(orig_images[0])
current_axis = plt.gca()
for box in y_pred_decoded[0]:
xmin = box[2]
ymin = box[3]
xmax = box[4]
ymax = box[5]
color = colors[round(box[0])]
label = '{}: {:.2f}'.format(classes[round(box[0])], box[1])
current_axis.add_patch(plt.Rectangle((xmin, ymin), xmax-xmin, ymax-ymin, color=color, fill=False, linewidth=2))
current_axis.text(xmin, ymin, label, size='x-large', color='white', bbox={'facecolor':color, 'alpha':1.0})
plt.show()