AI-TOD
AI-TOD copied to clipboard
Test set contaminated with Trainval objects!
Hi there!
I was hoping to use the AI-TOD dataset, but noticed that both v1 and v2 have contaminated the test set with trainval objects and parts of trainval images.
This is happening because the overlapping crops from each xView dataset image are split across the train, val, and test sets.
Additionally this inflates the total number of objects.
Here are the stats I’ve found:
V1: Total number of contaminating xview bboxes: 84704 Total number of unique xview bboxes: 308286 Total number of xview bboxes: 443943 Total number of bboxes: 700621
V2: Total number of contaminating xview bboxes: 44801 Total number of unique xview bboxes: 419747 Total number of xview bboxes: 475857 Total number of bboxes: 752746
Given this overlap on xView, I'm concerned there may be additional contamination in the non-xView images. But without access to the script creating the non-xView images crops, I'm not able to check this.
I sent an email 2 months ago regarding this but haven't heard back yet. Let me know if I’m mistaken!
Here's the script I created to measure these numbers:
import json
import os
import numpy as np
from tqdm import tqdm
def process_annos(annos):
id_to_filename = dict()
print('Process Images:')
for img_ann in tqdm(annos['images']):
filename = img_ann['file_name']
filename_arr = filename.split('_')
if len(filename_arr) == 4:
id_ = img_ann['id']
id_to_filename[id_] = filename
xview_id_to_ori_bboxes = dict()
print('Process Annotations:')
for ann in tqdm(annos['annotations']):
if ann['iscrowd']:
continue
if not ann['image_id'] in id_to_filename.keys():
continue
filename = id_to_filename[ann['image_id']]
filename = os.path.splitext(filename)[0]
filename_arr = filename.split('_')
xview_id = filename_arr[0]
offsets = filename_arr[-2:]
offsets = np.array([float(coord) for coord in offsets])
bbox = ann['bbox']
bbox = np.array(bbox)
bbox[:2] += offsets
if xview_id not in xview_id_to_ori_bboxes.keys():
xview_id_to_ori_bboxes[xview_id] = []
xview_id_to_ori_bboxes[xview_id].append(bbox)
return xview_id_to_ori_bboxes
aitod_test_filename = './aitod_test_v1_1.0.json'
# aitod_test_filename = './AI-TOD-v2/aitodv2_test.json'
with open(aitod_test_filename, 'r') as f:
annos_test = json.load(f)
print('Process Test:')
xview_id_to_ori_bboxes_test = process_annos(annos_test)
aitod_trainval_filename = './aitod_trainval_v1_1.0.json'
# aitod_trainval_filename = './AI-TOD-v2/aitodv2_trainval.json'
with open(aitod_trainval_filename, 'r') as f:
annos_trainval = json.load(f)
print('Process Trainval:')
xview_id_to_ori_bboxes_trainval = process_annos(annos_trainval)
total_num_bboxes = len(annos_test['annotations']) + len(annos_trainval['annotations'])
total_num_contaminating_bboxes = 0
total_num_xview_bboxes = 0
total_num_unique_xview_bboxes = 0
for xv_id, bboxes_test in xview_id_to_ori_bboxes_test.items():
if not xv_id in xview_id_to_ori_bboxes_trainval.keys():
continue
bboxes_train = xview_id_to_ori_bboxes_trainval[xv_id]
num_bboxes_test = len(bboxes_test)
num_bboxes_train = len(bboxes_train)
total_num_xview_bboxes += num_bboxes_test
total_num_xview_bboxes += num_bboxes_train
# duplicates also exist among test bboxes, remove these
bboxes_test = np.unique(bboxes_test, axis=0)
# duplicates also exist among trainval bboxes, remove these
bboxes_train = np.unique(bboxes_train, axis=0)
num_unique_bboxes_test = len(bboxes_test)
num_unique_bboxes_train = len(bboxes_train)
total_num_unique_xview_bboxes += num_unique_bboxes_test
total_num_unique_xview_bboxes += num_unique_bboxes_train
all_bboxes = np.concatenate((bboxes_test, bboxes_train))
unique_bboxes, counts = np.unique(all_bboxes, axis=0, return_counts=True)
contaminating_bboxes = unique_bboxes[counts > 1]
num_contaminating_bboxes = len(contaminating_bboxes)
if num_contaminating_bboxes > 0:
print(f'Xview id: {xv_id} \t Contaminating bboxes: {num_contaminating_bboxes}')
total_num_contaminating_bboxes += num_contaminating_bboxes
print(f'Total number of contaminating xview bboxes: {total_num_contaminating_bboxes}')
print(f'Total number of unique xview bboxes: {total_num_unique_xview_bboxes}')
print(f'Total number of xview bboxes: {total_num_xview_bboxes}')
print(f'Total number of bboxes: {total_num_bboxes}')