R2GenCMN
R2GenCMN copied to clipboard
supplementary code/data
Hi, thank you for this interesting work. I would like to know if there are the models weights available and ~~if there is a script to convert the xml files from the datasets (IU dataset) to an annotation.json file?~~
Are these the weights?: https://github.com/cuhksz-nlp/R2GenCMN/blob/main/data/r2gencmn.md
here is some code I wrote:
import pandas as pd
from os import listdir
from os.path import isfile, join
import xmltodict
from sklearn.model_selection import train_test_split
SEED = 0
path = 'path/to/ecgen-radiology/'
annotationFiles = [path+f for f in listdir(path) if isfile(join(path, f))]
train, tmp = train_test_split(annotationFiles, test_size=0.3, random_state=SEED)
val, test = train_test_split(tmp, test_size=0.66, random_state=SEED)
train_lst = []
val_lst = []
test_lst = []
def get_ann_(lst, subset, subset_lst):
for file in lst:
with open(file) as xml_file:
data_dict = xmltodict.parse(xml_file.read())
if 'parentImage' not in data_dict['eCitation']:
continue
#print(data_dict['eCitation']['parentImage'])#[0]['@id'])
if type(data_dict['eCitation']['parentImage']) == list:
img_paths = [x['@id']+'.png' for x in data_dict['eCitation']['parentImage']]
else:
img_paths = [data_dict['eCitation']['parentImage']['@id']]
ann_id = img_paths[0].replace('.png', '')
report = ' '.join([x['@Label'] + " " + x['#text'] if '#text' in x else '' for x in data_dict['eCitation']['MedlineCitation']['Article']['Abstract']['AbstractText']])
annotation = {'id': ann_id, 'report':report, 'image_path': img_paths, 'split':subset}
subset_lst.append(annotation)
get_ann_(train, 'train', train_lst)
get_ann_(val, 'val', val_lst)
get_ann_(test, 'test', test_lst)
res = {'train': train_lst, 'val': val_lst, 'test':test_lst}