PreSumm icon indicating copy to clipboard operation
PreSumm copied to clipboard

Step 4. Format to Simpler Json Files

Open fatmalearning opened this issue 5 years ago • 6 comments

form Step 3 I trained my own dataset and obtained to json files for Step 4. Format to Simpler Json Files: I get this error

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/My Drive/Abstractive/pre-dataset/maping_data/mapping_valid.txt'

how can i get urls file for my own dataset ? could you help me ?

fatmalearning avatar Dec 09 '19 19:12 fatmalearning

The same problem, I don't think they have coded it to preprocess any data except CNN/dailymail.

Same problem here.

v-zmiycharov avatar Jan 09 '20 18:01 v-zmiycharov

def custom_format_to_lines(args):
    corpus_mapping = {}
    train_files = []
    for f in glob.glob(pjoin(args.raw_path, '*.json')):
        train_files.append(f)
    
    corpora = {'train': train_files}
    for corpus_type in ['train']:
        a_lst = [(f, args) for f in corpora[corpus_type]]
        pool = Pool(args.n_cpus)
        dataset = []
        p_ct = 0
        for d in pool.imap_unordered(_format_to_lines, a_lst):
            dataset.append(d)
            if (len(dataset) > args.shard_size):
                pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
                with open(pt_file, 'w') as save:
                    # save.write('\n'.join(dataset))
                    save.write(json.dumps(dataset))
                    p_ct += 1
                    dataset = []

        pool.close()
        pool.join()
        if (len(dataset) > 0):
            pt_file = "{:s}.{:s}.{:d}.json".format(args.save_path, corpus_type, p_ct)
            with open(pt_file, 'w') as save:
                # save.write('\n'.join(dataset))
                save.write(json.dumps(dataset))
                p_ct += 1
                dataset = []
def custom_format_to_bert(args):
        if (args.dataset != ''):
            datasets = [args.dataset]
            print('dataset')
        else:
            datasets = ['train']
        for corpus_type in datasets:
            a_lst = []
            print('.' + corpus_type + '.0.json')
            for json_f in glob.glob(pjoin(args.raw_path, '.' + corpus_type + '.0.json')):
                print(json_f)
                real_name = json_f.split('/')[-1]
                print(real_name)
                a_lst.append((corpus_type, json_f, args, pjoin(args.save_path, real_name.replace('json', 'bert.pt'))))
            print(a_lst)
            pool = Pool(args.n_cpus)
            for d in pool.imap(_format_to_bert, a_lst):
                pass

            pool.close()
            pool.join()

def _format_to_bert(params):
    corpus_type, json_file, args, save_file = params
    is_test = corpus_type == 'test'
    if (os.path.exists(save_file)):
        logger.info('Ignore %s' % save_file)
        return

    bert = BertData(args)

    logger.info('Processing %s' % json_file)
    jobs = json.load(open(json_file))
    datasets = []
    for d in jobs:
        source, tgt = d['src'], d['tgt']

        sent_labels = greedy_selection(source[:args.max_src_nsents], tgt, 3)
        if (args.lower):
            source = [' '.join(s).lower().split() for s in source]
            tgt = [' '.join(s).lower().split() for s in tgt]
        b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer, is_test=is_test)
        # b_data = bert.preprocess(source, tgt, sent_labels, use_bert_basic_tokenizer=args.use_bert_basic_tokenizer)

        if (b_data is None):
            continue
        src_subtoken_idxs, sent_labels, tgt_subtoken_idxs, segments_ids, cls_ids, src_txt, tgt_txt = b_data
        b_data_dict = {"src": src_subtoken_idxs, "tgt": tgt_subtoken_idxs,
                       "src_sent_labels": sent_labels, "segs": segments_ids, 'clss': cls_ids,
                       'src_txt': src_txt, "tgt_txt": tgt_txt}
        datasets.append(b_data_dict)
    logger.info('Processed instances %d' % len(datasets))
    logger.info('Saving to %s' % save_file)
    torch.save(datasets, save_file)
    datasets = []
    gc.collect()

I wrote some custom functions just edited the old ones to my own need, you should add them in data_builder.py

Also noticed something called args.dataset, haven't personally used it but see if it fits your use case. You could read the src code data_builder.py and know more.

Hi, im trying to make a dataset in swedish and im running into some problems with the preprocessing.py code and the changes above. Could you point me in the right direction as to where encoding might be an issue in the changes that you made?

Tobias289 avatar Apr 15 '20 13:04 Tobias289

same problem.

they only consider the cnn/dm...

Dod-o avatar Jul 06 '20 08:07 Dod-o

@shashankMadan-designEsthetics is your modified only generate train files?

LusianaSiahaan avatar Mar 17 '22 01:03 LusianaSiahaan