MOSS
MOSS copied to clipboard
How to fix can not find "./sft_data/train.jsonl"
we can convert dataset in SFT_data/conversations
to ./sft_data/train.jsonl
Here is the code:
# prepare_moss_sft.py
"""
prepare train and val dataset for for moss-sft
This method will not generate the train.jsonl, and it save the train&val dataset to train_data and val_data. It also can be loaded by class SFTDataset.
"""
import os
import json
import copy
import torch
from transformers import AutoTokenizer
def tokenzie_dataset(file, tokenizer):
"""
read json file and tokenzie it.
"""
data = []
no_loss_spans = []
with open(file, "r") as f:
try:
sample = json.load(f)
chat = sample['chat']
num_turns = int(sample['num_turns'])
meta_instruction = sample['meta_instruction']
instruction_ids = tokenizer.encode(meta_instruction)
assert isinstance(instruction_ids, list) and len(instruction_ids) > 0
input_ids = copy.deepcopy(instruction_ids)
_no_loss_spans = [(0, len(instruction_ids))]
for i in range(num_turns):
cur_turn_ids = []
cur_no_loss_spans = []
cur_turn = chat[f'turn_{i+1}']
for key, value in cur_turn.items():
cur_ids = tokenizer.encode(value)
if key == 'Tool Responses':
# The format tokens (<|Results|>:...<eor>\n) should have losses.
cur_no_loss_spans.append((len(input_ids + cur_turn_ids) + 5, len(input_ids + cur_turn_ids + cur_ids) - 2))
assert isinstance(cur_ids, list) and len(cur_ids) > 0
cur_turn_ids.extend(cur_ids)
if len(input_ids + cur_turn_ids) > 2048:
break
input_ids.extend(cur_turn_ids)
_no_loss_spans.extend(cur_no_loss_spans)
assert len(input_ids) > 0 and len(input_ids) <= 2048
data = input_ids
no_loss_spans = _no_loss_spans
except json.JSONDecodeError as e:
print(f"Error parsing {file_path}: {e}")
return data, no_loss_spans
def combine_json(dir, tokenizer, split):
files = []
# there are maybe several json files in dir
for item in os.listdir(dir):
item_path = os.path.join(dir, item)
if os.path.isfile(item_path) and item_path.endswith(".json"): # only json file
files.append(item_path)
data = []
no_loss_spans = []
for file in files:
# one file to be one sample
_data, _no_loss_spans = tokenzie_dataset(file, tokenizer)
data.append(_data)
no_loss_spans.append(_no_loss_spans)
# split data to train data and val data
train_len = int(len(data) * split)
train_data = data[:train_len]
val_data = data[train_len:]
train_len = int(len(no_loss_spans) * split)
train_no_loss_spans = no_loss_spans[:train_len]
val_no_loss_spans = no_loss_spans[train_len:]
print("train len:", len(train_data), len(train_no_loss_spans))
print("val len:", len(val_data), len(val_no_loss_spans))
return train_data, train_no_loss_spans, val_data, val_no_loss_spans
def prepare_dataset(data_dir, out_dir, tokenizer, split = 0.85):
'''
parse all json files in a directory
'''
train_data = []
train_no_loss_spans = []
val_data = []
val_no_loss_spans = []
for root, dirs, _ in os.walk(data_dir):
for dir in dirs:
dir = os.path.join(root, dir)
print(dir)
_train_data, _train_no_loss_spans, _val_data, _val_no_loss_spans = combine_json(dir, tokenizer, split)
train_data += _train_data
val_data += _val_data
train_no_loss_spans += _train_no_loss_spans
val_no_loss_spans += _val_no_loss_spans
data_type = ['train', 'val']
data_file = os.path.join(out_dir, f'{data_type[0]}_data')
no_loss_spans_file = os.path.join(out_dir, f'{data_type[0]}_no_loss_spans')
torch.save(train_data, data_file)
torch.save(train_no_loss_spans, no_loss_spans_file)
data_file = os.path.join(out_dir, f'{data_type[1]}_data')
no_loss_spans_file = os.path.join(out_dir, f'{data_type[1]}_no_loss_spans')
torch.save(val_data, data_file)
torch.save(val_no_loss_spans, no_loss_spans_file)
print("train samples:", len(train_data), len(train_no_loss_spans))
print("val samples:", len(val_data), len(val_no_loss_spans))
if __name__ == "__main__":
# tokenizer
model_name_or_path = "fnlp/moss-moon-003-base"
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, trust_remote_code=True)
tokenizer.eos_token_id = 106068 # The eos_token_id of base model is 106028. We need map the eos token to <eom> (its token id is 106068)
indir = "./SFT_data/conversations/conversation_without_plugins/"
outdir = "./sft_data/"
prepare_dataset(indir, outdir, tokenizer)
This method will not generate the train.jsonl, and it save the train&val dataset to train_data
and val_data
.
It also can be loaded by class SFTDataset
.
you said 'we can convert dataset in SFT_data/conversations to ./sft_data/train.jsonl Here is the code:' in the beginning, and then you said 'This method will not generate the train.jsonl, and it save the train&val dataset to train_data and val_data. It also can be loaded by class SFTDataset.' wtf?
you said 'we can convert dataset in SFT_data/conversations to ./sft_data/train.jsonl Here is the code:' in the beginning, and then you said 'This method will not generate the train.jsonl, and it save the train&val dataset to train_data and val_data. It also can be loaded by class SFTDataset.' wtf?
Maybe I didn't explain clearly. the load_data
method in finetune_moss.py
has two ways to load data, one way is reading dataset from jsonl type file and extract the content, saving to train_data and val_data. Another way is reading directly from train_data and val_data saved before.
The code above is saving the train_data and val_data , not train.jsonl.
you said 'we can convert dataset in SFT_data/conversations to ./sft_data/train.jsonl Here is the code:' in the beginning, and then you said 'This method will not generate the train.jsonl, and it save the train&val dataset to train_data and val_data. It also can be loaded by class SFTDataset.' wtf?
Maybe I didn't explain clearly. the
load_data
method infinetune_moss.py
has two ways to load data, one way is reading dataset from jsonl type file and extract the content, saving to train_data and val_data. Another way is reading directly from train_data and val_data saved before.The code above is saving the train_data and val_data , not train.jsonl.
I got you. I have successfully generated the train_data and val_data and finetuned my custom model. I would say that your shared code is extremely helpful! Thanks a lot!