setfit
setfit copied to clipboard
setfit can not get good result for chinese language?
The number of data labels is 4. the eval loss increase and the train loss decrease. I think it is overfit. Code is as below:
code
from setfit import SetFitModel, Trainer, TrainingArguments, sample_dataset, SetFitModelCardData
import ujson as json
from datasets import load_dataset, DatasetDict, Dataset
from transformers import EarlyStoppingCallback
import datetime
def convert_files_to_dataset(train_path, val_path):
# 定义一个函数来读取单个文件
def read_file(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
data = [json.loads(line.strip()) for line in lines]
return data
# 读取训练集和验证集
train_data = read_file(train_path)
val_data = read_file(val_path)
# 将数据转换为Dataset
train_dataset = Dataset.from_list(train_data)
val_dataset = Dataset.from_list(val_data)
# 创建DatasetDict
dataset_dict = DatasetDict({'train': train_dataset, 'validation': val_dataset})
return dataset_dict
# 使用函数
train_path = 'train_cn.txt'
val_path = 'val_cn.txt'
dataset = convert_files_to_dataset(train_path, val_path)
from setfit import sample_dataset
train_dataset = sample_dataset(dataset["train"], num_samples=50)
print(train_dataset)
eval_dataset = dataset["validation"]
print(eval_dataset)
from setfit import SetFitModel
model = SetFitModel.from_pretrained('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2', model_card_data=SetFitModelCardData(
language=['en', 'de', 'nl'],
))
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import SetFitTrainer
trainer = SetFitTrainer(
model=model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
loss_class=CosineSimilarityLoss,
num_iterations=20,
num_epochs=5
)
trainer.train()
metrics = trainer.evaluate()
print(metrics)```
log is as below:
Dataset({
features: ['text', 'label'],
num_rows: 200
})
Dataset({
features: ['text', 'label'],
num_rows: 40
})
/usr/local/matrix/conda3/envs/peft/lib/python3.8/site-packages/huggingface_hub/file_download.py:1132: FutureWarning: resume_download is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use force_download=True.
warnings.warn(
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
b.py:51: DeprecationWarning: SetFitTrainer has been deprecated and will be removed in v2.0.0 of SetFit. Please use Trainer instead.
trainer = SetFitTrainer(
Using evaluation_strategy="steps" as eval_steps is defined.
Map: 100%|████████████████████████████████████████████████████████████████████████████| 200/200 [00:00<00:00, 12981.44 examples/s]
***** Running training *****
Num unique pairs = 8000
Batch size = 16
Num epochs = 5
Total optimization steps = 2500
0%| | 0/2500 [00:00<?, ?it/s]
{'embedding_loss': 0.3173, 'learning_rate': 8e-08, 'epoch': 0.0} | 0/2500 [00:00<?, ?it/s]
{'embedding_loss': 0.2875, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.1}
{'eval_embedding_loss': 0.236, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.1}
{'embedding_loss': 0.2662, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.2}
{'eval_embedding_loss': 0.2351, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.2}
{'embedding_loss': 0.2741, 'learning_rate': 1.2e-05, 'epoch': 0.3}
{'eval_embedding_loss': 0.2319, 'learning_rate': 1.2e-05, 'epoch': 0.3}
{'embedding_loss': 0.2747, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.4}
{'eval_embedding_loss': 0.2331, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.4}
{'embedding_loss': 0.196, 'learning_rate': 2e-05, 'epoch': 0.5}
{'eval_embedding_loss': 0.2297, 'learning_rate': 2e-05, 'epoch': 0.5}
{'embedding_loss': 0.1512, 'learning_rate': 1.9555555555555557e-05, 'epoch': 0.6}
{'eval_embedding_loss': 0.2387, 'learning_rate': 1.9555555555555557e-05, 'epoch': 0.6}
{'embedding_loss': 0.0866, 'learning_rate': 1.9111111111111113e-05, 'epoch': 0.7}
{'eval_embedding_loss': 0.248, 'learning_rate': 1.9111111111111113e-05, 'epoch': 0.7}
{'embedding_loss': 0.0437, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.8}
{'eval_embedding_loss': 0.2427, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.8}
{'embedding_loss': 0.07, 'learning_rate': 1.8222222222222224e-05, 'epoch': 0.9}
{'eval_embedding_loss': 0.2474, 'learning_rate': 1.8222222222222224e-05, 'epoch': 0.9}
{'embedding_loss': 0.0332, 'learning_rate': 1.7777777777777777e-05, 'epoch': 1.0}
{'eval_embedding_loss': 0.2587, 'learning_rate': 1.7777777777777777e-05, 'epoch': 1.0}
{'embedding_loss': 0.0125, 'learning_rate': 1.7333333333333336e-05, 'epoch': 1.1}
{'eval_embedding_loss': 0.2573, 'learning_rate': 1.7333333333333336e-05, 'epoch': 1.1}
{'embedding_loss': 0.0023, 'learning_rate': 1.688888888888889e-05, 'epoch': 1.2}
{'eval_embedding_loss': 0.2648, 'learning_rate': 1.688888888888889e-05, 'epoch': 1.2}
{'embedding_loss': 0.0033, 'learning_rate': 1.6444444444444444e-05, 'epoch': 1.3}
{'eval_embedding_loss': 0.2659, 'learning_rate': 1.6444444444444444e-05, 'epoch': 1.3}
{'embedding_loss': 0.0011, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.4}
{'eval_embedding_loss': 0.2692, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.4}
{'embedding_loss': 0.0007, 'learning_rate': 1.555555555555556e-05, 'epoch': 1.5}
{'eval_embedding_loss': 0.2687, 'learning_rate': 1.555555555555556e-05, 'epoch': 1.5}
{'embedding_loss': 0.001, 'learning_rate': 1.5111111111111112e-05, 'epoch': 1.6}
{'eval_embedding_loss': 0.2739, 'learning_rate': 1.5111111111111112e-05, 'epoch': 1.6}
{'embedding_loss': 0.0012, 'learning_rate': 1.4666666666666666e-05, 'epoch': 1.7}
{'eval_embedding_loss': 0.2707, 'learning_rate': 1.4666666666666666e-05, 'epoch': 1.7}
{'embedding_loss': 0.0005, 'learning_rate': 1.4222222222222224e-05, 'epoch': 1.8}
{'eval_embedding_loss': 0.2684, 'learning_rate': 1.4222222222222224e-05, 'epoch': 1.8}
{'embedding_loss': 0.0006, 'learning_rate': 1.377777777777778e-05, 'epoch': 1.9}
{'eval_embedding_loss': 0.2756, 'learning_rate': 1.377777777777778e-05, 'epoch': 1.9}
{'embedding_loss': 0.0003, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.0}
{'eval_embedding_loss': 0.2698, 'learning_rate': 1.3333333333333333e-05, 'epoch': 2.0}
{'embedding_loss': 0.0007, 'learning_rate': 1.288888888888889e-05, 'epoch': 2.1}
{'eval_embedding_loss': 0.2745, 'learning_rate': 1.288888888888889e-05, 'epoch': 2.1}
{'embedding_loss': 0.0004, 'learning_rate': 1.2444444444444446e-05, 'epoch': 2.2}
{'eval_embedding_loss': 0.2771, 'learning_rate': 1.2444444444444446e-05, 'epoch': 2.2}
{'embedding_loss': 0.0005, 'learning_rate': 1.2e-05, 'epoch': 2.3}
{'eval_embedding_loss': 0.2742, 'learning_rate': 1.2e-05, 'epoch': 2.3}
{'embedding_loss': 0.0007, 'learning_rate': 1.1555555555555556e-05, 'epoch': 2.4}
{'eval_embedding_loss': 0.2719, 'learning_rate': 1.1555555555555556e-05, 'epoch': 2.4}
{'embedding_loss': 0.0002, 'learning_rate': 1.1111111111111113e-05, 'epoch': 2.5}
{'eval_embedding_loss': 0.2782, 'learning_rate': 1.1111111111111113e-05, 'epoch': 2.5}
{'embedding_loss': 0.0002, 'learning_rate': 1.0666666666666667e-05, 'epoch': 2.6}
{'eval_embedding_loss': 0.2721, 'learning_rate': 1.0666666666666667e-05, 'epoch': 2.6}
{'embedding_loss': 0.0002, 'learning_rate': 1.0222222222222223e-05, 'epoch': 2.7}
{'eval_embedding_loss': 0.2743, 'learning_rate': 1.0222222222222223e-05, 'epoch': 2.7}
{'embedding_loss': 0.0003, 'learning_rate': 9.777777777777779e-06, 'epoch': 2.8}
{'eval_embedding_loss': 0.2822, 'learning_rate': 9.777777777777779e-06, 'epoch': 2.8}
{'embedding_loss': 0.0003, 'learning_rate': 9.333333333333334e-06, 'epoch': 2.9}
{'eval_embedding_loss': 0.2758, 'learning_rate': 9.333333333333334e-06, 'epoch': 2.9}
{'embedding_loss': 0.0004, 'learning_rate': 8.888888888888888e-06, 'epoch': 3.0}
{'eval_embedding_loss': 0.2764, 'learning_rate': 8.888888888888888e-06, 'epoch': 3.0}
{'embedding_loss': 0.0004, 'learning_rate': 8.444444444444446e-06, 'epoch': 3.1}
{'eval_embedding_loss': 0.2798, 'learning_rate': 8.444444444444446e-06, 'epoch': 3.1}
{'embedding_loss': 0.0002, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.2}
{'eval_embedding_loss': 0.2769, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.2}
{'embedding_loss': 0.0004, 'learning_rate': 7.555555555555556e-06, 'epoch': 3.3}
{'eval_embedding_loss': 0.2766, 'learning_rate': 7.555555555555556e-06, 'epoch': 3.3}
{'embedding_loss': 0.0002, 'learning_rate': 7.111111111111112e-06, 'epoch': 3.4}
{'eval_embedding_loss': 0.2833, 'learning_rate': 7.111111111111112e-06, 'epoch': 3.4}
{'embedding_loss': 0.0002, 'learning_rate': 6.666666666666667e-06, 'epoch': 3.5}
{'eval_embedding_loss': 0.2755, 'learning_rate': 6.666666666666667e-06, 'epoch': 3.5}```