ChatGLM-Tuning icon indicating copy to clipboard operation
ChatGLM-Tuning copied to clipboard

ValueError: 130004 is not in list

Open itxingqing opened this issue 2 years ago • 20 comments

when i am training the model user code: !sed -i "s/THUDM\/chatglm-6b/\/openbayes\/home\/chatglm-6b/" finetune.py !cat finetune.py !python finetune.py \ --dataset_path /output/train \ --lora_rank 8 \ --per_device_train_batch_size 6 \ --gradient_accumulation_steps 1 \ --max_steps 3000 \ --save_steps 1000 \ --save_total_limit 2 \ --learning_rate 1e-4 \ --fp16 \ --remove_unused_columns false \ --logging_steps 50 \ --output_dir /output/lora run below error: `from transformers.integrations import TensorBoardCallback from torch.utils.tensorboard import SummaryWriter from transformers import TrainingArguments from transformers import Trainer, HfArgumentParser from transformers import AutoTokenizer, AutoModel import torch import torch.nn as nn from peft import get_peft_model, LoraConfig, TaskType from dataclasses import dataclass, field import datasets import os

tokenizer = AutoTokenizer.from_pretrained("/openbayes/home/chatglm-6b", trust_remote_code=True)

@dataclass class FinetuneArguments: dataset_path: str = field(default="data/alpaca") model_path: str = field(default="output") lora_rank: int = field(default=8)

class CastOutputToFloat(nn.Sequential): def forward(self, x): return super().forward(x).to(torch.float32)

def data_collator(features: list) -> dict: len_ids = [len(feature["input_ids"]) for feature in features] longest = max(len_ids) input_ids = [] labels_list = [] for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]): ids = feature["input_ids"] seq_len = feature["seq_len"] labels = ( [-100] * (seq_len - 1) + ids[(seq_len - 1) :] + [-100] * (longest - ids_l) ) ids = ids + [tokenizer.pad_token_id] * (longest - ids_l) _ids = torch.LongTensor(ids) labels_list.append(torch.LongTensor(labels)) input_ids.append(_ids) input_ids = torch.stack(input_ids) labels = torch.stack(labels_list) return { "input_ids": input_ids, "labels": labels, }

class ModifiedTrainer(Trainer): def compute_loss(self, model, inputs, return_outputs=False): return model( input_ids=inputs["input_ids"], labels=inputs["labels"], ).loss

def save_model(self, output_dir=None, _internal_call=False):
    from transformers.trainer import TRAINING_ARGS_NAME

    os.makedirs(output_dir, exist_ok=True)
    torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))
    saved_params = {
        k: v.to("cpu") for k, v in self.model.named_parameters() if v.requires_grad
    }
    torch.save(saved_params, os.path.join(output_dir, "adapter_model.bin"))

def main(): writer = SummaryWriter() finetune_args, training_args = HfArgumentParser( (FinetuneArguments, TrainingArguments) ).parse_args_into_dataclasses()

# init model
model = AutoModel.from_pretrained(
    "/openbayes/home/chatglm-6b", load_in_8bit=True, trust_remote_code=True, device_map="auto"
)
model.gradient_checkpointing_enable()
model.enable_input_require_grads()
model.is_parallelizable = True
model.model_parallel = True
model.lm_head = CastOutputToFloat(model.lm_head)
model.config.use_cache = (
    False  # silence the warnings. Please re-enable for inference!
)

# setup peft
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=finetune_args.lora_rank,
    lora_alpha=32,
    lora_dropout=0.1,
)
model = get_peft_model(model, peft_config)

# load dataset
dataset = datasets.load_from_disk(finetune_args.dataset_path)
print(f"\n{len(dataset)=}\n")

# start train
trainer = ModifiedTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    callbacks=[TensorBoardCallback(writer)],
    data_collator=data_collator,
)
trainer.train()
writer.close()
# save model
model.save_pretrained(training_args.output_dir)

if name == "main": main()

===================================BUG REPORT=================================== Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues

/usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/usr/local/nvidia/lib64'), PosixPath('/usr/local/nvidia/lib')} warn(msg) /usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: /usr/local/nvidia/lib:/usr/local/nvidia/lib64 did not contain libcudart.so as expected! Searching further paths... warn(msg) /usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('home/be7a8072-3e98-4214-9de2-30dc39aa3baf')} warn(msg) /usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('tcp'), PosixPath('//10.111.0.1'), PosixPath('443')} warn(msg) /usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('7890'), PosixPath('http'), PosixPath('//alchemist-experience')} warn(msg) /usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('localhost,127.0.0.1,openbayes-server-svc,openbayes-storage-server-svc,10.0.0.0/8')} warn(msg) /usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('itxingqing/jobs/n27oyqsbrszc')} warn(msg) /usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('module'), PosixPath('//matplotlib_inline.backend_inline')} warn(msg) /usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('/output/.torch')} warn(msg) /usr/local/lib/python3.8/site-packages/bitsandbytes/cuda_setup/main.py:136: UserWarning: WARNING: The following directories listed in your path were found to be non-existent: {PosixPath('//openbayes-server-svc/api/users/itxingqing/jobs/n27oyqsbrszc'), PosixPath('http')} warn(msg) CUDA_SETUP: WARNING! libcudart.so not found in any environmental path. Searching /usr/local/cuda/lib64... CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so CUDA SETUP: Highest compute capability among GPUs detected: 8.6 CUDA SETUP: Detected CUDA version 117 CUDA SETUP: Loading binary /usr/local/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cuda117.so... Explicitly passing a revision is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision. Explicitly passing a revision is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision. Explicitly passing a revision is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision. Overriding torch_dtype=None with torch_dtype=torch.float16 due to requirements of bitsandbytes to enable model loading in mixed int8. Either pass torch_dtype=torch.float16 or don't pass this argument at all to remove this warning. Loading checkpoint shards: 100%|██████████████████| 8/8 [00:06<00:00, 1.15it/s]

len(dataset)=3438

You are adding a <class 'transformers.integrations.TensorBoardCallback'> to the callbacks of this Trainer, but there is already one. The currentlist of callbacks is :DefaultFlowCallback TensorBoardCallback /usr/local/lib/python3.8/site-packages/transformers/optimization.py:391: FutureWarning: This implementation of AdamW is deprecated and will be removed in a future version. Use the PyTorch implementation torch.optim.AdamW instead, or set no_deprecation_warning=True to disable this warning warnings.warn( 0%| | 0/3000 [00:00<?, ?it/s]../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [96,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [97,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [98,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [99,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [100,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [101,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [102,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [103,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [104,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [105,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [106,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [107,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [108,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [109,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [110,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [111,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [112,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [113,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [114,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [115,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [116,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [117,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [118,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [119,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [120,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [121,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [122,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [123,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [124,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [125,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [126,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [150,0,0], thread: [127,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [64,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [65,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [66,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [67,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [68,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [69,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [70,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [71,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [72,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [73,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [74,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [75,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [76,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [77,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [78,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [79,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [80,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [81,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [82,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [83,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [84,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [85,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [86,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [87,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [88,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [89,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [90,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [91,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [92,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [93,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [94,0,0] Assertion srcIndex < srcSelectDimSize failed. ../aten/src/ATen/native/cuda/Indexing.cu:1141: indexSelectLargeIndex: block: [152,0,0], thread: [95,0,0] Assertion srcIndex < srcSelectDimSize failed. Traceback (most recent call last): File "finetune.py", line 118, in main() File "finetune.py", line 111, in main trainer.train() File "/usr/local/lib/python3.8/site-packages/transformers/trainer.py", line 1633, in train return inner_training_loop( File "/usr/local/lib/python3.8/site-packages/transformers/trainer.py", line 1902, in _inner_training_loop tr_loss_step = self.training_step(model, inputs) File "/usr/local/lib/python3.8/site-packages/transformers/trainer.py", line 2645, in training_step loss = self.compute_loss(model, inputs) File "finetune.py", line 54, in compute_loss return model( File "/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, **kwargs) File "/usr/local/lib/python3.8/site-packages/peft/peft_model.py", line 663, in forward return self.base_model( File "/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, **kwargs) File "/usr/local/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward output = old_forward(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/chatglm-6b/modeling_chatglm.py", line 1158, in forward transformer_outputs = self.transformer( File "/usr/local/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl return forward_call(*input, **kwargs) File "/usr/local/lib/python3.8/site-packages/accelerate/hooks.py", line 165, in new_forward output = old_forward(*args, **kwargs) File "/root/.cache/huggingface/modules/transformers_modules/chatglm-6b/modeling_chatglm.py", line 915, in forward attention_mask = self.get_masks( File "/root/.cache/huggingface/modules/transformers_modules/chatglm-6b/modeling_chatglm.py", line 667, in get_masks context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids] File "/root/.cache/huggingface/modules/transformers_modules/chatglm-6b/modeling_chatglm.py", line 667, in context_lengths = [seq.tolist().index(self.config.bos_token_id) for seq in input_ids] ValueError: 130004 is not in list 0%| | 0/3000 [00:00<?, ?it/s]` the error is "ValueError: 130004 is not in list" how to fix the error

itxingqing avatar Apr 10 '23 03:04 itxingqing

可以更新一下最新的代码,官方把token更新了

https://github.com/mymusise/ChatGLM-Tuning/issues/138#issuecomment-1499869956

mymusise avatar Apr 10 '23 03:04 mymusise

可以更新一下最新的代码,官方把token更新了

#138 (comment)

我上午刚clone最新的代码

itxingqing avatar Apr 10 '23 03:04 itxingqing

可以更新一下最新的代码,官方把token更新了

#138 (comment)

麻烦给看看吧

itxingqing avatar Apr 10 '23 03:04 itxingqing

image

itxingqing avatar Apr 10 '23 03:04 itxingqing

finetune数据样本不需要增加[MASK]标记吗? · Issue #79 · mymusise/ChatGLM-Tuning https://github.com/mymusise/ChatGLM-Tuning/issues/79

Aloha0424 commented 2 weeks ago • 说: 这个问题在输入内容长度太长时会出现,原因是在encode时先做了特殊token的拼接再截断,导致截断后,拼接的特殊token 150001 150004丢掉了。快速的修复方式是直接在tokenize_dataset_rows.py preprocess函数中将 prompt_ids处理下,把最后2个id改为150001 150004 prompt_ids[-2] = 150001 prompt_ids[-1] = 150004

然后重新生成数据 重新运行 python tokenize_dataset_rows.py --jsonl_path data/alpaca_data.jsonl --save_path data/alpaca --max_seq_length 200 --skip_overlength false

def preprocess(tokenizer, config, example, max_seq_length):
    prompt = example["context"]
    target = example["target"]
    prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)
    # 加了 下面的 。。。。。。。
    prompt_ids[-2] = 150001
    prompt_ids[-1] = 150004
    # 加了 上面的 然后重新生成数据 。。。。。。。
    target_ids = tokenizer.encode(
        target,
        max_length=max_seq_length,
        truncation=True,
        add_special_tokens=False)
    input_ids = prompt_ids + target_ids + [config.eos_token_id]
    
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

starplatinum3 avatar Apr 11 '23 05:04 starplatinum3

ChatGLM-6B 的 huggingface repo 更新了,需要重新下载模型下来,然后再运行(官方的一些特殊 token 的 ID 又变了)

ToSev7en avatar Apr 11 '23 07:04 ToSev7en

ChatGLM-6B 的 huggingface repo 更新了,需要重新下载模型下来,然后再运行(官方的一些特殊 token 的 ID 又变了)

https://huggingface.co/THUDM/chatglm-6b这个地址?

itxingqing avatar Apr 11 '23 07:04 itxingqing

ChatGLM-6B 的 huggingface repo 更新了,需要重新下载模型下来,然后再运行(官方的一些特殊 token 的 ID 又变了)

https://huggingface.co/THUDM/chatglm-6b这个地址?

是的,然后里面模型和tokenizer 好些文件都变了

ToSev7en avatar Apr 11 '23 07:04 ToSev7en

ChatGLM-6B 的 huggingface repo 更新了,需要重新下载模型下来,然后再运行(官方的一些特殊 token 的 ID 又变了)

https://huggingface.co/THUDM/chatglm-6b这个地址?

是的,然后里面模型和tokenizer 好些文件都变了

了解,能加你的VX吗

itxingqing avatar Apr 11 '23 07:04 itxingqing

ChatGLM-6B 的 huggingface repo 更新了,需要重新下载模型下来,然后再运行(官方的一些特殊 token 的 ID 又变了)

https://huggingface.co/THUDM/chatglm-6b这个地址?

是的,然后里面模型和tokenizer 好些文件都变了

了解,能加你的VX吗

您解决这个问题了吗

YSLLYW avatar Apr 30 '23 12:04 YSLLYW

我也遇到这个问题了,根本原因在于这一份代码的版本和开源的chatglm没有对齐的所导致的,解决办法也很简单,参照官方ptuning的数据预处理方法改就行了,简单加一行就行input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)

reference https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py#L217

DBtxy avatar May 23 '23 12:05 DBtxy

我也遇到这个问题了,根本原因在于这一份代码的版本和开源的chatglm没有对齐的所导致的,解决办法也很简答,参照官方ptuning的数据预处理方法改就行了,简单加一行就行input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)

reference https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py#L217

@DBtxy 请问具体在那个脚本中加入input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)这句话,tokenization_chatglm.py吗

briup1 avatar May 25 '23 08:05 briup1

我也遇到这个问题了,根本原因在于这一份代码的版本和开源的chatglm没有对齐的所导致的,解决办法也很简答,参照官方ptuning的数据预处理方法改就行了,简单加一行就行input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids) reference https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py#L217

@DBtxy 请问具体在那个脚本中加入input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)这句话,tokenization_chatglm.py吗 在tokenize_dataset_rows.py 文件里面 改一下preprocess函数就行了,我把我的代码复制给你,可以参考一下

def preprocess(tokenizer, config, example, max_source_length, max_target_length):
    prompt = example["context"]
    target = example["target"]
    # print("prompt: ", prompt)

    # print(">>>>>>>>>>>>>>>> ")

    # print("target: ", target)
    prompt_ids = tokenizer.encode(text=prompt, max_length=max_source_length, add_special_tokens=False, truncation=True)
    # 加了 下面的 。。。。。。。
    # prompt_ids[-2] = 150001
    # prompt_ids[-1] = 150004
    target_ids = tokenizer.encode(
        target,
        max_length=max_target_length,
        truncation=True,
        add_special_tokens=False)
    # input_ids = prompt_ids + target_ids + [config.eos_token_id]
    input_ids = tokenizer.build_inputs_with_special_tokens(prompt_ids, target_ids)
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

DBtxy avatar May 25 '23 08:05 DBtxy

我也遇到这个问题了,根本原因在于这一份代码的版本和开源的chatglm没有对齐的所导致的,解决办法也很简答,参照官方ptuning的数据预处理方法改就行了,简单加一行就行input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids) reference https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py#L217

@DBtxy 请问具体在那个脚本中加入input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)这句话,tokenization_chatglm.py吗 在tokenize_dataset_rows.py 文件里面 改一下preprocess函数就行了,我把我的代码复制给你,可以参考一下

def preprocess(tokenizer, config, example, max_source_length, max_target_length):
    prompt = example["context"]
    target = example["target"]
    # print("prompt: ", prompt)

    # print(">>>>>>>>>>>>>>>> ")

    # print("target: ", target)
    prompt_ids = tokenizer.encode(text=prompt, max_length=max_source_length, add_special_tokens=False, truncation=True)
    # 加了 下面的 。。。。。。。
    # prompt_ids[-2] = 150001
    # prompt_ids[-1] = 150004
    target_ids = tokenizer.encode(
        target,
        max_length=max_target_length,
        truncation=True,
        add_special_tokens=False)
    # input_ids = prompt_ids + target_ids + [config.eos_token_id]
    input_ids = tokenizer.build_inputs_with_special_tokens(prompt_ids, target_ids)
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

@DBtxy 兄弟,感谢你的及时回复 ,我很快就看到了 因为我跑的不是P-TUNING的代码 所以不能直接改过来 但是我理解你的思路!! 再次感谢!!!

briup1 avatar May 25 '23 09:05 briup1

tokenize_dataset_rows.py 老哥 tokenize_dataset_rows.py 这个文件在哪呀 没找到呀

254288008 avatar May 29 '23 13:05 254288008

tokenize_dataset_rows.py 老哥 tokenize_dataset_rows.py 这个文件在哪呀 没找到呀

我说的这个文件就在这个git下面呀,其实不管你用的哪份代码,找一下构建input_ids 的地方就行

DBtxy avatar May 29 '23 13:05 DBtxy

tokenize_dataset_rows.py 老哥 tokenize_dataset_rows.py 这个文件在哪呀 没找到呀

我说的这个文件就在这个git下面呀,其实不管你用的哪份代码,找一下构建input_ids 的地方就行 image

就是最新的 也有input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)这行代码 还是不行呀,大神什么情况这是

254288008 avatar May 29 '23 13:05 254288008

报错是130004 is not in list, 为何加 prompt_ids[-2] = 150001 prompt_ids[-1] = 150004

guotong1988 avatar Jun 21 '23 01:06 guotong1988

tokenize_dataset_rows.py 老哥 tokenize_dataset_rows.py 这个文件在哪呀 没找到呀

我说的这个文件就在这个git下面呀,其实不管你用的哪份代码,找一下构建input_ids 的地方就行 image

就是最新的 也有input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)这行代码 还是不行呀,大神什么情况这是

你好,请问解决了吗,谢谢

origi6615 avatar Dec 21 '23 13:12 origi6615

我也遇到这个问题了,根本原因在于这一份代码的版本和开源的chatglm没有对齐的所导致的,解决办法也很简答,参照官方ptuning的数据预处理方法改就行了,简单加一行就行input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids) reference https://github.com/THUDM/ChatGLM-6B/blob/main/ptuning/main.py#L217

@DBtxy 请问具体在那个脚本中加入input_ids = tokenizer.build_inputs_with_special_tokens(a_ids, b_ids)这句话,tokenization_chatglm.py吗 在tokenize_dataset_rows.py 文件里面 改一下preprocess函数就行了,我把我的代码复制给你,可以参考一下

def preprocess(tokenizer, config, example, max_source_length, max_target_length):
    prompt = example["context"]
    target = example["target"]
    # print("prompt: ", prompt)

    # print(">>>>>>>>>>>>>>>> ")

    # print("target: ", target)
    prompt_ids = tokenizer.encode(text=prompt, max_length=max_source_length, add_special_tokens=False, truncation=True)
    # 加了 下面的 。。。。。。。
    # prompt_ids[-2] = 150001
    # prompt_ids[-1] = 150004
    target_ids = tokenizer.encode(
        target,
        max_length=max_target_length,
        truncation=True,
        add_special_tokens=False)
    # input_ids = prompt_ids + target_ids + [config.eos_token_id]
    input_ids = tokenizer.build_inputs_with_special_tokens(prompt_ids, target_ids)
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

请问tokenize_dataset_rows.py 在哪个路径里面呢,我在官方代码里面没找到

xihaofei avatar Dec 28 '23 07:12 xihaofei