CoLLiE
CoLLiE copied to clipboard
llama-2-7b拓展词表报错
用的dev分支,examples/further_pretrain_llama里的脚本,运行指令是
torchrun --rdzv_backend=c10d --rdzv_endpoint=localhost:29402 --nnodes=1 --nproc_per_node=8 expand_vocab.py
只修改了llama的路径,包括config、tokenizer和model.from_pretrained。报错如下:
╭──────────────────────────── Traceback (most recent call last) ────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:85 in <module> │
│ │
│ 82 │ model.get_input_embedding()[1].weight.requires_grad = True │
│ 83 if model.get_lm_head()[1] is not None: │
│ 84 │ model.get_lm_head()[1].weight.requires_grad = True │
│ ❱ 85 optimizer = torch.optim.AdamW( │
│ 86 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 87 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 88 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in │
│ __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰───────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭──────────────────────────── Traceback (most recent call last) ────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:85 in <module> │
│ │
│ 82 │ model.get_input_embedding()[1].weight.requires_grad = True │
│ 83 if model.get_lm_head()[1] is not None: │
│ 84 │ model.get_lm_head()[1].weight.requires_grad = True │
│ ❱ 85 optimizer = torch.optim.AdamW( │
│ 86 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 87 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 88 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in │
│ __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰───────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭──────────────────────────── Traceback (most recent call last) ────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:77 in <module> │
│ │
│ 74 # 准备模型并调整 embedding 层大小,设置只训练 embedding 和 lm_head 层,加速收敛 │
│ 75 model = LlamaForCausalLM.from_pretrained( │
│ 76 │ "../../../llama-2-7b", config=config) │
│ ❱ 77 model.resize_token_embeddings(len(llama_tokenizer) + 7) # 取个整 │
│ 78 for p in model.parameters(): │
│ 79 │ p.requires_grad = False │
│ 80 # 因为 embedding 和 lm_head 在 pipeline 的情况下被分割到了不同的进程,所以要判断一 │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/models/base.py:634 in │
│ resize_token_embeddings │
│ │
│ 631 │ │ │ │ │ │ = lm_head.bias.data[start_pos_old:end_pos_old] │
│ 632 │ │ │ │ if end_pos_new < (new_num_tokens // env.tp_size): │
│ 633 │ │ │ │ │ initization_method = self.collie_config.initization_method │
│ ❱ 634 │ │ │ │ │ if self.collie_config.initization_method_params is not None: │
│ 635 │ │ │ │ │ │ initization_method = initization_method(new_lm_head.weight[ │
│ 636 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ **self.collie_confi │
│ 637 │ │ │ │ │ │ if lm_head.bias is not None: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/config.py:206 in __getattr__ │
│ │
│ 203 │ │ self.model_config.save_pretrained(path) │
│ 204 │ │
│ 205 │ def __getattr__(self, name): │
│ ❱ 206 │ │ return getattr(self.model_config, name) │
│ 207 │ │
│ 208 │ def __setattr__(self, name: str, value: Any) -> None: │
│ 209 │ │ if name in self.__annotations__.keys(): │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/transformers/configuration_utils.py: │
│ 261 in __getattribute__ │
│ │
│ 258 │ def __getattribute__(self, key): │
│ 259 │ │ if key != "attribute_map" and key in super().__getattribute__("attribute_ma │
│ 260 │ │ │ key = super().__getattribute__("attribute_map")[key] │
│ ❱ 261 │ │ return super().__getattribute__(key) │
│ 262 │ │
│ 263 │ def __init__(self, **kwargs): │
│ 264 │ │ # Attributes with defaults │
╰───────────────────────────────────────────────────────────────────────────────────────────╯
AttributeError: 'LlamaConfig' object has no attribute 'initization_method_params'
╭──────────────────────────── Traceback (most recent call last) ────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:77 in <module> │
│ │
│ 74 # 准备模型并调整 embedding 层大小,设置只训练 embedding 和 lm_head 层,加速收敛 │
│ 75 model = LlamaForCausalLM.from_pretrained( │
│ 76 │ "../../../llama-2-7b", config=config) │
│ ❱ 77 model.resize_token_embeddings(len(llama_tokenizer) + 7) # 取个整 │
│ 78 for p in model.parameters(): │
│ 79 │ p.requires_grad = False │
│ 80 # 因为 embedding 和 lm_head 在 pipeline 的情况下被分割到了不同的进程,所以要判断一 │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/models/base.py:634 in │
│ resize_token_embeddings │
│ │
│ 631 │ │ │ │ │ │ = lm_head.bias.data[start_pos_old:end_pos_old] │
│ 632 │ │ │ │ if end_pos_new < (new_num_tokens // env.tp_size): │
│ 633 │ │ │ │ │ initization_method = self.collie_config.initization_method │
│ ❱ 634 │ │ │ │ │ if self.collie_config.initization_method_params is not None: │
│ 635 │ │ │ │ │ │ initization_method = initization_method(new_lm_head.weight[ │
│ 636 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ **self.collie_confi │
│ 637 │ │ │ │ │ │ if lm_head.bias is not None: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/config.py:206 in __getattr__ │
│ │
│ 203 │ │ self.model_config.save_pretrained(path) │
│ 204 │ │
│ 205 │ def __getattr__(self, name): │
│ ❱ 206 │ │ return getattr(self.model_config, name) │
│ 207 │ │
│ 208 │ def __setattr__(self, name: str, value: Any) -> None: │
│ 209 │ │ if name in self.__annotations__.keys(): │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/transformers/configuration_utils.py: │
│ 261 in __getattribute__ │
│ │
│ 258 │ def __getattribute__(self, key): │
│ 259 │ │ if key != "attribute_map" and key in super().__getattribute__("attribute_ma │
│ 260 │ │ │ key = super().__getattribute__("attribute_map")[key] │
│ ❱ 261 │ │ return super().__getattribute__(key) │
│ 262 │ │
│ 263 │ def __init__(self, **kwargs): │
│ 264 │ │ # Attributes with defaults │
╰───────────────────────────────────────────────────────────────────────────────────────────╯
AttributeError: 'LlamaConfig' object has no attribute 'initization_method_params'
╭──────────────────────────── Traceback (most recent call last) ────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:85 in <module> │
│ │
│ 82 │ model.get_input_embedding()[1].weight.requires_grad = True │
│ 83 if model.get_lm_head()[1] is not None: │
│ 84 │ model.get_lm_head()[1].weight.requires_grad = True │
│ ❱ 85 optimizer = torch.optim.AdamW( │
│ 86 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 87 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 88 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in │
│ __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰───────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭──────────────────────────── Traceback (most recent call last) ────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:85 in <module> │
│ │
│ 82 │ model.get_input_embedding()[1].weight.requires_grad = True │
│ 83 if model.get_lm_head()[1] is not None: │
│ 84 │ model.get_lm_head()[1].weight.requires_grad = True │
│ ❱ 85 optimizer = torch.optim.AdamW( │
│ 86 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 87 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 88 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in │
│ __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰───────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭──────────────────────────── Traceback (most recent call last) ────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:77 in <module> │
│ │
│ 74 # 准备模型并调整 embedding 层大小,设置只训练 embedding 和 lm_head 层,加速收敛 │
│ 75 model = LlamaForCausalLM.from_pretrained( │
│ 76 │ "../../../llama-2-7b", config=config) │
│ ❱ 77 model.resize_token_embeddings(len(llama_tokenizer) + 7) # 取个整 │
│ 78 for p in model.parameters(): │
│ 79 │ p.requires_grad = False │
│ 80 # 因为 embedding 和 lm_head 在 pipeline 的情况下被分割到了不同的进程,所以要判断一 │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/models/base.py:548 in │
│ resize_token_embeddings │
│ │
│ 545 │ │ │ │ │ = embedding.weight.data[start_pos_old:end_pos_old, :] │
│ 546 │ │ │ │ if end_pos_new < (new_num_tokens // env.tp_size): │
│ 547 │ │ │ │ │ initization_method = self.collie_config.initization_method │
│ ❱ 548 │ │ │ │ │ if self.collie_config.initization_method_params is not None: │
│ 549 │ │ │ │ │ │ initization_method = initization_method(new_embedding.weigh │
│ 550 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ **self.collie_confi │
│ 551 │ │ │ │ │ else: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/config.py:206 in __getattr__ │
│ │
│ 203 │ │ self.model_config.save_pretrained(path) │
│ 204 │ │
│ 205 │ def __getattr__(self, name): │
│ ❱ 206 │ │ return getattr(self.model_config, name) │
│ 207 │ │
│ 208 │ def __setattr__(self, name: str, value: Any) -> None: │
│ 209 │ │ if name in self.__annotations__.keys(): │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/transformers/configuration_utils.py: │
│ 261 in __getattribute__ │
│ │
│ 258 │ def __getattribute__(self, key): │
│ 259 │ │ if key != "attribute_map" and key in super().__getattribute__("attribute_ma │
│ 260 │ │ │ key = super().__getattribute__("attribute_map")[key] │
│ ❱ 261 │ │ return super().__getattribute__(key) │
│ 262 │ │
│ 263 │ def __init__(self, **kwargs): │
│ 264 │ │ # Attributes with defaults │
╰───────────────────────────────────────────────────────────────────────────────────────────╯
AttributeError: 'LlamaConfig' object has no attribute 'initization_method_params'
╭──────────────────────────── Traceback (most recent call last) ────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:77 in <module> │
│ │
│ 74 # 准备模型并调整 embedding 层大小,设置只训练 embedding 和 lm_head 层,加速收敛 │
│ 75 model = LlamaForCausalLM.from_pretrained( │
│ 76 │ "../../../llama-2-7b", config=config) │
│ ❱ 77 model.resize_token_embeddings(len(llama_tokenizer) + 7) # 取个整 │
│ 78 for p in model.parameters(): │
│ 79 │ p.requires_grad = False │
│ 80 # 因为 embedding 和 lm_head 在 pipeline 的情况下被分割到了不同的进程,所以要判断一 │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/models/base.py:548 in │
│ resize_token_embeddings │
│ │
│ 545 │ │ │ │ │ = embedding.weight.data[start_pos_old:end_pos_old, :] │
│ 546 │ │ │ │ if end_pos_new < (new_num_tokens // env.tp_size): │
│ 547 │ │ │ │ │ initization_method = self.collie_config.initization_method │
│ ❱ 548 │ │ │ │ │ if self.collie_config.initization_method_params is not None: │
│ 549 │ │ │ │ │ │ initization_method = initization_method(new_embedding.weigh │
│ 550 │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ │ **self.collie_confi │
│ 551 │ │ │ │ │ else: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/config.py:206 in __getattr__ │
│ │
│ 203 │ │ self.model_config.save_pretrained(path) │
│ 204 │ │
│ 205 │ def __getattr__(self, name): │
│ ❱ 206 │ │ return getattr(self.model_config, name) │
│ 207 │ │
│ 208 │ def __setattr__(self, name: str, value: Any) -> None: │
│ 209 │ │ if name in self.__annotations__.keys(): │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/transformers/configuration_utils.py: │
│ 261 in __getattribute__ │
│ │
│ 258 │ def __getattribute__(self, key): │
│ 259 │ │ if key != "attribute_map" and key in super().__getattribute__("attribute_ma │
│ 260 │ │ │ key = super().__getattribute__("attribute_map")[key] │
│ ❱ 261 │ │ return super().__getattribute__(key) │
│ 262 │ │
│ 263 │ def __init__(self, **kwargs): │
│ 264 │ │ # Attributes with defaults │
╰───────────────────────────────────────────────────────────────────────────────────────────╯
AttributeError: 'LlamaConfig' object has no attribute 'initization_method_params'
非常抱歉这个initization_method_params这个参数已经过时了,您可以再更新一下dev的代码运行试试看
还是会报错,传入optimizer
的可训练参数是空的,但是前面也对input embedding
和lm head
设置了requires_grd=True
。很奇怪。
[2023-07-20 13:02:58,344] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-07-20 13:02:58,345] [INFO] [comm.py:594:init_distributed] cdb=None
[2023-07-20 13:02:58,702] [WARNING] [comm.py:152:init_deepspeed_backend] NCCL backend in DeepSpeed not yet implemented
[2023-07-20 13:02:58,702] [INFO] [comm.py:594:init_distributed] cdb=None
SEED_LAYERS=False BASE_SEED=42 SEED_FN=None
[2023-07-20 13:02:59,072] [INFO] [module.py:358:_partition_layers] Partitioning pipeline stages with method parameters
stage=0 layers=17
0: _inner
1: LlamaLayer
2: LlamaLayer
3: LlamaLayer
4: LlamaLayer
5: LlamaLayer
6: LlamaLayer
7: LlamaLayer
8: LlamaLayer
9: LlamaLayer
10: LlamaLayer
11: LlamaLayer
12: LlamaLayer
13: LlamaLayer
14: LlamaLayer
15: LlamaLayer
16: LlamaLayer
stage=1 layers=18
17: LlamaLayer
18: LlamaLayer
19: LlamaLayer
20: LlamaLayer
21: LlamaLayer
22: LlamaLayer
23: LlamaLayer
24: LlamaLayer
25: LlamaLayer
26: LlamaLayer
27: LlamaLayer
28: LlamaLayer
29: LlamaLayer
30: LlamaLayer
31: LlamaLayer
32: LlamaLayer
33: _inner
34: _inner
loss: GPTLMLoss
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:94 in <module> │
│ │
│ 91 # lr = 0.001, │
│ 92 # clip_grad_norm = 5.0 │
│ 93 # ) │
│ ❱ 94 optimizer = torch.optim.AdamW( │
│ 95 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 96 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 97 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:94 in <module> │
│ │
│ 91 # lr = 0.001, │
│ 92 # clip_grad_norm = 5.0 │
│ 93 # ) │
│ ❱ 94 optimizer = torch.optim.AdamW( │
│ 95 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 96 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 97 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:94 in <module> │
│ │
│ 91 # lr = 0.001, │
│ 92 # clip_grad_norm = 5.0 │
│ 93 # ) │
│ ❱ 94 optimizer = torch.optim.AdamW( │
│ 95 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 96 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 97 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:94 in <module> │
│ │
│ 91 # lr = 0.001, │
│ 92 # clip_grad_norm = 5.0 │
│ 93 # ) │
│ ❱ 94 optimizer = torch.optim.AdamW( │
│ 95 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 96 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 97 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:94 in <module> │
│ │
│ 91 # lr = 0.001, │
│ 92 # clip_grad_norm = 5.0 │
│ 93 # ) │
│ ❱ 94 optimizer = torch.optim.AdamW( │
│ 95 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 96 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 97 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:94 in <module> │
│ │
│ 91 # lr = 0.001, │
│ 92 # clip_grad_norm = 5.0 │
│ 93 # ) │
│ ❱ 94 optimizer = torch.optim.AdamW( │
│ 95 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 96 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 97 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:94 in <module> │
│ │
│ 91 # lr = 0.001, │
│ 92 # clip_grad_norm = 5.0 │
│ 93 # ) │
│ ❱ 94 optimizer = torch.optim.AdamW( │
│ 95 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 96 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 97 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:94 in <module> │
│ │
│ 91 # lr = 0.001, │
│ 92 # clip_grad_norm = 5.0 │
│ 93 # ) │
│ ❱ 94 optimizer = torch.optim.AdamW( │
│ 95 │ filter(lambda p: p.requires_grad, model.parameters()), lr=2e-4) │
│ 96 lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( │
│ 97 │ optimizer, T_max=config.train_epochs * len(train_dataset), eta_min=0) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/adamw.py:50 in __init__ │
│ │
│ 47 │ │ │ differentiable=differentiable, │
│ 48 │ │ │ fused=fused, │
│ 49 │ │ ) │
│ ❱ 50 │ │ super().__init__(params, defaults) │
│ 51 │ │ │
│ 52 │ │ if fused: │
│ 53 │ │ │ if differentiable: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/optim/optimizer.py:187 in __init__ │
│ │
│ 184 │ │ │
│ 185 │ │ param_groups = list(params) │
│ 186 │ │ if len(param_groups) == 0: │
│ ❱ 187 │ │ │ raise ValueError("optimizer got an empty parameter list") │
│ 188 │ │ if not isinstance(param_groups[0], dict): │
│ 189 │ │ │ param_groups = [{'params': param_groups}] │
│ 190 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
ValueError: optimizer got an empty parameter list
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 2165927) of binary: /d1/conda3/envs/scx_llm/bin/python
Traceback (most recent call last):
File "/d1/conda3/envs/scx_llm/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==2.0.1', 'console_scripts', 'torchrun')())
File "/d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
我们发现是这里关于embedding层的替换出了问题,您在pull一下dev分支试一下呢?
使用最新的dev分支,报错如下:
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d2/data/chuxiong/collie/examples/further_pretrain_llama/expand_vocab.py:115 in <module> │
│ │
│ 112 │ evaluators=[evaluator] │
│ 113 ) │
│ 114 │
│ ❱ 115 trainer.train() │
│ 116 │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/controller/trainer.py:324 in train │
│ │
│ 321 │ │ │ │ │ self.engine.module.forward_type = "train" │
│ 322 │ │ │ │ if isinstance(self.engine.module, PeftModel) and isinstance(self.engine. │
│ 323 │ │ │ │ │ self.engine.module.get_base_model().forward_type = "train" │
│ ❱ 324 │ │ │ │ with self.monitor as item: │
│ 325 │ │ │ │ │ loss = self.train_fn(self, batch, self.global_batch_idx) │
│ 326 │ │ │ │ │ item.update({"loss": round(loss, 4), │
│ 327 │ │ │ │ │ │ │ │ │ "lr": self.lr, │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/utils/monitor.py:232 in __exit__ │
│ │
│ 229 │ def __exit__(self, exc_type, exc_val, exc_tb): │
│ 230 │ │ for monitor in self.monitors: │
│ 231 │ │ │ monitor.item = self.item │
│ ❱ 232 │ │ │ monitor.__exit__(exc_type, exc_val, exc_tb) │
│ 233 │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/utils/monitor.py:180 in __exit__ │
│ │
│ 177 │ │ return super().__enter__() │
│ 178 │ │
│ 179 │ def __exit__(self, exc_type, exc_val, exc_tb): │
│ ❱ 180 │ │ if self.item["mode"] == "train" and "batch" in self.item.keys(): │
│ 181 │ │ │ self.monitor.write_events([(f"TGS", reduce(lambda x, y: x * y, self.item["ba │
│ 182 │
│ 183 class CPUMemoryMonitor(BaseMonitor): │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
KeyError: 'mode'
╭─────────────────────────────── Traceback (most recent call last) ────────────────────────────────╮
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/controller/trainer.py:325 in train │
│ │
│ 322 │ │ │ │ if isinstance(self.engine.module, PeftModel) and isinstance(self.engine. │
│ 323 │ │ │ │ │ self.engine.module.get_base_model().forward_type = "train" │
│ 324 │ │ │ │ with self.monitor as item: │
│ ❱ 325 │ │ │ │ │ loss = self.train_fn(self, batch, self.global_batch_idx) │
│ 326 │ │ │ │ │ item.update({"loss": round(loss, 4), │
│ 327 │ │ │ │ │ │ │ │ │ "lr": self.lr, │
│ 328 │ │ │ │ │ │ │ │ │ "batch": batch, │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/controller/trainer.py:382 in │
│ train_fn │
│ │
│ 379 │ │ │ │ trainer.engine.module.forward_type = "train" │
│ 380 │ │ │ if isinstance(trainer.engine.module, PeftModel) and isinstance(trainer.engin │
│ 381 │ │ │ │ trainer.engine.module.get_base_model().forward_type = "train" │
│ ❱ 382 │ │ │ loss = trainer.engine.module(**batch)["loss"] │
│ 383 │ │ else: │
│ 384 │ │ │ # concat prompt labels for p-tuning │
│ 385 │ │ │ if trainer.config.peft_config and trainer.config.peft_config.peft_type in [" │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in │
│ _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/module.py:549 in forward │
│ │
│ 546 │ │ │ if self.forward_type == "generate": │
│ 547 │ │ │ │ return self.generate_forward(*args, **kwargs) │
│ 548 │ │ │ elif self.forward_type == "train": │
│ ❱ 549 │ │ │ │ return self.train_forward(*args, **kwargs) │
│ 550 │ │ │ elif self.forward_type == "eval": │
│ 551 │ │ │ │ return self.eval_forward(*args, **kwargs) │
│ 552 │ │ │ else: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/module.py:271 in train_forward │
│ │
│ 268 │ │ else: │
│ 269 │ │ │ self._set_past_key_values(past_key_values) │
│ 270 │ │ inputs["labels"] = labels │
│ ❱ 271 │ │ loss = self.engine_container[-1].train_batch(inputs) │
│ 272 │ │ return CausalLMOutputWithPast( │
│ 273 │ │ │ loss=loss, │
│ 274 │ │ │ logits=None, │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/utils/pipeline_engine.py:212 in │
│ train_batch │
│ │
│ 209 │ │ batch = _split_batch(batch, self.train_micro_batch_size_per_gpu(), │
│ 210 │ │ │ │ │ │ │ self.gradient_accumulation_steps()) │
│ 211 │ │ data_iter = iter(batch) │
│ ❱ 212 │ │ result = super().train_batch(data_iter) │
│ 213 │ │ if isinstance(self.module, PipelineModel): │
│ 214 │ │ │ self.module.inner_forward = False │
│ 215 │ │ if isinstance(self.module, PeftModel) and isinstance(self.module.get_base_model( │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/deepspeed/runtime/pipe/engine.py:336 in │
│ train_batch │
│ │
│ 333 │ │ sched = schedule.TrainSchedule(micro_batches=self.micro_batches, │
│ 334 │ │ │ │ │ │ │ │ │ stages=self.num_stages, │
│ 335 │ │ │ │ │ │ │ │ │ stage_id=self.stage_id) │
│ ❱ 336 │ │ self._exec_schedule(sched) │
│ 337 │ │ self.agg_train_loss = self._aggregate_total_loss() │
│ 338 │ │ │
│ 339 │ │ self.timers('train_batch').stop() │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/deepspeed/runtime/pipe/engine.py:1307 in │
│ _exec_schedule │
│ │
│ 1304 │ │ │ │ │
│ 1305 │ │ │ │ # Equivalent to: self._exec_forward_pass(buffer_id=0) │
│ 1306 │ │ │ │ self._exec_instr = MethodType(self._INSTRUCTION_MAP[type(cmd)], self) │
│ ❱ 1307 │ │ │ │ self._exec_instr(**cmd.kwargs) │
│ 1308 │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/utils/pipeline_engine.py:402 in │
│ _exec_forward_pass │
│ │
│ 399 │ │ self._zero_grads(inputs) │
│ 400 │ │ # if buffer_id >= 1: │
│ 401 │ │ # import pdb; pdb.set_trace() │
│ ❱ 402 │ │ outputs = super(PipelineEngine, self).forward(inputs) │
│ 403 │ │ │
│ 404 │ │ # Reset activation checkpointing buffers. │
│ 405 │ │ # Need to call this between evaluation iterations │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/deepspeed/utils/nvtx.py:15 in wrapped_fn │
│ │
│ 12 │ │
│ 13 │ def wrapped_fn(*args, **kwargs): │
│ 14 │ │ get_accelerator().range_push(func.__qualname__) │
│ ❱ 15 │ │ ret_val = func(*args, **kwargs) │
│ 16 │ │ get_accelerator().range_pop() │
│ 17 │ │ return ret_val │
│ 18 │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/deepspeed/runtime/engine.py:1735 in forward │
│ │
│ 1732 │ │ if self.fp16_auto_cast(): │
│ 1733 │ │ │ inputs = self._cast_inputs_half(inputs) │
│ 1734 │ │ │
│ ❱ 1735 │ │ loss = self.module(*inputs, **kwargs) │
│ 1736 │ │ │
│ 1737 │ │ if self.zero_optimization_partition_weights(): │
│ 1738 │ │ │ # Disable automated discovery of external parameters │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in │
│ _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/module.py:559 in forward │
│ │
│ 556 │ │ │ if "input_ids" in kwargs.keys() and isinstance(kwargs["input_ids"], dict): │
│ 557 │ │ │ │ return super(PipelineModel, self).forward(kwargs["input_ids"]) │
│ 558 │ │ │ else: │
│ ❱ 559 │ │ │ │ return super(PipelineModel, self).forward(*args, **kwargs) │
│ 560 │ │
│ 561 │ def get_input_embedding(self): │
│ 562 │ │ if env.pp_rank != 0: │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/deepspeed/runtime/pipe/module.py:334 in │
│ forward │
│ │
│ 331 │ │ │
│ 332 │ │ if self.activation_checkpoint_interval == 0: │
│ 333 │ │ │ func = exec_range_func(0, len(self.forward_funcs)) │
│ ❱ 334 │ │ │ x = func(forward_input) │
│ 335 │ │ else: │
│ 336 │ │ │ num_layers = len(self.forward_funcs) │
│ 337 │ │ │ x = forward_input │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/deepspeed/runtime/pipe/module.py:327 in │
│ exec_func │
│ │
│ 324 │ │ │ │ │ │ else: │
│ 325 │ │ │ │ │ │ │ ds_utils.set_random_seed(new_seed) │
│ 326 │ │ │ │ │ │
│ ❱ 327 │ │ │ │ │ inputs = layer(inputs) │
│ 328 │ │ │ │ return inputs │
│ 329 │ │ │ │
│ 330 │ │ │ return exec_func │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in │
│ _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/models/llama/model.py:232 in forward │
│ │
│ 229 │ │
│ 230 │ def forward(self, inputs: dict): │
│ 231 │ │ if self.config.checkpointing and self.training: │
│ ❱ 232 │ │ │ inputs["hidden_states"] = torch.utils.checkpoint.checkpoint( │
│ 233 │ │ │ │ self._forward, │
│ 234 │ │ │ │ inputs["hidden_states"], │
│ 235 │ │ │ │ inputs.get("attention_mask", None) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/utils/checkpoint.py:249 in checkpoint │
│ │
│ 246 │ │ raise ValueError("Unexpected keyword arguments: " + ",".join(arg for arg in kwar │
│ 247 │ │
│ 248 │ if use_reentrant: │
│ ❱ 249 │ │ return CheckpointFunction.apply(function, preserve, *args) │
│ 250 │ else: │
│ 251 │ │ return _checkpoint_without_reentrant( │
│ 252 │ │ │ function, │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/autograd/function.py:506 in apply │
│ │
│ 503 │ │ if not torch._C._are_functorch_transforms_active(): │
│ 504 │ │ │ # See NOTE: [functorch vjp and autograd interaction] │
│ 505 │ │ │ args = _functorch.utils.unwrap_dead_wrappers(args) │
│ ❱ 506 │ │ │ return super().apply(*args, **kwargs) # type: ignore[misc] │
│ 507 │ │ │
│ 508 │ │ if cls.setup_context == _SingleLevelFunction.setup_context: │
│ 509 │ │ │ raise RuntimeError( │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/utils/checkpoint.py:107 in forward │
│ │
│ 104 │ │ ctx.save_for_backward(*tensor_inputs) │
│ 105 │ │ │
│ 106 │ │ with torch.no_grad(): │
│ ❱ 107 │ │ │ outputs = run_function(*args) │
│ 108 │ │ return outputs │
│ 109 │ │
│ 110 │ @staticmethod │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/models/llama/model.py:183 in │
│ _forward │
│ │
│ 180 │ │ │ start_pos = self.past_key_values.shape[3] │
│ 181 │ │ else: │
│ 182 │ │ │ start_pos = 0 │
│ ❱ 183 │ │ query, key = self.self_attn["rotary_emb"](query, key, seq_len, start_pos) │
│ 184 │ │ if self.past_key_values is not None: │
│ 185 │ │ │ query = torch.cat([self.past_key_values[0].permute([0, 2, 1, 3]), query], di │
│ 186 │ │ │ key = torch.cat([self.past_key_values[0].permute([0, 2, 1, 3]), key], dim=1) │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/torch/nn/modules/module.py:1501 in │
│ _call_impl │
│ │
│ 1498 │ │ if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks │
│ 1499 │ │ │ │ or _global_backward_pre_hooks or _global_backward_hooks │
│ 1500 │ │ │ │ or _global_forward_hooks or _global_forward_pre_hooks): │
│ ❱ 1501 │ │ │ return forward_call(*args, **kwargs) │
│ 1502 │ │ # Do not call functions when jit is used │
│ 1503 │ │ full_backward_hooks, non_full_backward_hooks = [], [] │
│ 1504 │ │ backward_pre_hooks = [] │
│ │
│ /d1/conda3/envs/scx_llm/lib/python3.10/site-packages/collie/models/llama/model.py:63 in forward │
│ │
│ 60 │ │ shape = [d if i == 1 or i == query.ndim - │
│ 61 │ │ │ │ 1 else 1 for i, d in enumerate(query.shape)] │
│ 62 │ │ freqs_cis = freqs_cis.view(*shape) │
│ ❱ 63 │ │ query = torch.view_as_real(query * freqs_cis).flatten(3) │
│ 64 │ │ key = torch.view_as_real(key * freqs_cis).flatten(3) │
│ 65 │ │ return query.type(t), key.type(t) │
│ 66 │
╰──────────────────────────────────────────────────────────────────────────────────────────────────╯
RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:7 and cpu!
把pp_size调成1才能避免报错,但是跑的时候loss一大半时间是nan。