[BUG/Help] !bash train.sh出问题

Open KAiWeN121381 opened this issue 2 years ago • 1 comments

Is there an existing issue for this?

[X] I have searched the existing issues

Current Behavior

Google Colab上面跑的

%cd /content/ChatGLM-6B/ptuning !bash` train.sh

尝试查了一下JSON好像没啥问题

2023-06-19 06:45:25.687473: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT 06/19/2023 06:45:26 - WARNING - main - Process rank: -1, device: cuda:0, n_gpu: 1distributed training: False, 16-bits training: False 06/19/2023 06:45:26 - INFO - main - Training/evaluation parameters Seq2SeqTrainingArguments( _n_gpu=1, adafactor=False, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, auto_find_batch_size=False, bf16=False, bf16_full_eval=False, data_seed=None, dataloader_drop_last=False, dataloader_num_workers=0, dataloader_pin_memory=True, ddp_bucket_cap_mb=None, ddp_find_unused_parameters=None, ddp_timeout=1800, debug=[], deepspeed=None, disable_tqdm=False, do_eval=False, do_predict=False, do_train=True, eval_accumulation_steps=None, eval_delay=0, eval_steps=None, evaluation_strategy=no, fp16=False, fp16_backend=auto, fp16_full_eval=False, fp16_opt_level=O1, fsdp=[], fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False}, fsdp_min_num_params=0, fsdp_transformer_layer_cls_to_wrap=None, full_determinism=False, generation_max_length=None, generation_num_beams=None, gradient_accumulation_steps=16, gradient_checkpointing=False, greater_is_better=None, group_by_length=False, half_precision_backend=auto, hub_model_id=None, hub_private_repo=False, hub_strategy=every_save, hub_token=<HUB_TOKEN>, ignore_data_skip=False, include_inputs_for_metrics=False, jit_mode_eval=False, label_names=None, label_smoothing_factor=0.0, learning_rate=0.02, length_column_name=length, load_best_model_at_end=False, local_rank=-1, log_level=passive, log_level_replica=warning, log_on_each_node=True, logging_dir=output/adgen-chatglm-6b-pt-128-2e-2/runs/Jun19_06-45-26_44695fee915a, logging_first_step=False, logging_nan_inf_filter=True, logging_steps=10, logging_strategy=steps, lr_scheduler_type=linear, max_grad_norm=1.0, max_steps=3000, metric_for_best_model=None, mp_parameters=, no_cuda=False, num_train_epochs=3.0, optim=adamw_hf, optim_args=None, output_dir=output/adgen-chatglm-6b-pt-128-2e-2, overwrite_output_dir=True, past_index=-1, per_device_eval_batch_size=1, per_device_train_batch_size=1, predict_with_generate=True, prediction_loss_only=False, push_to_hub=False, push_to_hub_model_id=None, push_to_hub_organization=None, push_to_hub_token=<PUSH_TO_HUB_TOKEN>, ray_scope=last, remove_unused_columns=True, report_to=['tensorboard'], resume_from_checkpoint=None, run_name=output/adgen-chatglm-6b-pt-128-2e-2, save_on_each_node=False, save_steps=1000, save_strategy=steps, save_total_limit=None, seed=42, sharded_ddp=[], skip_memory_metrics=True, sortish_sampler=False, tf32=None, torch_compile=False, torch_compile_backend=None, torch_compile_mode=None, torchdynamo=None, tpu_metrics_debug=False, tpu_num_cores=None, use_ipex=False, use_legacy_prediction_loop=False, use_mps_device=False, warmup_ratio=0.0, warmup_steps=0, weight_decay=0.0, xpu_backend=None, ) Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-b82a089ca7bf50bc/0.0.0/e347ab1c932092252e717ff3f949105a4dd28b27e842dd53157d2f72e276c2e4... Downloading data files: 100% 2/2 [00:00<00:00, 17331.83it/s] Extracting data files: 100% 2/2 [00:00<00:00, 188.11it/s] ╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json/json. │ │ py:113 in _generate_tables │ │ │ │ 110 │ │ │ │ │ │ try: │ │ 111 │ │ │ │ │ │ │ while True: │ │ 112 │ │ │ │ │ │ │ │ try: │ │ ❱ 113 │ │ │ │ │ │ │ │ │ pa_table = paj.read_json( │ │ 114 │ │ │ │ │ │ │ │ │ │ io.BytesIO(batch), read_option │ │ 115 │ │ │ │ │ │ │ │ │ ) │ │ 116 │ │ │ │ │ │ │ │ │ break │ │ │ │ in pyarrow._json.read_json:259 │ │ │ │ in pyarrow.lib.pyarrow_internal_check_status:144 │ │ │ │ in pyarrow.lib.check_status:100 │ ╰──────────────────────────────────────────────────────────────────────────────╯ ArrowInvalid: JSON parse error: Column(/summary) was specified twice in row 425

During handling of the above exception, another exception occurred:

╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /usr/local/lib/python3.10/dist-packages/datasets/builder.py:1879 in │ │ _prepare_split_single │ │ │ │ 1876 │ │ │ ) │ │ 1877 │ │ │ try: │ │ 1878 │ │ │ │ _time = time.time() │ │ ❱ 1879 │ │ │ │ for _, table in generator: │ │ 1880 │ │ │ │ │ if max_shard_size is not None and writer._num_byt │ │ 1881 │ │ │ │ │ │ num_examples, num_bytes = writer.finalize() │ │ 1882 │ │ │ │ │ │ writer.close() │ │ │ │ /usr/local/lib/python3.10/dist-packages/datasets/packaged_modules/json/json. │ │ py:134 in generate_tables │ │ │ │ 131 │ │ │ │ │ │ except pa.ArrowInvalid as e: │ │ 132 │ │ │ │ │ │ │ try: │ │ 133 │ │ │ │ │ │ │ │ with open(file, encoding="utf-8") as f │ │ ❱ 134 │ │ │ │ │ │ │ │ │ dataset = json.load(f) │ │ 135 │ │ │ │ │ │ │ except json.JSONDecodeError: │ │ 136 │ │ │ │ │ │ │ │ logger.error(f"Failed to read file '{f │ │ 137 │ │ │ │ │ │ │ │ raise e │ │ │ │ /usr/lib/python3.10/json/init.py:293 in load │ │ │ │ 290 │ To use a custom JSONDecoder subclass, specify it with the cl │ │ 291 │ kwarg; otherwise JSONDecoder`` is used. │ │ 292 │ """ │ │ ❱ 293 │ return loads(fp.read(), │ │ 294 │ │ cls=cls, object_hook=object_hook, │ │ 295 │ │ parse_float=parse_float, parse_int=parse_int, │ │ 296 │ │ parse_constant=parse_constant, object_pairs_hook=object_pairs │ │ │ │ /usr/lib/python3.10/codecs.py:322 in decode │ │ │ │ 319 │ def decode(self, input, final=False): │ │ 320 │ │ # decode input (taking the buffer into account) │ │ 321 │ │ data = self.buffer + input │ │ ❱ 322 │ │ (result, consumed) = self._buffer_decode(data, self.errors, f │ │ 323 │ │ # keep undecoded input until the next call │ │ 324 │ │ self.buffer = data[consumed:] │ │ 325 │ │ return result │ ╰──────────────────────────────────────────────────────────────────────────────╯ UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa4 in position 3145728: invalid start byte

The above exception was the direct cause of the following exception:

╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ /content/ChatGLM-6B/ptuning/main.py:430 in │ │ │ │ 427 │ │ 428 │ │ 429 if name == "main": │ │ ❱ 430 │ main() │ │ 431 │ │ │ │ /content/ChatGLM-6B/ptuning/main.py:99 in main │ │ │ │ 96 │ │ data_files["test"] = data_args.test_file │ │ 97 │ │ extension = data_args.test_file.split(".")[-1] │ │ 98 │ │ │ ❱ 99 │ raw_datasets = load_dataset( │ │ 100 │ │ extension, │ │ 101 │ │ data_files=data_files, │ │ 102 │ │ cache_dir=model_args.cache_dir, │ │ │ │ /usr/local/lib/python3.10/dist-packages/datasets/load.py:1797 in │ │ load_dataset │ │ │ │ 1794 │ try_from_hf_gcs = path not in _PACKAGED_DATASETS_MODULES │ │ 1795 │ │ │ 1796 │ # Download and prepare data │ │ ❱ 1797 │ builder_instance.download_and_prepare( │ │ 1798 │ │ download_config=download_config, │ │ 1799 │ │ download_mode=download_mode, │ │ 1800 │ │ verification_mode=verification_mode, │ │ │ │ /usr/local/lib/python3.10/dist-packages/datasets/builder.py:909 in │ │ download_and_prepare │ │ │ │ 906 │ │ │ │ │ │ │ prepare_split_kwargs["max_shard_size"] = │ │ 907 │ │ │ │ │ │ if num_proc is not None: │ │ 908 │ │ │ │ │ │ │ prepare_split_kwargs["num_proc"] = num_pr │ │ ❱ 909 │ │ │ │ │ │ self._download_and_prepare( │ │ 910 │ │ │ │ │ │ │ dl_manager=dl_manager, │ │ 911 │ │ │ │ │ │ │ verification_mode=verification_mode, │ │ 912 │ │ │ │ │ │ │ **prepare_split_kwargs, │ │ │ │ /usr/local/lib/python3.10/dist-packages/datasets/builder.py:1004 in │ │ _download_and_prepare │ │ │ │ 1001 │ │ │ │ │ 1002 │ │ │ try: │ │ 1003 │ │ │ │ # Prepare split will record examples associated to th │ │ ❱ 1004 │ │ │ │ self.prepare_split(split_generator, **prepare_split │ │ 1005 │ │ │ except OSError as e: │ │ 1006 │ │ │ │ raise OSError( │ │ 1007 │ │ │ │ │ "Cannot find data file. " │ │ │ │ /usr/local/lib/python3.10/dist-packages/datasets/builder.py:1767 in │ │ _prepare_split │ │ │ │ 1764 │ │ │ gen_kwargs = split_generator.gen_kwargs │ │ 1765 │ │ │ job_id = 0 │ │ 1766 │ │ │ with pbar: │ │ ❱ 1767 │ │ │ │ for job_id, done, content in self._prepare_split_sing │ │ 1768 │ │ │ │ │ gen_kwargs=gen_kwargs, job_id=job_id, **prepare │ │ 1769 │ │ │ │ ): │ │ 1770 │ │ │ │ │ if done: │ │ │ │ /usr/local/lib/python3.10/dist-packages/datasets/builder.py:1912 in │ │ _prepare_split_single │ │ │ │ 1909 │ │ │ # Ignore the writer's error for no examples written to th │ │ 1910 │ │ │ if isinstance(e, SchemaInferenceError) and e.context │ │ 1911 │ │ │ │ e = e.context │ │ ❱ 1912 │ │ │ raise DatasetGenerationError("An error occurred while gen │ │ 1913 │ │ │ │ 1914 │ │ yield job_id, True, (total_num_examples, total_num_bytes, wri │ │ 1915 │ ╰──────────────────────────────────────────────────────────────────────────────╯ DatasetGenerationError: An error occurred while generating the dataset

Expected Behavior

No response

Steps To Reproduce

照着README做的

Environment

- OS:
- Python:
- Transformers:
- PyTorch:
- CUDA Support (`python -c "import torch; print(torch.cuda.is_available())"`) :

Anything else?

No response

Jun 19 '23 06:06 KAiWeN121381

应该是你数据集内容的问题吧，检查一下数据集内容。有提示：UnicodeDecodeError: 'utf-8' codec can't decode byte 0xa4 in position 3145728:

Jun 20 '23 03:06 woderchen