LLM-Finetuning-Toolkit UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 101: character maps to <undefined>

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 101: character maps to <undefined>

Open jeetendraabvv opened this issue 6 months ago • 0 comments

How can we resolve this error?. No error occur when we read the file data using pandas read_csv. or how I can set encoding. My data file contains Text written in Hindi Language. │ 20 │ │ 21 class IncrementalDecoder(codecs.IncrementalDecoder): │ │ 22 │ def decode(self, input, final=False): │ │ ❱ 23 │ │ return codecs.charmap_decode(input,self.errors,decoding_table) │ │ 24 │ │ 25 class StreamWriter(Codec,codecs.StreamWriter): │ │ 26 │ pass │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ final = False │ │ │ │ input = b',output,input,instruction\r\n0,Worst Car Names of All Time │ │ │ │ \xe0\xa4\x95\xe0\xa4\xad\xe0\xa5\x80-\xe0\xa4\x95\xe0\xa4\xad\x… │ │ │ │ \xe0\xa4\xb5'+8112 │ │ │ │ self = <encodings.cp1252.IncrementalDecoder object at │ │ │ │ 0x00000266E8003D90> │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ ╰──────────────────────────────────────────────────────────────────────────────╯ UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 101: character maps to

The above exception was the direct cause of the following exception:

╭───────────────────── Traceback (most recent call last) ──────────────────────╮ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\llmtune\cli\toolki │ │ t.py:123 in run │ │ │ │ 120 │ │ │ config = yaml.safe_load(file) │ │ 121 │ │ │ config = Config(**config) │ │ 122 │ │ │ │ ❱ 123 │ │ run_one_experiment(config, config_path) │ │ 124 │ │ 125 │ │ 126 @generate_app.command("config") │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ config = Config( │ │ │ │ │ save_dir='./experiment/', │ │ │ │ │ ablation=AblationConfig( │ │ │ │ │ │ use_ablate=False, │ │ │ │ │ │ study_name='ablation' │ │ │ │ │ ), │ │ │ │ │ data=DataConfig( │ │ │ │ │ │ file_type='csv', │ │ │ │ │ │ │ │ │ │ path='C:\Users\Administrator\Pictures\lll_tk\jag_ful… │ │ │ │ │ │ prompt='Below is an instruction that describes a │ │ │ │ task. Write a response that appropriat'+89, │ │ │ │ │ │ prompt_stub='{output}', │ │ │ │ │ │ train_size=500, │ │ │ │ │ │ test_size=25, │ │ │ │ │ │ train_test_split_seed=42 │ │ │ │ │ ), │ │ │ │ │ model=ModelConfig( │ │ │ │ │ │ hf_model_ckpt='facebook/opt-125m', │ │ │ │ │ │ device_map='auto', │ │ │ │ │ │ torch_dtype='bfloat16', │ │ │ │ │ │ attn_implementation=None, │ │ │ │ │ │ quantize=True, │ │ │ │ │ │ bitsandbytes=BitsAndBytesConfig( │ │ │ │ │ │ │ load_in_8bit=False, │ │ │ │ │ │ │ llm_int8_threshold=6.0, │ │ │ │ │ │ │ llm_int8_skip_modules=None, │ │ │ │ │ │ │ llm_int8_enable_fp32_cpu_offload=False, │ │ │ │ │ │ │ llm_int8_has_fp16_weight=False, │ │ │ │ │ │ │ load_in_4bit=True, │ │ │ │ │ │ │ bnb_4bit_compute_dtype='bfloat16', │ │ │ │ │ │ │ bnb_4bit_quant_type='nf4', │ │ │ │ │ │ │ bnb_4bit_use_double_quant=True │ │ │ │ │ │ ) │ │ │ │ │ ), │ │ │ │ │ lora=LoraConfig( │ │ │ │ │ │ r=32, │ │ │ │ │ │ task_type='CAUSAL_LM', │ │ │ │ │ │ lora_alpha=64, │ │ │ │ │ │ bias='none', │ │ │ │ │ │ lora_dropout=0.1, │ │ │ │ │ │ target_modules='all-linear', │ │ │ │ │ │ fan_in_fan_out=False, │ │ │ │ │ │ modules_to_save=None, │ │ │ │ │ │ layers_to_transform=None, │ │ │ │ │ │ layers_pattern=None │ │ │ │ │ ), │ │ │ │ │ training=TrainingConfig( │ │ │ │ │ │ training_args=TrainingArgs( │ │ │ │ │ │ │ num_train_epochs=1, │ │ │ │ │ │ │ per_device_train_batch_size=4, │ │ │ │ │ │ │ gradient_accumulation_steps=4, │ │ │ │ │ │ │ gradient_checkpointing=True, │ │ │ │ │ │ │ optim='paged_adamw_32bit', │ │ │ │ │ │ │ logging_steps=1, │ │ │ │ │ │ │ learning_rate=0.0002, │ │ │ │ │ │ │ bf16=True, │ │ │ │ │ │ │ tf32=True, │ │ │ │ │ │ │ fp16=False, │ │ │ │ │ │ │ max_grad_norm=0.3, │ │ │ │ │ │ │ warmup_ratio=0.03, │ │ │ │ │ │ │ lr_scheduler_type='constant', │ │ │ │ │ │ │ save_steps=500 │ │ │ │ │ │ ), │ │ │ │ │ │ sft_args=SftArgs( │ │ │ │ │ │ │ max_seq_length=1024, │ │ │ │ │ │ │ neftune_noise_alpha=None │ │ │ │ │ │ ) │ │ │ │ │ ), │ │ │ │ │ inference=InferenceConfig( │ │ │ │ │ │ max_length=None, │ │ │ │ │ │ max_new_tokens=256, │ │ │ │ │ │ min_length=0, │ │ │ │ │ │ min_new_tokens=None, │ │ │ │ │ │ early_stopping=False, │ │ │ │ │ │ max_time=None, │ │ │ │ │ │ do_sample=True, │ │ │ │ │ │ num_beams=1, │ │ │ │ │ │ num_beam_groups=1, │ │ │ │ │ │ penalty_alpha=None, │ │ │ │ │ │ use_cache=True, │ │ │ │ │ │ temperature=0.8, │ │ │ │ │ │ top_k=50, │ │ │ │ │ │ top_p=0.9, │ │ │ │ │ │ typical_p=1.0, │ │ │ │ │ │ epsilon_cutoff=0.0, │ │ │ │ │ │ eta_cutoff=0.0, │ │ │ │ │ │ diversity_penalty=0.0, │ │ │ │ │ │ repetition_penalty=1.0, │ │ │ │ │ │ encoder_repetition_penalty=1.0, │ │ │ │ │ │ length_penalty=1.0, │ │ │ │ │ │ no_repeat_ngram_size=0, │ │ │ │ │ │ bad_words_ids=None, │ │ │ │ │ │ force_words_ids=None, │ │ │ │ │ │ renormalize_logits=False │ │ │ │ │ ), │ │ │ │ │ qa=QaConfig( │ │ │ │ │ │ llm_metrics=[ │ │ │ │ │ │ │ 'dot_product', │ │ │ │ │ │ │ 'rouge_score', │ │ │ │ │ │ │ 'word_overlap', │ │ │ │ │ │ │ 'verb_percent', │ │ │ │ │ │ │ 'adjective_percent', │ │ │ │ │ │ │ 'noun_percent', │ │ │ │ │ │ │ 'summary_length' │ │ │ │ │ │ ] │ │ │ │ │ ) │ │ │ │ ) │ │ │ │ config_path = './config.yml' │ │ │ │ configs = [ │ │ │ │ │ { │ │ │ │ │ │ 'save_dir': './experiment/', │ │ │ │ │ │ 'ablation': {'use_ablate': False}, │ │ │ │ │ │ 'data': { │ │ │ │ │ │ │ 'file_type': 'csv', │ │ │ │ │ │ │ 'path': │ │ │ │ 'C:\Users\Administrator\Pictures\lll_tk\jag_full_ss.… │ │ │ │ │ │ │ 'prompt': 'Below is an instruction that │ │ │ │ describes a task. Write a response that appropriat'+89, │ │ │ │ │ │ │ 'prompt_stub': '{output}', │ │ │ │ │ │ │ 'test_size': 25, │ │ │ │ │ │ │ 'train_size': 500, │ │ │ │ │ │ │ 'train_test_split_seed': 42 │ │ │ │ │ │ }, │ │ │ │ │ │ 'model': { │ │ │ │ │ │ │ 'hf_model_ckpt': 'facebook/opt-125m', │ │ │ │ │ │ │ 'torch_dtype': 'bfloat16', │ │ │ │ │ │ │ 'quantize': True, │ │ │ │ │ │ │ 'bitsandbytes': { │ │ │ │ │ │ │ │ 'load_in_4bit': True, │ │ │ │ │ │ │ │ 'bnb_4bit_compute_dtype': 'bfloat16', │ │ │ │ │ │ │ │ 'bnb_4bit_quant_type': 'nf4' │ │ │ │ │ │ │ } │ │ │ │ │ │ }, │ │ │ │ │ │ 'lora': { │ │ │ │ │ │ │ 'task_type': 'CAUSAL_LM', │ │ │ │ │ │ │ 'r': 32, │ │ │ │ │ │ │ 'lora_alpha': 64, │ │ │ │ │ │ │ 'lora_dropout': 0.1, │ │ │ │ │ │ │ 'target_modules': 'all-linear' │ │ │ │ │ │ }, │ │ │ │ │ │ 'training': { │ │ │ │ │ │ │ 'training_args': { │ │ │ │ │ │ │ │ 'num_train_epochs': 1, │ │ │ │ │ │ │ │ 'per_device_train_batch_size': 4, │ │ │ │ │ │ │ │ 'gradient_accumulation_steps': 4, │ │ │ │ │ │ │ │ 'gradient_checkpointing': True, │ │ │ │ │ │ │ │ 'optim': 'paged_adamw_32bit', │ │ │ │ │ │ │ │ 'logging_steps': 1, │ │ │ │ │ │ │ │ 'learning_rate': 0.0002, │ │ │ │ │ │ │ │ 'bf16': True, │ │ │ │ │ │ │ │ 'tf32': True, │ │ │ │ │ │ │ │ 'max_grad_norm': 0.3, │ │ │ │ │ │ │ │ ... +2 │ │ │ │ │ │ │ }, │ │ │ │ │ │ │ 'sft_args': {'max_seq_length': 1024} │ │ │ │ │ │ }, │ │ │ │ │ │ 'inference': { │ │ │ │ │ │ │ 'max_new_tokens': 256, │ │ │ │ │ │ │ 'use_cache': True, │ │ │ │ │ │ │ 'do_sample': True, │ │ │ │ │ │ │ 'top_p': 0.9, │ │ │ │ │ │ │ 'temperature': 0.8 │ │ │ │ │ │ }, │ │ │ │ │ │ 'qa': { │ │ │ │ │ │ │ 'llm_metrics': [ │ │ │ │ │ │ │ │ 'dot_product', │ │ │ │ │ │ │ │ 'rouge_score', │ │ │ │ │ │ │ │ 'word_overlap', │ │ │ │ │ │ │ │ 'verb_percent', │ │ │ │ │ │ │ │ 'adjective_percent', │ │ │ │ │ │ │ │ 'noun_percent', │ │ │ │ │ │ │ │ 'summary_length' │ │ │ │ │ │ │ ] │ │ │ │ │ │ } │ │ │ │ │ } │ │ │ │ ] │ │ │ │ dir_helper = <llmtune.utils.save_utils.DirectoryHelper object at │ │ │ │ 0x00000266E7F23190> │ │ │ │ file = <_io.TextIOWrapper │ │ │ │ name='experiment\zYTcZ\config\config.yml' mode='r' │ │ │ │ encoding='cp1252'> │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\llmtune\cli\toolki │ │ t.py:47 in run_one_experiment │ │ │ │ 44 │ RichUI.before_dataset_creation() │ │ 45 │ │ │ 46 │ with RichUI.during_dataset_creation("Injecting Values into Prompt" │ │ ❱ 47 │ │ dataset_generator = DatasetGenerator(**config.data.model_dump( │ │ 48 │ │ │ 49 │ _ = dataset_generator.train_columns │ │ 50 │ test_column = dataset_generator.test_column │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ config = Config( │ │ │ │ │ save_dir='./experiment/', │ │ │ │ │ ablation=AblationConfig( │ │ │ │ │ │ use_ablate=False, │ │ │ │ │ │ study_name='ablation' │ │ │ │ │ ), │ │ │ │ │ data=DataConfig( │ │ │ │ │ │ file_type='csv', │ │ │ │ │ │ │ │ │ │ path='C:\Users\Administrator\Pictures\lll_tk\jag_ful… │ │ │ │ │ │ prompt='Below is an instruction that describes a │ │ │ │ task. Write a response that appropriat'+89, │ │ │ │ │ │ prompt_stub='{output}', │ │ │ │ │ │ train_size=500, │ │ │ │ │ │ test_size=25, │ │ │ │ │ │ train_test_split_seed=42 │ │ │ │ │ ), │ │ │ │ │ model=ModelConfig( │ │ │ │ │ │ hf_model_ckpt='facebook/opt-125m', │ │ │ │ │ │ device_map='auto', │ │ │ │ │ │ torch_dtype='bfloat16', │ │ │ │ │ │ attn_implementation=None, │ │ │ │ │ │ quantize=True, │ │ │ │ │ │ bitsandbytes=BitsAndBytesConfig( │ │ │ │ │ │ │ load_in_8bit=False, │ │ │ │ │ │ │ llm_int8_threshold=6.0, │ │ │ │ │ │ │ llm_int8_skip_modules=None, │ │ │ │ │ │ │ llm_int8_enable_fp32_cpu_offload=False, │ │ │ │ │ │ │ llm_int8_has_fp16_weight=False, │ │ │ │ │ │ │ load_in_4bit=True, │ │ │ │ │ │ │ bnb_4bit_compute_dtype='bfloat16', │ │ │ │ │ │ │ bnb_4bit_quant_type='nf4', │ │ │ │ │ │ │ bnb_4bit_use_double_quant=True │ │ │ │ │ │ ) │ │ │ │ │ ), │ │ │ │ │ lora=LoraConfig( │ │ │ │ │ │ r=32, │ │ │ │ │ │ task_type='CAUSAL_LM', │ │ │ │ │ │ lora_alpha=64, │ │ │ │ │ │ bias='none', │ │ │ │ │ │ lora_dropout=0.1, │ │ │ │ │ │ target_modules='all-linear', │ │ │ │ │ │ fan_in_fan_out=False, │ │ │ │ │ │ modules_to_save=None, │ │ │ │ │ │ layers_to_transform=None, │ │ │ │ │ │ layers_pattern=None │ │ │ │ │ ), │ │ │ │ │ training=TrainingConfig( │ │ │ │ │ │ training_args=TrainingArgs( │ │ │ │ │ │ │ num_train_epochs=1, │ │ │ │ │ │ │ per_device_train_batch_size=4, │ │ │ │ │ │ │ gradient_accumulation_steps=4, │ │ │ │ │ │ │ gradient_checkpointing=True, │ │ │ │ │ │ │ optim='paged_adamw_32bit', │ │ │ │ │ │ │ logging_steps=1, │ │ │ │ │ │ │ learning_rate=0.0002, │ │ │ │ │ │ │ bf16=True, │ │ │ │ │ │ │ tf32=True, │ │ │ │ │ │ │ fp16=False, │ │ │ │ │ │ │ max_grad_norm=0.3, │ │ │ │ │ │ │ warmup_ratio=0.03, │ │ │ │ │ │ │ lr_scheduler_type='constant', │ │ │ │ │ │ │ save_steps=500 │ │ │ │ │ │ ), │ │ │ │ │ │ sft_args=SftArgs( │ │ │ │ │ │ │ max_seq_length=1024, │ │ │ │ │ │ │ neftune_noise_alpha=None │ │ │ │ │ │ ) │ │ │ │ │ ), │ │ │ │ │ inference=InferenceConfig( │ │ │ │ │ │ max_length=None, │ │ │ │ │ │ max_new_tokens=256, │ │ │ │ │ │ min_length=0, │ │ │ │ │ │ min_new_tokens=None, │ │ │ │ │ │ early_stopping=False, │ │ │ │ │ │ max_time=None, │ │ �� │ │ │ do_sample=True, │ │ │ │ │ │ num_beams=1, │ │ │ │ │ │ num_beam_groups=1, │ │ │ │ │ │ penalty_alpha=None, │ │ │ │ │ │ use_cache=True, │ │ │ │ │ │ temperature=0.8, │ │ │ │ │ │ top_k=50, │ │ │ │ │ │ top_p=0.9, │ │ │ │ │ │ typical_p=1.0, │ │ │ │ │ │ epsilon_cutoff=0.0, │ │ │ │ │ │ eta_cutoff=0.0, │ │ │ │ │ │ diversity_penalty=0.0, │ │ │ │ │ │ repetition_penalty=1.0, │ │ │ │ │ │ encoder_repetition_penalty=1.0, │ │ │ │ │ │ length_penalty=1.0, │ │ │ │ │ │ no_repeat_ngram_size=0, │ │ │ │ │ │ bad_words_ids=None, │ │ │ │ │ │ force_words_ids=None, │ │ │ │ │ │ renormalize_logits=False │ │ │ │ │ ), │ │ │ │ │ qa=QaConfig( │ │ │ │ │ │ llm_metrics=[ │ │ │ │ │ │ │ 'dot_product', │ │ │ │ │ │ │ 'rouge_score', │ │ │ │ │ │ │ 'word_overlap', │ │ │ │ │ │ │ 'verb_percent', │ │ │ │ │ │ │ 'adjective_percent', │ │ │ │ │ │ │ 'noun_percent', │ │ │ │ │ │ │ 'summary_length' │ │ │ │ │ │ ] │ │ │ │ │ ) │ │ │ │ ) │ │ │ │ config_path = './config.yml' │ │ │ │ dir_helper = <llmtune.utils.save_utils.DirectoryHelper object at │ │ │ │ 0x00000266E7A75C90> │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\llmtune\data\datas │ │ et_generator.py:27 in init │ │ │ │ 24 │ │ self.ingestor: Ingestor = get_ingestor(file_type) │ │ 25 │ │ self.ingestor: Ingestor = self.ingestor(path) │ │ 26 │ │ │ │ ❱ 27 │ │ self.dataset: Dataset = self.ingestor.to_dataset() │ │ 28 │ │ self.prompt: str = prompt │ │ 29 │ │ self.prompt_stub: str = prompt_stub │ │ 30 │ │ self.test_size = test_size │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ file_type = 'csv' │ │ │ │ path = 'C:\Users\Administrator\Pictures\lll_tk\ja… │ │ │ │ prompt = 'Below is an instruction that describes a task. │ │ │ │ Write a response that appropriat'+89 │ │ │ │ prompt_stub = '{output}' │ │ │ │ self = <llmtune.data.dataset_generator.DatasetGenerator │ │ │ │ object at 0x00000266E7F54050> │ │ │ │ test_size = 25 │ │ │ │ train_size = 500 │ │ │ │ train_test_split_seed = 42 │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\llmtune\data\inges │ │ tor.py:67 in to_dataset │ │ │ │ 64 │ │ │ │ yield row │ │ 65 │ │ │ 66 │ def to_dataset(self) -> Dataset: │ │ ❱ 67 │ │ return Dataset.from_generator(self._csv_generator) │ │ 68 │ │ 69 │ │ 70 class HuggingfaceIngestor(Ingestor): │ │ │ �� ╭──────────────────────────────── locals ─────────────────────────────────╮ │ │ │ self = <llmtune.data.ingestor.CsvIngestor object at 0x00000266E7F23E90> │ │ │ ╰─────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\datasets\arrow_dat │ │ aset.py:1125 in from_generator │ │ │ │ 1122 │ │ │ gen_kwargs=gen_kwargs, │ │ 1123 │ │ │ num_proc=num_proc, │ │ 1124 │ │ │ **kwargs, │ │ ❱ 1125 │ │ ).read() │ │ 1126 │ │ │ 1127 │ @staticmethod │ │ 1128 │ def from_json( │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ cache_dir = None │ │ │ │ features = None │ │ │ │ gen_kwargs = None │ │ │ │ generator = <bound method CsvIngestor._csv_generator │ │ │ │ of <llmtune.data.ingestor.CsvIngestor │ │ │ │ object at 0x00000266E7F23E90>> │ │ │ │ GeneratorDatasetInputStream = <class │ │ │ │ 'datasets.io.generator.GeneratorDatasetIn… │ │ │ │ keep_in_memory = False │ │ │ │ kwargs = {} │ │ │ │ num_proc = None │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\datasets\io\genera │ │ tor.py:47 in read │ │ │ │ 44 │ │ │ verification_mode = None │ │ 45 │ │ │ base_path = None │ │ 46 │ │ │ │ │ ❱ 47 │ │ │ self.builder.download_and_prepare( │ │ 48 │ │ │ │ download_config=download_config, │ │ 49 │ │ │ │ download_mode=download_mode, │ │ 50 │ │ │ │ verification_mode=verification_mode, │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ base_path = None │ │ │ │ download_config = None │ │ │ │ download_mode = None │ │ │ │ self = <datasets.io.generator.GeneratorDatasetInputStream │ │ │ │ object at 0x00000266E7F23B90> │ │ │ │ verification_mode = None │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\datasets\builder.p │ │ y:1027 in download_and_prepare │ │ │ │ 1024 │ │ │ │ │ │ │ prepare_split_kwargs["max_shard_size"] = │ │ 1025 │ │ │ │ │ │ if num_proc is not None: │ │ 1026 │ │ │ │ │ │ │ prepare_split_kwargs["num_proc"] = num_pr │ │ ❱ 1027 │ │ │ │ │ │ self._download_and_prepare( │ │ 1028 │ │ │ │ │ │ │ dl_manager=dl_manager, │ │ 1029 │ │ │ │ │ │ │ verification_mode=verification_mode, │ │ 1030 │ │ │ │ │ │ │ **prepare_split_kwargs, │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ _dest = 'C:/Users/Administrator/.cache/huggingfac… │ │ │ │ base_path = None │ │ │ │ data_exists = False │ │ │ │ dl_manager = <datasets.download.download_manager.Downl… │ │ │ │ object at 0x00000266E8006650> │ │ │ │ download_and_prepare_kwargs = {} │ │ │ │ download_config = DownloadConfig( │ │ │ │ │ │ │ │ │ cache_dir='C:\Users\Administrator\.cac… │ │ │ │ │ force_download=False, │ │ │ │ │ resume_download=False, │ │ │ │ │ local_files_only=False, │ │ │ │ │ proxies=None, │ │ │ │ │ user_agent=None, │ │ │ │ │ extract_compressed_file=False, │ │ │ │ │ force_extract=False, │ │ │ │ │ delete_extracted=False, │ │ │ │ │ extract_on_the_fly=False, │ │ │ │ │ use_etag=False, │ │ │ │ │ num_proc=None, │ │ │ │ │ max_retries=1, │ │ │ │ │ token=None, │ │ │ │ │ ignore_url_params=False, │ │ │ │ │ storage_options={ │ │ │ │ │ │ 'hf': { │ │ │ │ │ │ │ 'token': None, │ │ │ │ │ │ │ 'endpoint': │ │ │ │ 'https://huggingface.co' │ │ │ │ │ │ } │ │ │ │ │ }, │ │ │ │ │ download_desc=None, │ │ │ │ │ disable_tqdm=False │ │ │ │ ) │ │ │ │ download_mode = <DownloadMode.REUSE_DATASET_IF_EXISTS: │ │ │ │ 'reuse_dataset_if_exists'> │ │ │ │ downloaded_from_gcs = False │ │ │ │ file_format = 'arrow' │ │ │ │ fs = <fsspec.implementations.local.LocalFileSy… │ │ │ │ object at 0x00000266E7887D50> │ │ │ │ ignore_verifications = 'deprecated' │ │ │ │ incomplete_dir = <function │ │ │ │ DatasetBuilder.download_and_prepare.<loca… │ │ │ │ at 0x00000266E7FFE3E0> │ │ │ │ is_local = True │ │ │ │ lock_path = 'C:/Users/Administrator/.cache/huggingfac… │ │ │ │ max_shard_size = None │ │ │ │ num_proc = None │ │ │ │ output_dir = 'C:/Users/Administrator/.cache/huggingfac�� │ │ │ │ prepare_split_kwargs = {'file_format': 'arrow'} │ │ │ │ self = <datasets.packaged_modules.generator.gene… │ │ │ │ object at 0x00000266E7F23AD0> │ │ │ │ storage_options = None │ │ │ │ tmp_output_dir = 'C:/Users/Administrator/.cache/huggingfac… │ │ │ │ token = None │ │ │ │ try_from_hf_gcs = False │ │ │ │ use_auth_token = 'deprecated' │ │ │ │ verification_mode = <VerificationMode.BASIC_CHECKS: │ │ │ │ 'basic_checks'> │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\datasets\builder.p │ │ y:1789 in _download_and_prepare │ │ │ │ 1786 │ │ yield job_id, True, (total_num_examples, total_num_bytes, wri │ │ 1787 │ │ │ 1788 │ def _download_and_prepare(self, dl_manager, verification_mode, ** │ │ ❱ 1789 │ │ super()._download_and_prepare( │ │ 1790 │ │ │ dl_manager, │ │ 1791 │ │ │ verification_mode, │ │ 1792 │ │ │ check_duplicate_keys=verification_mode == VerificationMod │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ dl_manager = <datasets.download.download_manager.DownloadMan… │ │ │ │ object at 0x00000266E8006650> │ │ │ │ prepare_splits_kwargs = {'file_format': 'arrow'} │ │ │ │ self = <datasets.packaged_modules.generator.generator.… │ │ │ │ object at 0x00000266E7F23AD0> │ │ │ │ verification_mode = <VerificationMode.BASIC_CHECKS: 'basic_checks'> │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\datasets\builder.p │ │ y:1122 in _download_and_prepare │ │ │ │ 1119 │ │ │ │ │ 1120 │ │ │ try: │ │ 1121 │ │ │ │ # Prepare split will record examples associated to th │ │ ❱ 1122 │ │ │ │ self.prepare_split(split_generator, **prepare_split │ │ 1123 │ │ │ except OSError as e: │ │ 1124 │ │ │ │ raise OSError( │ │ 1125 │ │ │ │ │ "Cannot find data file. " │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ dl_manager = <datasets.download.download_manager.DownloadM… │ │ │ │ object at 0x00000266E8006650> │ │ │ │ prepare_split_kwargs = { │ │ │ │ │ 'check_duplicate_keys': True, │ │ │ │ │ 'file_format': 'arrow' │ │ │ │ } │ │ │ │ self = <datasets.packaged_modules.generator.generato… │ │ │ │ object at 0x00000266E7F23AD0> │ │ │ │ split_dict = { │ │ │ │ │ 'train': SplitInfo( │ │ │ │ │ │ name='train', │ │ │ │ │ │ num_bytes=0, │ │ │ │ │ │ num_examples=0, │ │ │ │ │ │ shard_lengths=None, │ │ │ │ │ │ dataset_name='generator' │ │ │ │ │ ) │ │ │ │ } │ │ │ │ split_generator = SplitGenerator( │ │ │ │ │ name='train', │ │ │ │ │ gen_kwargs={}, │ │ │ │ │ split_info=SplitInfo( │ │ │ │ │ │ name='train', │ │ │ │ │ │ num_bytes=0, │ │ │ │ │ │ num_examples=0, │ │ │ │ │ │ shard_lengths=None, │ │ │ │ │ │ dataset_name='generator' │ │ │ │ │ ) │ │ │ │ ) │ │ │ │ split_generators = [ │ │ │ │ │ SplitGenerator( │ │ │ │ │ │ name='train', │ │ │ │ │ │ gen_kwargs={}, │ │ │ │ │ │ split_info=SplitInfo( │ │ │ │ │ │ │ name='train', │ │ │ │ │ │ │ num_bytes=0, │ │ │ │ │ │ │ num_examples=0, │ │ │ │ │ │ │ shard_lengths=None, │ │ │ │ │ │ │ dataset_name='generator' │ │ │ │ │ │ ) │ │ │ │ │ ) │ │ │ │ ] │ │ │ │ split_generators_kwargs = {} │ │ │ │ verification_mode = <VerificationMode.BASIC_CHECKS: │ │ │ │ 'basic_checks'> │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\datasets\builder.p │ │ y:1627 in _prepare_split │ │ │ │ 1624 │ │ │ gen_kwargs = split_generator.gen_kwargs │ │ 1625 │ │ │ job_id = 0 │ │ 1626 │ │ │ with pbar: │ │ ❱ 1627 │ │ │ │ for job_id, done, content in self._prepare_split_sing │ │ 1628 │ │ │ │ │ gen_kwargs=gen_kwargs, job_id=job_id, **prepare │ │ 1629 │ │ │ │ ): │ │ 1630 │ │ │ │ │ if done: │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ _prepare_split_args = { │ │ │ │ │ 'fpath': │ │ │ │ 'C:/Users/Administrator/.cache/huggingface/datas… │ │ │ │ │ 'file_format': 'arrow', │ �� │ │ │ 'max_shard_size': 500000000, │ │ │ │ │ 'split_info': SplitInfo( │ │ │ │ │ │ name='train', │ │ │ │ │ │ num_bytes=0, │ │ │ │ │ │ num_examples=0, │ │ │ │ │ │ shard_lengths=None, │ │ │ │ │ │ dataset_name='generator' │ │ │ │ │ ), │ │ │ │ │ 'check_duplicate_keys': True │ │ │ │ } │ │ │ │ check_duplicate_keys = True │ │ │ │ content = 0 │ │ │ │ done = False │ │ │ │ file_format = 'arrow' │ │ │ │ fname = 'generator-train-JJJJJ-SSSSS-of-NNNNN.arrow' │ │ │ │ fpath = 'C:/Users/Administrator/.cache/huggingface/datas… │ │ │ │ gen_kwargs = {} │ │ │ │ job_id = 0 │ │ │ │ max_shard_size = 500000000 │ │ │ │ num_proc = None │ │ │ │ pbar = <datasets.utils.tqdm.tqdm object at │ │ │ │ 0x00000266E8006190> │ │ │ │ result = None │ │ │ │ self = <datasets.packaged_modules.generator.generator.G… │ │ │ │ object at 0x00000266E7F23AD0> │ │ │ │ split_generator = SplitGenerator( │ │ │ │ │ name='train', │ │ │ │ │ gen_kwargs={}, │ │ │ │ │ split_info=SplitInfo( │ │ │ │ │ │ name='train', │ │ │ │ │ │ num_bytes=0, │ │ │ │ │ │ num_examples=0, │ │ │ │ │ │ shard_lengths=None, │ │ │ │ │ │ dataset_name='generator' │ │ │ │ │ ) │ │ │ │ ) │ │ │ │ split_info = SplitInfo( │ │ │ │ │ name='train', │ │ │ │ │ num_bytes=0, │ │ │ │ │ num_examples=0, │ │ │ │ │ shard_lengths=None, │ │ │ │ │ dataset_name='generator' │ │ │ │ ) │ │ │ │ SUFFIX = '-JJJJJ-SSSSS-of-NNNNN' │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ │ │ │ C:\ProgramData\anaconda3\envs\llm-ntkit\Lib\site-packages\datasets\builder.p │ │ y:1784 in _prepare_split_single │ │ │ │ 1781 │ │ │ # Ignore the writer's error for no examples written to th │ │ 1782 │ │ │ if isinstance(e, SchemaInferenceError) and e.context │ │ 1783 │ │ │ │ e = e.context │ │ ❱ 1784 │ │ │ raise DatasetGenerationError("An error occurred while gen │ │ 1785 │ │ │ │ 1786 │ │ yield job_id, True, (total_num_examples, total_num_bytes, wri │ │ 1787 │ │ │ │ ╭───────────────────────────────── locals ─────────────────────────────────╮ │ │ │ _time = 1723796203.1039772 │ │ │ │ check_duplicate_keys = True │ │ │ │ embed_local_files = False │ │ │ │ file_format = 'arrow' │ │ │ │ fpath = 'C:/Users/Administrator/.cache/huggingfa… │ │ │ │ gen_kwargs = {} │ │ │ │ generator = <generator object │ │ │ │ Generator._generate_examples at │ │ │ │ 0x00000266E8020440> │ │ │ │ job_id = 0 │ │ │ │ max_shard_size = 500000000 │ │ │ │ num_examples_progress_update = 0 │ │ │ │ num_shards = 1 │ │ │ │ self = <datasets.packaged_modules.generator.gen… │ │ │ │ object at 0x00000266E7F23AD0> │ │ │ │ shard_id = 0 │ │ │ │ shard_lengths = [] │ │ │ │ split_info = SplitInfo( │ │ │ │ │ name='train', │ │ │ │ │ num_bytes=0, │ │ │ │ │ num_examples=0, │ │ │ │ │ shard_lengths=None, │ │ │ │ │ dataset_name='generator' │ │ │ │ ) │ │ │ │ total_num_bytes = 0 │ │ │ │ total_num_examples = 0 │ │ │ │ writer = <datasets.arrow_writer.ArrowWriter object │ │ │ │ at 0x00000266E8004510> │ │ │ │ writer_class = <class │ │ │ │ 'datasets.arrow_writer.ArrowWriter'> │ │ │ ╰──────────────────────────────────────────────────────────────────────────╯ │ ╰──────────────────────────────────────────────────────────────────────────────╯ DatasetGenerationError: An error occurred while generating the dataset PS C:\Users\Administrator\Pictures\lll_tk>

Aug 16 '24 08:08 jeetendraabvv

LLM-Finetuning-Toolkit LLM-Finetuning-Toolkit copied to clipboard

UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 101: character maps to <undefined>

LLM-Finetuning-Toolkit
LLM-Finetuning-Toolkit copied to clipboard