OneTrainer
OneTrainer copied to clipboard
[Bug]: Error when starting training
I'm getting an error each time I try to run the training process. For context, I'm using Windows 11 with an RTX 3090, and both A111 and ComfyUI work fine on my machine.
OneTrainer installs fine without any errors, but I get an error when I try to start the actual training process inside the GUI. I've tried reinstalling it but doesn't fix the issue. I've tried upgrading transformers from 4.36.2 to 4.40.1 (per the attached pip freeze log) but that doesn't seem to fix it either, so doubt the issue is related to that. Would really appreciate some guidance as I've tried searching past bug reports and also searched google, but can't seem to find anything concrete that helps.
Config
I'm using the following config:
{
"__version": 3,
"training_method": "FINE_TUNE",
"model_type": "STABLE_DIFFUSION_XL_10_BASE",
"debug_mode": false,
"debug_dir": "debug",
"workspace_dir": "Z:/AI_Image_Generation/OneTrainer/OneTrainerWorkspace",
"cache_dir": "Z:/AI_Image_Generation/OneTrainer/OneTrainerWorkspace/cache",
"tensorboard": false,
"tensorboard_expose": false,
"continue_last_backup": false,
"include_train_config": "NONE",
"base_model_name": "Z:/AI_Image_Generation/Stability Matrix_V2/Data/Models/StableDiffusion/RealVisXL_V4.0.safetensors",
"weight_dtype": "BFLOAT_16",
"output_dtype": "BFLOAT_16",
"output_model_format": "SAFETENSORS",
"output_model_destination": "Z:/AI_Image_Generation/OneTrainer/OneTrainerWorkspace/tier1_fast_GH.safetensors",
"gradient_checkpointing": true,
"force_circular_padding": false,
"concept_file_name": "training_concepts/concepts.json",
"concepts": null,
"circular_mask_generation": false,
"random_rotate_and_crop": false,
"aspect_ratio_bucketing": false,
"latent_caching": true,
"clear_cache_before_training": false,
"learning_rate_scheduler": "CONSTANT",
"learning_rate": 1e-05,
"learning_rate_warmup_steps": 200,
"learning_rate_cycles": 1,
"epochs": 250,
"batch_size": 1,
"gradient_accumulation_steps": 1,
"ema": "OFF",
"ema_decay": 0.999,
"ema_update_step_interval": 1,
"dataloader_threads": 2,
"train_device": "cuda",
"temp_device": "cpu",
"train_dtype": "BFLOAT_16",
"fallback_train_dtype": "FLOAT_32",
"enable_autocast_cache": true,
"only_cache": false,
"resolution": "1024",
"attention_mechanism": "DEFAULT",
"align_prop": false,
"align_prop_probability": 0.1,
"align_prop_loss": "AESTHETIC",
"align_prop_weight": 0.01,
"align_prop_steps": 20,
"align_prop_truncate_steps": 0.5,
"align_prop_cfg_scale": 7.0,
"mse_strength": 1.0,
"mae_strength": 0.0,
"vb_loss_strength": 1.0,
"loss_weight_fn": "CONSTANT",
"loss_weight_strength": 5.0,
"dropout_probability": 0.0,
"loss_scaler": "NONE",
"learning_rate_scaler": "NONE",
"offset_noise_weight": 0.0,
"perturbation_noise_weight": 0.0,
"rescale_noise_scheduler_to_zero_terminal_snr": false,
"force_v_prediction": false,
"force_epsilon_prediction": false,
"min_noising_strength": 0.0,
"max_noising_strength": 1.0,
"noising_weight": 0.0,
"noising_bias": 0.5,
"unet": {
"__version": 0,
"model_name": "",
"train": true,
"stop_training_after": 10000,
"stop_training_after_unit": "EPOCH",
"learning_rate": 1e-05,
"weight_dtype": "BFLOAT_16"
},
"prior": {
"__version": 0,
"model_name": "",
"train": true,
"stop_training_after": 10000,
"stop_training_after_unit": "EPOCH",
"learning_rate": null,
"weight_dtype": "NONE"
},
"text_encoder": {
"__version": 0,
"model_name": "",
"train": true,
"stop_training_after": 250,
"stop_training_after_unit": "NEVER",
"learning_rate": 3e-06,
"weight_dtype": "BFLOAT_16"
},
"text_encoder_layer_skip": 0,
"text_encoder_2": {
"__version": 0,
"model_name": "",
"train": false,
"stop_training_after": 30,
"stop_training_after_unit": "EPOCH",
"learning_rate": null,
"weight_dtype": "BFLOAT_16"
},
"text_encoder_2_layer_skip": 0,
"vae": {
"__version": 0,
"model_name": "",
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "FLOAT_32"
},
"effnet_encoder": {
"__version": 0,
"model_name": "",
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE"
},
"decoder": {
"__version": 0,
"model_name": "",
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE"
},
"decoder_text_encoder": {
"__version": 0,
"model_name": "",
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE"
},
"decoder_vqgan": {
"__version": 0,
"model_name": "",
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"learning_rate": null,
"weight_dtype": "NONE"
},
"masked_training": false,
"unmasked_probability": 0.1,
"unmasked_weight": 0.1,
"normalize_masked_area_loss": false,
"embedding_learning_rate": null,
"preserve_embedding_norm": false,
"embedding": {
"__version": 0,
"uuid": "f561c996-8a3f-4ff4-9b44-c819f14594f1",
"model_name": "",
"placeholder": "<embedding>",
"train": true,
"stop_training_after": null,
"stop_training_after_unit": "NEVER",
"token_count": 1,
"initial_embedding_text": "*"
},
"additional_embeddings": [],
"embedding_weight_dtype": "FLOAT_32",
"lora_model_name": "",
"lora_rank": 16,
"lora_alpha": 1.0,
"lora_weight_dtype": "FLOAT_32",
"optimizer": {
"__version": 0,
"optimizer": "ADAFACTOR",
"adam_w_mode": false,
"alpha": null,
"amsgrad": false,
"beta1": null,
"beta2": null,
"beta3": null,
"bias_correction": false,
"block_wise": false,
"capturable": false,
"centered": false,
"clip_threshold": 1.0,
"d0": null,
"d_coef": null,
"dampening": null,
"decay_rate": -0.8,
"decouple": false,
"differentiable": false,
"eps": 1e-30,
"eps2": 0.001,
"foreach": false,
"fsdp_in_use": false,
"fused": false,
"fused_back_pass": false,
"growth_rate": null,
"initial_accumulator_value": null,
"is_paged": false,
"log_every": null,
"lr_decay": null,
"max_unorm": null,
"maximize": false,
"min_8bit_size": null,
"momentum": null,
"nesterov": false,
"no_prox": false,
"optim_bits": null,
"percentile_clipping": null,
"relative_step": false,
"safeguard_warmup": false,
"scale_parameter": false,
"stochastic_rounding": false,
"use_bias_correction": false,
"use_triton": false,
"warmup_init": false,
"weight_decay": 0.01
},
"optimizer_defaults": {
"ADAFACTOR": {
"__version": 0,
"optimizer": "ADAFACTOR",
"adam_w_mode": false,
"alpha": null,
"amsgrad": false,
"beta1": null,
"beta2": null,
"beta3": null,
"bias_correction": false,
"block_wise": false,
"capturable": false,
"centered": false,
"clip_threshold": 1.0,
"d0": null,
"d_coef": null,
"dampening": null,
"decay_rate": -0.8,
"decouple": false,
"differentiable": false,
"eps": 1e-30,
"eps2": 0.001,
"foreach": false,
"fsdp_in_use": false,
"fused": false,
"fused_back_pass": false,
"growth_rate": null,
"initial_accumulator_value": null,
"is_paged": false,
"log_every": null,
"lr_decay": null,
"max_unorm": null,
"maximize": false,
"min_8bit_size": null,
"momentum": null,
"nesterov": false,
"no_prox": false,
"optim_bits": null,
"percentile_clipping": null,
"relative_step": false,
"safeguard_warmup": false,
"scale_parameter": false,
"stochastic_rounding": false,
"use_bias_correction": false,
"use_triton": false,
"warmup_init": false,
"weight_decay": 0.01
}
},
"sample_definition_file_name": "training_samples/samples.json",
"samples": null,
"sample_after": 10,
"sample_after_unit": "MINUTE",
"sample_image_format": "JPG",
"samples_to_tensorboard": false,
"non_ema_sampling": false,
"backup_after": 30,
"backup_after_unit": "NEVER",
"rolling_backup": false,
"rolling_backup_count": 3,
"backup_before_save": false,
"save_after": 25,
"save_after_unit": "EPOCH",
"save_filename_prefix": "gh_tier1_fast"
}
Error log output
activating venv Z:\AI_Image_Generation\OneTrainer\venv
Using Python "Z:\AI_Image_Generation\OneTrainer\venv\Scripts\python.exe"
Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\utils\generic.py:441: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
_torch_pytree._register_pytree_node(
Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\utils\generic.py:309: UserWarning: torch.utils._pytree._register_pytree_node is deprecated. Please use torch.utils._pytree.register_pytree_node instead.
_torch_pytree._register_pytree_node(
Traceback (most recent call last):
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 189, in load
self.__load_internal(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 26, in __load_internal
self.__load_diffusers(model, model_type, weight_dtypes, base_model_name, vae_model_name)
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 36, in __load_diffusers
tokenizer_1 = CLIPTokenizer.from_pretrained(
File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.
Traceback (most recent call last):
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 195, in load
self.__load_diffusers(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 36, in __load_diffusers
tokenizer_1 = CLIPTokenizer.from_pretrained(
File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1925, in from_pretrained
raise ValueError(
ValueError: Calling CLIPTokenizer.from_pretrained() with the path to a single file or url is not supported for this tokenizer. Use a model identifier or the path to a directory instead.
Traceback (most recent call last):
File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1479, in create_text_encoders_and_tokenizers_from_ldm
tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1970, in from_pretrained
tokenizer_config = json.load(reader)
File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 293, in load
return loads(fp.read(),
File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 201, in load
self.__load_safetensors(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 145, in __load_safetensors
pipeline = StableDiffusionXLPipeline.from_single_file(
File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\huggingface_hub\utils\_validators.py", line 118, in _inner_fn
return fn(*args, **kwargs)
File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 289, in from_single_file
components = build_sub_model_components(
File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 102, in build_sub_model_components
text_encoder_components = create_text_encoders_and_tokenizers_from_ldm(
File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1485, in create_text_encoders_and_tokenizers_from_ldm
raise ValueError(
ValueError: With local_files_only set to False, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'.
Traceback (most recent call last):
File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1479, in create_text_encoders_and_tokenizers_from_ldm
tokenizer = CLIPTokenizer.from_pretrained(config_name, local_files_only=local_files_only)
File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\transformers\tokenization_utils_base.py", line 1970, in from_pretrained
tokenizer_config = json.load(reader)
File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 293, in load
return loads(fp.read(),
File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\__init__.py", line 346, in loads
return _default_decoder.decode(s)
File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 337, in decode
obj, end = self.raw_decode(s, idx=_w(s, 0).end())
File "C:\Users\GH\AppData\Local\Programs\Python\Python310\lib\json\decoder.py", line 355, in raw_decode
raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 207, in load
self.__load_ckpt(model, model_type, weight_dtypes, model_names.base_model, model_names.vae_model)
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 104, in __load_ckpt
pipeline = StableDiffusionXLPipeline.from_single_file(
File "Z:\AI_Image_Generation\OneTrainer\venv\lib\site-packages\huggingface_hub\utils\_validators.py", line 118, in _inner_fn
return fn(*args, **kwargs)
File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 289, in from_single_file
components = build_sub_model_components(
File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file.py", line 102, in build_sub_model_components
text_encoder_components = create_text_encoders_and_tokenizers_from_ldm(
File "Z:\AI_Image_Generation\OneTrainer\venv\src\diffusers\src\diffusers\loaders\single_file_utils.py", line 1485, in create_text_encoders_and_tokenizers_from_ldm
raise ValueError(
ValueError: With local_files_only set to False, you must first locally save the text_encoder and tokenizer in the following path: 'openai/clip-vit-large-patch14'.
Traceback (most recent call last):
File "Z:\AI_Image_Generation\OneTrainer\modules\ui\TrainUI.py", line 522, in __training_thread_function
trainer.start()
File "Z:\AI_Image_Generation\OneTrainer\modules\trainer\GenericTrainer.py", line 113, in start
self.model = self.model_loader.load(
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\StableDiffusionXLFineTuneModelLoader.py", line 62, in load
base_model_loader.load(model, model_type, model_names, weight_dtypes)
File "Z:\AI_Image_Generation\OneTrainer\modules\modelLoader\stableDiffusionXL\StableDiffusionXLModelLoader.py", line 214, in load
raise Exception("could not load model: " + model_names.base_model)
Exception: could not load model: Z:/AI_Image_Generation/OneTrainer/RealVisXL_V4.0.safetensors
Output of pip freeze
absl-py==2.1.0 accelerate==0.27.2 aiofiles==23.2.1 aiohttp==3.9.5 aiosignal==1.3.1 altair==5.3.0 annotated-types==0.6.0 annoy-fixed==1.16.3 antlr4-python3-runtime==4.9.3 anyio==4.3.0 appdirs==1.4.4 astunparse==1.6.3 async-timeout==4.0.3 attrs==23.2.0 beautifulsoup4==4.12.2 bidict==0.23.1 bitsandbytes==0.43.0 blinker==1.7.0 braceexpand==0.1.7 cachetools==5.3.3 certifi==2024.2.2 charset-normalizer==3.3.2 clean-fid==0.1.35 click==8.1.7 clip-anytorch==2.6.0 colorama==0.4.6 coloredlogs==15.0.1 contourpy==1.2.1 cycler==0.12.1 dadaptation==3.1 dctorch==0.1.2 decorator==4.4.2 diffusers==0.27.2 distlib==0.3.8 distro==1.9.0 docker-pycreds==0.4.0 easydict==1.10 easygui==0.98.3 einops==0.7.0 einops-exts==0.0.4 entrypoints==0.4 exceptiongroup==1.2.1 face-alignment==1.4.1 facexlib==0.3.0 fairscale==0.4.13 faiss-cpu==1.7.4 fastapi==0.110.2 ffmpeg-progress-yield==0.7.8 ffmpy==0.3.2 filelock==3.13.4 filetype==1.2.0 filterpy==1.4.5 Flask==2.3.2 Flask-SocketIO==5.3.4 flatbuffers==24.3.25 fonttools==4.51.0 frozenlist==1.4.1 fsspec==2024.3.1 ftfy==6.2.0 gast==0.5.4 gitdb==4.0.11 GitPython==3.1.43 google-pasta==0.2.0 gradio==4.19.0 gradio_client==0.10.0 gradio_imageslider==0.0.20 grpcio==1.62.2 h11==0.14.0 h5py==3.11.0 httpcore==1.0.5 httpx==0.27.0 huggingface-hub==0.22.2 humanfriendly==10.0 idna==3.7 imageio==2.34.1 imageio-ffmpeg==0.4.9 imagesize==1.4.1 importlib_metadata==7.1.0 importlib_resources==6.4.0 invisible-watermark==0.2.0 itsdangerous==2.1.2 Jinja2==3.1.3 joblib==1.4.0 jsonmerge==1.9.2 jsonschema==4.21.1 jsonschema-specifications==2023.12.1 k-diffusion==0.1.1.post1 keras==3.2.1 kiwisolver==1.4.5 kornia==0.7.1 lazy_loader==0.4 libclang==18.1.1 lightning-utilities==0.11.2 lion-pytorch==0.0.6 llvmlite==0.42.0 lycoris_lora==2.2.0.post3 Markdown==3.5.2 markdown-it-py==3.0.0 MarkupSafe==2.1.5 matplotlib==3.8.3 mdurl==0.1.2 ml-dtypes==0.3.2 moviepy==1.0.3 mpmath==1.3.0 multidict==6.0.5 namex==0.0.8 networkx==3.3 ninja==1.11.1.1 numba==0.59.1 numpy==1.26.4 nvidia-ml-py==12.535.161 nvitop==1.3.2 omegaconf==2.3.0 onnx==1.15.0 onnxruntime-gpu==1.17.1 open-clip-torch==2.24.0 openai==1.3.3 openai-clip==1.0.1 opencv-python==4.9.0.80 opt-einsum==3.3.0 optree==0.11.0 orjson==3.10.1 packaging==24.0 pandas==2.2.1 pathtools==0.1.2 pillow==10.2.0 platformdirs==3.11.0 prodigyopt==1.0 proglog==0.1.10 protobuf==4.25.3 psutil==5.9.8 pydantic==2.7.0 pydantic_core==2.18.1 pydub==0.25.1 Pygments==2.17.2 pyparsing==3.1.2 pypdfium2==4.27.0 pyreadline3==3.4.1 python-dateutil==2.9.0.post0 python-dotenv==1.0.1 python-engineio==4.9.0 python-multipart==0.0.9 python-socketio==5.11.2 pytorch-lightning==2.2.2 pytz==2024.1 PyWavelets==1.6.0 PyYAML==6.0.1 referencing==0.34.0 regex==2024.4.16 requests==2.31.0 rich==13.7.1 rpds-py==0.18.0 ruff==0.4.1 safetensors==0.4.3 scikit-image==0.23.2 scikit-learn==1.4.1.post1 scipy==1.12.0 semantic-version==2.10.0 semantra==0.1.8 sentencepiece==0.2.0 sentry-sdk==1.45.0 setproctitle==1.3.3 shellingham==1.5.4 simple-websocket==1.0.0 six==1.16.0 smmap==5.0.1 sniffio==1.3.1 soupsieve==2.5 starlette==0.37.2 sympy==1.12 tenacity==8.2.2 tensorboard==2.16.2 tensorboard-data-server==0.7.2 tensorflow==2.16.1 tensorflow-intel==2.16.1 tensorflow-io-gcs-filesystem==0.31.0 termcolor==2.4.0 threadpoolctl==3.4.0 tifffile==2024.4.18 tiktoken==0.4.0 timm==0.9.16 tk==0.1.0 tokenizers==0.19.1 toml==0.10.2 tomlkit==0.12.0 toolz==0.12.1 torch==2.2.0+cu121 torchaudio==2.2.0+cu121 torchdiffeq==0.2.3 torchmetrics==1.3.2 torchsde==0.2.6 torchvision==0.17.0+cu121 tqdm==4.66.2 trampoline==0.1.2 transformers==4.40.1 triton @ https://huggingface.co/MonsterMMORPG/SECourses/resolve/main/triton-2.1.0-cp310-cp310-win_amd64.whl typer==0.12.3 typing_extensions==4.11.0 tzdata==2024.1 urllib3==2.2.1 uvicorn==0.28.0 virtualenv==20.23.0 voluptuous==0.13.1 wandb==0.16.4 wcwidth==0.2.13 webdataset==0.2.86 websockets==11.0.3 Werkzeug==2.3.6 Wikipedia-API==0.6.0 windows-curses==2.3.2 wrapt==1.16.0 wsproto==1.2.0 xformers==0.0.24 yarl==1.9.4 zipp==3.18.1