fast-stable-diffusion
fast-stable-diffusion copied to clipboard
Training with fast-Dreambooth.py doesn't work anymore
I have been using the colab (with the Pro+ subscription on an A100 GPU) for about 5 months now. However, today, it is not working. Here is the error I get:
File "<string>", line 21, in _fwd_kernel
KeyError: ('2-.-0-.-0-83ca8b715a9dc5f32dc1110973485f64-d6252949da17ceb5f3a278a70250af13-3b85c7bef5f0a641282f3b73af50f599-3d2aedeb40d6d81c66a42791e268f98b-3498c340fd4b6ee7805fd54b882a04f5-e1f133f98d04093da2078dfc51c36b72-b26258bf01f839199e39d64851821f26-d7c06e3b46e708006c15224aac7a1378-f585402118c8a136948ce0a49cfe122c', (torch.float16, torch.float16, torch.float16, None, torch.float16, torch.float32, torch.float32, 'fp32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32', 'i32'), ('none', False, 64, False, False, True, 128, 128), (True, True, True, (False,), True, True, True, (False,), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (True, False), (False, True), (True, False), (True, False), (True, False), (True, False), (False, False), (False, False)))
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 937, in build_triton_ir
generator.visit(fn.parse())
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/usr/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 183, in visit_Module
ast.NodeVisitor.generic_visit(self, node)
File "/usr/lib/python3.10/ast.py", line 426, in generic_visit
self.visit(item)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/usr/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 252, in visit_FunctionDef
has_ret = self.visit_compound_statement(node.body)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 177, in visit_compound_statement
self.last_ret_type = self.visit(stmt)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/usr/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 678, in visit_For
self.visit_compound_statement(node.body)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 177, in visit_compound_statement
self.last_ret_type = self.visit(stmt)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/usr/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 319, in visit_AugAssign
self.visit(assign)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/usr/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 301, in visit_Assign
values = self.visit(node.value)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/usr/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 339, in visit_BinOp
rhs = self.visit(node.right)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 855, in visit
return super().visit(node)
File "/usr/lib/python3.10/ast.py", line 418, in visit
return visitor(node)
File "/usr/local/lib/python3.10/dist-packages/triton/compiler.py", line 797, in visit_Call
return fn(*args, _builder=self.builder, **kws)
File "/usr/local/lib/python3.10/dist-packages/triton/impl/base.py", line 22, in wrapper
return fn(*args, **kwargs)
TypeError: dot() got an unexpected keyword argument 'trans_b'
Traceback (most recent call last):
File "/usr/local/bin/accelerate", line 8, in <module>
sys.exit(main())
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/accelerate_cli.py", line 43, in main
args.func(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 837, in launch_command
simple_launcher(args)
File "/usr/local/lib/python3.10/dist-packages/accelerate/commands/launch.py", line 354, in simple_launcher
raise subprocess.CalledProcessError(returncode=process.returncode, cmd=cmd)
subprocess.CalledProcessError: Command '['/usr/bin/python3', '/content/diffusers/examples/dreambooth/train_dreambooth.py', '--image_captions_filename', '--train_only_unet', '--save_starting_step=500', '--save_n_steps=0', '--Session_dir=/content/gdrive/MyDrive/Fast-Dreambooth/Sessions/clay-public', '--pretrained_model_name_or_path=/content/stable-diffusion-v2-512', '--instance_data_dir=/content/gdrive/MyDrive/Fast-Dreambooth/Sessions/clay-public/instance_images', '--output_dir=/content/models/clay-public', '--captions_dir=/content/gdrive/MyDrive/Fast-Dreambooth/Sessions/clay-public/captions', '--instance_prompt=', '--seed=174067', '--resolution=512', '--mixed_precision=fp16', '--train_batch_size=1', '--gradient_accumulation_steps=1', '--use_8bit_adam', '--learning_rate=2e-06', '--lr_scheduler=linear', '--lr_warmup_steps=0', '--max_train_steps=1500']' returned non-zero exit status 1.
Disconnect then reconnect, it should work fine
I've done this 3-4 times. It does not work
use the T4 and see if it works
That works, thank you. Do you know why it doesn't work on the A100 though?
I'm not sure, I'll investigate it later
I have the same issue on A100s
@Mustaf2501 use the T4 while I fix it, I recommend not using the A100, it cost too much for a simple dreambooth application.
@Mustaf2501 use the T4 while I fix it, I recommend not using the A100, it cost too much for a simple dreambooth application.
its not working on T4. Do you have any fixes.