I run the following command in the terminal and encountered assertion error.
torchrun --standalone --nproc_per_node 3 scripts/inference.py configs/opensora/inference/16x512x512.py --ckpt-path OpenSora-v1-HQ-16x512x512.pth --prompt-path ./assets/texts/test_1.txt
Traceback (most recent call last):
File "/root/autodl-tmp/Open-Sora/scripts/inference.py", line 112, in
main()
File "/root/autodl-tmp/Open-Sora/scripts/inference.py", line 58, in main
model = build_module(
File "/root/miniconda3/lib/python3.10/site-packages/opensora/registry.py", line 22, in build_module
return builder.build(cfg)
File "/root/miniconda3/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/root/miniconda3/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/root/miniconda3/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 385, in STDiT_XL_2
model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 181, in init
[
File "/root/miniconda3/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 182, in
STDiTBlock(
File "/root/miniconda3/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 78, in init
assert d_t % sp_size == 0
AssertionError
[2024-04-10 08:02:23,334] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 1641) of binary: /root/miniconda3/bin/python
Traceback (most recent call last):
File "/root/miniconda3/bin/torchrun", line 8, in
sys.exit(main())
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
return f(*args, **kwargs)
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/root/miniconda3/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
I can run 1 gpu 16x256x256 well, and same error running 3 gpu 16x256x256.
Please stick to even-sized nproc_per_node
for now (or setting it to 1
). The reason was the temporal dimension of the DiT attention block is of 16
which is not divisible by the degree of sequence parallelism3
.
Apr 14
'24 15:04
JThh