Inference ImportError
when i run inference there is an importerror
ImportError: /home/lz/anaconda3/envs/opensora/lib/python3.10/site-packages/fused_layer_norm_cuda.cpython-310-x86_64-linux-gnu.so: undefined symbol: _ZN2at4_ops19empty_memory_format4callEN3c108ArrayRefINS2_6SymIntEEENS2_8optionalINS2_10ScalarTypeEEENS6_INS2_6LayoutEEENS6_INS2_6DeviceEEENS6_IbEENS6_INS2_12MemoryFormatEEE
[2024-03-18 14:53:46,409] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 153812) of binary: /home/lz/anaconda3/envs/opensora/bin/python
Traceback (most recent call last):
File "/home/lz/anaconda3/envs/opensora/bin/torchrun", line 33, in
sys.exit(load_entry_point('torch==2.2.1+cu121', 'console_scripts', 'torchrun')())
File "/home/lz/anaconda3/envs/opensora/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/init.py", line 347, in wrapper
return f(*args, **kwargs)
File "/home/lz/anaconda3/envs/opensora/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/home/lz/anaconda3/envs/opensora/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/home/lz/anaconda3/envs/opensora/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in call
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/lz/anaconda3/envs/opensora/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
scripts/inference.py FAILED
same problem here
try pip uninstall apex , works for me
try pip uninstall apex , works for me
tried and get RuntimeError: FusedLayerNorm not available. Please install apex.
Traceback (most recent call last):
File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 33, in get_layernorm
from apex.normalization import FusedLayerNorm
ModuleNotFoundError: No module named 'apex'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/output/Open-Sora/scripts/inference.py", line 112, in <module>
main()
File "/output/Open-Sora/scripts/inference.py", line 58, in main
model = build_module(
File "/usr/local/lib/python3.10/site-packages/opensora/registry.py", line 22, in build_module
return builder.build(cfg)
File "/usr/local/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build
return self.build_func(cfg, *args, **kwargs, registry=self)
File "/usr/local/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg
obj = obj_cls(**args) # type: ignore
File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 385, in STDiT_XL_2
model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs)
File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 181, in __init__
[
File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 182, in <listcomp>
STDiTBlock(
File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 56, in __init__
self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel)
File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 37, in get_layernorm
raise RuntimeError("FusedLayerNorm not available. Please install apex.")
RuntimeError: FusedLayerNorm not available. Please install apex.
[2024-03-19 04:51:57,268] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 12284) of binary: /usr/local/bin/python
Traceback (most recent call last):
File "/usr/local/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/usr/local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper
return f(*args, **kwargs)
File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main
run(args)
File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run
elastic_launch(
File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
scripts/inference.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2024-03-19_04:51:57
host : openbayesalgo-t193qht565kb-main
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 12284)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
try pip uninstall apex , works for me尝试pip卸载apex,为我工作
tried and get
RuntimeError: FusedLayerNorm not available. Please install apex.尝试并获取RuntimeError: FusedLayerNorm not available. Please install apex.Traceback (most recent call last): File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 33, in get_layernorm from apex.normalization import FusedLayerNorm ModuleNotFoundError: No module named 'apex' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/output/Open-Sora/scripts/inference.py", line 112, in <module> main() File "/output/Open-Sora/scripts/inference.py", line 58, in main model = build_module( File "/usr/local/lib/python3.10/site-packages/opensora/registry.py", line 22, in build_module return builder.build(cfg) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build return self.build_func(cfg, *args, **kwargs, registry=self) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg obj = obj_cls(**args) # type: ignore File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 385, in STDiT_XL_2 model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 181, in __init__ [ File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 182, in <listcomp> STDiTBlock( File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 56, in __init__ self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 37, in get_layernorm raise RuntimeError("FusedLayerNorm not available. Please install apex.") RuntimeError: FusedLayerNorm not available. Please install apex. [2024-03-19 04:51:57,268] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 12284) of binary: /usr/local/bin/python Traceback (most recent call last): File "/usr/local/bin/torchrun", line 8, in <module> sys.exit(main()) File "/usr/local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper return f(*args, **kwargs) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main run(args) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run elastic_launch( File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ scripts/inference.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2024-03-19_04:51:57 host : openbayesalgo-t193qht565kb-main rank : 0 (local_rank: 0) exitcode : 1 (pid: 12284) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================
你把配置文件里的enable_layernorm_kernel设成False,就不需要apex了
try pip uninstall apex , works for me尝试pip卸载apex,为我工作
tried and get
RuntimeError: FusedLayerNorm not available. Please install apex.尝试并获取RuntimeError: FusedLayerNorm not available. Please install apex.Traceback (most recent call last): File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 33, in get_layernorm from apex.normalization import FusedLayerNorm ModuleNotFoundError: No module named 'apex' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/output/Open-Sora/scripts/inference.py", line 112, in <module> main() File "/output/Open-Sora/scripts/inference.py", line 58, in main model = build_module( File "/usr/local/lib/python3.10/site-packages/opensora/registry.py", line 22, in build_module return builder.build(cfg) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build return self.build_func(cfg, *args, **kwargs, registry=self) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg obj = obj_cls(**args) # type: ignore File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 385, in STDiT_XL_2 model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 181, in __init__ [ File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 182, in <listcomp> STDiTBlock( File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 56, in __init__ self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 37, in get_layernorm raise RuntimeError("FusedLayerNorm not available. Please install apex.") RuntimeError: FusedLayerNorm not available. Please install apex. [2024-03-19 04:51:57,268] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 12284) of binary: /usr/local/bin/python Traceback (most recent call last): File "/usr/local/bin/torchrun", line 8, in <module> sys.exit(main()) File "/usr/local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper return f(*args, **kwargs) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main run(args) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run elastic_launch( File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ scripts/inference.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2024-03-19_04:51:57 host : openbayesalgo-t193qht565kb-main rank : 0 (local_rank: 0) exitcode : 1 (pid: 12284) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================你把配置文件里的enable_layernorm_kernel设成False,就不需要apex了
我把这个改成False之后还是出现了undefined symbol的报错。
try pip uninstall apex , works for me尝试pip卸载apex,为我工作
tried and get
RuntimeError: FusedLayerNorm not available. Please install apex.尝试并获取RuntimeError: FusedLayerNorm not available. Please install apex.Traceback (most recent call last): File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 33, in get_layernorm from apex.normalization import FusedLayerNorm ModuleNotFoundError: No module named 'apex' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/output/Open-Sora/scripts/inference.py", line 112, in <module> main() File "/output/Open-Sora/scripts/inference.py", line 58, in main model = build_module( File "/usr/local/lib/python3.10/site-packages/opensora/registry.py", line 22, in build_module return builder.build(cfg) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build return self.build_func(cfg, *args, **kwargs, registry=self) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg obj = obj_cls(**args) # type: ignore File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 385, in STDiT_XL_2 model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 181, in __init__ [ File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 182, in <listcomp> STDiTBlock( File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 56, in __init__ self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 37, in get_layernorm raise RuntimeError("FusedLayerNorm not available. Please install apex.") RuntimeError: FusedLayerNorm not available. Please install apex. [2024-03-19 04:51:57,268] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 12284) of binary: /usr/local/bin/python Traceback (most recent call last): File "/usr/local/bin/torchrun", line 8, in <module> sys.exit(main()) File "/usr/local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper return f(*args, **kwargs) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main run(args) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run elastic_launch( File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ scripts/inference.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2024-03-19_04:51:57 host : openbayesalgo-t193qht565kb-main rank : 0 (local_rank: 0) exitcode : 1 (pid: 12284) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================你把配置文件里的enable_layernorm_kernel设成False,就不需要apex了
我把这个改成False之后还是出现了undefined symbol的报错。
你要不试试环境重新装一下,那个设成False后如果有装apex还是会去调用
try pip uninstall apex , works for me尝试pip卸载apex,为我工作
tried and get
RuntimeError: FusedLayerNorm not available. Please install apex.尝试并获取RuntimeError: FusedLayerNorm not available. Please install apex.Traceback (most recent call last): File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 33, in get_layernorm from apex.normalization import FusedLayerNorm ModuleNotFoundError: No module named 'apex' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/output/Open-Sora/scripts/inference.py", line 112, in <module> main() File "/output/Open-Sora/scripts/inference.py", line 58, in main model = build_module( File "/usr/local/lib/python3.10/site-packages/opensora/registry.py", line 22, in build_module return builder.build(cfg) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build return self.build_func(cfg, *args, **kwargs, registry=self) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg obj = obj_cls(**args) # type: ignore File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 385, in STDiT_XL_2 model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 181, in __init__ [ File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 182, in <listcomp> STDiTBlock( File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 56, in __init__ self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 37, in get_layernorm raise RuntimeError("FusedLayerNorm not available. Please install apex.") RuntimeError: FusedLayerNorm not available. Please install apex. [2024-03-19 04:51:57,268] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 12284) of binary: /usr/local/bin/python Traceback (most recent call last): File "/usr/local/bin/torchrun", line 8, in <module> sys.exit(main()) File "/usr/local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper return f(*args, **kwargs) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main run(args) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run elastic_launch( File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ scripts/inference.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2024-03-19_04:51:57 host : openbayesalgo-t193qht565kb-main rank : 0 (local_rank: 0) exitcode : 1 (pid: 12284) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================你把配置文件里的enable_layernorm_kernel设成False,就不需要apex了
我把这个改成False之后还是出现了undefined symbol的报错。
你要不试试环境重新装一下,那个设成False后如果有装apex还是会去调用
确保环境里没有apex还是报这个错,不清楚是否是其他地方有版本冲突。cuda用的12.1,python是3.9.2,torch是2.2.1+cu121,系统是: Distributor ID: Debian Description: Debian GNU/Linux 11 (bullseye) Release: 11 Codename: bullseye
我重启docker之后改为用MAX_JOBS=1 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers 安装xformers就解决了
完整命令
pip install packaging ninja
pip install flash-attn --no-build-isolation
pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git
pip install -v .
apt-get update
apt-get install libgl1-mesa-glx
MAX_JOBS=1 pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
try pip uninstall apex , works for me尝试pip卸载apex,为我工作
tried and get
RuntimeError: FusedLayerNorm not available. Please install apex.尝试并获取RuntimeError: FusedLayerNorm not available. Please install apex.Traceback (most recent call last): File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 33, in get_layernorm from apex.normalization import FusedLayerNorm ModuleNotFoundError: No module named 'apex' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/output/Open-Sora/scripts/inference.py", line 112, in <module> main() File "/output/Open-Sora/scripts/inference.py", line 58, in main model = build_module( File "/usr/local/lib/python3.10/site-packages/opensora/registry.py", line 22, in build_module return builder.build(cfg) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build return self.build_func(cfg, *args, **kwargs, registry=self) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg obj = obj_cls(**args) # type: ignore File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 385, in STDiT_XL_2 model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 181, in __init__ [ File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 182, in <listcomp> STDiTBlock( File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 56, in __init__ self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 37, in get_layernorm raise RuntimeError("FusedLayerNorm not available. Please install apex.") RuntimeError: FusedLayerNorm not available. Please install apex. [2024-03-19 04:51:57,268] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 12284) of binary: /usr/local/bin/python Traceback (most recent call last): File "/usr/local/bin/torchrun", line 8, in <module> sys.exit(main()) File "/usr/local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper return f(*args, **kwargs) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main run(args) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run elastic_launch( File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ scripts/inference.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2024-03-19_04:51:57 host : openbayesalgo-t193qht565kb-main rank : 0 (local_rank: 0) exitcode : 1 (pid: 12284) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================你把配置文件里的enable_layernorm_kernel设成False,就不需要apex了
我把这个改成False之后还是出现了undefined symbol的报错。
你要不试试环境重新装一下,那个设成False后如果有装apex还是会去调用
确保环境里没有apex还是报这个错,不清楚是否是其他地方有版本冲突。cuda用的12.1,python是3.9.2,torch是2.2.1+cu121,系统是: Distributor ID: Debian Description: Debian GNU/Linux 11 (bullseye) Release: 11 Codename: bullseye
我也是同样的问题,同样的软件版本
please try the latest version.
try pip uninstall apex , works for me
tried and get
RuntimeError: FusedLayerNorm not available. Please install apex.Traceback (most recent call last): File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 33, in get_layernorm from apex.normalization import FusedLayerNorm ModuleNotFoundError: No module named 'apex' During handling of the above exception, another exception occurred: Traceback (most recent call last): File "/output/Open-Sora/scripts/inference.py", line 112, in <module> main() File "/output/Open-Sora/scripts/inference.py", line 58, in main model = build_module( File "/usr/local/lib/python3.10/site-packages/opensora/registry.py", line 22, in build_module return builder.build(cfg) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/registry.py", line 570, in build return self.build_func(cfg, *args, **kwargs, registry=self) File "/usr/local/lib/python3.10/site-packages/mmengine/registry/build_functions.py", line 121, in build_from_cfg obj = obj_cls(**args) # type: ignore File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 385, in STDiT_XL_2 model = STDiT(depth=28, hidden_size=1152, patch_size=(1, 2, 2), num_heads=16, **kwargs) File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 181, in __init__ [ File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 182, in <listcomp> STDiTBlock( File "/usr/local/lib/python3.10/site-packages/opensora/models/stdit/stdit.py", line 56, in __init__ self.norm1 = get_layernorm(hidden_size, eps=1e-6, affine=False, use_kernel=enable_layernorm_kernel) File "/usr/local/lib/python3.10/site-packages/opensora/models/layers/blocks.py", line 37, in get_layernorm raise RuntimeError("FusedLayerNorm not available. Please install apex.") RuntimeError: FusedLayerNorm not available. Please install apex. [2024-03-19 04:51:57,268] torch.distributed.elastic.multiprocessing.api: [ERROR] failed (exitcode: 1) local_rank: 0 (pid: 12284) of binary: /usr/local/bin/python Traceback (most recent call last): File "/usr/local/bin/torchrun", line 8, in <module> sys.exit(main()) File "/usr/local/lib/python3.10/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 347, in wrapper return f(*args, **kwargs) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 812, in main run(args) File "/usr/local/lib/python3.10/site-packages/torch/distributed/run.py", line 803, in run elastic_launch( File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 135, in __call__ return launch_agent(self._config, self._entrypoint, list(args)) File "/usr/local/lib/python3.10/site-packages/torch/distributed/launcher/api.py", line 268, in launch_agent raise ChildFailedError( torch.distributed.elastic.multiprocessing.errors.ChildFailedError: ============================================================ scripts/inference.py FAILED ------------------------------------------------------------ Failures: <NO_OTHER_FAILURES> ------------------------------------------------------------ Root Cause (first observed failure): [0]: time : 2024-03-19_04:51:57 host : openbayesalgo-t193qht565kb-main rank : 0 (local_rank: 0) exitcode : 1 (pid: 12284) error_file: <N/A> traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html ============================================================
Same error