fast-stable-diffusion
fast-stable-diffusion copied to clipboard
How to precompile xformers and save for other GPU?
I'm trying to run this on Paperspace Gradient to train DreamBooth, but I don't want to keep building xformers each time. Is there an easy way to compile and save that? I can even do it and make a pull request for others to use.
Thanks!
what type of platform, GPU ?
Specifically, the P5000, RTX 5000, and A4000. Those are the only 16GB available for free on Paperspace Gradient. I'd like to also experiment with 16GB+ cards, but these are priority.
under linux ?
Yes, I believe so. So, I think I figured it out. Can I basically just do this to create the wheel?
I saw something on Reddit about adding "env" info, but I am not so clear on those parts.
git clone [email protected]:facebookresearch/xformers.git git submodule update --init --recursive
cd xformers pip install -r requirements.txt pip install -e .
python setup.py sdist bdist_wheel
pip install git+https://github.com/facebookresearch/xformers@51dd119#egg=xformers
after around 40min, and the installation is done, navigate to /usr/local/lib/python3.7/dist-packages/xformers
save the two files : "_C_flashattention.so" and "_C.so" and put them back after installing xformers (without compiling)
Thanks! In your precompiled area you have .whl files. Am I easily able to create those?
I'm sorry, but I'm not sure how to install xformers without compiling... Is there a specific flag for pip?
to install without compiling; git clone xformers, then replace the setup.py with this one below, then pip install (path to the local xformer folder)
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates. All rights reserved.
#
# This source code is licensed under the BSD license found in the
# LICENSE file in the root directory of this source tree.
import distutils.command.clean
import glob
import os
import re
import shutil
import subprocess
import sys
from pathlib import Path
import setuptools
import torch
from torch.utils.cpp_extension import (
CUDA_HOME,
BuildExtension,
CppExtension,
CUDAExtension,
)
this_dir = os.path.dirname(os.path.abspath(__file__))
def fetch_requirements():
with open("requirements.txt") as f:
reqs = f.read().strip().split("\n")
return reqs
# https://packaging.python.org/guides/single-sourcing-package-version/
def find_version(version_file_path):
with open(version_file_path) as version_file:
version_match = re.search(
r"^__version__ = ['\"]([^'\"]*)['\"]", version_file.read(), re.M
)
# The following is used to build release packages.
# Users should never use it.
suffix = os.getenv("XFORMERS_VERSION_SUFFIX", "")
if version_match:
return version_match.group(1) + suffix
raise RuntimeError("Unable to find version string.")
def get_cuda_version(cuda_dir) -> int:
nvcc_bin = "nvcc" if cuda_dir is None else cuda_dir + "/bin/nvcc"
raw_output = subprocess.check_output([nvcc_bin, "-V"], universal_newlines=True)
output = raw_output.split()
release_idx = output.index("release") + 1
release = output[release_idx].split(".")
bare_metal_major = int(release[0])
bare_metal_minor = int(release[1][0])
assert bare_metal_minor < 100
return bare_metal_major * 100 + bare_metal_minor
def get_flash_attention_extensions(cuda_version: int, extra_compile_args):
# Figure out default archs to target
DEFAULT_ARCHS_LIST = ""
if cuda_version > 1100:
DEFAULT_ARCHS_LIST = "7.5;8.0;8.6"
elif cuda_version == 1100:
DEFAULT_ARCHS_LIST = "7.5;8.0"
else:
return []
if os.getenv("XFORMERS_DISABLE_FLASH_ATTN", "0") != "0":
return []
archs_list = os.environ.get("TORCH_CUDA_ARCH_LIST", DEFAULT_ARCHS_LIST)
nvcc_archs_flags = []
for arch in archs_list.split(";"):
assert len(arch) >= 3, f"Invalid sm version: {arch}"
num = 10 * int(arch[0]) + int(arch[2])
# Need at least 7.5
if num < 75:
continue
nvcc_archs_flags.append(f"-gencode=arch=compute_{num},code=sm_{num}")
if arch.endswith("+PTX"):
nvcc_archs_flags.append(f"-gencode=arch=compute_{num},code=compute_{num}")
if not nvcc_archs_flags:
return []
this_dir = os.path.dirname(os.path.abspath(__file__))
flash_root = os.path.join(this_dir, "third_party", "flash-attention")
if not os.path.exists(flash_root):
raise RuntimeError(
"flashattention submodule not found. Did you forget "
"to run `git submodule update --init --recursive` ?"
)
return [
CUDAExtension(
name="xformers._C_flashattention",
sources=[
os.path.join(this_dir, "third_party", "flash-attention", path)
for path in [
"csrc/flash_attn/fmha_api.cpp",
"csrc/flash_attn/src/fmha_fprop_fp16_kernel.sm80.cu",
"csrc/flash_attn/src/fmha_dgrad_fp16_kernel_loop.sm80.cu",
"csrc/flash_attn/src/fmha_block_fprop_fp16_kernel.sm80.cu",
"csrc/flash_attn/src/fmha_block_dgrad_fp16_kernel_loop.sm80.cu",
]
],
extra_compile_args={
**extra_compile_args,
"nvcc": extra_compile_args.get("nvcc", [])
+ [
"-O3",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
"--expt-relaxed-constexpr",
"--expt-extended-lambda",
"--use_fast_math",
"--ptxas-options=-v",
"-lineinfo",
]
+ nvcc_archs_flags,
},
include_dirs=[
Path(flash_root) / "csrc" / "flash_attn",
Path(flash_root) / "csrc" / "flash_attn" / "src",
# Path(flash_root) / 'csrc' / 'flash_attn' / 'cutlass' / 'include',
Path(this_dir) / "third_party" / "cutlass" / "include",
],
)
]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(
this_dir, "xformers", "components", "attention", "csrc"
)
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp")) + glob.glob(
os.path.join(extensions_dir, "autograd", "*.cpp")
)
sources = main_file + source_cpu
source_cuda = glob.glob(
os.path.join(extensions_dir, "cuda", "**", "*.cu"), recursive=True
)
sputnik_dir = os.path.join(this_dir, "third_party", "sputnik")
cutlass_dir = os.path.join(this_dir, "third_party", "cutlass", "include")
if not os.path.exists(cutlass_dir):
raise RuntimeError(
"CUTLASS submodule not found. Did you forget "
"to run `git submodule update --init --recursive` ?"
)
extension = CppExtension
define_macros = []
extra_compile_args = {"cxx": ["-O3"]}
if sys.platform == "win32":
define_macros += [("xformers_EXPORTS", None)]
extra_compile_args["cxx"].append("/MP")
elif "OpenMP not found" not in torch.__config__.parallel_info():
extra_compile_args["cxx"].append("-fopenmp")
include_dirs = [extensions_dir]
ext_modules = []
if (torch.cuda.is_available() and ((CUDA_HOME is not None))) or os.getenv(
"FORCE_CUDA", "0"
) == "1":
extension = CUDAExtension
sources += source_cuda
include_dirs += [sputnik_dir, cutlass_dir]
nvcc_flags = os.getenv("NVCC_FLAGS", "")
if nvcc_flags == "":
nvcc_flags = ["--use_fast_math", "-DNDEBUG"]
else:
nvcc_flags = nvcc_flags.split(" ")
cuda_version = get_cuda_version(CUDA_HOME)
if cuda_version >= 1102:
nvcc_flags += [
"--threads",
"4",
"--ptxas-options=-v",
]
extra_compile_args["nvcc"] = nvcc_flags
ext_modules += get_flash_attention_extensions(
cuda_version=cuda_version, extra_compile_args=extra_compile_args
)
sources = [os.path.join(extensions_dir, s) for s in sources]
ext_modules.append(
extension(
"xformers._C",
sorted(sources),
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
)
return ext_modules
class clean(distutils.command.clean.clean): # type: ignore
def run(self):
if os.path.exists(".gitignore"):
with open(".gitignore", "r") as f:
ignores = f.read()
for wildcard in filter(None, ignores.split("\n")):
for filename in glob.glob(wildcard):
try:
os.remove(filename)
except OSError:
shutil.rmtree(filename, ignore_errors=True)
# It's an old-style class in Python 2.7...
distutils.command.clean.clean.run(self)
if __name__ == "__main__":
setuptools.setup(
name="xformers",
description="XFormers: A collection of composable Transformer building blocks.",
version=find_version(os.path.join(this_dir, "xformers", "__init__.py")),
setup_requires=[],
install_requires=fetch_requirements(),
packages=setuptools.find_packages(exclude=("tests", "tests.*")),
url="https://facebookresearch.github.io/xformers/",
python_requires=">=3.6",
author="Facebook AI Research",
author_email="[email protected]",
long_description="XFormers: A collection of composable Transformer building blocks."
+ "XFormers aims at being able to reproduce most architectures in the Transformer-family SOTA,"
+ "defined as compatible and combined building blocks as opposed to monolithic models",
long_description_content_type="text/markdown",
classifiers=[
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"License :: OSI Approved :: BSD License",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
"Operating System :: OS Independent",
],
zip_safe=False,
)
Thank you for that! This is simpler than me making a whl file?
you can build a whl file, i'll post the instructions later
Okay cool. Once I build any wheels, I'll upload them for others to test/use.
@TheLastBen May I ask is there any more information about building whl files? I also have an unsupport GPU Tesla K80.
@TheLastBen May I ask is there any more information about building whl files? I also have an unsupport GPU Tesla K80.
If you simply want to install xformers, use this:
!pip install xformers
However, if you want to build a wheel, you can generally try to follow these steps. I'm sure there is technically a better way, byut it worked for me.
!pip install --upgrade setuptools
!git clone https://github.com/facebookresearch/xformers !git submodule update --init --recursive
%cd xformers !pip install -r requirements.txt
!python setup.py sdist bdist_wheel --universal`
Once it's done, you'll find your .whl file in the /xformers/dist/ folder. In order to use it, you just write:
!pip install <path-to-whl>
Hope this helps!
Copy the *.so compiled files onto the folder "xformers" then run python setup.py bdist_wheel
Copy the *.so compiled files onto the folder "xformers" then run
python setup.py bdist_wheel
Let me make sure I understand:
- Unzip the file you shared.
- Copy the two *.so files into the xformers subfolder.
- Then, run
python setup.py bdist_wheel
So far, my method worked and I'm training as I type this, but I'll go ahead and do it your way. I already forked this, so I'll do a pull request when I'm ready. Also, I'm working on a notebook for Paperspace if you'd like to add it to the repo.
if the wheel includes the compiled files, you don't need my method
Thank you both. I was trying to migrate the notebook to Azure ML.
I used @swcrazyfan's method, got a small whl (<1MB).
Then I tried pip install git+https://github.com/facebookresearch/xformers@51dd119#egg=xformers, but after the installation is completed, the process exit. I could't get any so files.
BTW, Azure ML looks very different from colab, I'm considering giving up, just providing a feedback here. No need to give solution. Thanks all.
Thank you both. I was trying to migrate the notebook to Azure ML.
I used @swcrazyfan's method, got a small whl (<1MB).
Then I tried
pip install git+https://github.com/facebookresearch/xformers@51dd119#egg=xformers, but after the installation is completed, the process exit. I could't get any so files.BTW, Azure ML looks very different from colab, I'm considering giving up, just providing a feedback here. No need to give solution. Thanks all.
I had the same problem when I tried to use RunPod. However, it works perfectly on Paperspace Gradient. Honestly, I'm not sure what's the problem.
Did everything work after you ran the regular pip install above? You just didn't get the wheel?
@swcrazyfan According to your method python setup.py sdist bdist_wheel --universal, I got a wheel. However, due to other issues, I can't run other parts of the notebook, so I can't verify whether the whl works properly.
@swcrazyfan According to your method
python setup.py sdist bdist_wheel --universal, I got a wheel. However, due to other issues, I can't run other parts of the notebook, so I can't verify whether the whl works properly.
To be honest, I’m not knowledgeable enough to say why this didn’t work. It worked perfectly for me on one GPU/computer, but I also had issues on other machines. My suggestion is to try TheLastBens advice. It’s a bit more complicated, but it might work better.
is there pre compiled wheel for 3080ti on windows ? having issues compiling one (have VS 2022 and tried cud 11.6 and 11.8 toolkit)