candle
candle copied to clipboard
Flash Attention not working on CUDA 12.1
I'm trying to use Flash Attention on an environment with CUDA 12.1 but it fails to compile. Is it expected?
Reproducing:
- Start a Docker container with CUDA version 12.1.1
docker run -it --gpus all nvidia/cuda:12.1.1-devel-ubuntu20.04 /bin/bash - Install dependencies (
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y) - Run an example with "flash-attn" feature
cargo run --example llama --release --features "cuda flash-attn" -- --prompt "hi"
Here's the full error log:
Compiling candle-flash-attn v0.4.2 (/root/maritaca/testes/candle/candle-original/candle-flash-attn)
error: failed to run custom build command for candle-flash-attn v0.4.2 (/root/maritaca/testes/candle/candle-original/candle-flash-attn)
Caused by:
process didn't exit successfully: /root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-5a27d196c8d06b8f/build-script-build (exit status: 101)
--- stdout
cargo:rerun-if-changed=build.rs
cargo:rerun-if-changed=kernels/flash_api.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim128_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim160_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim192_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim224_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim256_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim32_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim64_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim96_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim128_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim160_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim192_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim224_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim256_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim32_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim64_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim96_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_kernel.h
cargo:rerun-if-changed=kernels/flash_fwd_launch_template.h
cargo:rerun-if-changed=kernels/flash.h
cargo:rerun-if-changed=kernels/philox.cuh
cargo:rerun-if-changed=kernels/softmax.h
cargo:rerun-if-changed=kernels/utils.h
cargo:rerun-if-changed=kernels/kernel_traits.h
cargo:rerun-if-changed=kernels/block_info.h
cargo:rerun-if-changed=kernels/static_switch.h
cargo:info=["/usr", "/usr/local/cuda", "/opt/cuda", "/usr/lib/cuda", "C:/Program Files/NVIDIA GPU Computing Toolkit", "C:/CUDA"]
cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP
cargo:rustc-env=CUDA_COMPUTE_CAP=80
--- stderr
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ NVVM_BRANCH=nvvm
#$ CUDART=cudart
#$ SPACE=
#$ HERE=/usr/local/cuda/bin
#$ CUDART=cudart
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ TOP=/usr/local/cuda/bin/..
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim192_fp16_sm80.cu" -o "/tmp/tmpxft_000005a8_00000000-5_flash_fwd_hdim192_fp16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_api.cu" -o "/tmp/tmpxft_00000599_00000000-5_flash_api.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ NVVM_BRANCH=nvvm
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ SPACE=
#$ CUDAFE_FLAGS=
#$ CUDART=cudart
#$ PTXAS_FLAGS=
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ NVVM_BRANCH=nvvm
#$ TARGET_DIR=
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_DIR=targets/x86_64-linux
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim224_fp16_sm80.cu" -o "/tmp/tmpxft_0000059a_00000000-5_flash_fwd_hdim224_fp16_sm80.cpp4.ii"
#$ TOP=/usr/local/cuda/bin/..
#$ TOP=/usr/local/cuda/bin/..
#$ NVVM_BRANCH=nvvm
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ SPACE=
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ CUDART=cudart
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ HERE=/usr/local/cuda/bin
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ THERE=/usr/local/cuda/bin
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ TARGET_SIZE=
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ TARGET_DIR=
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ NVVM_BRANCH=nvvm
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ SPACE=
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDART=cudart
#$ TARGET_DIR=targets/x86_64-linux
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ TARGET_DIR=targets/x86_64-linux
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim96_bf16_sm80.cu" -o "/tmp/tmpxft_000005a9_00000000-5_flash_fwd_hdim96_bf16_sm80.cpp4.ii"
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim64_fp16_sm80.cu" -o "/tmp/tmpxft_0000059e_00000000-5_flash_fwd_hdim64_fp16_sm80.cpp4.ii"
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim224_bf16_sm80.cu" -o "/tmp/tmpxft_0000059b_00000000-5_flash_fwd_hdim224_bf16_sm80.cpp4.ii"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim128_fp16_sm80.cu" -o "/tmp/tmpxft_0000059c_00000000-5_flash_fwd_hdim128_fp16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ NVVM_BRANCH=nvvm
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ HERE=/usr/local/cuda/bin
#$ TOP=/usr/local/cuda/bin/..
#$ TARGET_DIR=
#$ THERE=/usr/local/cuda/bin
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ NVVM_BRANCH=nvvm
#$ TARGET_DIR=targets/x86_64-linux
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ SPACE=
#$ TARGET_DIR=targets/x86_64-linux
#$ NVVM_BRANCH=nvvm
#$ CUDAFE_FLAGS=
#$ CUDART=cudart
#$ PTXAS_FLAGS=
#$ HERE=/usr/local/cuda/bin
#$ SPACE=
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ CUDART=cudart
#$ TARGET_DIR=
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim32_fp16_sm80.cu" -o "/tmp/tmpxft_000005a3_00000000-5_flash_fwd_hdim32_fp16_sm80.cpp4.ii"
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ CUDAFE_FLAGS=
#$ CUDAFE_FLAGS=
#$ TOP=/usr/local/cuda/bin/..
#$ PTXAS_FLAGS=
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ PTXAS_FLAGS=
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim256_bf16_sm80.cu" -o "/tmp/tmpxft_000005a7_00000000-5_flash_fwd_hdim256_bf16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim128_bf16_sm80.cu" -o "/tmp/tmpxft_000005a1_00000000-5_flash_fwd_hdim128_bf16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim256_fp16_sm80.cu" -o "/tmp/tmpxft_000005a5_00000000-5_flash_fwd_hdim256_fp16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim160_bf16_sm80.cu" -o "/tmp/tmpxft_0000059f_00000000-5_flash_fwd_hdim160_bf16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ NVVM_BRANCH=nvvm
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ TARGET_SIZE=
#$ CUDAFE_FLAGS=
#$ TARGET_DIR=
#$ PTXAS_FLAGS=
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ TARGET_DIR=targets/x86_64-linux
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim160_fp16_sm80.cu" -o "/tmp/tmpxft_000005a0_00000000-5_flash_fwd_hdim160_fp16_sm80.cpp4.ii"
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ TOP=/usr/local/cuda/bin/..
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ CUDAFE_FLAGS=
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PTXAS_FLAGS=
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim96_fp16_sm80.cu" -o "/tmp/tmpxft_0000059d_00000000-5_flash_fwd_hdim96_fp16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim64_bf16_sm80.cu" -o "/tmp/tmpxft_000005a6_00000000-5_flash_fwd_hdim64_bf16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim192_bf16_sm80.cu" -o "/tmp/tmpxft_000005a2_00000000-5_flash_fwd_hdim192_bf16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim32_bf16_sm80.cu" -o "/tmp/tmpxft_000005a4_00000000-5_flash_fwd_hdim32_bf16_sm80.cpp4.ii"
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_fwd_hdim128_bf16_sm80.cu:5:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
--error 0x1 --
thread '
stdout
stderr
note: run with RUST_BACKTRACE=1 environment variable to display a backtrace
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_fwd_hdim32_bf16_sm80.cu:5:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_fwd_hdim64_fp16_sm80.cu:5:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_api.cu:1:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
--error 0x1 --
thread '
stdout
stderr
--error 0x1 --
--error 0x1 --
thread '
stdout
stderr
thread '
stdout
stderr
In file included from kernels/flash_fwd_launch_template.h:9, from kernels/flash_fwd_hdim224_fp16_sm80.cu:5: kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory 7 | #include <cute/algorithm/copy.hpp> | ^~~~~~~~~~~~~~~~~~~~~~~~~ compilation terminated. In file included from kernels/flash_fwd_launch_template.h:9, from kernels/flash_fwd_hdim128_fp16_sm80.cu:5: kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory 7 | #include <cute/algorithm/copy.hpp> | ^~~~~~~~~~~~~~~~~~~~~~~~~ compilation terminated. In file included from kernels/flash_fwd_launch_template.h:9, from kernels/flash_fwd_hdim96_bf16_sm80.cu:5: kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory 7 | #include <cute/algorithm/copy.hpp> | ^~~~~~~~~~~~~~~~~~~~~~~~~ compilation terminated.
--error 0x1 --
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_fwd_hdim192_fp16_sm80.cu:5:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
thread '
stdout
stderr
--error 0x1 --
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_fwd_hdim160_bf16_sm80.cu:5:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
thread '
stdout
stderr
--error 0x1 --
thread '
stdout
stderr
--error 0x1 --
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_fwd_hdim256_fp16_sm80.cu:5:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_fwd_hdim32_fp16_sm80.cu:5:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
thread '
stdout
stderr
--error 0x1 --
--error 0x1 --
thread '
stdout
stderr
--error 0x1 --
thread '
stdout
stderr
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_fwd_hdim192_bf16_sm80.cu:5:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
thread '
stdout
stderr
In file included from kernels/flash_fwd_launch_template.h:9, from kernels/flash_fwd_hdim96_fp16_sm80.cu:5: kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory 7 | #include <cute/algorithm/copy.hpp> | ^~~~~~~~~~~~~~~~~~~~~~~~~ compilation terminated. In file included from kernels/flash_fwd_launch_template.h:9, from kernels/flash_fwd_hdim224_bf16_sm80.cu:5: kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory 7 | #include <cute/algorithm/copy.hpp> | ^~~~~~~~~~~~~~~~~~~~~~~~~ compilation terminated. In file included from kernels/flash_fwd_launch_template.h:9, from kernels/flash_fwd_hdim256_bf16_sm80.cu:5: kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory 7 | #include <cute/algorithm/copy.hpp> | ^~~~~~~~~~~~~~~~~~~~~~~~~ compilation terminated.
--error 0x1 --
thread '
stdout
stderr
--error 0x1 --
In file included from kernels/flash_fwd_launch_template.h:9,
from kernels/flash_fwd_hdim64_bf16_sm80.cu:5:
kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory
7 | #include <cute/algorithm/copy.hpp>
| ^~~~~~~~~~~~~~~~~~~~~~~~~
compilation terminated.
thread '
stdout
stderr
In file included from kernels/flash_fwd_launch_template.h:9, from kernels/flash_fwd_hdim160_fp16_sm80.cu:5: kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory 7 | #include <cute/algorithm/copy.hpp> | ^~~~~~~~~~~~~~~~~~~~~~~~~ compilation terminated.
--error 0x1 --
--error 0x1 --
thread '
stdout
stderr
thread '
stdout
stderr
--error 0x1 --
thread '
stdout
stderr
--error 0x1 --
thread '
stdout
stderr
</details>
Glancing quickly at the error message, I see some kernels/flash_fwd_kernel.h:7:10: fatal error: cute/algorithm/copy.hpp: No such file or directory, just to be sure do you ensure that the cutlass submodule is properly checked in?
The cutlass submodule was indeed missing (so I ran git submodule update --init --recursive), but the problem persists:
Caused by:
process didn't exit successfully: /root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-5a27d196c8d06b8f/build-script-build (exit status: 101)
--- stdout
cargo:rerun-if-changed=build.rs
cargo:rerun-if-changed=kernels/flash_api.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim128_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim160_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim192_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim224_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim256_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim32_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim64_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim96_fp16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim128_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim160_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim192_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim224_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim256_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim32_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim64_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_hdim96_bf16_sm80.cu
cargo:rerun-if-changed=kernels/flash_fwd_kernel.h
cargo:rerun-if-changed=kernels/flash_fwd_launch_template.h
cargo:rerun-if-changed=kernels/flash.h
cargo:rerun-if-changed=kernels/philox.cuh
cargo:rerun-if-changed=kernels/softmax.h
cargo:rerun-if-changed=kernels/utils.h
cargo:rerun-if-changed=kernels/kernel_traits.h
cargo:rerun-if-changed=kernels/block_info.h
cargo:rerun-if-changed=kernels/static_switch.h
cargo:info=["/usr", "/usr/local/cuda", "/opt/cuda", "/usr/lib/cuda", "C:/Program Files/NVIDIA GPU Computing Toolkit", "C:/CUDA"]
cargo:rerun-if-env-changed=CUDA_COMPUTE_CAP
cargo:rustc-env=CUDA_COMPUTE_CAP=80
--- stderr
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim128_bf16_sm80.cu" -o "/tmp/tmpxft_00000a28_00000000-5_flash_fwd_hdim128_bf16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim224_bf16_sm80.cu" -o "/tmp/tmpxft_00000a30_00000000-5_flash_fwd_hdim224_bf16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ NVVM_BRANCH=nvvm
#$ THERE=/usr/local/cuda/bin
#$ NVVM_BRANCH=nvvm
#$ TARGET_SIZE=
#$ SPACE=
#$ TARGET_DIR=
#$ SPACE=
#$ CUDART=cudart
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=targets/x86_64-linux
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ TOP=/usr/local/cuda/bin/..
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ TOP=/usr/local/cuda/bin/..
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim224_fp16_sm80.cu" -o "/tmp/tmpxft_00000a26_00000000-5_flash_fwd_hdim224_fp16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim96_bf16_sm80.cu" -o "/tmp/tmpxft_00000a34_00000000-5_flash_fwd_hdim96_bf16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim128_fp16_sm80.cu" -o "/tmp/tmpxft_00000a27_00000000-5_flash_fwd_hdim128_fp16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ SPACE=
#$ CUDART=cudart
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_SIZE=
#$ NVVM_BRANCH=nvvm
#$ TARGET_DIR=
#$ TARGET_DIR=
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_DIR=targets/x86_64-linux
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ TOP=/usr/local/cuda/bin/..
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ PTXAS_FLAGS=
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim192_fp16_sm80.cu" -o "/tmp/tmpxft_00000a2a_00000000-5_flash_fwd_hdim192_fp16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim256_fp16_sm80.cu" -o "/tmp/tmpxft_00000a2e_00000000-5_flash_fwd_hdim256_fp16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim192_bf16_sm80.cu" -o "/tmp/tmpxft_00000a2b_00000000-5_flash_fwd_hdim192_bf16_sm80.cpp4.ii"
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim64_bf16_sm80.cu" -o "/tmp/tmpxft_00000a33_00000000-5_flash_fwd_hdim64_bf16_sm80.cpp4.ii"
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim160_bf16_sm80.cu" -o "/tmp/tmpxft_00000a31_00000000-5_flash_fwd_hdim160_bf16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ NVVM_BRANCH=nvvm
#$ TOP=/usr/local/cuda/bin/..
#$ NVVM_BRANCH=nvvm
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ SPACE=
#$ SPACE=
#$ CUDART=cudart
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ HERE=/usr/local/cuda/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ TARGET_DIR=
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ TARGET_DIR=targets/x86_64-linux
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim256_bf16_sm80.cu" -o "/tmp/tmpxft_00000a32_00000000-5_flash_fwd_hdim256_bf16_sm80.cpp4.ii"
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ NVVM_BRANCH=nvvm
#$ TOP=/usr/local/cuda/bin/..
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_api.cu" -o "/tmp/tmpxft_00000a24_00000000-5_flash_api.cpp4.ii"
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ TARGET_DIR=targets/x86_64-linux
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim160_fp16_sm80.cu" -o "/tmp/tmpxft_00000a2f_00000000-5_flash_fwd_hdim160_fp16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim32_fp16_sm80.cu" -o "/tmp/tmpxft_00000a29_00000000-5_flash_fwd_hdim32_fp16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim96_fp16_sm80.cu" -o "/tmp/tmpxft_00000a25_00000000-5_flash_fwd_hdim96_fp16_sm80.cpp4.ii"
#$ NVVM_BRANCH=nvvm
#$ SPACE=
#$ CUDART=cudart
#$ HERE=/usr/local/cuda/bin
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ NVVM_BRANCH=nvvm
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ SPACE=
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ CUDART=cudart
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ HERE=/usr/local/cuda/bin
#$ PTXAS_FLAGS=
#$ THERE=/usr/local/cuda/bin
#$ TARGET_SIZE=
#$ TARGET_DIR=
#$ TARGET_DIR=targets/x86_64-linux
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim32_bf16_sm80.cu" -o "/tmp/tmpxft_00000a2c_00000000-5_flash_fwd_hdim32_bf16_sm80.cpp4.ii"
#$ TOP=/usr/local/cuda/bin/..
#$ NVVMIR_LIBRARY_DIR=/usr/local/cuda/bin/../nvvm/libdevice
#$ LD_LIBRARY_PATH=/usr/local/cuda/bin/../lib:/root/maritaca/testes/candle/candle-original/target/release/deps:/root/maritaca/testes/candle/candle-original/target/release:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib/rustlib/x86_64-unknown-linux-gnu/lib:/root/.rustup/toolchains/stable-x86_64-unknown-linux-gnu/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
#$ PATH=/usr/local/cuda/bin/../nvvm/bin:/usr/local/cuda/bin:/root/.cargo/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
#$ INCLUDES="-I/usr/local/cuda/bin/../targets/x86_64-linux/include"
#$ LIBRARIES= "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib/stubs" "-L/usr/local/cuda/bin/../targets/x86_64-linux/lib"
#$ CUDAFE_FLAGS=
#$ PTXAS_FLAGS=
#$ gcc -std=c++17 -D__CUDA_ARCH_LIST__=800 -E -x c++ -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim64_fp16_sm80.cu" -o "/tmp/tmpxft_00000a2d_00000000-5_flash_fwd_hdim64_fp16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim128_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim128_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a27_00000000-6_flash_fwd_hdim128_fp16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a27_00000000-6_flash_fwd_hdim128_fp16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a27_00000000-4_flash_fwd_hdim128_fp16_sm80.module_id" "/tmp/tmpxft_00000a27_00000000-5_flash_fwd_hdim128_fp16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim256_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim256_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a2e_00000000-6_flash_fwd_hdim256_fp16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a2e_00000000-6_flash_fwd_hdim256_fp16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a2e_00000000-4_flash_fwd_hdim256_fp16_sm80.module_id" "/tmp/tmpxft_00000a2e_00000000-5_flash_fwd_hdim256_fp16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim32_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim32_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a29_00000000-6_flash_fwd_hdim32_fp16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a29_00000000-6_flash_fwd_hdim32_fp16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a29_00000000-4_flash_fwd_hdim32_fp16_sm80.module_id" "/tmp/tmpxft_00000a29_00000000-5_flash_fwd_hdim32_fp16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim96_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim96_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a34_00000000-6_flash_fwd_hdim96_bf16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a34_00000000-6_flash_fwd_hdim96_bf16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a34_00000000-4_flash_fwd_hdim96_bf16_sm80.module_id" "/tmp/tmpxft_00000a34_00000000-5_flash_fwd_hdim96_bf16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim224_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim224_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a30_00000000-6_flash_fwd_hdim224_bf16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a30_00000000-6_flash_fwd_hdim224_bf16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a30_00000000-4_flash_fwd_hdim224_bf16_sm80.module_id" "/tmp/tmpxft_00000a30_00000000-5_flash_fwd_hdim224_bf16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim128_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim128_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a28_00000000-6_flash_fwd_hdim128_bf16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a28_00000000-6_flash_fwd_hdim128_bf16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a28_00000000-4_flash_fwd_hdim128_bf16_sm80.module_id" "/tmp/tmpxft_00000a28_00000000-5_flash_fwd_hdim128_bf16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim256_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim256_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a32_00000000-6_flash_fwd_hdim256_bf16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a32_00000000-6_flash_fwd_hdim256_bf16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a32_00000000-4_flash_fwd_hdim256_bf16_sm80.module_id" "/tmp/tmpxft_00000a32_00000000-5_flash_fwd_hdim256_bf16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim192_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim192_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a2b_00000000-6_flash_fwd_hdim192_bf16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a2b_00000000-6_flash_fwd_hdim192_bf16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a2b_00000000-4_flash_fwd_hdim192_bf16_sm80.module_id" "/tmp/tmpxft_00000a2b_00000000-5_flash_fwd_hdim192_bf16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim32_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim32_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a2c_00000000-6_flash_fwd_hdim32_bf16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a2c_00000000-6_flash_fwd_hdim32_bf16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a2c_00000000-4_flash_fwd_hdim32_bf16_sm80.module_id" "/tmp/tmpxft_00000a2c_00000000-5_flash_fwd_hdim32_bf16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim96_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim96_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a25_00000000-6_flash_fwd_hdim96_fp16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a25_00000000-6_flash_fwd_hdim96_fp16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a25_00000000-4_flash_fwd_hdim96_fp16_sm80.module_id" "/tmp/tmpxft_00000a25_00000000-5_flash_fwd_hdim96_fp16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim160_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim160_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a2f_00000000-6_flash_fwd_hdim160_fp16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a2f_00000000-6_flash_fwd_hdim160_fp16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a2f_00000000-4_flash_fwd_hdim160_fp16_sm80.module_id" "/tmp/tmpxft_00000a2f_00000000-5_flash_fwd_hdim160_fp16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim224_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim224_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a26_00000000-6_flash_fwd_hdim224_fp16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a26_00000000-6_flash_fwd_hdim224_fp16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a26_00000000-4_flash_fwd_hdim224_fp16_sm80.module_id" "/tmp/tmpxft_00000a26_00000000-5_flash_fwd_hdim224_fp16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim192_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim192_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a2a_00000000-6_flash_fwd_hdim192_fp16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a2a_00000000-6_flash_fwd_hdim192_fp16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a2a_00000000-4_flash_fwd_hdim192_fp16_sm80.module_id" "/tmp/tmpxft_00000a2a_00000000-5_flash_fwd_hdim192_fp16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim64_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim64_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a33_00000000-6_flash_fwd_hdim64_bf16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a33_00000000-6_flash_fwd_hdim64_bf16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a33_00000000-4_flash_fwd_hdim64_bf16_sm80.module_id" "/tmp/tmpxft_00000a33_00000000-5_flash_fwd_hdim64_bf16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_api.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_api.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a24_00000000-6_flash_api.cudafe1.cpp" --stub_file_name "tmpxft_00000a24_00000000-6_flash_api.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a24_00000000-4_flash_api.module_id" "/tmp/tmpxft_00000a24_00000000-5_flash_api.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim160_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim160_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a31_00000000-6_flash_fwd_hdim160_bf16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a31_00000000-6_flash_fwd_hdim160_bf16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a31_00000000-4_flash_fwd_hdim160_bf16_sm80.module_id" "/tmp/tmpxft_00000a31_00000000-5_flash_fwd_hdim160_bf16_sm80.cpp4.ii"
#$ cudafe++ --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim64_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim64_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr --m64 --parse_templates --gen_c_file_name "/tmp/tmpxft_00000a2d_00000000-6_flash_fwd_hdim64_fp16_sm80.cudafe1.cpp" --stub_file_name "tmpxft_00000a2d_00000000-6_flash_fwd_hdim64_fp16_sm80.cudafe1.stub.c" --gen_module_id_file --module_id_file_name "/tmp/tmpxft_00000a2d_00000000-4_flash_fwd_hdim64_fp16_sm80.module_id" "/tmp/tmpxft_00000a2d_00000000-5_flash_fwd_hdim64_fp16_sm80.cpp4.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim256_fp16_sm80.cu" -o "/tmp/tmpxft_00000a2e_00000000-7_flash_fwd_hdim256_fp16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim32_fp16_sm80.cu" -o "/tmp/tmpxft_00000a29_00000000-7_flash_fwd_hdim32_fp16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim256_bf16_sm80.cu" -o "/tmp/tmpxft_00000a32_00000000-7_flash_fwd_hdim256_bf16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim192_bf16_sm80.cu" -o "/tmp/tmpxft_00000a2b_00000000-7_flash_fwd_hdim192_bf16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim96_bf16_sm80.cu" -o "/tmp/tmpxft_00000a34_00000000-7_flash_fwd_hdim96_bf16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim224_bf16_sm80.cu" -o "/tmp/tmpxft_00000a30_00000000-7_flash_fwd_hdim224_bf16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim32_bf16_sm80.cu" -o "/tmp/tmpxft_00000a2c_00000000-7_flash_fwd_hdim32_bf16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim128_fp16_sm80.cu" -o "/tmp/tmpxft_00000a27_00000000-7_flash_fwd_hdim128_fp16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim96_fp16_sm80.cu" -o "/tmp/tmpxft_00000a25_00000000-7_flash_fwd_hdim96_fp16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim192_fp16_sm80.cu" -o "/tmp/tmpxft_00000a2a_00000000-7_flash_fwd_hdim192_fp16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim128_bf16_sm80.cu" -o "/tmp/tmpxft_00000a28_00000000-7_flash_fwd_hdim128_bf16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim224_fp16_sm80.cu" -o "/tmp/tmpxft_00000a26_00000000-7_flash_fwd_hdim224_fp16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_api.cu" -o "/tmp/tmpxft_00000a24_00000000-7_flash_api.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim160_bf16_sm80.cu" -o "/tmp/tmpxft_00000a31_00000000-7_flash_fwd_hdim160_bf16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim64_fp16_sm80.cu" -o "/tmp/tmpxft_00000a2d_00000000-7_flash_fwd_hdim64_fp16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim160_fp16_sm80.cu" -o "/tmp/tmpxft_00000a2f_00000000-7_flash_fwd_hdim160_fp16_sm80.cpp1.ii"
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -E -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -D__CUDACC__ -D__NVCC__ -D__CUDACC_EXTENDED_LAMBDA__ -D__CUDACC_RELAXED_CONSTEXPR__ -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -U "CUDA_NO_HALF_OPERATORS" -U "CUDA_NO_HALF_CONVERSIONS" -U "CUDA_NO_HALF2_OPERATORS" -U "CUDA_NO_BFLOAT16_CONVERSIONS" -D__CUDACC_VER_MAJOR__=12 -D__CUDACC_VER_MINOR__=1 -D__CUDACC_VER_BUILD__=105 -D__CUDA_API_VER_MAJOR__=12 -D__CUDA_API_VER_MINOR__=1 -DCUDA_API_PER_THREAD_DEFAULT_STREAM=1 -D__NVCC_DIAG_PRAGMA_SUPPORT__=1 -include "cuda_runtime.h" -m64 "kernels/flash_fwd_hdim64_bf16_sm80.cu" -o "/tmp/tmpxft_00000a33_00000000-7_flash_fwd_hdim64_bf16_sm80.cpp1.ii"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim256_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim256_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a2e_00000000-3_flash_fwd_hdim256_fp16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a2e_00000000-4_flash_fwd_hdim256_fp16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a2e_00000000-6_flash_fwd_hdim256_fp16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a2e_00000000-6_flash_fwd_hdim256_fp16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a2e_00000000-6_flash_fwd_hdim256_fp16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a2e_00000000-7_flash_fwd_hdim256_fp16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a2e_00000000-6_flash_fwd_hdim256_fp16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim32_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim32_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a29_00000000-3_flash_fwd_hdim32_fp16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a29_00000000-4_flash_fwd_hdim32_fp16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a29_00000000-6_flash_fwd_hdim32_fp16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a29_00000000-6_flash_fwd_hdim32_fp16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a29_00000000-6_flash_fwd_hdim32_fp16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a29_00000000-7_flash_fwd_hdim32_fp16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a29_00000000-6_flash_fwd_hdim32_fp16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim192_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim192_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a2b_00000000-3_flash_fwd_hdim192_bf16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a2b_00000000-4_flash_fwd_hdim192_bf16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a2b_00000000-6_flash_fwd_hdim192_bf16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a2b_00000000-6_flash_fwd_hdim192_bf16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a2b_00000000-6_flash_fwd_hdim192_bf16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a2b_00000000-7_flash_fwd_hdim192_bf16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a2b_00000000-6_flash_fwd_hdim192_bf16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim256_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim256_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a32_00000000-3_flash_fwd_hdim256_bf16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a32_00000000-4_flash_fwd_hdim256_bf16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a32_00000000-6_flash_fwd_hdim256_bf16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a32_00000000-6_flash_fwd_hdim256_bf16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a32_00000000-6_flash_fwd_hdim256_bf16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a32_00000000-7_flash_fwd_hdim256_bf16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a32_00000000-6_flash_fwd_hdim256_bf16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim96_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim96_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a34_00000000-3_flash_fwd_hdim96_bf16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a34_00000000-4_flash_fwd_hdim96_bf16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a34_00000000-6_flash_fwd_hdim96_bf16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a34_00000000-6_flash_fwd_hdim96_bf16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a34_00000000-6_flash_fwd_hdim96_bf16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a34_00000000-7_flash_fwd_hdim96_bf16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a34_00000000-6_flash_fwd_hdim96_bf16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim32_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim32_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a2c_00000000-3_flash_fwd_hdim32_bf16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a2c_00000000-4_flash_fwd_hdim32_bf16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a2c_00000000-6_flash_fwd_hdim32_bf16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a2c_00000000-6_flash_fwd_hdim32_bf16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a2c_00000000-6_flash_fwd_hdim32_bf16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a2c_00000000-7_flash_fwd_hdim32_bf16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a2c_00000000-6_flash_fwd_hdim32_bf16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim128_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim128_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a27_00000000-3_flash_fwd_hdim128_fp16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a27_00000000-4_flash_fwd_hdim128_fp16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a27_00000000-6_flash_fwd_hdim128_fp16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a27_00000000-6_flash_fwd_hdim128_fp16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a27_00000000-6_flash_fwd_hdim128_fp16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a27_00000000-7_flash_fwd_hdim128_fp16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a27_00000000-6_flash_fwd_hdim128_fp16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim224_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim224_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a30_00000000-3_flash_fwd_hdim224_bf16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a30_00000000-4_flash_fwd_hdim224_bf16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a30_00000000-6_flash_fwd_hdim224_bf16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a30_00000000-6_flash_fwd_hdim224_bf16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a30_00000000-6_flash_fwd_hdim224_bf16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a30_00000000-7_flash_fwd_hdim224_bf16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a30_00000000-6_flash_fwd_hdim224_bf16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim192_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim192_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a2a_00000000-3_flash_fwd_hdim192_fp16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a2a_00000000-4_flash_fwd_hdim192_fp16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a2a_00000000-6_flash_fwd_hdim192_fp16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a2a_00000000-6_flash_fwd_hdim192_fp16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a2a_00000000-6_flash_fwd_hdim192_fp16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a2a_00000000-7_flash_fwd_hdim192_fp16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a2a_00000000-6_flash_fwd_hdim192_fp16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_api.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_api.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a24_00000000-3_flash_api.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a24_00000000-4_flash_api.module_id" --gen_c_file_name "/tmp/tmpxft_00000a24_00000000-6_flash_api.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a24_00000000-6_flash_api.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a24_00000000-6_flash_api.cudafe1.gpu" "/tmp/tmpxft_00000a24_00000000-7_flash_api.cpp1.ii" -o "/tmp/tmpxft_00000a24_00000000-6_flash_api.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim128_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim128_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a28_00000000-3_flash_fwd_hdim128_bf16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a28_00000000-4_flash_fwd_hdim128_bf16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a28_00000000-6_flash_fwd_hdim128_bf16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a28_00000000-6_flash_fwd_hdim128_bf16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a28_00000000-6_flash_fwd_hdim128_bf16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a28_00000000-7_flash_fwd_hdim128_bf16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a28_00000000-6_flash_fwd_hdim128_bf16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim160_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim160_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a31_00000000-3_flash_fwd_hdim160_bf16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a31_00000000-4_flash_fwd_hdim160_bf16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a31_00000000-6_flash_fwd_hdim160_bf16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a31_00000000-6_flash_fwd_hdim160_bf16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a31_00000000-6_flash_fwd_hdim160_bf16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a31_00000000-7_flash_fwd_hdim160_bf16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a31_00000000-6_flash_fwd_hdim160_bf16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim96_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim96_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a25_00000000-3_flash_fwd_hdim96_fp16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a25_00000000-4_flash_fwd_hdim96_fp16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a25_00000000-6_flash_fwd_hdim96_fp16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a25_00000000-6_flash_fwd_hdim96_fp16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a25_00000000-6_flash_fwd_hdim96_fp16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a25_00000000-7_flash_fwd_hdim96_fp16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a25_00000000-6_flash_fwd_hdim96_fp16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim64_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim64_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a2d_00000000-3_flash_fwd_hdim64_fp16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a2d_00000000-4_flash_fwd_hdim64_fp16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a2d_00000000-6_flash_fwd_hdim64_fp16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a2d_00000000-6_flash_fwd_hdim64_fp16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a2d_00000000-6_flash_fwd_hdim64_fp16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a2d_00000000-7_flash_fwd_hdim64_fp16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a2d_00000000-6_flash_fwd_hdim64_fp16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim160_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim160_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a2f_00000000-3_flash_fwd_hdim160_fp16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a2f_00000000-4_flash_fwd_hdim160_fp16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a2f_00000000-6_flash_fwd_hdim160_fp16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a2f_00000000-6_flash_fwd_hdim160_fp16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a2f_00000000-6_flash_fwd_hdim160_fp16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a2f_00000000-7_flash_fwd_hdim160_fp16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a2f_00000000-6_flash_fwd_hdim160_fp16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim64_bf16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim64_bf16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a33_00000000-3_flash_fwd_hdim64_bf16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a33_00000000-4_flash_fwd_hdim64_bf16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a33_00000000-6_flash_fwd_hdim64_bf16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a33_00000000-6_flash_fwd_hdim64_bf16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a33_00000000-6_flash_fwd_hdim64_bf16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a33_00000000-7_flash_fwd_hdim64_bf16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a33_00000000-6_flash_fwd_hdim64_bf16_sm80.ptx"
#$ cicc --c++17 --gnu_version=90400 --display_error_number --orig_src_file_name "kernels/flash_fwd_hdim224_fp16_sm80.cu" --orig_src_path_name "/root/maritaca/testes/candle/candle-original/candle-flash-attn/kernels/flash_fwd_hdim224_fp16_sm80.cu" --allow_managed --extended-lambda --relaxed_constexpr -arch compute_80 -m64 --no-version-ident -ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 -fast-math --gen_div_approx_ftz --include_file_name "tmpxft_00000a26_00000000-3_flash_fwd_hdim224_fp16_sm80.fatbin.c" -tused --module_id_file_name "/tmp/tmpxft_00000a26_00000000-4_flash_fwd_hdim224_fp16_sm80.module_id" --gen_c_file_name "/tmp/tmpxft_00000a26_00000000-6_flash_fwd_hdim224_fp16_sm80.cudafe1.c" --stub_file_name "/tmp/tmpxft_00000a26_00000000-6_flash_fwd_hdim224_fp16_sm80.cudafe1.stub.c" --gen_device_file_name "/tmp/tmpxft_00000a26_00000000-6_flash_fwd_hdim224_fp16_sm80.cudafe1.gpu" "/tmp/tmpxft_00000a26_00000000-7_flash_fwd_hdim224_fp16_sm80.cpp1.ii" -o "/tmp/tmpxft_00000a26_00000000-6_flash_fwd_hdim224_fp16_sm80.ptx"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a24_00000000-6_flash_api.ptx" -o "/tmp/tmpxft_00000a24_00000000-8_flash_api.sm_80.cubin"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a24_00000000-8_flash_api.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a24_00000000-6_flash_api.ptx" --embedded-fatbin="/tmp/tmpxft_00000a24_00000000-3_flash_api.fatbin.c"
#$ rm /tmp/tmpxft_00000a24_00000000-3_flash_api.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a24_00000000-6_flash_api.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_api-59d12f2bec85f63.o"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a2b_00000000-6_flash_fwd_hdim192_bf16_sm80.ptx" -o "/tmp/tmpxft_00000a2b_00000000-8_flash_fwd_hdim192_bf16_sm80.sm_80.cubin"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a2c_00000000-6_flash_fwd_hdim32_bf16_sm80.ptx" -o "/tmp/tmpxft_00000a2c_00000000-8_flash_fwd_hdim32_bf16_sm80.sm_80.cubin"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a29_00000000-6_flash_fwd_hdim32_fp16_sm80.ptx" -o "/tmp/tmpxft_00000a29_00000000-8_flash_fwd_hdim32_fp16_sm80.sm_80.cubin"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a2a_00000000-6_flash_fwd_hdim192_fp16_sm80.ptx" -o "/tmp/tmpxft_00000a2a_00000000-8_flash_fwd_hdim192_fp16_sm80.sm_80.cubin"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a33_00000000-6_flash_fwd_hdim64_bf16_sm80.ptx" -o "/tmp/tmpxft_00000a33_00000000-8_flash_fwd_hdim64_bf16_sm80.sm_80.cubin"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a2b_00000000-8_flash_fwd_hdim192_bf16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a2b_00000000-6_flash_fwd_hdim192_bf16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a2b_00000000-3_flash_fwd_hdim192_bf16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a2b_00000000-3_flash_fwd_hdim192_bf16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a2b_00000000-6_flash_fwd_hdim192_bf16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim192_bf16_sm80-f7453c8601d43b17.o"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a2d_00000000-6_flash_fwd_hdim64_fp16_sm80.ptx" -o "/tmp/tmpxft_00000a2d_00000000-8_flash_fwd_hdim64_fp16_sm80.sm_80.cubin"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a2a_00000000-8_flash_fwd_hdim192_fp16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a2a_00000000-6_flash_fwd_hdim192_fp16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a2a_00000000-3_flash_fwd_hdim192_fp16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a2a_00000000-3_flash_fwd_hdim192_fp16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a2a_00000000-6_flash_fwd_hdim192_fp16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim192_fp16_sm80-3981fe996a7e8814.o"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a34_00000000-6_flash_fwd_hdim96_bf16_sm80.ptx" -o "/tmp/tmpxft_00000a34_00000000-8_flash_fwd_hdim96_bf16_sm80.sm_80.cubin"
Segmentation fault (core dumped)
--error 0x8b --
thread '
stdout
stderr
note: run with RUST_BACKTRACE=1 environment variable to display a backtrace
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a33_00000000-8_flash_fwd_hdim64_bf16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a33_00000000-6_flash_fwd_hdim64_bf16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a33_00000000-3_flash_fwd_hdim64_bf16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a33_00000000-3_flash_fwd_hdim64_bf16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a33_00000000-6_flash_fwd_hdim64_bf16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim64_bf16_sm80-eaa7ce7f57eb7351.o"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a25_00000000-6_flash_fwd_hdim96_fp16_sm80.ptx" -o "/tmp/tmpxft_00000a25_00000000-8_flash_fwd_hdim96_fp16_sm80.sm_80.cubin"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a29_00000000-8_flash_fwd_hdim32_fp16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a29_00000000-6_flash_fwd_hdim32_fp16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a29_00000000-3_flash_fwd_hdim32_fp16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a29_00000000-3_flash_fwd_hdim32_fp16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a29_00000000-6_flash_fwd_hdim32_fp16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim32_fp16_sm80-3a7585e74a278dc3.o"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a27_00000000-6_flash_fwd_hdim128_fp16_sm80.ptx" -o "/tmp/tmpxft_00000a27_00000000-8_flash_fwd_hdim128_fp16_sm80.sm_80.cubin"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a2d_00000000-8_flash_fwd_hdim64_fp16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a2d_00000000-6_flash_fwd_hdim64_fp16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a2d_00000000-3_flash_fwd_hdim64_fp16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a2d_00000000-3_flash_fwd_hdim64_fp16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a2d_00000000-6_flash_fwd_hdim64_fp16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim64_fp16_sm80-a93563dad84e2972.o"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a28_00000000-6_flash_fwd_hdim128_bf16_sm80.ptx" -o "/tmp/tmpxft_00000a28_00000000-8_flash_fwd_hdim128_bf16_sm80.sm_80.cubin"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a34_00000000-8_flash_fwd_hdim96_bf16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a34_00000000-6_flash_fwd_hdim96_bf16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a34_00000000-3_flash_fwd_hdim96_bf16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a34_00000000-3_flash_fwd_hdim96_bf16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a34_00000000-6_flash_fwd_hdim96_bf16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim96_bf16_sm80-f51ba409eb93ce41.o"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a25_00000000-8_flash_fwd_hdim96_fp16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a25_00000000-6_flash_fwd_hdim96_fp16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a25_00000000-3_flash_fwd_hdim96_fp16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a25_00000000-3_flash_fwd_hdim96_fp16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a25_00000000-6_flash_fwd_hdim96_fp16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim96_fp16_sm80-791226771e2c8c97.o"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a31_00000000-6_flash_fwd_hdim160_bf16_sm80.ptx" -o "/tmp/tmpxft_00000a31_00000000-8_flash_fwd_hdim160_bf16_sm80.sm_80.cubin"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a27_00000000-8_flash_fwd_hdim128_fp16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a27_00000000-6_flash_fwd_hdim128_fp16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a27_00000000-3_flash_fwd_hdim128_fp16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a27_00000000-3_flash_fwd_hdim128_fp16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a27_00000000-6_flash_fwd_hdim128_fp16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim128_fp16_sm80-759fdfecd1f0ed1c.o"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a2f_00000000-6_flash_fwd_hdim160_fp16_sm80.ptx" -o "/tmp/tmpxft_00000a2f_00000000-8_flash_fwd_hdim160_fp16_sm80.sm_80.cubin"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a30_00000000-6_flash_fwd_hdim224_bf16_sm80.ptx" -o "/tmp/tmpxft_00000a30_00000000-8_flash_fwd_hdim224_bf16_sm80.sm_80.cubin"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a28_00000000-8_flash_fwd_hdim128_bf16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a28_00000000-6_flash_fwd_hdim128_bf16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a28_00000000-3_flash_fwd_hdim128_bf16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a28_00000000-3_flash_fwd_hdim128_bf16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a28_00000000-6_flash_fwd_hdim128_bf16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim128_bf16_sm80-f1ff254233809e96.o"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a26_00000000-6_flash_fwd_hdim224_fp16_sm80.ptx" -o "/tmp/tmpxft_00000a26_00000000-8_flash_fwd_hdim224_fp16_sm80.sm_80.cubin"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a2e_00000000-6_flash_fwd_hdim256_fp16_sm80.ptx" -o "/tmp/tmpxft_00000a2e_00000000-8_flash_fwd_hdim256_fp16_sm80.sm_80.cubin"
#$ ptxas -arch=sm_80 -m64 "/tmp/tmpxft_00000a32_00000000-6_flash_fwd_hdim256_bf16_sm80.ptx" -o "/tmp/tmpxft_00000a32_00000000-8_flash_fwd_hdim256_bf16_sm80.sm_80.cubin"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a30_00000000-8_flash_fwd_hdim224_bf16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a30_00000000-6_flash_fwd_hdim224_bf16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a30_00000000-3_flash_fwd_hdim224_bf16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a30_00000000-3_flash_fwd_hdim224_bf16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a30_00000000-6_flash_fwd_hdim224_bf16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim224_bf16_sm80-9b2b93dbac21043c.o"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a31_00000000-8_flash_fwd_hdim160_bf16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a31_00000000-6_flash_fwd_hdim160_bf16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a31_00000000-3_flash_fwd_hdim160_bf16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a31_00000000-3_flash_fwd_hdim160_bf16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a31_00000000-6_flash_fwd_hdim160_bf16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim160_bf16_sm80-b8e226bc00ecbaf1.o"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a2f_00000000-8_flash_fwd_hdim160_fp16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a2f_00000000-6_flash_fwd_hdim160_fp16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a2f_00000000-3_flash_fwd_hdim160_fp16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a2f_00000000-3_flash_fwd_hdim160_fp16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a2f_00000000-6_flash_fwd_hdim160_fp16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim160_fp16_sm80-17db6cdd19f7f98b.o"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a26_00000000-8_flash_fwd_hdim224_fp16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a26_00000000-6_flash_fwd_hdim224_fp16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a26_00000000-3_flash_fwd_hdim224_fp16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a26_00000000-3_flash_fwd_hdim224_fp16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a26_00000000-6_flash_fwd_hdim224_fp16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim224_fp16_sm80-54d101fd022eab36.o"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a2e_00000000-8_flash_fwd_hdim256_fp16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a2e_00000000-6_flash_fwd_hdim256_fp16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a2e_00000000-3_flash_fwd_hdim256_fp16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a2e_00000000-3_flash_fwd_hdim256_fp16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a2e_00000000-6_flash_fwd_hdim256_fp16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim256_fp16_sm80-6bbb415157454ca9.o"
#$ fatbinary -64 --cicc-cmdline="-ftz=1 -prec_div=0 -prec_sqrt=0 -fmad=1 " "--image3=kind=elf,sm=80,file=/tmp/tmpxft_00000a32_00000000-8_flash_fwd_hdim256_bf16_sm80.sm_80.cubin" "--image3=kind=ptx,sm=80,file=/tmp/tmpxft_00000a32_00000000-6_flash_fwd_hdim256_bf16_sm80.ptx" --embedded-fatbin="/tmp/tmpxft_00000a32_00000000-3_flash_fwd_hdim256_bf16_sm80.fatbin.c"
#$ rm /tmp/tmpxft_00000a32_00000000-3_flash_fwd_hdim256_bf16_sm80.fatbin
#$ gcc -std=c++17 -D__CUDA_ARCH__=800 -D__CUDA_ARCH_LIST__=800 -c -x c++ -DCUDA_DOUBLE_MATH_FUNCTIONS -O3 -I"cutlass/include" "-I/usr/local/cuda/bin/../targets/x86_64-linux/include" -m64 "/tmp/tmpxft_00000a32_00000000-6_flash_fwd_hdim256_bf16_sm80.cudafe1.cpp" -o "/root/maritaca/testes/candle/candle-original/target/release/build/candle-flash-attn-e3705818c6c58200/out/flash_fwd_hdim256_bf16_sm80-21dd0f7dd998e506.o"
Not sure what is going on here and the error message is certainly pretty hard to read (not sure why its so verbose). The culprit seems to be ptxas segfaulting. A possibility would be that it runs out of memory as it compiles many kernels in parallel, you may want to check if there is any trace of this happening and if that's the case reduce the number of parallel jobs (I think you should be able to do so via RAYON_NUM_THREADS=... as parallelism is controlled via rayon but better to check on the box how many jobs are actually running to be sure).
It seems that there are 32 parallel jobs running the compilation (I checked with ps aux | grep cicc | wc -l - is this appropriate?). The machine I'm using has 48 cores and 252GB of RAM. I believe it's not an out-of-memory issue because the max memory usage was ~61GB.
Running with RAYON_NUM_THREADS=1 cargo run --example llama --release --features "cuda flash-attn" -- --prompt "hi" also didn't work - and took awhile to break :sweat_smile:.
Agreed that it's unlikely to be an out of memory, though if you can check the output of dmesg -T to see if there is anything about the crash, that would help being sure (there are also ptxas process on top of cicc).
Besides this if it's till breaking on the same segmentation fault in ptxas I don't have much more thoughts on how to debug this. I just double checked that the current version works properly on my side and it was all good, that's on cuda 12.2 but I doubt that it would make much of a difference here.
I'm not familiar with dmesg tool, but here's the output. The result of my last test is probably the last 2 lines:
[seg mar 25 18:10:40 2024] docker0: port 7(vethd17463b) entered disabled state
[seg mar 25 18:10:40 2024] device vethd17463b entered promiscuous mode
[seg mar 25 18:10:40 2024] eth0: renamed from veth5ca2d2a
[seg mar 25 18:10:40 2024] IPv6: ADDRCONF(NETDEV_CHANGE): vethd17463b: link becomes ready
[seg mar 25 18:10:40 2024] docker0: port 7(vethd17463b) entered blocking state
[seg mar 25 18:10:40 2024] docker0: port 7(vethd17463b) entered forwarding state
[seg mar 25 18:17:40 2024] ptxas[2267989]: segfault at 18 ip 000000000085642b sp 00007ffe1d0011e0 error 4 in ptxas[400000+117e000]
[seg mar 25 18:17:40 2024] Code: 83 ec 48 48 89 14 24 8b 57 50 89 74 24 1c 48 89 4c 24 08 85 d2 0f 85 d4 05 00 00 41 8b 47 78 45 31 e4 85 c0 0f 85 a5 05 00 00 <4d> 8b 75 18 49 8b 2f 4d 85 f6 0f 84 d8 00 00 00 49 8b 5e 18 8b 53
[seg mar 25 18:54:19 2024] ptxas[2273513]: segfault at 18 ip 000000000085642b sp 00007ffd2b8bee90 error 4 in ptxas[400000+117e000]
[seg mar 25 18:54:19 2024] Code: 83 ec 48 48 89 14 24 8b 57 50 89 74 24 1c 48 89 4c 24 08 85 d2 0f 85 d4 05 00 00 41 8b 47 78 45 31 e4 85 c0 0f 85 a5 05 00 00 <4d> 8b 75 18 49 8b 2f 4d 85 f6 0f 84 d8 00 00 00 49 8b 5e 18 8b 53
[seg mar 25 19:00:05 2024] ptxas[2273923]: segfault at 18 ip 000000000085642b sp 00007ffe12ea8770 error 4 in ptxas[400000+117e000]
[seg mar 25 19:00:05 2024] Code: 83 ec 48 48 89 14 24 8b 57 50 89 74 24 1c 48 89 4c 24 08 85 d2 0f 85 d4 05 00 00 41 8b 47 78 45 31 e4 85 c0 0f 85 a5 05 00 00 <4d> 8b 75 18 49 8b 2f 4d 85 f6 0f 84 d8 00 00 00 49 8b 5e 18 8b 53
[seg mar 25 19:46:27 2024] ptxas[2274415]: segfault at 18 ip 000000000085642b sp 00007fff8993e8a0 error 4 in ptxas[400000+117e000]
[seg mar 25 19:46:27 2024] Code: 83 ec 48 48 89 14 24 8b 57 50 89 74 24 1c 48 89 4c 24 08 85 d2 0f 85 d4 05 00 00 41 8b 47 78 45 31 e4 85 c0 0f 85 a5 05 00 00 <4d> 8b 75 18 49 8b 2f 4d 85 f6 0f 84 d8 00 00 00 49 8b 5e 18 8b 53
[seg mar 25 19:53:25 2024] ptxas[2278540]: segfault at 18 ip 000000000085642b sp 00007ffd83e74610 error 4 in ptxas[400000+117e000]
[seg mar 25 19:53:25 2024] Code: 83 ec 48 48 89 14 24 8b 57 50 89 74 24 1c 48 89 4c 24 08 85 d2 0f 85 d4 05 00 00 41 8b 47 78 45 31 e4 85 c0 0f 85 a5 05 00 00 <4d> 8b 75 18 49 8b 2f 4d 85 f6 0f 84 d8 00 00 00 49 8b 5e 18 8b 53
[seg mar 25 20:07:20 2024] ptxas[2279025]: segfault at 18 ip 000000000085642b sp 00007fff95457950 error 4 in ptxas[400000+117e000]
[seg mar 25 20:07:20 2024] Code: 83 ec 48 48 89 14 24 8b 57 50 89 74 24 1c 48 89 4c 24 08 85 d2 0f 85 d4 05 00 00 41 8b 47 78 45 31 e4 85 c0 0f 85 a5 05 00 00 <4d> 8b 75 18 49 8b 2f 4d 85 f6 0f 84 d8 00 00 00 49 8b 5e 18 8b 53
[seg mar 25 20:21:46 2024] ptxas[2279514]: segfault at 18 ip 000000000085642b sp 00007ffd55e74620 error 4 in ptxas[400000+117e000]
[seg mar 25 20:21:46 2024] Code: 83 ec 48 48 89 14 24 8b 57 50 89 74 24 1c 48 89 4c 24 08 85 d2 0f 85 d4 05 00 00 41 8b 47 78 45 31 e4 85 c0 0f 85 a5 05 00 00 <4d> 8b 75 18 49 8b 2f 4d 85 f6 0f 84 d8 00 00 00 49 8b 5e 18 8b 53
My use case involves generating two binaries compatible with CUDA versions 11.x and 12.x. To achieve this, I'm using Docker and an older version of CUDA 12. For the 11.x version, I'm using version 11.3.1, which works well in the environments I have tested. However, I'm having some compatibility problems with version 12. Is there a better approach to releasing a binary with the aim of maximizing compatibility?
In summary, I can't use Flash Attention with CUDA 12.0 (nvidia/cuda:12.0.1-devel-ubuntu22.04) and 12.1 (nvidia/cuda:12.1.1-devel-ubuntu22.04). With version 12.2 (nvidia/cuda:12.2.0-devel-ubuntu22.04) and beyond, it works fine.
Interesting, my guess is that the upstream version of flash attention has the same issue as our code is going almost directly from there.
The Flash Attention Python library works in these environments, maybe they're doing something more sophisticated during installation. I'm considering releasing a binary without Flash Attention for these specific CUDA versions, but it would be perfect if it worked seamlessly.
Getting the same problem on a fresh aws instance as well! @hugoabonizio any luck with this problem? Do I just upgrade CUDA.
Repro (even with nvidia/cuda:12.1.1-devel-ubuntu20.04 does same problem):
docker run -it --gpus all nvidia/cuda:12.1.1-devel-ubuntu20.04 /bin/bash
apt-get -y update; apt-get -y install curl; apt-get install git; apt install pkg-config; apt install -y libssl-dev
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
git submodule update --init --recursive
cargo run --example llama --release --features "cuda flash-attn" -- --prompt "hi"
Linked issue: https://github.com/Dao-AILab/flash-attention/issues/1004