Add warp_count to the procedural_name of TileDescription
Hi,
this commit adds warp_count to the procedural_name of TileDescription.
Please review, thanks : )
Would you please show me the kernel name of gemm and conv now?
https://github.com/NVIDIA/cutlass/issues/286
Would you please show me the kernel name of gemm and conv now?
I run python3 generator.py and one of the generated files:
cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_n32t32_align16.cu
/*
Generated by gemm_operation.py - Do not edit.
*/
///////////////////////////////////////////////////////////////////////////////////////////////////
#include "cutlass/arch/wmma.h"
#include "cutlass/cutlass.h"
#include "cutlass/library/library.h"
#include "cutlass/library/manifest.h"
#include "library_internal.h"
#include "gemm_operation.h"
///////////////////////////////////////////////////////////////////////////////////////////////////
// Gemm operator cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_n32t32_align16
using cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_n32t32_align16_base =
typename cutlass::gemm::kernel::DefaultGemmUniversal<
uint8_t, cutlass::layout::ColumnMajorInterleaved<32>, cutlass::ComplexTransform::kNone, 16,
uint8_t, cutlass::layout::RowMajorInterleaved<32>, cutlass::ComplexTransform::kNone, 16,
uint8_t, cutlass::layout::ColumnMajorInterleaved<32>,
int32_t,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 64>,
cutlass::gemm::GemmShape<64, 64, 64>,
cutlass::gemm::GemmShape<8, 8, 16>,
cutlass::epilogue::thread::LinearCombinationClamp<
uint8_t,
8,
int32_t,
float
>,
cutlass::gemm::threadblock::GemmIdentityThreadblockSwizzle<8>,
2,
cutlass::arch::OpMultiplyAddSaturate
>::GemmKernel;
// Define named type
struct cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_n32t32_align16 :
public cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_n32t32_align16_base { };
///////////////////////////////////////////////////////////////////////////////////////////////////
namespace cutlass {
namespace library {
///////////////////////////////////////////////////////////////////////////////////////////////////
void initialize_cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_n32t32_align16(Manifest &manifest) {
manifest.append(new GemmUniversalOperation<
cutlass::gemm::device::GemmUniversalAdapter<cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_n32t32_align16>
>("cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_n32t32_align16"));
}
///////////////////////////////////////////////////////////////////////////////////////////////////
} // namespace library
} // namespace cutlass
///////////////////////////////////////////////////////////////////////////////////////////////////
Thanks, would you please list the new kernel name of fprop, wgrad, and dgrad?
Filenames in generated/gemm
cutlass_simt_cgemm_128x128x8_4x2x1_2_nn_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_2_nt_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_2_tn_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_2_tt_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_cc_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_ch_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_cn_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_ct_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_hc_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_hh_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_hn_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_ht_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_nc_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_nh_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_nn_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_nt_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_tc_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_th_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_tn_align1.cu
cutlass_simt_cgemm_128x128x8_4x2x1_5_tt_align1.cu
cutlass_simt_dgemm_128x128x8_4x2x1_2_nn_align1.cu
cutlass_simt_dgemm_128x128x8_4x2x1_2_nt_align1.cu
cutlass_simt_dgemm_128x128x8_4x2x1_2_tn_align1.cu
cutlass_simt_dgemm_128x128x8_4x2x1_2_tt_align1.cu
cutlass_simt_dgemm_128x128x8_4x2x1_3_nn_align1.cu
cutlass_simt_dgemm_128x128x8_4x2x1_3_nt_align1.cu
cutlass_simt_dgemm_128x128x8_4x2x1_3_tn_align1.cu
cutlass_simt_dgemm_128x128x8_4x2x1_3_tt_align1.cu
cutlass_simt_hgemm_256x128x8_4x2x1_2_nn_align1.cu
cutlass_simt_hgemm_256x128x8_4x2x1_2_nt_align1.cu
cutlass_simt_hgemm_256x128x8_4x2x1_2_tn_align1.cu
cutlass_simt_hgemm_256x128x8_4x2x1_2_tt_align1.cu
cutlass_simt_igemm_s8_128x128x32_4x2x1_2_nn_align1.cu
cutlass_simt_igemm_s8_128x128x32_4x2x1_2_nt_align1.cu
cutlass_simt_igemm_s8_128x128x32_4x2x1_2_tn_align1.cu
cutlass_simt_igemm_s8_128x128x32_4x2x1_2_tt_align1.cu
cutlass_simt_s8_igemm_s8_128x128x32_4x2x1_2_nn_align1.cu
cutlass_simt_s8_igemm_s8_128x128x32_4x2x1_2_nt_align1.cu
cutlass_simt_s8_igemm_s8_128x128x32_4x2x1_2_tn_align1.cu
cutlass_simt_s8_igemm_s8_128x128x32_4x2x1_2_tt_align1.cu
cutlass_simt_sgemm_128x128x8_4x2x1_2_nn_align1.cu
cutlass_simt_sgemm_128x128x8_4x2x1_2_nt_align1.cu
cutlass_simt_sgemm_128x128x8_4x2x1_2_tn_align1.cu
cutlass_simt_sgemm_128x128x8_4x2x1_2_tt_align1.cu
cutlass_simt_sgemm_256x128x8_4x2x1_5_nn_align1.cu
cutlass_simt_sgemm_256x128x8_4x2x1_5_nt_align1.cu
cutlass_simt_sgemm_256x128x8_4x2x1_5_tn_align1.cu
cutlass_simt_sgemm_256x128x8_4x2x1_5_tt_align1.cu
cutlass_tensorop_bf16_s16816gemm_bf16_256x128x32_4x2x1_3_nn_align8.cu
cutlass_tensorop_bf16_s16816gemm_bf16_256x128x32_4x2x1_3_nt_align8.cu
cutlass_tensorop_bf16_s16816gemm_bf16_256x128x32_4x2x1_3_tn_align8.cu
cutlass_tensorop_bf16_s16816gemm_bf16_256x128x32_4x2x1_3_tt_align8.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_cc_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_ch_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_cn_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_ct_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_hc_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_hh_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_hn_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_ht_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_nc_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_nh_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_nn_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_nt_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_tc_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_th_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_tn_align1.cu
cutlass_tensorop_c1688gemm_128x64x16_4x2x1_4_tt_align1.cu
cutlass_tensorop_d884gemm_128x128x16_4x2x1_3_nn_align1.cu
cutlass_tensorop_d884gemm_128x128x16_4x2x1_3_nt_align1.cu
cutlass_tensorop_d884gemm_128x128x16_4x2x1_3_tn_align1.cu
cutlass_tensorop_d884gemm_128x128x16_4x2x1_3_tt_align1.cu
cutlass_tensorop_f16_s16816gemm_f16_256x128x32_4x2x1_3_nn_align8.cu
cutlass_tensorop_f16_s16816gemm_f16_256x128x32_4x2x1_3_nt_align8.cu
cutlass_tensorop_f16_s16816gemm_f16_256x128x32_4x2x1_3_tn_align8.cu
cutlass_tensorop_f16_s16816gemm_f16_256x128x32_4x2x1_3_tt_align8.cu
cutlass_tensorop_f16_s1688gemm_f16_256x128x32_4x2x1_2_nn_align8.cu
cutlass_tensorop_f16_s1688gemm_f16_256x128x32_4x2x1_2_nt_align8.cu
cutlass_tensorop_f16_s1688gemm_f16_256x128x32_4x2x1_2_tn_align8.cu
cutlass_tensorop_f16_s1688gemm_f16_256x128x32_4x2x1_2_tt_align8.cu
cutlass_tensorop_f16_s884gemm_f16_256x128x32_4x2x1_2_nn_align8.cu
cutlass_tensorop_f16_s884gemm_f16_256x128x32_4x2x1_2_nt_align8.cu
cutlass_tensorop_f16_s884gemm_f16_256x128x32_4x2x1_2_tn_align8.cu
cutlass_tensorop_f16_s884gemm_f16_256x128x32_4x2x1_2_tt_align8.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_cc_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_ch_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_cn_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_ct_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_hc_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_hh_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_hn_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_ht_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_nc_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_nh_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_nn_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_nt_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_tc_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_th_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_tn_align1.cu
cutlass_tensorop_gz884gemm_64x64x8_4x2x1_3_tt_align1.cu
cutlass_tensorop_h16816gemm_256x128x32_4x2x1_3_nn_align8.cu
cutlass_tensorop_h16816gemm_256x128x32_4x2x1_3_nt_align8.cu
cutlass_tensorop_h16816gemm_256x128x32_4x2x1_3_tn_align8.cu
cutlass_tensorop_h16816gemm_256x128x32_4x2x1_3_tt_align8.cu
cutlass_tensorop_h1688gemm_256x128x32_4x2x1_2_nn_align8.cu
cutlass_tensorop_h1688gemm_256x128x32_4x2x1_2_nt_align8.cu
cutlass_tensorop_h1688gemm_256x128x32_4x2x1_2_tn_align8.cu
cutlass_tensorop_h1688gemm_256x128x32_4x2x1_2_tt_align8.cu
cutlass_tensorop_h884gemm_256x128x32_4x2x1_2_nn_align8.cu
cutlass_tensorop_h884gemm_256x128x32_4x2x1_2_nt_align8.cu
cutlass_tensorop_h884gemm_256x128x32_4x2x1_2_tn_align8.cu
cutlass_tensorop_h884gemm_256x128x32_4x2x1_2_tt_align8.cu
cutlass_tensorop_i168256xorgemm_b1_256x128x512_4x2x1_3_tn_align128.cu
cutlass_tensorop_i16832gemm_s8_256x128x64_4x2x1_3_tn_align16.cu
cutlass_tensorop_i16832gemm_u8_256x128x64_4x2x1_3_tn_align16.cu
cutlass_tensorop_i16864gemm_s4_256x128x128_4x2x1_3_tn_align32.cu
cutlass_tensorop_i16864gemm_u4_256x128x128_4x2x1_3_tn_align32.cu
cutlass_tensorop_i88128xorgemm_b1_256x128x512_4x2x1_2_tn_align128.cu
cutlass_tensorop_i8816gemm_s8_256x128x64_4x2x1_2_tn_align16.cu
cutlass_tensorop_i8816gemm_u8_256x128x64_4x2x1_2_tn_align16.cu
cutlass_tensorop_i8832gemm_s4_256x128x128_4x2x1_2_tn_align32.cu
cutlass_tensorop_i8832gemm_u4_256x128x128_4x2x1_2_tn_align32.cu
cutlass_tensorop_s16816gemm_bf16_256x128x32_4x2x1_3_nn_align8.cu
cutlass_tensorop_s16816gemm_bf16_256x128x32_4x2x1_3_nt_align8.cu
cutlass_tensorop_s16816gemm_bf16_256x128x32_4x2x1_3_tn_align8.cu
cutlass_tensorop_s16816gemm_bf16_256x128x32_4x2x1_3_tt_align8.cu
cutlass_tensorop_s16816gemm_f16_256x128x32_4x2x1_3_nn_align8.cu
cutlass_tensorop_s16816gemm_f16_256x128x32_4x2x1_3_nt_align8.cu
cutlass_tensorop_s16816gemm_f16_256x128x32_4x2x1_3_tn_align8.cu
cutlass_tensorop_s16816gemm_f16_256x128x32_4x2x1_3_tt_align8.cu
cutlass_tensorop_s1688bf16gemm_256x128x16_4x2x1_3_nn_align4.cu
cutlass_tensorop_s1688bf16gemm_256x128x16_4x2x1_3_nt_align4.cu
cutlass_tensorop_s1688bf16gemm_256x128x16_4x2x1_3_tn_align4.cu
cutlass_tensorop_s1688bf16gemm_256x128x16_4x2x1_3_tt_align4.cu
cutlass_tensorop_s1688f16gemm_256x128x16_4x2x1_3_nn_align4.cu
cutlass_tensorop_s1688f16gemm_256x128x16_4x2x1_3_nt_align4.cu
cutlass_tensorop_s1688f16gemm_256x128x16_4x2x1_3_tn_align4.cu
cutlass_tensorop_s1688f16gemm_256x128x16_4x2x1_3_tt_align4.cu
cutlass_tensorop_s1688gemm_f16_256x128x32_4x2x1_2_nn_align8.cu
cutlass_tensorop_s1688gemm_f16_256x128x32_4x2x1_2_nt_align8.cu
cutlass_tensorop_s1688gemm_f16_256x128x32_4x2x1_2_tn_align8.cu
cutlass_tensorop_s1688gemm_f16_256x128x32_4x2x1_2_tt_align8.cu
cutlass_tensorop_s1688gemm_tf32_256x128x16_4x2x1_3_nn_align4.cu
cutlass_tensorop_s1688gemm_tf32_256x128x16_4x2x1_3_nt_align4.cu
cutlass_tensorop_s1688gemm_tf32_256x128x16_4x2x1_3_tn_align4.cu
cutlass_tensorop_s1688gemm_tf32_256x128x16_4x2x1_3_tt_align4.cu
cutlass_tensorop_s1688tf32gemm_256x128x16_4x2x1_3_nn_align4.cu
cutlass_tensorop_s1688tf32gemm_256x128x16_4x2x1_3_nt_align4.cu
cutlass_tensorop_s1688tf32gemm_256x128x16_4x2x1_3_tn_align4.cu
cutlass_tensorop_s1688tf32gemm_256x128x16_4x2x1_3_tt_align4.cu
cutlass_tensorop_s4_i16864gemm_s4_256x128x128_4x2x1_3_n64t64_align32.cu
cutlass_tensorop_s4_i16864gemm_s4_256x128x128_4x2x1_3_tn_align32.cu
cutlass_tensorop_s4_i8832gemm_s4_256x128x128_4x2x1_2_n64t64_align32.cu
cutlass_tensorop_s4_i8832gemm_s4_256x128x128_4x2x1_2_tn_align32.cu
cutlass_tensorop_s884gemm_f16_256x128x32_4x2x1_2_nn_align8.cu
cutlass_tensorop_s884gemm_f16_256x128x32_4x2x1_2_nt_align8.cu
cutlass_tensorop_s884gemm_f16_256x128x32_4x2x1_2_tn_align8.cu
cutlass_tensorop_s884gemm_f16_256x128x32_4x2x1_2_tt_align8.cu
cutlass_tensorop_s8_i16832gemm_s8_256x128x64_4x2x1_3_n32t32_align16.cu
cutlass_tensorop_s8_i16832gemm_s8_256x128x64_4x2x1_3_tn_align16.cu
cutlass_tensorop_s8_i8816gemm_s8_256x128x64_4x2x1_2_n32t32_align16.cu
cutlass_tensorop_s8_i8816gemm_s8_256x128x64_4x2x1_2_tn_align16.cu
cutlass_tensorop_tf32_s1688gemm_tf32_256x128x16_4x2x1_3_nn_align4.cu
cutlass_tensorop_tf32_s1688gemm_tf32_256x128x16_4x2x1_3_nt_align4.cu
cutlass_tensorop_tf32_s1688gemm_tf32_256x128x16_4x2x1_3_tn_align4.cu
cutlass_tensorop_tf32_s1688gemm_tf32_256x128x16_4x2x1_3_tt_align4.cu
cutlass_tensorop_u4_i16864gemm_u4_256x128x128_4x2x1_3_n64t64_align32.cu
cutlass_tensorop_u4_i16864gemm_u4_256x128x128_4x2x1_3_tn_align32.cu
cutlass_tensorop_u4_i8832gemm_u4_256x128x128_4x2x1_2_n64t64_align32.cu
cutlass_tensorop_u4_i8832gemm_u4_256x128x128_4x2x1_2_tn_align32.cu
cutlass_tensorop_u8_i16832gemm_u8_256x128x64_4x2x1_3_n32t32_align16.cu
cutlass_tensorop_u8_i16832gemm_u8_256x128x64_4x2x1_3_tn_align16.cu
cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_n32t32_align16.cu
cutlass_tensorop_u8_i8816gemm_u8_256x128x64_4x2x1_2_tn_align16.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_cc_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_ch_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_cn_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_ct_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_hc_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_hh_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_hn_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_ht_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_nc_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_nh_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_nn_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_nt_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_tc_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_th_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_tn_align1.cu
cutlass_tensorop_z884gemm_128x64x8_4x2x1_3_tt_align1.cu
Filenames in generated/conv2d
cutlass_simt_cf32_cdgrad_analytic_cf32_128x128_8x2_nhwc.cu
cutlass_simt_cf32_cdgrad_analytic_cf32_128x128_8x5_nhwc.cu
cutlass_simt_cf32_cdgrad_optimized_cf32_128x128_8x2_nhwc_unity_stride.cu
cutlass_simt_cf32_cdgrad_optimized_cf32_128x128_8x5_nhwc_unity_stride.cu
cutlass_simt_cf32_cfprop_analytic_cf32_128x128_8x2_nhwc.cu
cutlass_simt_cf32_cfprop_analytic_cf32_128x128_8x5_nhwc.cu
cutlass_simt_cf32_cfprop_optimized_cf32_128x128_8x2_nhwc.cu
cutlass_simt_cf32_cfprop_optimized_cf32_128x128_8x5_nhwc.cu
cutlass_simt_cf32_cwgrad_analytic_cf32_128x128_8x2_nhwc.cu
cutlass_simt_cf32_cwgrad_analytic_cf32_128x128_8x5_nhwc.cu
cutlass_simt_cf32_cwgrad_optimized_cf32_128x128_8x2_nhwc.cu
cutlass_simt_cf32_cwgrad_optimized_cf32_128x128_8x5_nhwc.cu
cutlass_simt_sdgrad_analytic_128x128_8x2_nhwc.cu
cutlass_simt_sdgrad_analytic_256x128_8x5_nhwc.cu
cutlass_simt_sdgrad_optimized_128x128_8x2_nhwc_unity_stride.cu
cutlass_simt_sdgrad_optimized_256x128_8x5_nhwc_unity_stride.cu
cutlass_simt_sfprop_analytic_128x128_8x2_nhwc.cu
cutlass_simt_sfprop_analytic_256x128_8x5_nhwc.cu
cutlass_simt_sfprop_optimized_128x128_8x2_nhwc.cu
cutlass_simt_sfprop_optimized_256x128_8x5_nhwc.cu
cutlass_simt_swgrad_analytic_128x128_8x2_nhwc.cu
cutlass_simt_swgrad_analytic_256x128_8x5_nhwc.cu
cutlass_simt_swgrad_optimized_128x128_8x2_nhwc.cu
cutlass_simt_swgrad_optimized_256x128_8x5_nhwc.cu
cutlass_tensorop_bf16_s16816dgrad_analytic_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_bf16_s16816dgrad_optimized_bf16_256x128_32x3_nhwc_unity_stride.cu
cutlass_tensorop_bf16_s16816fprop_analytic_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_bf16_s16816wgrad_analytic_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_bf16_s16816wgrad_optimized_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_f16_s16816dgrad_analytic_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_f16_s16816dgrad_optimized_f16_256x128_32x3_nhwc_unity_stride.cu
cutlass_tensorop_f16_s16816fprop_analytic_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_f16_s16816fprop_optimized_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_f16_s16816wgrad_analytic_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_f16_s16816wgrad_optimized_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_f16_s1688dgrad_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_f16_s1688dgrad_optimized_f16_256x128_32x2_nhwc_unity_stride.cu
cutlass_tensorop_f16_s1688fprop_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_f16_s1688fprop_optimized_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_f16_s1688wgrad_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_f16_s1688wgrad_optimized_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_f16_s884dgrad_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_f16_s884dgrad_optimized_f16_256x128_32x2_nhwc_unity_stride.cu
cutlass_tensorop_f16_s884fprop_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_f16_s884fprop_optimized_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_f16_s884wgrad_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_f16_s884wgrad_optimized_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_h16816dgrad_analytic_256x128_32x3_nhwc.cu
cutlass_tensorop_h16816dgrad_optimized_256x128_32x3_nhwc_unity_stride.cu
cutlass_tensorop_h16816fprop_analytic_256x128_32x3_nhwc.cu
cutlass_tensorop_h16816fprop_optimized_256x128_32x3_nhwc.cu
cutlass_tensorop_h16816wgrad_analytic_256x128_32x3_nhwc.cu
cutlass_tensorop_h16816wgrad_optimized_256x128_32x3_nhwc.cu
cutlass_tensorop_h1688dgrad_analytic_256x128_32x2_nhwc.cu
cutlass_tensorop_h1688dgrad_optimized_256x128_32x2_nhwc_unity_stride.cu
cutlass_tensorop_h1688fprop_analytic_256x128_32x2_nhwc.cu
cutlass_tensorop_h1688fprop_optimized_256x128_32x2_nhwc.cu
cutlass_tensorop_h1688wgrad_analytic_256x128_32x2_nhwc.cu
cutlass_tensorop_h1688wgrad_optimized_256x128_32x2_nhwc.cu
cutlass_tensorop_h884dgrad_analytic_256x128_32x2_nhwc.cu
cutlass_tensorop_h884dgrad_optimized_256x128_32x2_nhwc_unity_stride.cu
cutlass_tensorop_h884fprop_analytic_256x128_32x2_nhwc.cu
cutlass_tensorop_h884fprop_optimized_256x128_32x2_nhwc.cu
cutlass_tensorop_h884wgrad_analytic_256x128_32x2_nhwc.cu
cutlass_tensorop_h884wgrad_optimized_256x128_32x2_nhwc.cu
cutlass_tensorop_i16832fprop_analytic_s8_256x128_64x3_nhwc.cu
cutlass_tensorop_i16832fprop_analytic_u8_256x128_64x3_nhwc.cu
cutlass_tensorop_i16832fprop_optimized_s8_256x128_64x3_nhwc.cu
cutlass_tensorop_i16832fprop_optimized_u8_256x128_64x3_nhwc.cu
cutlass_tensorop_i16864fprop_analytic_s4_256x128_128x3_nhwc.cu
cutlass_tensorop_i16864fprop_analytic_u4_256x128_128x3_nhwc.cu
cutlass_tensorop_i16864fprop_optimized_s4_256x128_128x3_nhwc.cu
cutlass_tensorop_i16864fprop_optimized_u4_256x128_128x3_nhwc.cu
cutlass_tensorop_i8816fprop_analytic_s8_256x128_64x2_nhwc.cu
cutlass_tensorop_i8816fprop_analytic_u8_256x128_64x2_nhwc.cu
cutlass_tensorop_i8816fprop_optimized_s8_256x128_64x2_nhwc.cu
cutlass_tensorop_i8816fprop_optimized_u8_256x128_64x2_nhwc.cu
cutlass_tensorop_i8832fprop_analytic_s4_256x128_128x2_nhwc.cu
cutlass_tensorop_i8832fprop_analytic_u4_256x128_128x2_nhwc.cu
cutlass_tensorop_i8832fprop_optimized_s4_256x128_128x2_nhwc.cu
cutlass_tensorop_i8832fprop_optimized_u4_256x128_128x2_nhwc.cu
cutlass_tensorop_s16816dgrad_analytic_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_s16816dgrad_analytic_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_s16816dgrad_optimized_bf16_256x128_32x3_nhwc_unity_stride.cu
cutlass_tensorop_s16816dgrad_optimized_f16_256x128_32x3_nhwc_unity_stride.cu
cutlass_tensorop_s16816fprop_analytic_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_s16816fprop_analytic_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_s16816fprop_optimized_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_s16816fprop_optimized_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_s16816wgrad_analytic_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_s16816wgrad_analytic_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_s16816wgrad_optimized_bf16_256x128_32x3_nhwc.cu
cutlass_tensorop_s16816wgrad_optimized_f16_256x128_32x3_nhwc.cu
cutlass_tensorop_s1688bf16dgrad_analytic_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688bf16dgrad_optimized_256x128_16x3_nhwc_unity_stride.cu
cutlass_tensorop_s1688bf16fprop_analytic_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688bf16fprop_optimized_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688bf16wgrad_analytic_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688bf16wgrad_optimized_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688dgrad_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s1688dgrad_analytic_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688dgrad_optimized_f16_256x128_32x2_nhwc_unity_stride.cu
cutlass_tensorop_s1688dgrad_optimized_tf32_256x128_16x3_nhwc_unity_stride.cu
cutlass_tensorop_s1688f16dgrad_analytic_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688f16dgrad_optimized_256x128_16x3_nhwc_unity_stride.cu
cutlass_tensorop_s1688f16fprop_analytic_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688f16fprop_optimized_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688f16wgrad_analytic_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688f16wgrad_optimized_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688fprop_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s1688fprop_analytic_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688fprop_optimized_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s1688fprop_optimized_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688tf32dgrad_analytic_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688tf32dgrad_optimized_256x128_16x3_nhwc_unity_stride.cu
cutlass_tensorop_s1688tf32fprop_analytic_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688tf32fprop_optimized_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688tf32wgrad_analytic_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688tf32wgrad_optimized_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688wgrad_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s1688wgrad_analytic_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_s1688wgrad_optimized_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s1688wgrad_optimized_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_s4_i16864fprop_analytic_s4_256x128_128x3_nc64hw64.cu
cutlass_tensorop_s4_i16864fprop_analytic_s4_256x128_128x3_nhwc.cu
cutlass_tensorop_s4_i16864fprop_optimized_s4_256x128_128x3_nc64hw64.cu
cutlass_tensorop_s4_i16864fprop_optimized_s4_256x128_128x3_nhwc.cu
cutlass_tensorop_s4_i8832fprop_analytic_s4_256x128_128x2_nc64hw64.cu
cutlass_tensorop_s4_i8832fprop_analytic_s4_256x128_128x2_nhwc.cu
cutlass_tensorop_s4_i8832fprop_optimized_s4_256x128_128x2_nc64hw64.cu
cutlass_tensorop_s4_i8832fprop_optimized_s4_256x128_128x2_nhwc.cu
cutlass_tensorop_s884dgrad_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s884dgrad_optimized_f16_256x128_32x2_nhwc_unity_stride.cu
cutlass_tensorop_s884fprop_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s884fprop_optimized_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s884wgrad_analytic_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s884wgrad_optimized_f16_256x128_32x2_nhwc.cu
cutlass_tensorop_s8_i16832fprop_analytic_s8_256x128_64x3_nc32hw32.cu
cutlass_tensorop_s8_i16832fprop_analytic_s8_256x128_64x3_nhwc.cu
cutlass_tensorop_s8_i16832fprop_optimized_s8_256x128_64x3_nc32hw32.cu
cutlass_tensorop_s8_i16832fprop_optimized_s8_256x128_64x3_nhwc.cu
cutlass_tensorop_s8_i8816fprop_analytic_s8_256x128_64x2_nc32hw32.cu
cutlass_tensorop_s8_i8816fprop_analytic_s8_256x128_64x2_nhwc.cu
cutlass_tensorop_s8_i8816fprop_optimized_s8_256x128_64x2_nc32hw32.cu
cutlass_tensorop_s8_i8816fprop_optimized_s8_256x128_64x2_nhwc.cu
cutlass_tensorop_tf32_s1688dgrad_analytic_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_tf32_s1688dgrad_optimized_tf32_256x128_16x3_nhwc_unity_stride.cu
cutlass_tensorop_tf32_s1688fprop_analytic_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_tf32_s1688fprop_optimized_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_tf32_s1688wgrad_analytic_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_tf32_s1688wgrad_optimized_tf32_256x128_16x3_nhwc.cu
cutlass_tensorop_u4_i16864fprop_analytic_u4_256x128_128x3_nc64hw64.cu
cutlass_tensorop_u4_i16864fprop_analytic_u4_256x128_128x3_nhwc.cu
cutlass_tensorop_u4_i16864fprop_optimized_u4_256x128_128x3_nc64hw64.cu
cutlass_tensorop_u4_i16864fprop_optimized_u4_256x128_128x3_nhwc.cu
cutlass_tensorop_u4_i8832fprop_analytic_u4_256x128_128x2_nc64hw64.cu
cutlass_tensorop_u4_i8832fprop_analytic_u4_256x128_128x2_nhwc.cu
cutlass_tensorop_u4_i8832fprop_optimized_u4_256x128_128x2_nc64hw64.cu
cutlass_tensorop_u4_i8832fprop_optimized_u4_256x128_128x2_nhwc.cu
cutlass_tensorop_u8_i16832fprop_analytic_u8_256x128_64x3_nc32hw32.cu
cutlass_tensorop_u8_i16832fprop_analytic_u8_256x128_64x3_nhwc.cu
cutlass_tensorop_u8_i16832fprop_optimized_u8_256x128_64x3_nc32hw32.cu
cutlass_tensorop_u8_i16832fprop_optimized_u8_256x128_64x3_nhwc.cu
cutlass_tensorop_u8_i8816fprop_analytic_u8_256x128_64x2_nc32hw32.cu
cutlass_tensorop_u8_i8816fprop_analytic_u8_256x128_64x2_nhwc.cu
cutlass_tensorop_u8_i8816fprop_optimized_u8_256x128_64x2_nc32hw32.cu
cutlass_tensorop_u8_i8816fprop_optimized_u8_256x128_64x2_nhwc.cu
Filenames in generated/conv3d
cutlass_tensorop_bf16_s16816dgrad3d_analytic_bf16_256x128_32x3.cu
cutlass_tensorop_bf16_s16816fprop3d_analytic_bf16_256x128_32x3.cu
cutlass_tensorop_bf16_s16816wgrad3d_analytic_bf16_256x128_32x3.cu
cutlass_tensorop_bf16_s16816wgrad3d_optimized_bf16_256x128_32x3.cu
cutlass_tensorop_f16_s16816dgrad3d_analytic_f16_256x128_32x3.cu
cutlass_tensorop_f16_s16816fprop3d_analytic_f16_256x128_32x3.cu
cutlass_tensorop_f16_s16816wgrad3d_analytic_f16_256x128_32x3.cu
cutlass_tensorop_f16_s16816wgrad3d_optimized_f16_256x128_32x3.cu
cutlass_tensorop_h16816dgrad3d_analytic_256x128_32x3.cu
cutlass_tensorop_h16816fprop3d_analytic_256x128_32x3.cu
cutlass_tensorop_h16816wgrad3d_analytic_256x128_32x3.cu
cutlass_tensorop_h16816wgrad3d_optimized_256x128_32x3.cu
cutlass_tensorop_s16816dgrad3d_analytic_bf16_256x128_32x3.cu
cutlass_tensorop_s16816dgrad3d_analytic_f16_256x128_32x3.cu
cutlass_tensorop_s16816fprop3d_analytic_bf16_256x128_32x3.cu
cutlass_tensorop_s16816fprop3d_analytic_f16_256x128_32x3.cu
cutlass_tensorop_s16816wgrad3d_analytic_bf16_256x128_32x3.cu
cutlass_tensorop_s16816wgrad3d_analytic_f16_256x128_32x3.cu
cutlass_tensorop_s16816wgrad3d_optimized_bf16_256x128_32x3.cu
cutlass_tensorop_s16816wgrad3d_optimized_f16_256x128_32x3.cu
The conv kernel name is still the old one. Is it your intention to keep conv kernel name unchanged?
The conv kernel name is still the old one. Is it your intention to keep conv kernel name unchanged?
Oh, my mistake :sweat_smile:
I just updated this commit. Please take a look again : )
Filenames generated for conv2d:
cutlass_simt_cf32_cdgrad_analytic_cf32_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_cf32_cdgrad_analytic_cf32_128x128x8_4x2x1_5_nhwc.cu
cutlass_simt_cf32_cdgrad_optimized_cf32_128x128x8_4x2x1_2_nhwc_unity_stride.cu
cutlass_simt_cf32_cdgrad_optimized_cf32_128x128x8_4x2x1_5_nhwc_unity_stride.cu
cutlass_simt_cf32_cfprop_analytic_cf32_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_cf32_cfprop_analytic_cf32_128x128x8_4x2x1_5_nhwc.cu
cutlass_simt_cf32_cfprop_optimized_cf32_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_cf32_cfprop_optimized_cf32_128x128x8_4x2x1_5_nhwc.cu
cutlass_simt_cf32_cwgrad_analytic_cf32_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_cf32_cwgrad_analytic_cf32_128x128x8_4x2x1_5_nhwc.cu
cutlass_simt_cf32_cwgrad_optimized_cf32_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_cf32_cwgrad_optimized_cf32_128x128x8_4x2x1_5_nhwc.cu
cutlass_simt_sdgrad_analytic_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_sdgrad_analytic_256x128x8_4x2x1_5_nhwc.cu
cutlass_simt_sdgrad_optimized_128x128x8_4x2x1_2_nhwc_unity_stride.cu
cutlass_simt_sdgrad_optimized_256x128x8_4x2x1_5_nhwc_unity_stride.cu
cutlass_simt_sfprop_analytic_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_sfprop_analytic_256x128x8_4x2x1_5_nhwc.cu
cutlass_simt_sfprop_optimized_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_sfprop_optimized_256x128x8_4x2x1_5_nhwc.cu
cutlass_simt_swgrad_analytic_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_swgrad_analytic_256x128x8_4x2x1_5_nhwc.cu
cutlass_simt_swgrad_optimized_128x128x8_4x2x1_2_nhwc.cu
cutlass_simt_swgrad_optimized_256x128x8_4x2x1_5_nhwc.cu
cutlass_tensorop_bf16_s16816dgrad_analytic_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_bf16_s16816dgrad_optimized_bf16_256x128x32_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_bf16_s16816fprop_analytic_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_bf16_s16816fprop_optimized_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_bf16_s16816wgrad_analytic_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_bf16_s16816wgrad_optimized_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_f16_s16816dgrad_analytic_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_f16_s16816dgrad_optimized_f16_256x128x32_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_f16_s16816fprop_analytic_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_f16_s16816fprop_optimized_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_f16_s16816wgrad_analytic_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_f16_s16816wgrad_optimized_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_f16_s1688dgrad_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_f16_s1688dgrad_optimized_f16_256x128x32_4x2x1_2_nhwc_unity_stride.cu
cutlass_tensorop_f16_s1688fprop_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_f16_s1688fprop_optimized_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_f16_s1688wgrad_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_f16_s1688wgrad_optimized_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_f16_s884dgrad_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_f16_s884dgrad_optimized_f16_256x128x32_4x2x1_2_nhwc_unity_stride.cu
cutlass_tensorop_f16_s884fprop_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_f16_s884fprop_optimized_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_f16_s884wgrad_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_f16_s884wgrad_optimized_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h16816dgrad_analytic_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_h16816dgrad_optimized_256x128x32_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_h16816fprop_analytic_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_h16816fprop_optimized_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_h16816wgrad_analytic_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_h16816wgrad_optimized_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_h1688dgrad_analytic_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h1688dgrad_optimized_256x128x32_4x2x1_2_nhwc_unity_stride.cu
cutlass_tensorop_h1688fprop_analytic_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h1688fprop_optimized_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h1688wgrad_analytic_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h1688wgrad_optimized_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h884dgrad_analytic_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h884dgrad_optimized_256x128x32_4x2x1_2_nhwc_unity_stride.cu
cutlass_tensorop_h884fprop_analytic_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h884fprop_optimized_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h884wgrad_analytic_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_h884wgrad_optimized_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_i16832fprop_analytic_s8_256x128x64_4x2x1_3_nhwc.cu
cutlass_tensorop_i16832fprop_analytic_u8_256x128x64_4x2x1_3_nhwc.cu
cutlass_tensorop_i16832fprop_optimized_s8_256x128x64_4x2x1_3_nhwc.cu
cutlass_tensorop_i16832fprop_optimized_u8_256x128x64_4x2x1_3_nhwc.cu
cutlass_tensorop_i16864fprop_analytic_s4_256x128x128_4x2x1_3_nhwc.cu
cutlass_tensorop_i16864fprop_analytic_u4_256x128x128_4x2x1_3_nhwc.cu
cutlass_tensorop_i16864fprop_optimized_s4_256x128x128_4x2x1_3_nhwc.cu
cutlass_tensorop_i16864fprop_optimized_u4_256x128x128_4x2x1_3_nhwc.cu
cutlass_tensorop_i8816fprop_analytic_s8_256x128x64_4x2x1_2_nhwc.cu
cutlass_tensorop_i8816fprop_analytic_u8_256x128x64_4x2x1_2_nhwc.cu
cutlass_tensorop_i8816fprop_optimized_s8_256x128x64_4x2x1_2_nhwc.cu
cutlass_tensorop_i8816fprop_optimized_u8_256x128x64_4x2x1_2_nhwc.cu
cutlass_tensorop_i8832fprop_analytic_s4_256x128x128_4x2x1_2_nhwc.cu
cutlass_tensorop_i8832fprop_analytic_u4_256x128x128_4x2x1_2_nhwc.cu
cutlass_tensorop_i8832fprop_optimized_s4_256x128x128_4x2x1_2_nhwc.cu
cutlass_tensorop_i8832fprop_optimized_u4_256x128x128_4x2x1_2_nhwc.cu
cutlass_tensorop_s16816dgrad_analytic_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s16816dgrad_analytic_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s16816dgrad_optimized_bf16_256x128x32_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_s16816dgrad_optimized_f16_256x128x32_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_s16816fprop_analytic_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s16816fprop_analytic_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s16816fprop_optimized_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s16816fprop_optimized_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s16816wgrad_analytic_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s16816wgrad_analytic_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s16816wgrad_optimized_bf16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s16816wgrad_optimized_f16_256x128x32_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688bf16dgrad_analytic_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688bf16dgrad_optimized_256x128x16_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_s1688bf16fprop_analytic_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688bf16fprop_optimized_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688bf16wgrad_analytic_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688bf16wgrad_optimized_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688dgrad_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s1688dgrad_analytic_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688dgrad_optimized_f16_256x128x32_4x2x1_2_nhwc_unity_stride.cu
cutlass_tensorop_s1688dgrad_optimized_tf32_256x128x16_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_s1688f16dgrad_analytic_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688f16dgrad_optimized_256x128x16_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_s1688f16fprop_analytic_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688f16fprop_optimized_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688f16wgrad_analytic_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688f16wgrad_optimized_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688fprop_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s1688fprop_analytic_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688fprop_optimized_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s1688fprop_optimized_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688tf32dgrad_analytic_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688tf32dgrad_optimized_256x128x16_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_s1688tf32fprop_analytic_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688tf32fprop_optimized_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688tf32wgrad_analytic_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688tf32wgrad_optimized_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688wgrad_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s1688wgrad_analytic_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s1688wgrad_optimized_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s1688wgrad_optimized_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_s4_i16864fprop_analytic_s4_256x128x128_4x2x1_3_nc64hw64.cu
cutlass_tensorop_s4_i16864fprop_analytic_s4_256x128x128_4x2x1_3_nhwc.cu
cutlass_tensorop_s4_i16864fprop_optimized_s4_256x128x128_4x2x1_3_nc64hw64.cu
cutlass_tensorop_s4_i16864fprop_optimized_s4_256x128x128_4x2x1_3_nhwc.cu
cutlass_tensorop_s4_i8832fprop_analytic_s4_256x128x128_4x2x1_2_nc64hw64.cu
cutlass_tensorop_s4_i8832fprop_analytic_s4_256x128x128_4x2x1_2_nhwc.cu
cutlass_tensorop_s4_i8832fprop_optimized_s4_256x128x128_4x2x1_2_nc64hw64.cu
cutlass_tensorop_s4_i8832fprop_optimized_s4_256x128x128_4x2x1_2_nhwc.cu
cutlass_tensorop_s884dgrad_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s884dgrad_optimized_f16_256x128x32_4x2x1_2_nhwc_unity_stride.cu
cutlass_tensorop_s884fprop_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s884fprop_optimized_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s884wgrad_analytic_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s884wgrad_optimized_f16_256x128x32_4x2x1_2_nhwc.cu
cutlass_tensorop_s8_i16832fprop_analytic_s8_256x128x64_4x2x1_3_nc32hw32.cu
cutlass_tensorop_s8_i16832fprop_analytic_s8_256x128x64_4x2x1_3_nhwc.cu
cutlass_tensorop_s8_i16832fprop_optimized_s8_256x128x64_4x2x1_3_nc32hw32.cu
cutlass_tensorop_s8_i16832fprop_optimized_s8_256x128x64_4x2x1_3_nhwc.cu
cutlass_tensorop_s8_i8816fprop_analytic_s8_256x128x64_4x2x1_2_nc32hw32.cu
cutlass_tensorop_s8_i8816fprop_analytic_s8_256x128x64_4x2x1_2_nhwc.cu
cutlass_tensorop_s8_i8816fprop_optimized_s8_256x128x64_4x2x1_2_nc32hw32.cu
cutlass_tensorop_s8_i8816fprop_optimized_s8_256x128x64_4x2x1_2_nhwc.cu
cutlass_tensorop_tf32_s1688dgrad_analytic_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_tf32_s1688dgrad_optimized_tf32_256x128x16_4x2x1_3_nhwc_unity_stride.cu
cutlass_tensorop_tf32_s1688fprop_analytic_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_tf32_s1688fprop_optimized_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_tf32_s1688wgrad_analytic_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_tf32_s1688wgrad_optimized_tf32_256x128x16_4x2x1_3_nhwc.cu
cutlass_tensorop_u4_i16864fprop_analytic_u4_256x128x128_4x2x1_3_nc64hw64.cu
cutlass_tensorop_u4_i16864fprop_analytic_u4_256x128x128_4x2x1_3_nhwc.cu
cutlass_tensorop_u4_i16864fprop_optimized_u4_256x128x128_4x2x1_3_nc64hw64.cu
cutlass_tensorop_u4_i16864fprop_optimized_u4_256x128x128_4x2x1_3_nhwc.cu
cutlass_tensorop_u4_i8832fprop_analytic_u4_256x128x128_4x2x1_2_nc64hw64.cu
cutlass_tensorop_u4_i8832fprop_analytic_u4_256x128x128_4x2x1_2_nhwc.cu
cutlass_tensorop_u4_i8832fprop_optimized_u4_256x128x128_4x2x1_2_nc64hw64.cu
cutlass_tensorop_u4_i8832fprop_optimized_u4_256x128x128_4x2x1_2_nhwc.cu
cutlass_tensorop_u8_i16832fprop_analytic_u8_256x128x64_4x2x1_3_nc32hw32.cu
cutlass_tensorop_u8_i16832fprop_analytic_u8_256x128x64_4x2x1_3_nhwc.cu
cutlass_tensorop_u8_i16832fprop_optimized_u8_256x128x64_4x2x1_3_nc32hw32.cu
cutlass_tensorop_u8_i16832fprop_optimized_u8_256x128x64_4x2x1_3_nhwc.cu
cutlass_tensorop_u8_i8816fprop_analytic_u8_256x128x64_4x2x1_2_nc32hw32.cu
cutlass_tensorop_u8_i8816fprop_analytic_u8_256x128x64_4x2x1_2_nhwc.cu
cutlass_tensorop_u8_i8816fprop_optimized_u8_256x128x64_4x2x1_2_nc32hw32.cu
cutlass_tensorop_u8_i8816fprop_optimized_u8_256x128x64_4x2x1_2_nhwc.cu
Filenames generated for conv3d:
cutlass_tensorop_bf16_s16816dgrad3d_analytic_bf16_256x128x32_4x2x1_3.cu
cutlass_tensorop_bf16_s16816fprop3d_analytic_bf16_256x128x32_4x2x1_3.cu
cutlass_tensorop_bf16_s16816wgrad3d_analytic_bf16_256x128x32_4x2x1_3.cu
cutlass_tensorop_bf16_s16816wgrad3d_optimized_bf16_256x128x32_4x2x1_3.cu
cutlass_tensorop_f16_s16816dgrad3d_analytic_f16_256x128x32_4x2x1_3.cu
cutlass_tensorop_f16_s16816fprop3d_analytic_f16_256x128x32_4x2x1_3.cu
cutlass_tensorop_f16_s16816wgrad3d_analytic_f16_256x128x32_4x2x1_3.cu
cutlass_tensorop_f16_s16816wgrad3d_optimized_f16_256x128x32_4x2x1_3.cu
cutlass_tensorop_h16816dgrad3d_analytic_256x128x32_4x2x1_3.cu
cutlass_tensorop_h16816fprop3d_analytic_256x128x32_4x2x1_3.cu
cutlass_tensorop_h16816wgrad3d_analytic_256x128x32_4x2x1_3.cu
cutlass_tensorop_h16816wgrad3d_optimized_256x128x32_4x2x1_3.cu
cutlass_tensorop_s16816dgrad3d_analytic_bf16_256x128x32_4x2x1_3.cu
cutlass_tensorop_s16816dgrad3d_analytic_f16_256x128x32_4x2x1_3.cu
cutlass_tensorop_s16816fprop3d_analytic_bf16_256x128x32_4x2x1_3.cu
cutlass_tensorop_s16816fprop3d_analytic_f16_256x128x32_4x2x1_3.cu
cutlass_tensorop_s16816wgrad3d_analytic_bf16_256x128x32_4x2x1_3.cu
cutlass_tensorop_s16816wgrad3d_analytic_f16_256x128x32_4x2x1_3.cu
cutlass_tensorop_s16816wgrad3d_optimized_bf16_256x128x32_4x2x1_3.cu
cutlass_tensorop_s16816wgrad3d_optimized_f16_256x128x32_4x2x1_3.cu
Thank you. I will let different stake holders to review this. Since cutlass is used everywhere, it will take a while.
This PR has been labeled inactive-30d due to no recent activity in the past 30 days. Please close this PR if it is no longer required. Otherwise, please respond with a comment indicating any updates. This PR will be labeled inactive-90d if there is no activity in the next 60 days.
This PR has been labeled inactive-90d due to no recent activity in the past 90 days. Please close this PR if it is no longer required. Otherwise, please respond with a comment indicating any updates.