cutlass
cutlass copied to clipboard
[BUG] cutlass.cute.nvgpu.common.OpError: OpError: expects arch to be one of ['sm_100a', 'sm_100f'], but got sm_121a
Which component has the problem?
CuTe DSL
Bug Report
Describe the bug with nvidia-cutlass and nvidia-cutlass-dsl 4.2.0.0
python cutlass/examples/python/CuTeDSL/blackwell/tutorial_gemm/fp16_gemm_1.py
nvidia_cutlass_dsl/python_packages/cutlass/cute/nvgpu/tcgen05/mma.py", line 153, in __post_init__
raise OpError(
cutlass.cute.nvgpu.common.OpError: OpError: expects arch to be one of ['sm_100a', 'sm_100f'], but got sm_121a
Error Code: MmaF16BF16Op error
Steps/Code to reproduce bug see above
Expected behavior work
Environment details (please complete the following information):
Additional context Add any other context about the problem here. .DGX Spark
+1
---------------------------------------------------------------------------
OpError Traceback (most recent call last)
Cell In[19], line 2
1 # Compile the kernel for the specific input types
----> 2 naive_kernel = cute.compile(host_function, a_tensor, b_tensor, c_tensor, kernel)
4 # Run the kernel
5 benchmark(naive_kernel, a_tensor, b_tensor, c_tensor)
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py:566](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py#line=565), in CompileCallable.__call__(self, *args, **kwargs)
565 def __call__(self, *args, **kwargs):
--> 566 return self._compile(*args, **kwargs)
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py:640](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py#line=639), in CompileCallable._compile(self, func, *args, **kwargs)
638 if hasattr(func, "_decorator_frame"):
639 kwargs["_decorator_frame"] = func._decorator_frame
--> 640 return func._dsl_object._func(fcn_ptr, *args, **kwargs)
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py:1555](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py#line=1554), in BaseDSL._func(self, funcBody, *args, **kwargs)
1553 # Generate MLIR Context and start generating IR
1554 log().debug(f"Generating MLIR for function '{function_name}'")
-> 1555 result = self.generate_mlir(
1556 funcBody,
1557 canonicalized_kwargs,
1558 function_name,
1559 gpu_module_attrs,
1560 canonicalized_args,
1561 args_spec,
1562 pipeline,
1563 no_cache,
1564 compile_only,
1565 frame=decorator_frame,
1566 )
1567 return result
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py:1361](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py#line=1360), in BaseDSL.generate_mlir(self, funcBody, kwargs, function_name, gpu_module_attrs, args, args_spec, pipeline, no_cache, compile_only, loc, frame)
1358 original_function_name = funcBody.__name__
1360 # Generate original ir module and its hash value.
-> 1361 module, module_hash, result = self.generate_original_ir(
1362 ir,
1363 func,
1364 funcBody,
1365 kwargs,
1366 function_name,
1367 func_types,
1368 gpu_module_attrs,
1369 args,
1370 args_spec,
1371 frame=frame,
1372 )
1374 # dryrun is used to only generate IR
1375 if self.envar.dryrun:
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py:1204](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py#line=1203), in BaseDSL.generate_original_ir(self, ir, func, funcBody, kwargs, function_name, func_types, gpu_module_attrs, args, args_spec, frame)
1202 # Build IR module
1203 profiler = timer(enable=self.envar.jit_time_profiling)
-> 1204 module, result = profiler(build_ir_module)()
1205 module_hash = self.get_module_hash(module, function_name)
1207 module = self.build_module(module, function_name)
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/utils/timer.py:29](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/utils/timer.py#line=28), in timer.<locals>.decorator.<locals>.func_wrapper(*args, **kwargs)
26 @wraps(func)
27 def func_wrapper(*args, **kwargs):
28 if not enable:
---> 29 return func(*args, **kwargs)
30 from time import time
32 start = time()
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py:1186](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py#line=1185), in BaseDSL.generate_original_ir.<locals>.build_ir_module()
1184 # Call user function body
1185 try:
-> 1186 result = funcBody(*ir_args, **ir_kwargs)
1187 default_ret_values = self.generate_default_return_values(
1188 ir.InsertionPoint.current
1189 )
1190 func.ReturnOp(default_ret_values, loc=loc)
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py:255](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/dsl.py#line=254), in DSLCallable.__call__(self, *args, **kwargs)
254 def __call__(self, *args, **kwargs):
--> 255 ret = self.__func__(*args, **kwargs)
256 self.func = None
257 return ret
Cell In[16], line 9, in host_function(a, b, c, kernel)
1 @cute.jit
2 def host_function(
3 a: cute.Tensor,
(...) 7 ):
8 # Construct tiled MMA
----> 9 op = tcgen05.MmaF16BF16Op(
10 io_dtype,
11 acc_dtype,
12 mma_inst_shape_mnk,
13 tcgen05.CtaGroup.ONE,
14 tcgen05.OperandSource.SMEM,
15 tcgen05.OperandMajorMode.K,
16 tcgen05.OperandMajorMode.K,
17 )
18 tiled_mma = cute.make_tiled_mma(op)
20 # Construct SMEM layouts for A and B
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/cute/nvgpu/tcgen05/mma.py:707](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/cute/nvgpu/tcgen05/mma.py#line=706), in MmaF16BF16Op.__init__(self, ab_dtype, acc_dtype, instruction_shape, cta_group, a_src, a_major_mode, b_major_mode)
697 def __init__(
698 self,
699 ab_dtype: Type[Numeric],
(...) 705 b_major_mode: OperandMajorMode,
706 ) -> None:
--> 707 super().__init__(
708 ab_dtype,
709 ab_dtype,
710 acc_dtype,
711 instruction_shape,
712 cta_group,
713 a_src,
714 a_major_mode,
715 b_major_mode,
716 )
717 self._verify()
File <string>:11, in __init__(self, a_dtype, b_dtype, acc_dtype, shape_mnk, cta_group, a_src, a_major_mode, b_major_mode)
File [~/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/cute/nvgpu/tcgen05/mma.py:167](http://192.168.1.175:8888/home/kapilsh/miniconda3/envs/cutlass/lib/python3.12/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/cute/nvgpu/tcgen05/mma.py#line=166), in MmaOp.__post_init__(self)
165 arch = CuTeDSL._get_dsl().get_arch_enum()
166 if arch not in self.admissible_archs:
--> 167 raise OpError(
168 self,
169 f"expects arch to be one of {self.admissible_archs}, but got {arch}",
170 suggestion="Ensure env CUTE_DSL_ARCH matches your GPU architecture",
171 )
172 # Verify that the user provided enum values
173 if not isinstance(self.cta_group, CtaGroup):
OpError: OpError: expects arch to be one of [Arch.sm_100a, Arch.sm_100f, Arch.sm_101a, Arch.sm_101f, Arch.sm_110a, Arch.sm_110f], but got Arch.sm_121a
Error Code: MmaF16BF16Op error
💡 Suggestions:
Ensure env CUTE_DSL_ARCH matches your GPU architecture
After forcing cute to ignore the architecture checks 1 and 2 , hitting the following mlir issue . So looks like tcgen05 is not supported in DGX Spark. . Is there a plan and timeline for its support ?
Found GPU0 NVIDIA GB10 which is of cuda capability 12.1.
Minimum and Maximum cuda capability supported by this version of PyTorch is
(8.0) - (12.0)
queued_call()
===================================================================
Running Blackwell fp16 GEMM example 0 with:
mnk: [8192, 8192, 8192]
tolerance: 0.1
===================================================================
Traceback (most recent call last):
File " .../lib/python3.10/site-packages/nvidia_cutlass_dsl/python_packages/cutlass/base_dsl/compiler.py", line 148, in compile
pm.run(module.operation)
cutlass._mlir._mlir_libs.MLIRError: Failure while executing pass pipeline:
error: unknown: Failed to generate PTX from the compilation unit) (error 9: NVVM_ERROR_COMPILATION), libNVVM extra log: error: tcgen05.alloc supported only on arch-conditional or family-conditional variants from SM100 onwa
rds.
^@^@
note: unknown: see current operation:
"gpu.module"() <{sym_name = "kernels", targets = [#nvvm.target<O = 3, chip = "sm_121a", flags = {"ptx-cmd-options" = []}>]}> ({
"llvm.mlir.global"() <{addr_space = 3 : i32, alignment = 1024 : i64, dso_local, global_type = !llvm.array<0 x i8>, linkage = #llvm.linkage<external>, sym_name = "__dynamic_shmem__0", visibility_ = 0 : i64}> ({
}) : () -> ()
"llvm.func"() <{CConv = #llvm.cconv<ccc>, arg_attrs = [{}, {llvm.align = 64 : i64, llvm.byval = !llvm.struct<(struct<(array<16 x i64>)>)>, nvvm.grid_constant}, {}, {llvm.align = 64 : i64, llvm.byval = !llvm.struct<(str
uct<(array<16 x i64>)>)>, nvvm.grid_constant}, {}, {}], function_type = !llvm.func<void (struct<(i1, i1, i1)>, ptr, struct<(struct<()>, struct<(struct<(i32, i32)>, struct<()>)>)>, ptr, struct<(struct<()>, struct<(struct<(i
32, i32)>, struct<()>)>)>, struct<(ptr<1>, struct<(struct<(i32, i32)>, i64)>)>)>, linkage = #llvm.linkage<external>, sym_name = "kernel_cutlass_kernel_TiledMMA_ThrLayoutVMNK11110000_PermutationMNK____MMAAtom_ThrID10_ShapeM
NK12825616_TVLayoutA1128161281128_TVLayoutB1256162561256_TVLayoutC11282561281128_CopyAtom_ThrI_0", visibility_ = 0 : i64}> ({
^bb0(%arg0: !llvm.struct<(i1, i1, i1)>, %arg1: !llvm.ptr, %arg2: !llvm.struct<(struct<()>, struct<(struct<(i32, i32)>, struct<()>)>)>, %arg3: !llvm.ptr, %arg4: !llvm.struct<(struct<()>, struct<(struct<(i32, i32)>, stru
ct<()>)>)>, %arg5: !llvm.struct<(ptr<1>, struct<(struct<(i32, i32)>, i64)>)>):
.
.
^bb119: // pred: ^bb118
%368 = "llvm.inttoptr"(%161) : (i32) -> !llvm.ptr<6>
"nvvm.tcgen05.dealloc"(%368, %22) <{group = #nvvm.tcgen05_group<cta_1>}> : (!llvm.ptr<6>, i32) -> ()
"llvm.br"()[^bb120] : () -> ()
^bb120: // 2 preds: ^bb118, ^bb119
"llvm.return"() : () -> ()
}) {cu_attrs = {max_dynamic_shared_size_bytes = #cuda.dev_max_shared_memory_optin, non_portable_cluster_size_allowed = 1 : i32}, gpu.kernel, nvvm.kernel, nvvm.reqntid = array<i32: 128, 1, 1>} : () -> ()
}) {compute_targets = [#cuda.compute_target<sass, conditional, [sm_121]>]} : () -> ()
Any comment or update on this ?