iree
iree copied to clipboard
[gpu] 'func.func' op uses 401920 bytes of shared memory; exceeded the limit of 65536 bytes
What happened?
For the given IR
module {
func.func @main_graph(%arg0: !torch.vtensor<[1,3,224,224],f32>, %arg1: !torch.vtensor<[1,288,56,56],f32>, %arg2: !torch.vtensor<[288,72,3,3],f32>, %arg3: !torch.vtensor<[288],f32>, %arg4: !torch.vtensor<[36,288,1,1],f32>, %arg5: !torch.vtensor<[36],f32>, %arg6: !torch.vtensor<[1,288,1,1],f32>, %arg7: !torch.vtensor<[288,36,1,1],f32>, %arg8: !torch.vtensor<[288],f32>, %arg9: !torch.vtensor<[288,288,1,1],f32>, %arg10 : !torch.vtensor<[288],f32>, %arg11: !torch.vtensor<[1,288,28,28],si8> ) -> !torch.vtensor<[1,288,28,28],f32> attributes {torch.onnx_meta.ir_version = 8 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.opset_versions = {ai.onnx.contrib = 1 : si64, ai.onnx.ml = 4 : si64, ai.onnx.preview.training = 1 : si64, ai.onnx.training = 1 : si64, com.microsoft = 1 : si64, com.microsoft.experimental = 1 : si64, com.microsoft.nchwc = 1 : si64, org.pytorch.aten = 1 : si64}, torch.onnx_meta.producer_name = "vai_q_onnx", torch.onnx_meta.producer_version = "1.17.0+43059a7"} {
%136 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8>
%137 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<6.250000e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32>
%1627 = torch.operator "onnx.DequantizeLinear"(%arg11, %137, %136) : (!torch.vtensor<[1,288,28,28],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,28,28],f32>
%1628 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<[2, 3]> : tensor<2xsi64>} : () -> !torch.vtensor<[2],si64>
%1629 = torch.operator "onnx.ReduceMean"(%1627, %1628) {torch.onnx.keepdims = 1 : si64} : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[2],si64>) -> !torch.vtensor<[1,288,1,1],f32>
%1257 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<1.00488281> : tensor<f32>} : () -> !torch.vtensor<[],f32>
%1630 = torch.operator "onnx.Mul"(%1629, %1257) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[1,288,1,1],f32>
%138 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8>
%139 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<1.562500e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32>
%1631 = torch.operator "onnx.QuantizeLinear"(%1630, %139, %138) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],si8>
%1632 = torch.operator "onnx.DequantizeLinear"(%1631, %139, %138) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32>
%1633 = torch.operator "onnx.Conv"(%1632, %arg4, %arg5) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[36,288,1,1],f32>, !torch.vtensor<[36],f32>) -> !torch.vtensor<[1,36,1,1],f32>
%1634 = torch.operator "onnx.Relu"(%1633) : (!torch.vtensor<[1,36,1,1],f32>) -> !torch.vtensor<[1,36,1,1],f32>
%146 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8>
%147 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<3.125000e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32>
%1635 = torch.operator "onnx.QuantizeLinear"(%1634, %147, %146) : (!torch.vtensor<[1,36,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,36,1,1],si8>
%1636 = torch.operator "onnx.DequantizeLinear"(%1635, %147, %146) : (!torch.vtensor<[1,36,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,36,1,1],f32>
%1637 = torch.operator "onnx.Conv"(%1636, %arg7, %arg8) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,36,1,1],f32>, !torch.vtensor<[288,36,1,1],f32>, !torch.vtensor<[288],f32>) -> !torch.vtensor<[1,288,1,1],f32>
%148 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8>
%149 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<3.125000e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32>
%1638 = torch.operator "onnx.QuantizeLinear"(%1637, %149, %148) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],si8>
%1639 = torch.operator "onnx.DequantizeLinear"(%1638, %149, %148) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32>
%1640 = torch.operator "onnx.Sigmoid"(%1639) : (!torch.vtensor<[1,288,1,1],f32>) -> !torch.vtensor<[1,288,1,1],f32>
%156 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8>
%157 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<7.812500e-03> : tensor<f32>} : () -> !torch.vtensor<[],f32>
%1641 = torch.operator "onnx.QuantizeLinear"(%1640, %157, %156) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],si8>
%1642 = torch.operator "onnx.DequantizeLinear"(%1641, %157, %156) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32>
%1643 = torch.operator "onnx.Mul"(%1627, %1642) : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[1,288,1,1],f32>) -> !torch.vtensor<[1,288,28,28],f32>
%158 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8>
%159 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<7.812500e-03> : tensor<f32>} : () -> !torch.vtensor<[],f32>
%1644 = torch.operator "onnx.QuantizeLinear"(%1643, %159, %158) : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,28,28],si8>
%1645 = torch.operator "onnx.DequantizeLinear"(%1644, %159, %158) : (!torch.vtensor<[1,288,28,28],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,28,28],f32>
%1646 = torch.operator "onnx.Conv"(%1645, %arg9, %arg10) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[288,288,1,1],f32>, !torch.vtensor<[288],f32>) -> !torch.vtensor<[1,288,28,28],f32>
return %1646 : !torch.vtensor<[1,288,28,28],f32>
}
}
getting error as
failed to translate executables
issue2/model.torch_onnx.mlir:14:13: error: 'func.func' op uses 401920 bytes of shared memory; exceeded the limit of 65536 bytes
%1632 = torch.operator "onnx.DequantizeLinear"(%1631, %139, %138) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32>
^
issue2/model.torch_onnx.mlir:14:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
%1632 = torch.operator "onnx.DequantizeLinear"(%1631, %139, %138) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32>
^
dump.log with following flag enabled '--mlir-print-ir-after-all --mlir-print-ir-before-all --mlir-disable-threading --mlir-elide-elementsattrs-if-larger=4'
Steps to reproduce your issue
this issue is present only in GPU. it's passing inference in CPU
command to reproduce the issue:
iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=rocm model.mlir
What component(s) does this issue relate to?
Compiler
Version information
No response
Additional context
No response