iree [gpu] 'func.func' op uses 401920 bytes of shared memory; exceeded the limit of 65536 bytes

[gpu] 'func.func' op uses 401920 bytes of shared memory; exceeded the limit of 65536 bytes

Open pdhirajkumarprasad opened this issue 4 months ago • 7 comments

What happened?

For the given IR

module {
  func.func @main_graph(%arg0: !torch.vtensor<[1,3,224,224],f32>, %arg1: !torch.vtensor<[1,288,56,56],f32>, %arg2: !torch.vtensor<[288,72,3,3],f32>, %arg3: !torch.vtensor<[288],f32>, %arg4: !torch.vtensor<[36,288,1,1],f32>, %arg5: !torch.vtensor<[36],f32>, %arg6: !torch.vtensor<[1,288,1,1],f32>, %arg7: !torch.vtensor<[288,36,1,1],f32>, %arg8: !torch.vtensor<[288],f32>, %arg9: !torch.vtensor<[288,288,1,1],f32>, %arg10 : !torch.vtensor<[288],f32>, %arg11: !torch.vtensor<[1,288,28,28],si8>   ) -> !torch.vtensor<[1,288,28,28],f32>  attributes {torch.onnx_meta.ir_version = 8 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.opset_versions = {ai.onnx.contrib = 1 : si64, ai.onnx.ml = 4 : si64, ai.onnx.preview.training = 1 : si64, ai.onnx.training = 1 : si64, com.microsoft = 1 : si64, com.microsoft.experimental = 1 : si64, com.microsoft.nchwc = 1 : si64, org.pytorch.aten = 1 : si64}, torch.onnx_meta.producer_name = "vai_q_onnx", torch.onnx_meta.producer_version = "1.17.0+43059a7"} {

    %136 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %137 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<6.250000e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1627 = torch.operator "onnx.DequantizeLinear"(%arg11, %137, %136) : (!torch.vtensor<[1,288,28,28],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,28,28],f32> 
    %1628 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<[2, 3]> : tensor<2xsi64>} : () -> !torch.vtensor<[2],si64> 
    %1629 = torch.operator "onnx.ReduceMean"(%1627, %1628) {torch.onnx.keepdims = 1 : si64} : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[2],si64>) -> !torch.vtensor<[1,288,1,1],f32> 
    %1257 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<1.00488281> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1630 = torch.operator "onnx.Mul"(%1629, %1257) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[1,288,1,1],f32> 
    %138 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %139 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<1.562500e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1631 = torch.operator "onnx.QuantizeLinear"(%1630, %139, %138) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],si8> 
    %1632 = torch.operator "onnx.DequantizeLinear"(%1631, %139, %138) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
    %1633 = torch.operator "onnx.Conv"(%1632, %arg4, %arg5) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[36,288,1,1],f32>, !torch.vtensor<[36],f32>) -> !torch.vtensor<[1,36,1,1],f32> 
    %1634 = torch.operator "onnx.Relu"(%1633) : (!torch.vtensor<[1,36,1,1],f32>) -> !torch.vtensor<[1,36,1,1],f32> 
    %146 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %147 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<3.125000e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1635 = torch.operator "onnx.QuantizeLinear"(%1634, %147, %146) : (!torch.vtensor<[1,36,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,36,1,1],si8> 
    %1636 = torch.operator "onnx.DequantizeLinear"(%1635, %147, %146) : (!torch.vtensor<[1,36,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,36,1,1],f32> 
    %1637 = torch.operator "onnx.Conv"(%1636, %arg7, %arg8) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,36,1,1],f32>, !torch.vtensor<[288,36,1,1],f32>, !torch.vtensor<[288],f32>) -> !torch.vtensor<[1,288,1,1],f32> 
    %148 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %149 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<3.125000e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1638 = torch.operator "onnx.QuantizeLinear"(%1637, %149, %148) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],si8> 
    %1639 = torch.operator "onnx.DequantizeLinear"(%1638, %149, %148) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
    %1640 = torch.operator "onnx.Sigmoid"(%1639) : (!torch.vtensor<[1,288,1,1],f32>) -> !torch.vtensor<[1,288,1,1],f32> 
    %156 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %157 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<7.812500e-03> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1641 = torch.operator "onnx.QuantizeLinear"(%1640, %157, %156) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],si8> 
    %1642 = torch.operator "onnx.DequantizeLinear"(%1641, %157, %156) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
    %1643 = torch.operator "onnx.Mul"(%1627, %1642) : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[1,288,1,1],f32>) -> !torch.vtensor<[1,288,28,28],f32> 
    %158 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %159 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<7.812500e-03> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1644 = torch.operator "onnx.QuantizeLinear"(%1643, %159, %158) : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,28,28],si8> 
    %1645 = torch.operator "onnx.DequantizeLinear"(%1644, %159, %158) : (!torch.vtensor<[1,288,28,28],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,28,28],f32> 
    %1646 = torch.operator "onnx.Conv"(%1645, %arg9, %arg10) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[288,288,1,1],f32>, !torch.vtensor<[288],f32>) -> !torch.vtensor<[1,288,28,28],f32> 
    return %1646 : !torch.vtensor<[1,288,28,28],f32>
  }
}

getting error as

failed to translate executables
issue2/model.torch_onnx.mlir:14:13: error: 'func.func' op uses 401920 bytes of shared memory; exceeded the limit of 65536 bytes
    %1632 = torch.operator "onnx.DequantizeLinear"(%1631, %139, %138) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
            ^
issue2/model.torch_onnx.mlir:14:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
    %1632 = torch.operator "onnx.DequantizeLinear"(%1631, %139, %138) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
            ^

dump.log with following flag enabled '--mlir-print-ir-after-all --mlir-print-ir-before-all --mlir-disable-threading --mlir-elide-elementsattrs-if-larger=4'

dump.log

Steps to reproduce your issue

this issue is present only in GPU. it's passing inference in CPU

command to reproduce the issue:

iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=rocm model.mlir

What component(s) does this issue relate to?

Compiler

Version information

No response

Additional context

No response

Sep 26 '24 08:09 pdhirajkumarprasad

iree iree copied to clipboard

[gpu] 'func.func' op uses 401920 bytes of shared memory; exceeded the limit of 65536 bytes

What happened?

Steps to reproduce your issue

this issue is present only in GPU. it's passing inference in CPU

What component(s) does this issue relate to?

Version information

Additional context

iree
iree copied to clipboard