iree icon indicating copy to clipboard operation
iree copied to clipboard

[gpu] 'func.func' op uses 401920 bytes of shared memory; exceeded the limit of 65536 bytes

Open pdhirajkumarprasad opened this issue 4 months ago • 7 comments

What happened?

For the given IR

module {
  func.func @main_graph(%arg0: !torch.vtensor<[1,3,224,224],f32>, %arg1: !torch.vtensor<[1,288,56,56],f32>, %arg2: !torch.vtensor<[288,72,3,3],f32>, %arg3: !torch.vtensor<[288],f32>, %arg4: !torch.vtensor<[36,288,1,1],f32>, %arg5: !torch.vtensor<[36],f32>, %arg6: !torch.vtensor<[1,288,1,1],f32>, %arg7: !torch.vtensor<[288,36,1,1],f32>, %arg8: !torch.vtensor<[288],f32>, %arg9: !torch.vtensor<[288,288,1,1],f32>, %arg10 : !torch.vtensor<[288],f32>, %arg11: !torch.vtensor<[1,288,28,28],si8>   ) -> !torch.vtensor<[1,288,28,28],f32>  attributes {torch.onnx_meta.ir_version = 8 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.opset_versions = {ai.onnx.contrib = 1 : si64, ai.onnx.ml = 4 : si64, ai.onnx.preview.training = 1 : si64, ai.onnx.training = 1 : si64, com.microsoft = 1 : si64, com.microsoft.experimental = 1 : si64, com.microsoft.nchwc = 1 : si64, org.pytorch.aten = 1 : si64}, torch.onnx_meta.producer_name = "vai_q_onnx", torch.onnx_meta.producer_version = "1.17.0+43059a7"} {

    %136 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %137 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<6.250000e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1627 = torch.operator "onnx.DequantizeLinear"(%arg11, %137, %136) : (!torch.vtensor<[1,288,28,28],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,28,28],f32> 
    %1628 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<[2, 3]> : tensor<2xsi64>} : () -> !torch.vtensor<[2],si64> 
    %1629 = torch.operator "onnx.ReduceMean"(%1627, %1628) {torch.onnx.keepdims = 1 : si64} : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[2],si64>) -> !torch.vtensor<[1,288,1,1],f32> 
    %1257 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<1.00488281> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1630 = torch.operator "onnx.Mul"(%1629, %1257) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>) -> !torch.vtensor<[1,288,1,1],f32> 
    %138 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %139 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<1.562500e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1631 = torch.operator "onnx.QuantizeLinear"(%1630, %139, %138) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],si8> 
    %1632 = torch.operator "onnx.DequantizeLinear"(%1631, %139, %138) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
    %1633 = torch.operator "onnx.Conv"(%1632, %arg4, %arg5) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[36,288,1,1],f32>, !torch.vtensor<[36],f32>) -> !torch.vtensor<[1,36,1,1],f32> 
    %1634 = torch.operator "onnx.Relu"(%1633) : (!torch.vtensor<[1,36,1,1],f32>) -> !torch.vtensor<[1,36,1,1],f32> 
    %146 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %147 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<3.125000e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1635 = torch.operator "onnx.QuantizeLinear"(%1634, %147, %146) : (!torch.vtensor<[1,36,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,36,1,1],si8> 
    %1636 = torch.operator "onnx.DequantizeLinear"(%1635, %147, %146) : (!torch.vtensor<[1,36,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,36,1,1],f32> 
    %1637 = torch.operator "onnx.Conv"(%1636, %arg7, %arg8) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,36,1,1],f32>, !torch.vtensor<[288,36,1,1],f32>, !torch.vtensor<[288],f32>) -> !torch.vtensor<[1,288,1,1],f32> 
    %148 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %149 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<3.125000e-02> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1638 = torch.operator "onnx.QuantizeLinear"(%1637, %149, %148) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],si8> 
    %1639 = torch.operator "onnx.DequantizeLinear"(%1638, %149, %148) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
    %1640 = torch.operator "onnx.Sigmoid"(%1639) : (!torch.vtensor<[1,288,1,1],f32>) -> !torch.vtensor<[1,288,1,1],f32> 
    %156 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %157 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<7.812500e-03> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1641 = torch.operator "onnx.QuantizeLinear"(%1640, %157, %156) : (!torch.vtensor<[1,288,1,1],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],si8> 
    %1642 = torch.operator "onnx.DequantizeLinear"(%1641, %157, %156) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
    %1643 = torch.operator "onnx.Mul"(%1627, %1642) : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[1,288,1,1],f32>) -> !torch.vtensor<[1,288,28,28],f32> 
    %158 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8> 
    %159 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<7.812500e-03> : tensor<f32>} : () -> !torch.vtensor<[],f32> 
    %1644 = torch.operator "onnx.QuantizeLinear"(%1643, %159, %158) : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,28,28],si8> 
    %1645 = torch.operator "onnx.DequantizeLinear"(%1644, %159, %158) : (!torch.vtensor<[1,288,28,28],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,28,28],f32> 
    %1646 = torch.operator "onnx.Conv"(%1645, %arg9, %arg10) {torch.onnx.dilations = [1 : si64, 1 : si64], torch.onnx.group = 1 : si64, torch.onnx.kernel_shape = [1 : si64, 1 : si64], torch.onnx.pads = [0 : si64, 0 : si64, 0 : si64, 0 : si64], torch.onnx.strides = [1 : si64, 1 : si64]} : (!torch.vtensor<[1,288,28,28],f32>, !torch.vtensor<[288,288,1,1],f32>, !torch.vtensor<[288],f32>) -> !torch.vtensor<[1,288,28,28],f32> 
    return %1646 : !torch.vtensor<[1,288,28,28],f32>
  }
}

getting error as

failed to translate executables
issue2/model.torch_onnx.mlir:14:13: error: 'func.func' op uses 401920 bytes of shared memory; exceeded the limit of 65536 bytes
    %1632 = torch.operator "onnx.DequantizeLinear"(%1631, %139, %138) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
            ^
issue2/model.torch_onnx.mlir:14:13: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"rocm", "rocm-hsaco-fb", {iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647]>>, ukernels = "none"}>
    %1632 = torch.operator "onnx.DequantizeLinear"(%1631, %139, %138) : (!torch.vtensor<[1,288,1,1],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,288,1,1],f32> 
            ^

dump.log with following flag enabled '--mlir-print-ir-after-all --mlir-print-ir-before-all --mlir-disable-threading --mlir-elide-elementsattrs-if-larger=4'

dump.log

Steps to reproduce your issue

this issue is present only in GPU. it's passing inference in CPU

command to reproduce the issue:

iree-compile --iree-input-type=auto --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=rocm model.mlir

What component(s) does this issue relate to?

Compiler

Version information

No response

Additional context

No response

pdhirajkumarprasad avatar Sep 26 '24 08:09 pdhirajkumarprasad