iree Vectorize missing static 2D and depthwise convolutions

Some statically-shaped convolutions currently remain scalar at least on RISC-V. The RISC-V models that are more impacted by this issue are EfficientNet and PersonDetect.

Some dispatches to repro:

EfficientNet:

hal.executable private @main_dispatch_2 {
  hal.executable.variant public @embedded_elf_riscv_64, target = <"llvm-cpu", "embedded-elf-riscv_64", {cpu_features = "+m,+a,+f,+d,+v", data_layout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128", native_vector_size = 64 : index, target_triple = "riscv64-unknown-unknown-eabi-elf"}> {
    hal.executable.export public @main_dispatch_2_conv_2d_nhwc_hwcf_q_1x112x112x32x3x3x3 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index):
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_2_conv_2d_nhwc_hwcf_q_1x112x112x32x3x3x3() {
        %c150528 = arith.constant 150528 : index
        %c4695744 = arith.constant 4695744 : index
        %c302464 = arith.constant 302464 : index
        %c0_i32 = arith.constant 0 : i32
        %c3_i32 = arith.constant 3 : i32
        %0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c150528) alignment(64) : !flow.dispatch.tensor<readonly:1x225x225x3xi8>
        %1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c4695744) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xi8>
        %2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c302464) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xi32>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xi8> -> tensor<1x225x225x3xi8>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xi8> -> tensor<3x3x3x32xi8>
        %5 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xi32>
        %6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1x112x112x32xi32>) -> tensor<1x112x112x32xi32>
        %7 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4, %c3_i32, %c0_i32 : tensor<1x225x225x3xi8>, tensor<3x3x3x32xi8>, i32, i32) outs(%6 : tensor<1x112x112x32xi32>) -> tensor<1x112x112x32xi32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xi32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xi32>
        return
      }
    }
  }
}

PersonDetect:

hal.executable private @main_dispatch_34 {
  hal.executable.variant public @embedded_elf_riscv_64, target = <"llvm-cpu", "embedded-elf-riscv_64", {cpu_features = "+m,+a,+f,+d,+v", data_layout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128", native_vector_size = 64 : index, target_triple = "riscv64-unknown-unknown-eabi-elf"}> {
    hal.executable.export public @main_dispatch_34_depthwise_conv_2d_nhwc_hwc_q_1x6x6x128x3x3 ordinal(0) layout(#hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
    ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
      %x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @main_dispatch_34_depthwise_conv_2d_nhwc_hwc_q_1x6x6x128x3x3() {
        %c-128_i32 = arith.constant -128 : i32
        %c0_i32 = arith.constant 0 : i32
        %0 = hal.interface.constant.load[0] values([9728 : i32, 12032 : i32]) : i32
        %1 = hal.interface.constant.load[1] values([1792 : i32, 2944 : i32, 4096 : i32, 5248 : i32, 6400 : i32]) : i32
        %2 = hal.interface.constant.load[2] values([17920 : i32, 20224 : i32]) : i32
        %3 = arith.index_cast %0 : i32 to index
        %4 = arith.index_cast %1 : i32 to index
        %5 = arith.index_cast %2 : i32 to index
        %6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%3) alignment(64) : !flow.dispatch.tensor<readonly:1x8x8x128xi8>
        %7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%4) alignment(64) : !flow.dispatch.tensor<readonly:3x3x128xi8>
        %8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%5) alignment(64) : !flow.dispatch.tensor<writeonly:1x6x6x128xi32>
        %9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [1, 8, 8, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x8x8x128xi8> -> tensor<1x8x8x128xi8>
        %10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0], sizes = [3, 3, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x128xi8> -> tensor<3x3x128xi8>
        %11 = linalg.init_tensor [1, 6, 6, 128] : tensor<1x6x6x128xi32>
        %12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x6x6x128xi32>) -> tensor<1x6x6x128xi32>
        %13 = linalg.depthwise_conv_2d_nhwc_hwc_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%9, %10, %c-128_i32, %c0_i32 : tensor<1x8x8x128xi8>, tensor<3x3x128xi8>, i32, i32) outs(%12 : tensor<1x6x6x128xi32>) -> tensor<1x6x6x128xi32>
        flow.dispatch.tensor.store %13, %8, offsets = [0, 0, 0, 0], sizes = [1, 6, 6, 128], strides = [1, 1, 1, 1] : tensor<1x6x6x128xi32> -> !flow.dispatch.tensor<writeonly:1x6x6x128xi32>
        return
      }
    }
  }
}

Sep 07 '22 00:09 dcaballe

They are quantized convolution ops (i.e., linalg.*_q). I filed https://github.com/iree-org/iree/issues/8905 before. We need a pattern to rewrite quantized convolution to generic + convolution, like what we've done for quantized matmul.

Sep 07 '22 00:09 hanhanW

Thanks for the pointer! Ok, let me keep this issue open to make sure the specific dispatches that I see on RISC-V are addressed. Assigning this to @pzread as he is working on #8905.

Sep 07 '22 01:09 dcaballe

I wasn't actively working on #8905 since it was at low priority. Taking a look at it again to see if I can fix it and reassign it if I can't.

Sep 08 '22 20:09 pzread

I don't think I'll have enough time recently to work on #8905. Unassigned myself and feel free to take over

Sep 09 '22 16:09 pzread

We need a pattern to rewrite quantized convolution to generic + convolution, like what we've done for quantized matmul.

I think @bjacob and @rsuderman can also help here. As we discussed, we may even want to consider removing the _q operation altogether.

Sep 12 '22 17:09 dcaballe

Adding @vmurali to this issue so that he can coordinate with Rob in case something is needed at codegen level or Rob needs some help.

Oct 11 '22 22:10 dcaballe

Hey Rob, could you please comment on the current state of this? I think you mentioned you were not seeing the expected performance in some of the cases. What about the depthwise ones? Are they also covered by #10789?

Oct 20 '22 03:10 dcaballe

iree iree copied to clipboard

Vectorize missing static 2D and depthwise convolutions

iree
iree copied to clipboard