iree
iree copied to clipboard
Vectorize missing static 2D and depthwise convolutions
Some statically-shaped convolutions currently remain scalar at least on RISC-V. The RISC-V models that are more impacted by this issue are EfficientNet and PersonDetect.
Some dispatches to repro:
EfficientNet:
hal.executable private @main_dispatch_2 {
hal.executable.variant public @embedded_elf_riscv_64, target = <"llvm-cpu", "embedded-elf-riscv_64", {cpu_features = "+m,+a,+f,+d,+v", data_layout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128", native_vector_size = 64 : index, target_triple = "riscv64-unknown-unknown-eabi-elf"}> {
hal.executable.export public @main_dispatch_2_conv_2d_nhwc_hwcf_q_1x112x112x32x3x3x3 ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index, %arg7: index):
%x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2, %arg3, %arg4, %arg5, %arg6, %arg7
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_dispatch_2_conv_2d_nhwc_hwcf_q_1x112x112x32x3x3x3() {
%c150528 = arith.constant 150528 : index
%c4695744 = arith.constant 4695744 : index
%c302464 = arith.constant 302464 : index
%c0_i32 = arith.constant 0 : i32
%c3_i32 = arith.constant 3 : i32
%0 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%c150528) alignment(64) : !flow.dispatch.tensor<readonly:1x225x225x3xi8>
%1 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%c4695744) alignment(64) : !flow.dispatch.tensor<readonly:3x3x3x32xi8>
%2 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%c302464) alignment(64) : !flow.dispatch.tensor<writeonly:1x112x112x32xi32>
%3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 225, 225, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x225x225x3xi8> -> tensor<1x225x225x3xi8>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 3, 32], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x3x32xi8> -> tensor<3x3x3x32xi8>
%5 = linalg.init_tensor [1, 112, 112, 32] : tensor<1x112x112x32xi32>
%6 = linalg.fill ins(%c0_i32 : i32) outs(%5 : tensor<1x112x112x32xi32>) -> tensor<1x112x112x32xi32>
%7 = linalg.conv_2d_nhwc_hwcf_q {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} ins(%3, %4, %c3_i32, %c0_i32 : tensor<1x225x225x3xi8>, tensor<3x3x3x32xi8>, i32, i32) outs(%6 : tensor<1x112x112x32xi32>) -> tensor<1x112x112x32xi32>
flow.dispatch.tensor.store %7, %2, offsets = [0, 0, 0, 0], sizes = [1, 112, 112, 32], strides = [1, 1, 1, 1] : tensor<1x112x112x32xi32> -> !flow.dispatch.tensor<writeonly:1x112x112x32xi32>
return
}
}
}
}
PersonDetect:
hal.executable private @main_dispatch_34 {
hal.executable.variant public @embedded_elf_riscv_64, target = <"llvm-cpu", "embedded-elf-riscv_64", {cpu_features = "+m,+a,+f,+d,+v", data_layout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128", native_vector_size = 64 : index, target_triple = "riscv64-unknown-unknown-eabi-elf"}> {
hal.executable.export public @main_dispatch_34_depthwise_conv_2d_nhwc_hwc_q_1x6x6x128x3x3 ordinal(0) layout(#hal.pipeline.layout<push_constants = 3, sets = [<0, bindings = [<0, storage_buffer, ReadOnly>, <1, storage_buffer, ReadOnly>, <2, storage_buffer>]>]>) {
^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index, %arg4: index, %arg5: index, %arg6: index):
%x, %y, %z = flow.dispatch.default_workgroup_count %arg1, %arg2, %arg3, %arg4, %arg5, %arg6
hal.return %x, %y, %z : index, index, index
}
builtin.module {
func.func @main_dispatch_34_depthwise_conv_2d_nhwc_hwc_q_1x6x6x128x3x3() {
%c-128_i32 = arith.constant -128 : i32
%c0_i32 = arith.constant 0 : i32
%0 = hal.interface.constant.load[0] values([9728 : i32, 12032 : i32]) : i32
%1 = hal.interface.constant.load[1] values([1792 : i32, 2944 : i32, 4096 : i32, 5248 : i32, 6400 : i32]) : i32
%2 = hal.interface.constant.load[2] values([17920 : i32, 20224 : i32]) : i32
%3 = arith.index_cast %0 : i32 to index
%4 = arith.index_cast %1 : i32 to index
%5 = arith.index_cast %2 : i32 to index
%6 = hal.interface.binding.subspan set(0) binding(0) type(storage_buffer) offset(%3) alignment(64) : !flow.dispatch.tensor<readonly:1x8x8x128xi8>
%7 = hal.interface.binding.subspan set(0) binding(1) type(storage_buffer) offset(%4) alignment(64) : !flow.dispatch.tensor<readonly:3x3x128xi8>
%8 = hal.interface.binding.subspan set(0) binding(2) type(storage_buffer) offset(%5) alignment(64) : !flow.dispatch.tensor<writeonly:1x6x6x128xi32>
%9 = flow.dispatch.tensor.load %6, offsets = [0, 0, 0, 0], sizes = [1, 8, 8, 128], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:1x8x8x128xi8> -> tensor<1x8x8x128xi8>
%10 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0], sizes = [3, 3, 128], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:3x3x128xi8> -> tensor<3x3x128xi8>
%11 = linalg.init_tensor [1, 6, 6, 128] : tensor<1x6x6x128xi32>
%12 = linalg.fill ins(%c0_i32 : i32) outs(%11 : tensor<1x6x6x128xi32>) -> tensor<1x6x6x128xi32>
%13 = linalg.depthwise_conv_2d_nhwc_hwc_q {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%9, %10, %c-128_i32, %c0_i32 : tensor<1x8x8x128xi8>, tensor<3x3x128xi8>, i32, i32) outs(%12 : tensor<1x6x6x128xi32>) -> tensor<1x6x6x128xi32>
flow.dispatch.tensor.store %13, %8, offsets = [0, 0, 0, 0], sizes = [1, 6, 6, 128], strides = [1, 1, 1, 1] : tensor<1x6x6x128xi32> -> !flow.dispatch.tensor<writeonly:1x6x6x128xi32>
return
}
}
}
}
They are quantized convolution ops (i.e., linalg.*_q). I filed https://github.com/iree-org/iree/issues/8905 before. We need a pattern to rewrite quantized convolution to generic + convolution, like what we've done for quantized matmul.
Thanks for the pointer! Ok, let me keep this issue open to make sure the specific dispatches that I see on RISC-V are addressed. Assigning this to @pzread as he is working on #8905.
I wasn't actively working on #8905 since it was at low priority. Taking a look at it again to see if I can fix it and reassign it if I can't.
I don't think I'll have enough time recently to work on #8905. Unassigned myself and feel free to take over
We need a pattern to rewrite quantized convolution to generic + convolution, like what we've done for quantized matmul.
I think @bjacob and @rsuderman can also help here. As we discussed, we may even want to consider removing the _q
operation altogether.
Adding @vmurali to this issue so that he can coordinate with Rob in case something is needed at codegen level or Rob needs some help.
Hey Rob, could you please comment on the current state of this? I think you mentioned you were not seeing the expected performance in some of the cases. What about the depthwise ones? Are they also covered by #10789?