[compile][cpu]:'func.func' op exceeded stack allocation limit of 32768 bytes for function
What happened?
For the given IR
module {
func.func @torch_jit(%arg3: !torch.vtensor<[8,12,128,128],f32>, %arg4: !torch.vtensor<[8,12,128,64],f32>,%arg5:!torch.vtensor<[768,768],f32>,%arg6: !torch.vtensor<[3],si64>,%arg7:!torch.vtensor<[768],f32>) -> !torch.vtensor<[8,128,768],f32> attributes {torch.onnx_meta.ir_version = 7 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "pytorch", torch.onnx_meta.producer_version = "1.13.0"} {
%258 = torch.operator "onnx.MatMul"(%arg3, %arg4) : (!torch.vtensor<[8,12,128,128],f32>, !torch.vtensor<[8,12,128,64],f32>) -> !torch.vtensor<[8,12,128,64],f32>
%259 = torch.operator "onnx.Transpose"(%258) {torch.onnx.perm = [0 : si64, 2 : si64, 1 : si64, 3 : si64]} : (!torch.vtensor<[8,12,128,64],f32>) -> !torch.vtensor<[8,128,12,64],f32>
%260 = torch.operator "onnx.Constant"() {torch.onnx.value = dense_resource<__16> : tensor<3xsi64>} : () -> !torch.vtensor<[3],si64>
%261 = torch.operator "onnx.Reshape"(%259, %arg6) {torch.onnx.allowzero = 0 : si64} : (!torch.vtensor<[8,128,12,64],f32>, !torch.vtensor<[3],si64>) -> !torch.vtensor<[8,128,768],f32>
%262 = torch.operator "onnx.MatMul"(%261, %arg5) : (!torch.vtensor<[8,128,768],f32>, !torch.vtensor<[768,768],f32>) -> !torch.vtensor<[8,128,768],f32>
%263 = torch.operator "onnx.Add"(%arg7, %262) : (!torch.vtensor<[768],f32>, !torch.vtensor<[8,128,768],f32>) -> !torch.vtensor<[8,128,768],f32>
return %263: !torch.vtensor<[8,128,768],f32>
}
}
getting error as
model.torch_onnx.mlir:7:12: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 65664 bytes %262 = torch.operator "onnx.MatMul"(%261, %arg5) : (!torch.vtensor<[8,128,768],f32>, !torch.vtensor<[768,768],f32>) -> !torch.vtensor<[8,128,768],f32> ^ model.torch_onnx.mlir:7:12: note: see current operation:
IR after failure:
// -----// IR Dump After LLVMCPUCheckIRBeforeLLVMConversionPass Failed (iree-llvmcpu-check-ir-before-llvm-conversion) //----- //
func.func @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%cst = arith.constant dense<0.000000e+00> : vector<8xf32>
%c7 = arith.constant 7 : index
%c6 = arith.constant 6 : index
%c5 = arith.constant 5 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c9437184 = arith.constant 9437184 : index
%c16 = arith.constant 16 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x8x4xf32>
%alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<8x1x32x64xf32>
%0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c9437184) flags("ReadOnly|Indirect") : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
memref.assume_alignment %0, 64 : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
%1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags(Indirect) : memref<8x16x12x64x8x1xf32>
memref.assume_alignment %1, 64 : memref<8x16x12x64x8x1xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c16 step %3 {
scf.for %arg1 = %workgroup_id_x to %c12 step %workgroup_count_x {
%subview = memref.subview %1[0, %arg0, %arg1, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x16x12x64x8x1xf32> to memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
%subview_1 = memref.subview %0[0, %arg1, %arg0, 0, 0, 0] [8, 1, 4, 16, 8, 4] [1, 1, 1, 1, 1, 1] : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>> to memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c32 step %c1 {
%4 = affine.apply affine_map<(d0) -> (d0 floordiv 8)>(%arg3)
%5 = affine.apply affine_map<(d0) -> (d0 mod 8)>(%arg3)
scf.for %arg4 = %c0 to %c64 step %c1 {
%6 = affine.apply affine_map<(d0) -> (d0 floordiv 4)>(%arg4)
%7 = affine.apply affine_map<(d0) -> (d0 mod 4)>(%arg4)
%8 = vector.load %subview_1[%arg2, %c0, %4, %6, %c0, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%9 = vector.load %subview_1[%arg2, %c0, %4, %6, %c1, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%10 = vector.load %subview_1[%arg2, %c0, %4, %6, %c2, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%11 = vector.load %subview_1[%arg2, %c0, %4, %6, %c3, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%12 = vector.load %subview_1[%arg2, %c0, %4, %6, %c4, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%13 = vector.load %subview_1[%arg2, %c0, %4, %6, %c5, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%14 = vector.load %subview_1[%arg2, %c0, %4, %6, %c6, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%15 = vector.load %subview_1[%arg2, %c0, %4, %6, %c7, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<8x4xf32>
vector.store %8, %subview_2[%c0, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %9, %subview_2[%c1, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %10, %subview_2[%c2, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %11, %subview_2[%c3, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %12, %subview_2[%c4, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %13, %subview_2[%c5, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %14, %subview_2[%c6, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %15, %subview_2[%c7, %c0] : memref<8x4xf32>, vector<4xf32>
%subview_3 = memref.subview %alloca[0, 0, %5, %7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
%subview_4 = memref.subview %alloca_0[%arg2, 0, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<8x1x32x64xf32> to memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
%16 = memref.load %subview_3[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
memref.store %16, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
}
}
}
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c1 {
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg3)
%5 = memref.load %alloca_0[%arg2, %c0, %4, %arg4] : memref<8x1x32x64xf32>
%6 = vector.broadcast %5 : f32 to vector<1xf32>
%7 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg3)
%8 = memref.load %alloca_0[%arg2, %c0, %7, %arg4] : memref<8x1x32x64xf32>
%9 = vector.broadcast %8 : f32 to vector<1xf32>
%10 = affine.apply affine_map<(d0) -> (d0 * 8 + 2)>(%arg3)
%11 = memref.load %alloca_0[%arg2, %c0, %10, %arg4] : memref<8x1x32x64xf32>
%12 = vector.broadcast %11 : f32 to vector<1xf32>
%13 = affine.apply affine_map<(d0) -> (d0 * 8 + 3)>(%arg3)
%14 = memref.load %alloca_0[%arg2, %c0, %13, %arg4] : memref<8x1x32x64xf32>
%15 = vector.broadcast %14 : f32 to vector<1xf32>
%16 = affine.apply affine_map<(d0) -> (d0 * 8 + 4)>(%arg3)
%17 = memref.load %alloca_0[%arg2, %c0, %16, %arg4] : memref<8x1x32x64xf32>
%18 = vector.broadcast %17 : f32 to vector<1xf32>
%19 = affine.apply affine_map<(d0) -> (d0 * 8 + 5)>(%arg3)
%20 = memref.load %alloca_0[%arg2, %c0, %19, %arg4] : memref<8x1x32x64xf32>
%21 = vector.broadcast %20 : f32 to vector<1xf32>
%22 = affine.apply affine_map<(d0) -> (d0 * 8 + 6)>(%arg3)
%23 = memref.load %alloca_0[%arg2, %c0, %22, %arg4] : memref<8x1x32x64xf32>
%24 = vector.broadcast %23 : f32 to vector<1xf32>
%25 = affine.apply affine_map<(d0) -> (d0 * 8 + 7)>(%arg3)
%26 = memref.load %alloca_0[%arg2, %c0, %25, %arg4] : memref<8x1x32x64xf32>
%27 = vector.broadcast %26 : f32 to vector<1xf32>
%28 = vector.insert_strided_slice %6, %cst {offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
%29 = vector.insert_strided_slice %9, %28 {offsets = [1], strides = [1]} : vector<1xf32> into vector<8xf32>
%30 = vector.insert_strided_slice %12, %29 {offsets = [2], strides = [1]} : vector<1xf32> into vector<8xf32>
%31 = vector.insert_strided_slice %15, %30 {offsets = [3], strides = [1]} : vector<1xf32> into vector<8xf32>
%32 = vector.insert_strided_slice %18, %31 {offsets = [4], strides = [1]} : vector<1xf32> into vector<8xf32>
%33 = vector.insert_strided_slice %21, %32 {offsets = [5], strides = [1]} : vector<1xf32> into vector<8xf32>
%34 = vector.insert_strided_slice %24, %33 {offsets = [6], strides = [1]} : vector<1xf32> into vector<8xf32>
%35 = vector.insert_strided_slice %27, %34 {offsets = [7], strides = [1]} : vector<1xf32> into vector<8xf32>
%subview_2 = memref.subview %subview[0, 0, 0, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>> to memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
vector.store %35, %subview_2[%arg2, %arg3, %c0, %arg4, %c0] : memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, vector<8xf32>
}
}
}
}
}
return
}
// -----// IR Dump After TranslateTargetExecutableVariantsPass Failed (iree-hal-translate-target-executable-variants) //----- //
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
^bb0(%arg0: !hal.device):
%c12 = arith.constant 12 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
hal.return %c12, %c4, %c1 : index, index, index
}
builtin.module {
func.func @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%cst = arith.constant dense<0.000000e+00> : vector<8xf32>
%c7 = arith.constant 7 : index
%c6 = arith.constant 6 : index
%c5 = arith.constant 5 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c9437184 = arith.constant 9437184 : index
%c16 = arith.constant 16 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x8x4xf32>
%alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<8x1x32x64xf32>
%0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c9437184) flags("ReadOnly|Indirect") : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
memref.assume_alignment %0, 64 : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
%1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags(Indirect) : memref<8x16x12x64x8x1xf32>
memref.assume_alignment %1, 64 : memref<8x16x12x64x8x1xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c16 step %3 {
scf.for %arg1 = %workgroup_id_x to %c12 step %workgroup_count_x {
%subview = memref.subview %1[0, %arg0, %arg1, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x16x12x64x8x1xf32> to memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
%subview_1 = memref.subview %0[0, %arg1, %arg0, 0, 0, 0] [8, 1, 4, 16, 8, 4] [1, 1, 1, 1, 1, 1] : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>> to memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c32 step %c1 {
%4 = affine.apply affine_map<(d0) -> (d0 floordiv 8)>(%arg3)
%5 = affine.apply affine_map<(d0) -> (d0 mod 8)>(%arg3)
scf.for %arg4 = %c0 to %c64 step %c1 {
%6 = affine.apply affine_map<(d0) -> (d0 floordiv 4)>(%arg4)
%7 = affine.apply affine_map<(d0) -> (d0 mod 4)>(%arg4)
%8 = vector.load %subview_1[%arg2, %c0, %4, %6, %c0, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%9 = vector.load %subview_1[%arg2, %c0, %4, %6, %c1, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%10 = vector.load %subview_1[%arg2, %c0, %4, %6, %c2, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%11 = vector.load %subview_1[%arg2, %c0, %4, %6, %c3, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%12 = vector.load %subview_1[%arg2, %c0, %4, %6, %c4, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%13 = vector.load %subview_1[%arg2, %c0, %4, %6, %c5, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%14 = vector.load %subview_1[%arg2, %c0, %4, %6, %c6, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%15 = vector.load %subview_1[%arg2, %c0, %4, %6, %c7, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<8x4xf32>
vector.store %8, %subview_2[%c0, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %9, %subview_2[%c1, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %10, %subview_2[%c2, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %11, %subview_2[%c3, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %12, %subview_2[%c4, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %13, %subview_2[%c5, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %14, %subview_2[%c6, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %15, %subview_2[%c7, %c0] : memref<8x4xf32>, vector<4xf32>
%subview_3 = memref.subview %alloca[0, 0, %5, %7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
%subview_4 = memref.subview %alloca_0[%arg2, 0, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<8x1x32x64xf32> to memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
%16 = memref.load %subview_3[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
memref.store %16, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
}
}
}
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c1 {
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg3)
%5 = memref.load %alloca_0[%arg2, %c0, %4, %arg4] : memref<8x1x32x64xf32>
%6 = vector.broadcast %5 : f32 to vector<1xf32>
%7 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg3)
%8 = memref.load %alloca_0[%arg2, %c0, %7, %arg4] : memref<8x1x32x64xf32>
%9 = vector.broadcast %8 : f32 to vector<1xf32>
%10 = affine.apply affine_map<(d0) -> (d0 * 8 + 2)>(%arg3)
%11 = memref.load %alloca_0[%arg2, %c0, %10, %arg4] : memref<8x1x32x64xf32>
%12 = vector.broadcast %11 : f32 to vector<1xf32>
%13 = affine.apply affine_map<(d0) -> (d0 * 8 + 3)>(%arg3)
%14 = memref.load %alloca_0[%arg2, %c0, %13, %arg4] : memref<8x1x32x64xf32>
%15 = vector.broadcast %14 : f32 to vector<1xf32>
%16 = affine.apply affine_map<(d0) -> (d0 * 8 + 4)>(%arg3)
%17 = memref.load %alloca_0[%arg2, %c0, %16, %arg4] : memref<8x1x32x64xf32>
%18 = vector.broadcast %17 : f32 to vector<1xf32>
%19 = affine.apply affine_map<(d0) -> (d0 * 8 + 5)>(%arg3)
%20 = memref.load %alloca_0[%arg2, %c0, %19, %arg4] : memref<8x1x32x64xf32>
%21 = vector.broadcast %20 : f32 to vector<1xf32>
%22 = affine.apply affine_map<(d0) -> (d0 * 8 + 6)>(%arg3)
%23 = memref.load %alloca_0[%arg2, %c0, %22, %arg4] : memref<8x1x32x64xf32>
%24 = vector.broadcast %23 : f32 to vector<1xf32>
%25 = affine.apply affine_map<(d0) -> (d0 * 8 + 7)>(%arg3)
%26 = memref.load %alloca_0[%arg2, %c0, %25, %arg4] : memref<8x1x32x64xf32>
%27 = vector.broadcast %26 : f32 to vector<1xf32>
%28 = vector.insert_strided_slice %6, %cst {offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
%29 = vector.insert_strided_slice %9, %28 {offsets = [1], strides = [1]} : vector<1xf32> into vector<8xf32>
%30 = vector.insert_strided_slice %12, %29 {offsets = [2], strides = [1]} : vector<1xf32> into vector<8xf32>
%31 = vector.insert_strided_slice %15, %30 {offsets = [3], strides = [1]} : vector<1xf32> into vector<8xf32>
%32 = vector.insert_strided_slice %18, %31 {offsets = [4], strides = [1]} : vector<1xf32> into vector<8xf32>
%33 = vector.insert_strided_slice %21, %32 {offsets = [5], strides = [1]} : vector<1xf32> into vector<8xf32>
%34 = vector.insert_strided_slice %24, %33 {offsets = [6], strides = [1]} : vector<1xf32> into vector<8xf32>
%35 = vector.insert_strided_slice %27, %34 {offsets = [7], strides = [1]} : vector<1xf32> into vector<8xf32>
%subview_2 = memref.subview %subview[0, 0, 0, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>> to memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
vector.store %35, %subview_2[%arg2, %arg3, %c0, %arg4, %c0] : memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, vector<8xf32>
}
}
}
}
}
return
}
}
}
failed to translate executables
// -----// IR Dump After TranslateExecutablesPass Failed (iree-hal-translate-executables) //----- //
hal.executable private @torch_jit$async_dispatch_3 {
hal.executable.variant public @embedded_elf_x86_64 target(<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>) {
hal.executable.export public @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack ordinal(0) layout(#hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) attributes {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>]} {
^bb0(%arg0: !hal.device):
%c12 = arith.constant 12 : index
%c4 = arith.constant 4 : index
%c1 = arith.constant 1 : index
hal.return %c12, %c4, %c1 : index, index, index
}
builtin.module {
func.func @torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack() attributes {translation_info = #iree_codegen.translation_info<CPUDataTiling>} {
%cst = arith.constant dense<0.000000e+00> : vector<8xf32>
%c7 = arith.constant 7 : index
%c6 = arith.constant 6 : index
%c5 = arith.constant 5 : index
%c3 = arith.constant 3 : index
%c2 = arith.constant 2 : index
%c0 = arith.constant 0 : index
%c9437184 = arith.constant 9437184 : index
%c16 = arith.constant 16 : index
%c12 = arith.constant 12 : index
%c8 = arith.constant 8 : index
%c1 = arith.constant 1 : index
%c32 = arith.constant 32 : index
%c64 = arith.constant 64 : index
%c4 = arith.constant 4 : index
%alloca = memref.alloca() {alignment = 64 : i64} : memref<1x1x8x4xf32>
%alloca_0 = memref.alloca() {alignment = 64 : i64} : memref<8x1x32x64xf32>
%0 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(0) alignment(64) offset(%c9437184) flags("ReadOnly|Indirect") : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
memref.assume_alignment %0, 64 : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
%1 = hal.interface.binding.subspan layout(<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>) set(0) binding(1) alignment(64) offset(%c0) flags(Indirect) : memref<8x16x12x64x8x1xf32>
memref.assume_alignment %1, 64 : memref<8x16x12x64x8x1xf32>
%workgroup_id_x = hal.interface.workgroup.id[0] : index
%workgroup_count_x = hal.interface.workgroup.count[0] : index
%workgroup_id_y = hal.interface.workgroup.id[1] : index
%workgroup_count_y = hal.interface.workgroup.count[1] : index
%2 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_id_y]
%3 = affine.apply affine_map<()[s0] -> (s0 * 4)>()[%workgroup_count_y]
scf.for %arg0 = %2 to %c16 step %3 {
scf.for %arg1 = %workgroup_id_x to %c12 step %workgroup_count_x {
%subview = memref.subview %1[0, %arg0, %arg1, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x16x12x64x8x1xf32> to memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
%subview_1 = memref.subview %0[0, %arg1, %arg0, 0, 0, 0] [8, 1, 4, 16, 8, 4] [1, 1, 1, 1, 1, 1] : memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>> to memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c32 step %c1 {
%4 = affine.apply affine_map<(d0) -> (d0 floordiv 8)>(%arg3)
%5 = affine.apply affine_map<(d0) -> (d0 mod 8)>(%arg3)
scf.for %arg4 = %c0 to %c64 step %c1 {
%6 = affine.apply affine_map<(d0) -> (d0 floordiv 4)>(%arg4)
%7 = affine.apply affine_map<(d0) -> (d0 mod 4)>(%arg4)
%8 = vector.load %subview_1[%arg2, %c0, %4, %6, %c0, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%9 = vector.load %subview_1[%arg2, %c0, %4, %6, %c1, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%10 = vector.load %subview_1[%arg2, %c0, %4, %6, %c2, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%11 = vector.load %subview_1[%arg2, %c0, %4, %6, %c3, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%12 = vector.load %subview_1[%arg2, %c0, %4, %6, %c4, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%13 = vector.load %subview_1[%arg2, %c0, %4, %6, %c5, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%14 = vector.load %subview_1[%arg2, %c0, %4, %6, %c6, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%15 = vector.load %subview_1[%arg2, %c0, %4, %6, %c7, %c0] : memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, vector<4xf32>
%subview_2 = memref.subview %alloca[0, 0, 0, 0] [1, 1, 8, 4] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<8x4xf32>
vector.store %8, %subview_2[%c0, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %9, %subview_2[%c1, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %10, %subview_2[%c2, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %11, %subview_2[%c3, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %12, %subview_2[%c4, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %13, %subview_2[%c5, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %14, %subview_2[%c6, %c0] : memref<8x4xf32>, vector<4xf32>
vector.store %15, %subview_2[%c7, %c0] : memref<8x4xf32>, vector<4xf32>
%subview_3 = memref.subview %alloca[0, 0, %5, %7] [1, 1, 1, 1] [1, 1, 1, 1] : memref<1x1x8x4xf32> to memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
%subview_4 = memref.subview %alloca_0[%arg2, 0, %arg3, %arg4] [1, 1, 1, 1] [1, 1, 1, 1] : memref<8x1x32x64xf32> to memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
%16 = memref.load %subview_3[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
memref.store %16, %subview_4[%c0, %c0, %c0, %c0] : memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
}
}
}
scf.for %arg2 = %c0 to %c8 step %c1 {
scf.for %arg3 = %c0 to %c4 step %c1 {
scf.for %arg4 = %c0 to %c64 step %c1 {
%4 = affine.apply affine_map<(d0) -> (d0 * 8)>(%arg3)
%5 = memref.load %alloca_0[%arg2, %c0, %4, %arg4] : memref<8x1x32x64xf32>
%6 = vector.broadcast %5 : f32 to vector<1xf32>
%7 = affine.apply affine_map<(d0) -> (d0 * 8 + 1)>(%arg3)
%8 = memref.load %alloca_0[%arg2, %c0, %7, %arg4] : memref<8x1x32x64xf32>
%9 = vector.broadcast %8 : f32 to vector<1xf32>
%10 = affine.apply affine_map<(d0) -> (d0 * 8 + 2)>(%arg3)
%11 = memref.load %alloca_0[%arg2, %c0, %10, %arg4] : memref<8x1x32x64xf32>
%12 = vector.broadcast %11 : f32 to vector<1xf32>
%13 = affine.apply affine_map<(d0) -> (d0 * 8 + 3)>(%arg3)
%14 = memref.load %alloca_0[%arg2, %c0, %13, %arg4] : memref<8x1x32x64xf32>
%15 = vector.broadcast %14 : f32 to vector<1xf32>
%16 = affine.apply affine_map<(d0) -> (d0 * 8 + 4)>(%arg3)
%17 = memref.load %alloca_0[%arg2, %c0, %16, %arg4] : memref<8x1x32x64xf32>
%18 = vector.broadcast %17 : f32 to vector<1xf32>
%19 = affine.apply affine_map<(d0) -> (d0 * 8 + 5)>(%arg3)
%20 = memref.load %alloca_0[%arg2, %c0, %19, %arg4] : memref<8x1x32x64xf32>
%21 = vector.broadcast %20 : f32 to vector<1xf32>
%22 = affine.apply affine_map<(d0) -> (d0 * 8 + 6)>(%arg3)
%23 = memref.load %alloca_0[%arg2, %c0, %22, %arg4] : memref<8x1x32x64xf32>
%24 = vector.broadcast %23 : f32 to vector<1xf32>
%25 = affine.apply affine_map<(d0) -> (d0 * 8 + 7)>(%arg3)
%26 = memref.load %alloca_0[%arg2, %c0, %25, %arg4] : memref<8x1x32x64xf32>
%27 = vector.broadcast %26 : f32 to vector<1xf32>
%28 = vector.insert_strided_slice %6, %cst {offsets = [0], strides = [1]} : vector<1xf32> into vector<8xf32>
%29 = vector.insert_strided_slice %9, %28 {offsets = [1], strides = [1]} : vector<1xf32> into vector<8xf32>
%30 = vector.insert_strided_slice %12, %29 {offsets = [2], strides = [1]} : vector<1xf32> into vector<8xf32>
%31 = vector.insert_strided_slice %15, %30 {offsets = [3], strides = [1]} : vector<1xf32> into vector<8xf32>
%32 = vector.insert_strided_slice %18, %31 {offsets = [4], strides = [1]} : vector<1xf32> into vector<8xf32>
%33 = vector.insert_strided_slice %21, %32 {offsets = [5], strides = [1]} : vector<1xf32> into vector<8xf32>
%34 = vector.insert_strided_slice %24, %33 {offsets = [6], strides = [1]} : vector<1xf32> into vector<8xf32>
%35 = vector.insert_strided_slice %27, %34 {offsets = [7], strides = [1]} : vector<1xf32> into vector<8xf32>
%subview_2 = memref.subview %subview[0, 0, 0, 0, 0, 0] [8, 4, 1, 64, 8, 1] [1, 1, 1, 1, 1, 1] : memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>> to memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
vector.store %35, %subview_2[%arg2, %arg3, %c0, %arg4, %c0] : memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, vector<8xf32>
}
}
}
}
}
return
}
}
}
}
model.torch_onnx.mlir:7:12: error: 'func.func' op exceeded stack allocation limit of 32768 bytes for function. Got 65664 bytes
%262 = torch.operator "onnx.MatMul"(%261, %arg5) : (!torch.vtensor<[8,128,768],f32>, !torch.vtensor<[768,768],f32>) -> !torch.vtensor<[8,128,768],f32>
^
model.torch_onnx.mlir:7:12: note: see current operation:
"func.func"() <{function_type = () -> (), sym_name = "torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack"}> ({
%0 = "arith.constant"() <{value = dense<0.000000e+00> : vector<8xf32>}> : () -> vector<8xf32>
%1 = "arith.constant"() <{value = 7 : index}> : () -> index
%2 = "arith.constant"() <{value = 6 : index}> : () -> index
%3 = "arith.constant"() <{value = 5 : index}> : () -> index
%4 = "arith.constant"() <{value = 3 : index}> : () -> index
%5 = "arith.constant"() <{value = 2 : index}> : () -> index
%6 = "arith.constant"() <{value = 0 : index}> : () -> index
%7 = "arith.constant"() <{value = 9437184 : index}> : () -> index
%8 = "arith.constant"() <{value = 16 : index}> : () -> index
%9 = "arith.constant"() <{value = 12 : index}> : () -> index
%10 = "arith.constant"() <{value = 8 : index}> : () -> index
%11 = "arith.constant"() <{value = 1 : index}> : () -> index
%12 = "arith.constant"() <{value = 32 : index}> : () -> index
%13 = "arith.constant"() <{value = 64 : index}> : () -> index
%14 = "arith.constant"() <{value = 4 : index}> : () -> index
%15 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x4xf32>
%16 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<8x1x32x64xf32>
%17 = "hal.interface.binding.subspan"(%7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
"memref.assume_alignment"(%17) <{alignment = 64 : i32}> : (memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>) -> ()
%18 = "hal.interface.binding.subspan"(%6) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8x16x12x64x8x1xf32>
"memref.assume_alignment"(%18) <{alignment = 64 : i32}> : (memref<8x16x12x64x8x1xf32>) -> ()
%19 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
%20 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
%21 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
%22 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
%23 = "affine.apply"(%21) <{map = affine_map<()[s0] -> (s0 * 4)>}> : (index) -> index
%24 = "affine.apply"(%22) <{map = affine_map<()[s0] -> (s0 * 4)>}> : (index) -> index
"scf.for"(%23, %8, %24) ({
^bb0(%arg0: index):
"scf.for"(%19, %9, %20) ({
^bb0(%arg1: index):
%25 = "memref.subview"(%18, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 8, 4, 1, 64, 8, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x16x12x64x8x1xf32>, index, index) -> memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
%26 = "memref.subview"(%17, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 8, 1, 4, 16, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>, index, index) -> memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
"scf.for"(%6, %10, %11) ({
^bb0(%arg5: index):
"scf.for"(%6, %12, %11) ({
^bb0(%arg6: index):
%60 = "affine.apply"(%arg6) <{map = affine_map<(d0) -> (d0 floordiv 8)>}> : (index) -> index
%61 = "affine.apply"(%arg6) <{map = affine_map<(d0) -> (d0 mod 8)>}> : (index) -> index
"scf.for"(%6, %13, %11) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 floordiv 4)>}> : (index) -> index
%63 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 mod 4)>}> : (index) -> index
%64 = "vector.load"(%26, %arg5, %6, %60, %62, %6, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%65 = "vector.load"(%26, %arg5, %6, %60, %62, %11, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%66 = "vector.load"(%26, %arg5, %6, %60, %62, %5, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%67 = "vector.load"(%26, %arg5, %6, %60, %62, %4, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%68 = "vector.load"(%26, %arg5, %6, %60, %62, %14, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%69 = "vector.load"(%26, %arg5, %6, %60, %62, %3, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%70 = "vector.load"(%26, %arg5, %6, %60, %62, %2, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%71 = "vector.load"(%26, %arg5, %6, %60, %62, %1, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%72 = "memref.subview"(%15) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x1x8x4xf32>) -> memref<8x4xf32>
"vector.store"(%64, %72, %6, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%65, %72, %11, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%66, %72, %5, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%67, %72, %4, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%68, %72, %14, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%69, %72, %3, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%70, %72, %2, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%71, %72, %1, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
%73 = "memref.subview"(%15, %61, %63) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 1, 1, 1, 1>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x1x8x4xf32>, index, index) -> memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
%74 = "memref.subview"(%16, %arg5, %arg6, %arg7) <{operandSegmentSizes = array<i32: 1, 3, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 1, 1, 1, 1>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<8x1x32x64xf32>, index, index, index) -> memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
%75 = "memref.load"(%73, %6, %6, %6, %6) <{nontemporal = false}> : (memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>, index, index, index, index) -> f32
"memref.store"(%75, %74, %6, %6, %6, %6) <{nontemporal = false}> : (f32, memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.for"(%6, %10, %11) ({
^bb0(%arg2: index):
"scf.for"(%6, %14, %11) ({
^bb0(%arg3: index):
"scf.for"(%6, %13, %11) ({
^bb0(%arg4: index):
%27 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8)>}> : (index) -> index
%28 = "memref.load"(%16, %arg2, %6, %27, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%29 = "vector.broadcast"(%28) : (f32) -> vector<1xf32>
%30 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 1)>}> : (index) -> index
%31 = "memref.load"(%16, %arg2, %6, %30, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%32 = "vector.broadcast"(%31) : (f32) -> vector<1xf32>
%33 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 2)>}> : (index) -> index
%34 = "memref.load"(%16, %arg2, %6, %33, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%35 = "vector.broadcast"(%34) : (f32) -> vector<1xf32>
%36 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 3)>}> : (index) -> index
%37 = "memref.load"(%16, %arg2, %6, %36, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%38 = "vector.broadcast"(%37) : (f32) -> vector<1xf32>
%39 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 4)>}> : (index) -> index
%40 = "memref.load"(%16, %arg2, %6, %39, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%41 = "vector.broadcast"(%40) : (f32) -> vector<1xf32>
%42 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 5)>}> : (index) -> index
%43 = "memref.load"(%16, %arg2, %6, %42, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%44 = "vector.broadcast"(%43) : (f32) -> vector<1xf32>
%45 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 6)>}> : (index) -> index
%46 = "memref.load"(%16, %arg2, %6, %45, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%47 = "vector.broadcast"(%46) : (f32) -> vector<1xf32>
%48 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 7)>}> : (index) -> index
%49 = "memref.load"(%16, %arg2, %6, %48, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%50 = "vector.broadcast"(%49) : (f32) -> vector<1xf32>
%51 = "vector.insert_strided_slice"(%29, %0) <{offsets = [0], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%52 = "vector.insert_strided_slice"(%32, %51) <{offsets = [1], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%53 = "vector.insert_strided_slice"(%35, %52) <{offsets = [2], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%54 = "vector.insert_strided_slice"(%38, %53) <{offsets = [3], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%55 = "vector.insert_strided_slice"(%41, %54) <{offsets = [4], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%56 = "vector.insert_strided_slice"(%44, %55) <{offsets = [5], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%57 = "vector.insert_strided_slice"(%47, %56) <{offsets = [6], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%58 = "vector.insert_strided_slice"(%50, %57) <{offsets = [7], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%59 = "memref.subview"(%25) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 8, 4, 1, 64, 8, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>) -> memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
"vector.store"(%58, %59, %arg2, %arg3, %6, %arg4, %6) <{nontemporal = false}> : (vector<8xf32>, memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, index, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<CPUDataTiling>} : () -> ()
model.torch_onnx.mlir:7:12: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>
%262 = torch.operator "onnx.MatMul"(%261, %arg5) : (!torch.vtensor<[8,128,768],f32>, !torch.vtensor<[768,768],f32>) -> !torch.vtensor<[8,128,768],f32>
^
model.torch_onnx.mlir:7:12: note: see current operation:
"hal.executable.variant"() ({
"hal.executable.export"() ({
^bb0(%arg8: !hal.device):
%76 = "arith.constant"() <{value = 12 : index}> : () -> index
%77 = "arith.constant"() <{value = 4 : index}> : () -> index
%78 = "arith.constant"() <{value = 1 : index}> : () -> index
"hal.return"(%76, %77, %78) : (index, index, index) -> ()
}) {hal.interface.bindings = [#hal.interface.binding<0, 0>, #hal.interface.binding<0, 1>], layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, ordinal = 0 : index, sym_name = "torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack"} : () -> ()
"builtin.module"() ({
"func.func"() <{function_type = () -> (), sym_name = "torch_jit$async_dispatch_3_unpack_transpose_8x128x12x64_f32_pack"}> ({
%0 = "arith.constant"() <{value = dense<0.000000e+00> : vector<8xf32>}> : () -> vector<8xf32>
%1 = "arith.constant"() <{value = 7 : index}> : () -> index
%2 = "arith.constant"() <{value = 6 : index}> : () -> index
%3 = "arith.constant"() <{value = 5 : index}> : () -> index
%4 = "arith.constant"() <{value = 3 : index}> : () -> index
%5 = "arith.constant"() <{value = 2 : index}> : () -> index
%6 = "arith.constant"() <{value = 0 : index}> : () -> index
%7 = "arith.constant"() <{value = 9437184 : index}> : () -> index
%8 = "arith.constant"() <{value = 16 : index}> : () -> index
%9 = "arith.constant"() <{value = 12 : index}> : () -> index
%10 = "arith.constant"() <{value = 8 : index}> : () -> index
%11 = "arith.constant"() <{value = 1 : index}> : () -> index
%12 = "arith.constant"() <{value = 32 : index}> : () -> index
%13 = "arith.constant"() <{value = 64 : index}> : () -> index
%14 = "arith.constant"() <{value = 4 : index}> : () -> index
%15 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<1x1x8x4xf32>
%16 = "memref.alloca"() <{alignment = 64 : i64, operandSegmentSizes = array<i32: 0, 0>}> : () -> memref<8x1x32x64xf32>
%17 = "hal.interface.binding.subspan"(%7) {alignment = 64 : index, binding = 0 : index, descriptor_flags = 3 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>
"memref.assume_alignment"(%17) <{alignment = 64 : i32}> : (memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>) -> ()
%18 = "hal.interface.binding.subspan"(%6) {alignment = 64 : index, binding = 1 : index, descriptor_flags = 2 : i32, layout = #hal.pipeline.layout<push_constants = 0, sets = [<0, bindings = [<0, storage_buffer, "ReadOnly|Indirect">, <1, storage_buffer, Indirect>], flags = Indirect>]>, operandSegmentSizes = array<i32: 1, 0>, set = 0 : index} : (index) -> memref<8x16x12x64x8x1xf32>
"memref.assume_alignment"(%18) <{alignment = 64 : i32}> : (memref<8x16x12x64x8x1xf32>) -> ()
%19 = "hal.interface.workgroup.id"() {dimension = 0 : index} : () -> index
%20 = "hal.interface.workgroup.count"() {dimension = 0 : index} : () -> index
%21 = "hal.interface.workgroup.id"() {dimension = 1 : index} : () -> index
%22 = "hal.interface.workgroup.count"() {dimension = 1 : index} : () -> index
%23 = "affine.apply"(%21) <{map = affine_map<()[s0] -> (s0 * 4)>}> : (index) -> index
%24 = "affine.apply"(%22) <{map = affine_map<()[s0] -> (s0 * 4)>}> : (index) -> index
"scf.for"(%23, %8, %24) ({
^bb0(%arg0: index):
"scf.for"(%19, %9, %20) ({
^bb0(%arg1: index):
%25 = "memref.subview"(%18, %arg0, %arg1) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 8, 4, 1, 64, 8, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x16x12x64x8x1xf32>, index, index) -> memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>
%26 = "memref.subview"(%17, %arg1, %arg0) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, -9223372036854775808, -9223372036854775808, 0, 0, 0>, static_sizes = array<i64: 8, 1, 4, 16, 8, 4>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x12x16x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: 2359296>>, index, index) -> memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>
"scf.for"(%6, %10, %11) ({
^bb0(%arg5: index):
"scf.for"(%6, %12, %11) ({
^bb0(%arg6: index):
%60 = "affine.apply"(%arg6) <{map = affine_map<(d0) -> (d0 floordiv 8)>}> : (index) -> index
%61 = "affine.apply"(%arg6) <{map = affine_map<(d0) -> (d0 mod 8)>}> : (index) -> index
"scf.for"(%6, %13, %11) ({
^bb0(%arg7: index):
%62 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 floordiv 4)>}> : (index) -> index
%63 = "affine.apply"(%arg7) <{map = affine_map<(d0) -> (d0 mod 4)>}> : (index) -> index
%64 = "vector.load"(%26, %arg5, %6, %60, %62, %6, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%65 = "vector.load"(%26, %arg5, %6, %60, %62, %11, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%66 = "vector.load"(%26, %arg5, %6, %60, %62, %5, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%67 = "vector.load"(%26, %arg5, %6, %60, %62, %4, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%68 = "vector.load"(%26, %arg5, %6, %60, %62, %14, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%69 = "vector.load"(%26, %arg5, %6, %60, %62, %3, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%70 = "vector.load"(%26, %arg5, %6, %60, %62, %2, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%71 = "vector.load"(%26, %arg5, %6, %60, %62, %1, %6) <{nontemporal = false}> : (memref<8x1x4x16x8x4xf32, strided<[98304, 8192, 512, 32, 4, 1], offset: ?>>, index, index, index, index, index, index) -> vector<4xf32>
%72 = "memref.subview"(%15) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0>, static_sizes = array<i64: 1, 1, 8, 4>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x1x8x4xf32>) -> memref<8x4xf32>
"vector.store"(%64, %72, %6, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%65, %72, %11, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%66, %72, %5, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%67, %72, %4, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%68, %72, %14, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%69, %72, %3, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%70, %72, %2, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
"vector.store"(%71, %72, %1, %6) <{nontemporal = false}> : (vector<4xf32>, memref<8x4xf32>, index, index) -> ()
%73 = "memref.subview"(%15, %61, %63) <{operandSegmentSizes = array<i32: 1, 2, 0, 0>, static_offsets = array<i64: 0, 0, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 1, 1, 1, 1>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<1x1x8x4xf32>, index, index) -> memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>
%74 = "memref.subview"(%16, %arg5, %arg6, %arg7) <{operandSegmentSizes = array<i32: 1, 3, 0, 0>, static_offsets = array<i64: -9223372036854775808, 0, -9223372036854775808, -9223372036854775808>, static_sizes = array<i64: 1, 1, 1, 1>, static_strides = array<i64: 1, 1, 1, 1>}> : (memref<8x1x32x64xf32>, index, index, index) -> memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>
%75 = "memref.load"(%73, %6, %6, %6, %6) <{nontemporal = false}> : (memref<1x1x1x1xf32, strided<[32, 32, 4, 1], offset: ?>>, index, index, index, index) -> f32
"memref.store"(%75, %74, %6, %6, %6, %6) <{nontemporal = false}> : (f32, memref<1x1x1x1xf32, strided<[2048, 2048, 64, 1], offset: ?>>, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.for"(%6, %10, %11) ({
^bb0(%arg2: index):
"scf.for"(%6, %14, %11) ({
^bb0(%arg3: index):
"scf.for"(%6, %13, %11) ({
^bb0(%arg4: index):
%27 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8)>}> : (index) -> index
%28 = "memref.load"(%16, %arg2, %6, %27, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%29 = "vector.broadcast"(%28) : (f32) -> vector<1xf32>
%30 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 1)>}> : (index) -> index
%31 = "memref.load"(%16, %arg2, %6, %30, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%32 = "vector.broadcast"(%31) : (f32) -> vector<1xf32>
%33 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 2)>}> : (index) -> index
%34 = "memref.load"(%16, %arg2, %6, %33, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%35 = "vector.broadcast"(%34) : (f32) -> vector<1xf32>
%36 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 3)>}> : (index) -> index
%37 = "memref.load"(%16, %arg2, %6, %36, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%38 = "vector.broadcast"(%37) : (f32) -> vector<1xf32>
%39 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 4)>}> : (index) -> index
%40 = "memref.load"(%16, %arg2, %6, %39, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%41 = "vector.broadcast"(%40) : (f32) -> vector<1xf32>
%42 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 5)>}> : (index) -> index
%43 = "memref.load"(%16, %arg2, %6, %42, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%44 = "vector.broadcast"(%43) : (f32) -> vector<1xf32>
%45 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 6)>}> : (index) -> index
%46 = "memref.load"(%16, %arg2, %6, %45, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%47 = "vector.broadcast"(%46) : (f32) -> vector<1xf32>
%48 = "affine.apply"(%arg3) <{map = affine_map<(d0) -> (d0 * 8 + 7)>}> : (index) -> index
%49 = "memref.load"(%16, %arg2, %6, %48, %arg4) <{nontemporal = false}> : (memref<8x1x32x64xf32>, index, index, index, index) -> f32
%50 = "vector.broadcast"(%49) : (f32) -> vector<1xf32>
%51 = "vector.insert_strided_slice"(%29, %0) <{offsets = [0], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%52 = "vector.insert_strided_slice"(%32, %51) <{offsets = [1], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%53 = "vector.insert_strided_slice"(%35, %52) <{offsets = [2], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%54 = "vector.insert_strided_slice"(%38, %53) <{offsets = [3], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%55 = "vector.insert_strided_slice"(%41, %54) <{offsets = [4], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%56 = "vector.insert_strided_slice"(%44, %55) <{offsets = [5], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%57 = "vector.insert_strided_slice"(%47, %56) <{offsets = [6], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%58 = "vector.insert_strided_slice"(%50, %57) <{offsets = [7], strides = [1]}> : (vector<1xf32>, vector<8xf32>) -> vector<8xf32>
%59 = "memref.subview"(%25) <{operandSegmentSizes = array<i32: 1, 0, 0, 0>, static_offsets = array<i64: 0, 0, 0, 0, 0, 0>, static_sizes = array<i64: 8, 4, 1, 64, 8, 1>, static_strides = array<i64: 1, 1, 1, 1, 1, 1>}> : (memref<8x4x1x64x8x1xf32, strided<[98304, 6144, 512, 8, 1, 1], offset: ?>>) -> memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>
"vector.store"(%58, %59, %arg2, %arg3, %6, %arg4, %6) <{nontemporal = false}> : (vector<8xf32>, memref<8x4x1x64x8xf32, strided<[98304, 6144, 512, 8, 1], offset: ?>>, index, index, index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"scf.yield"() : () -> ()
}) : (index, index, index) -> ()
"func.return"() : () -> ()
}) {translation_info = #iree_codegen.translation_info<CPUDataTiling>} : () -> ()
}) : () -> ()
"hal.executable.variant_end"() : () -> ()
}) {sym_name = "embedded_elf_x87_64", target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>} : () -> ()
Steps to reproduce your issue
Command to reproduce:
iree-compile model.torch_onnx.mlir --iree-hal-target-backends=llvm-cpu --iree-input-demote-i64-to-i32
IREE version: IREE compiler version 20240819.990 @ aeda14995f16ed1302db616adf0c03acf80f27ee LLVM version 20.0.0git
What component(s) does this issue relate to?
Compiler
Version information
No response
Additional context
No response
@pdhirajkumarprasad for such small examples it will also be helpful if you can attach the log with --mlir-print-ir-after-all --mlir-print-ir-before-all --mlir-disable-threading --mlir-elide-elementsattrs-if-larger=4 (the log can be large if it is a larger program, but for this size of program this is fine). I can redirect easily by just looking at the log.
@pashu123 please attach log when you get to this.
@MaheshRavishankar See the dump file: out_mlir.txt
This is related to #18297 as well.
Confirmed the issue no longer persists on main branch 56ecef7f .
Verified with latest build and don't see this issue anymore