iree icon indicating copy to clipboard operation
iree copied to clipboard

[CPU] TileRootAndFuseProducerConsumer causes redundant stack allocation

Open hanhanW opened this issue 7 months ago • 44 comments

The current implementation yields two tensors when tiling on parallel loops, see the lit test. (I will replace the link with the file after the change is landed to main.)

It causes an additional stack allocation. To repro:

Run: iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' /repro.mlir. Then you will find an memref.alloca op at the result IR.

#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
module {
  func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64} {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
    %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
    %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
    %10 = arith.extui %0 : i32 to i64
    %11 = arith.extui %1 : i32 to i64
    %12 = arith.shli %11, %c32_i64 : i64
    %13 = arith.ori %10, %12 : i64
    %14 = arith.index_castui %13 : i64 to index
    %15 = arith.extui %2 : i32 to i64
    %16 = arith.extui %3 : i32 to i64
    %17 = arith.shli %16, %c32_i64 : i64
    %18 = arith.ori %15, %17 : i64
    %19 = arith.index_castui %18 : i64 to index
    %20 = arith.extui %4 : i32 to i64
    %21 = arith.extui %5 : i32 to i64
    %22 = arith.shli %21, %c32_i64 : i64
    %23 = arith.ori %20, %22 : i64
    %24 = arith.index_castui %23 : i64 to index
    %25 = arith.extui %6 : i32 to i64
    %26 = arith.extui %7 : i32 to i64
    %27 = arith.shli %26, %c32_i64 : i64
    %28 = arith.ori %25, %27 : i64
    %29 = arith.index_castui %28 : i64 to index
    %30 = arith.extui %8 : i32 to i64
    %31 = arith.extui %9 : i32 to i64
    %32 = arith.shli %31, %c32_i64 : i64
    %33 = arith.ori %30, %32 : i64
    %34 = arith.index_castui %33 : i64 to index
    %35:5 = util.assume.int 
        %14<umin = 0, umax = 9007199254740991>, 
        %19<umin = 0, umax = 9007199254740991>, 
        %24<umin = 0, umax = 9007199254740991>, 
        %29<umin = 0, umax = 9007199254740991>, 
        %34<umin = 0, umax = 9007199254740991>
      : index, index, index, index, index
    %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
    %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
    %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
    %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
    %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
    %41 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
    %42 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
    %43 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
    %44 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
    %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
    %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
    %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
    %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
    %49 = linalg.fill ins(%cst : f32) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %50 = linalg.mmt4d ins(%45, %46 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%49 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %51 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%50, %47 : tensor<?x?x16x16xf32>, tensor<?x16xf32>) outs(%48 : tensor<?x?x16x16xf32>) {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %52 = arith.addf %in, %in_0 : f32
      %53 = arith.maximumf %52, %cst : f32
      linalg.yield %53 : f32
    } -> tensor<?x?x16x16xf32>
    iree_tensor_ext.dispatch.tensor.store %51, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
    return
  }
}

There is a path showing that mmt4d fusion codegen does not need any stack allocation, when we use TileAndFuse path with preset configuration.

Replace the path with TileAndFuse in mmt4d pipeline, and run iree-opt --pass-pipeline='builtin.module(func.func(iree-llvmcpu-lower-executable-target))' ~/repro2.mlir

#config = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 0, 0, 0, 0], [1, 1, 0, 16, 16, 0], [0, 0, 1, 0, 0, 1]]>
#config1 = #iree_codegen.lowering_config<tile_sizes = [[1, 2, 0, 0], [1, 1, 16, 16], [0, 0, 0, 0], [0, 0, 0, 0]]>
#executable_target_embedded_elf_x86_64 = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>
#map = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
#map1 = affine_map<(d0, d1, d2, d3) -> (d0, d2)>
#pipeline_layout = #hal.pipeline.layout<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
#translation = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>
module {
  func.func @mmt4d_bias_relu_fusion_dispatch_0_generic_DxDx16x16_f32() attributes {hal.executable.target = #executable_target_embedded_elf_x86_64, translation_info = #translation} {
    %c0 = arith.constant 0 : index
    %c32_i64 = arith.constant 32 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(#pipeline_layout) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(#pipeline_layout) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(#pipeline_layout) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(#pipeline_layout) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(#pipeline_layout) ordinal(7) : i32
    %8 = hal.interface.constant.load layout(#pipeline_layout) ordinal(8) : i32
    %9 = hal.interface.constant.load layout(#pipeline_layout) ordinal(9) : i32
    %10 = arith.extui %0 : i32 to i64
    %11 = arith.extui %1 : i32 to i64
    %12 = arith.shli %11, %c32_i64 : i64
    %13 = arith.ori %10, %12 : i64
    %14 = arith.index_castui %13 : i64 to index
    %15 = arith.extui %2 : i32 to i64
    %16 = arith.extui %3 : i32 to i64
    %17 = arith.shli %16, %c32_i64 : i64
    %18 = arith.ori %15, %17 : i64
    %19 = arith.index_castui %18 : i64 to index
    %20 = arith.extui %4 : i32 to i64
    %21 = arith.extui %5 : i32 to i64
    %22 = arith.shli %21, %c32_i64 : i64
    %23 = arith.ori %20, %22 : i64
    %24 = arith.index_castui %23 : i64 to index
    %25 = arith.extui %6 : i32 to i64
    %26 = arith.extui %7 : i32 to i64
    %27 = arith.shli %26, %c32_i64 : i64
    %28 = arith.ori %25, %27 : i64
    %29 = arith.index_castui %28 : i64 to index
    %30 = arith.extui %8 : i32 to i64
    %31 = arith.extui %9 : i32 to i64
    %32 = arith.shli %31, %c32_i64 : i64
    %33 = arith.ori %30, %32 : i64
    %34 = arith.index_castui %33 : i64 to index
    %35 = iree_tensor_ext.dispatch.workload.ordinal %14, 0 : index
    %36 = iree_tensor_ext.dispatch.workload.ordinal %19, 1 : index
    %37 = iree_tensor_ext.dispatch.workload.ordinal %24, 2 : index
    %38 = iree_tensor_ext.dispatch.workload.ordinal %29, 3 : index
    %39 = iree_tensor_ext.dispatch.workload.ordinal %34, 4 : index
    %40 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%38, %35}
    %41 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
    %42 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%37}
    %43 = hal.interface.binding.subspan layout(#pipeline_layout) binding(3) alignment(64) offset(%c0) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%38, %39}
    %44 = iree_tensor_ext.dispatch.tensor.load %40, offsets = [0, 0, 0, 0], sizes = [%38, %35, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%38, %35} -> tensor<?x?x16x1xf32>
    %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
    %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0], sizes = [%37, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%37} -> tensor<?x16xf32>
    %47 = tensor.empty(%38, %39) : tensor<?x?x16x16xf32>
    %48 = linalg.fill ins(%cst : f32) outs(%47 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %49 = linalg.mmt4d {lowering_config = #config} ins(%44, %45 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %50 = linalg.generic {indexing_maps = [#map, #map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%49, %46 : tensor<?x?x16x16xf32>, tensor<?x16xf32>) outs(%47 : tensor<?x?x16x16xf32>) attrs =  {lowering_config = #config1} {
    ^bb0(%in: f32, %in_0: f32, %out: f32):
      %51 = arith.addf %in, %in_0 : f32
      %52 = arith.maximumf %51, %cst : f32
      linalg.yield %52 : f32
    } -> tensor<?x?x16x16xf32>
    iree_tensor_ext.dispatch.tensor.store %50, %43, offsets = [0, 0, 0, 0], sizes = [%38, %39, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%38, %39}
    return
  }
}

hanhanW avatar May 13 '25 00:05 hanhanW

Hi, I'm also hitting the same issue. Correct me if I'm wrong, but I believe the redundant stack buffer allocations over here are an artifact of (iterative) consumer fusion generating loops with multiple results, where some of them are unused. I added an IR below for this. They then get bufferized into stack allocations. Switching the TileRootAndFuseProducerConsumer to TileAndFuse actually does get rid of the redundant stack buffer allocations, since - I believe - it works with producer fusion instead of producer+consumer. Although on the below example, I guess due to the unpack op in the middle and the current tile size selection mechanism, we get a rather suboptimal tiling that results in redundant computations, see the tensor<1x1x1xf32> extracted from the unpack and used for the generic.

// -----// IR Dump Before LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //

func.func @main$async_dispatch_10_batch_matmul_transpose_b_9x140x140x64_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %cst = arith.constant 0.000000e+00 : f32 
  %c537919616 = arith.constant 537919616 : index 
  %c1622016 = arith.constant 1622016 : index 
  %c967680 = arith.constant 967680 : index 
  %c1953792 = arith.constant 1953792 : index 
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c537919616) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c1622016) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c967680) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1953792) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32>
  %6 = tensor.empty() : tensor<9x140x140xf32>
  %7 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 140, 140], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>> -> tensor<140x140xf32>
  %8 = scf.forall (%arg0) in (9) shared_outs(%arg1 = %6) -> (tensor<9x140x140xf32>) {
    %extracted_slice = tensor.extract_slice %4[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32>
    %extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32>
    %9 = tensor.empty() : tensor<1x9x9x16x16xf32>
    %10 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0], [1, 1, 1, 0, 16], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]>} ins(%cst : f32) outs(%9 : tensor<1x9x9x16x16xf32>) -> tensor<1x9x9x16x16xf32>
    %11 = linalg.batch_mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0, 0, 0], [1, 1, 1, 0, 16, 16, 0], [0, 0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x9x64x16x1xf32>, tensor<1x9x64x16x1xf32>) outs(%10 : tensor<1x9x9x16x16xf32>) -> tensor<1x9x9x16x16xf32>
    %12 = tensor.empty() : tensor<1x140x140xf32> 
    %unpack = linalg.unpack %11 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %12 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} : tensor<1x9x9x16x16xf32> -> tensor<1x140x140xf32> 
    %extracted_slice_1 = tensor.extract_slice %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<9x140x140xf32> to tensor<1x140x140xf32> 
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack, %7 : tensor<1x140x140xf32>, tensor<140x140xf32>) outs(%extracted_slice_1 : tensor<1x140x140xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} {
    ^bb0(%in: f32 , %in_2: f32, %out: f32:
      %14 = arith.addf %in, %in_2 : f32
      linalg.yield %14 : f32
    } -> tensor<1x140x140xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %13 into %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<1x140x140xf32> into tensor<9x140x140xf32>
    } 
  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [9, 140, 140], strides = [1, 1, 1] : tensor<9x140x140xf32> -> !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>> 
  return
}

// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //

func.func @main$async_dispatch_10_batch_matmul_transpose_b_9x140x140x64_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c140 = arith.constant 140 : index  
  %c1 = arith.constant 1 : index  
  %c0 = arith.constant 0 : index  
  %cst = arith.constant 0.000000e+00 : f32  
  %c537919616 = arith.constant 537919616 : index  
  %c1622016 = arith.constant 1622016 : index  
  %c967680 = arith.constant 967680 : index  
  %c1953792 = arith.constant 1953792 : index  
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c537919616) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>>  
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c1622016) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>>  
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c967680) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>>  
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1953792) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>>  
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32>  
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32>  
  %6 = tensor.empty() : tensor<9x140x140xf32>  
  %7 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 140, 140], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>> -> tensor<140x140xf32>  
  %8 = scf.forall (%arg0) in (9) shared_outs(%arg1 = %6) -> (tensor<9x140x140xf32>) {
    %extracted_slice = tensor.extract_slice %4[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32>  
    %extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32>  
    %extracted_slice_1 = tensor.extract_slice %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<9x140x140xf32> to tensor<1x140x140xf32>  
    %9 = scf.for %arg2 = %c0 to %c140 step %c1 iter_args(%arg3 = %extracted_slice_1) -> (tensor<1x140x140xf32>) {
      %10 = scf.for %arg4 = %c0 to %c140 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x140x140xf32>) {
        %11 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg2)  
        %12 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg4)  
        %13 = affine.apply affine_map<(d0) -> (d0 floordiv 16)>(%arg2)  
        %14 = affine.apply affine_map<(d0) -> (d0 floordiv 16)>(%arg4)  
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %13, 0, 0, 0] [1, 1, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<1x9x64x16x1xf32> to tensor<1x1x64x16x1xf32>  
        %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[0, %14, 0, 0, 0] [1, 1, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<1x9x64x16x1xf32> to tensor<1x1x64x16x1xf32>  
        %15 = tensor.empty() : tensor<1x1x1x16x16xf32>  
        %16 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0], [1, 1, 1, 0, 16], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]>} ins(%cst : f32) outs(%15 : tensor<1x1x1x16x16xf32>) -> tensor<1x1x1x16x16xf32>  
        %17 = linalg.batch_mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0, 0, 0], [1, 1, 1, 0, 16, 16, 0], [0, 0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x64x16x1xf32>, tensor<1x1x64x16x1xf32>) outs(%16 : tensor<1x1x1x16x16xf32>) -> tensor<1x1x1x16x16xf32>  
        %18 = tensor.empty() : tensor<1x16x16xf32>  
        %unpack = linalg.unpack %17 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %18 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} : tensor<1x1x1x16x16xf32> -> tensor<1x16x16xf32>  
        %extracted_slice_4 = tensor.extract_slice %unpack[0, %11, %12] [1, 1, 1] [1, 1, 1] : tensor<1x16x16xf32> to tensor<1x1x1xf32>  
        %extracted_slice_5 = tensor.extract_slice %7[%arg2, %arg4] [1, 1] [1, 1] : tensor<140x140xf32> to tensor<1x1xf32>  
        %extracted_slice_6 = tensor.extract_slice %arg5[0, %arg2, %arg4] [1, 1, 1] [1, 1, 1] : tensor<1x140x140xf32> to tensor<1x1x1xf32>  
        %19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_6 : tensor<1x1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} {
        ^bb0(%in: f32  
          %20 = arith.addf %in, %in_7 : f32  
          linalg.yield %20 : f32  
        } -> tensor<1x1x1xf32>  
        %inserted_slice = tensor.insert_slice %19 into %arg5[0, %arg2, %arg4] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xf32> into tensor<1x140x140xf32>  
        scf.yield %inserted_slice : tensor<1x140x140xf32>  
      }  
      scf.yield %10 : tensor<1x140x140xf32>  
    }  
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %9 into %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<1x140x140xf32> into tensor<9x140x140xf32>  
    }  
  } {mapping = [#iree_codegen.workgroup_mapping<x>]}  
  flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [9, 140, 140], strides = [1, 1, 1] : tensor<9x140x140xf32> -> !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>>  
  return  
}  

The above would be the result of running TileRootAndFuseProducerConsumer on the same IR above, before the TileAndFuse:

// -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- //
func.func @main$async_dispatch_10_batch_matmul_transpose_b_9x140x140x64_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c9 = arith.constant 9 : index
  %c1 = arith.constant 1 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c537919616 = arith.constant 537919616 : index
  %c1622016 = arith.constant 1622016 : index
  %c967680 = arith.constant 967680 : index
  %c1953792 = arith.constant 1953792 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c537919616) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c1622016) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c967680) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1953792) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32>
  %6 = tensor.empty() : tensor<9x140x140xf32>
  %7 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 140, 140], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>> -> tensor<140x140xf32>
  %8 = scf.forall (%arg0) in (9) shared_outs(%arg1 = %6) -> (tensor<9x140x140xf32>) {
    %extracted_slice = tensor.extract_slice %4[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32>
    %extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32>
    %9 = tensor.empty() : tensor<1x9x9x16x16xf32>
    %10 = tensor.empty() : tensor<1x140x140xf32>
    %extracted_slice_1 = tensor.extract_slice %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<9x140x140xf32> to tensor<1x140x140xf32>
    %11:3 = scf.for %arg2 = %c0 to %c9 step %c1 iter_args(%arg3 = %9, %arg4 = %10, %arg5 = %extracted_slice_1) -> (tensor<1x9x9x16x16xf32>, tensor<1x140x140xf32>, tensor<1x140x140xf32>) {
      %12:3 = scf.for %arg6 = %c0 to %c9 step %c1 iter_args(%arg7 = %arg3, %arg8 = %arg4, %arg9 = %arg5) -> (tensor<1x9x9x16x16xf32>, tensor<1x140x140xf32>, tensor<1x140x140xf32>) {
        %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg2, 0, 0, 0] [1, 1, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<1x9x64x16x1xf32> to tensor<1x1x64x16x1xf32>
        %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[0, %arg6, 0, 0, 0] [1, 1, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<1x9x64x16x1xf32> to tensor<1x1x64x16x1xf32>
        %extracted_slice_4 = tensor.extract_slice %arg7[0, %arg2, %arg6, 0, 0] [1, 1, 1, 16, 16] [1, 1, 1, 1, 1] : tensor<1x9x9x16x16xf32> to tensor<1x1x1x16x16xf32>
        %13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0], [1, 1, 1, 0, 16], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]>} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x16x16xf32>) -> tensor<1x1x1x16x16xf32>
        %14 = linalg.batch_mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0, 0, 0], [1, 1, 1, 0, 16, 16, 0], [0, 0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x64x16x1xf32>, tensor<1x1x64x16x1xf32>) outs(%13 : tensor<1x1x1x16x16xf32>) -> tensor<1x1x1x16x16xf32>
        %15 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
        %16 = affine.min affine_map<(d0) -> (d0 * -16 + 140, 16)>(%arg2)
        %17 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg6)
        %18 = affine.min affine_map<(d0) -> (d0 * -16 + 140, 16)>(%arg6)
        %extracted_slice_5 = tensor.extract_slice %arg8[0, %15, %17] [1, %16, %18] [1, 1, 1] : tensor<1x140x140xf32> to tensor<1x?x?xf32>
        %unpack = linalg.unpack %14 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %extracted_slice_5 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} : tensor<1x1x1x16x16xf32> -> tensor<1x?x?xf32>
        %inserted_slice = tensor.insert_slice %14 into %arg7[0, %arg2, %arg6, 0, 0] [1, 1, 1, 16, 16] [1, 1, 1, 1, 1] : tensor<1x1x1x16x16xf32> into tensor<1x9x9x16x16xf32>
        %19 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
        %20 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg6)
        %extracted_slice_6 = tensor.extract_slice %7[%19, %20] [%16, %18] [1, 1] : tensor<140x140xf32> to tensor<?x?xf32>
        %21 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2)
        %22 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg6)
        %extracted_slice_7 = tensor.extract_slice %arg9[0, %21, %22] [1, %16, %18] [1, 1, 1] : tensor<1x140x140xf32> to tensor<1x?x?xf32>
        %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack, %extracted_slice_6 : tensor<1x?x?xf32>, tensor<?x?xf32>) outs(%extracted_slice_7 : tensor<1x?x?xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} {
        ^bb0(%in: f32, %in_10: f32, %out: f32):
          %24 = arith.addf %in, %in_10 : f32
          linalg.yield %24 : f32
        } -> tensor<1x?x?xf32>
        %inserted_slice_8 = tensor.insert_slice %unpack into %arg8[0, %15, %17] [1, %16, %18] [1, 1, 1] : tensor<1x?x?xf32> into tensor<1x140x140xf32>
        %inserted_slice_9 = tensor.insert_slice %23 into %arg9[0, %21, %22] [1, %16, %18] [1, 1, 1] : tensor<1x?x?xf32> into tensor<1x140x140xf32>
        scf.yield %inserted_slice, %inserted_slice_8, %inserted_slice_9 : tensor<1x9x9x16x16xf32>, tensor<1x140x140xf32>, tensor<1x140x140xf32>
      }
      scf.yield %12#0, %12#1, %12#2 : tensor<1x9x9x16x16xf32>, tensor<1x140x140xf32>, tensor<1x140x140xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %11#2 into %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<1x140x140xf32> into tensor<9x140x140xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [9, 140, 140], strides = [1, 1, 1] : tensor<9x140x140xf32> -> !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>>
  return
}

As I stated above, I believe these unused results of the tiled and fused loop get bufferized to large stack buffer allocations. I was wondering if the plan is to switch the CPU pipeline to use TileAndFuse instead of TileRootAndFuseProducerConsumer (or if the plan is to support/utilize both consumer+producer fusion or stick to producer fusion), or if the iterative consumer fusion mechanism in the latter pass might also be adjusted to not result in redundant stack allocations. I guess we could cleanup the unused results of the loop (and possibly add tensor.empty() ops in the innermost loops for the output buffers of compute ops) to achieve this. Do you think that would be sensible to do?

Also, since the above dispatch occurs in the DT-fusion path, I was also curious if there were any plans to improve DT/codegen for these - so that they would not end up with redundant computations :) It's probably not exactly the correct place for this one, though I had some observations and ideas for this part and would love to hear your thoughts/plans as well :)

EDIT: I guess the problem I mentioned with the redundant computations is the same issue as in: https://github.com/iree-org/iree/issues/20786#issuecomment-2873657151 - the extract slice op of the generic.

egebeysel avatar May 13 '25 13:05 egebeysel

Hi, I'm also hitting the same issue. Correct me if I'm wrong, but I believe the redundant stack buffer allocations over here are an artifact of (iterative) consumer fusion generating loops with multiple results, where some of them are unused. I added an IR below for this. They then get bufferized into stack allocations. Switching the TileRootAndFuseProducerConsumer to TileAndFuse actually does get rid of the redundant stack buffer allocations, since - I believe - it works with producer fusion instead of producer+consumer. Although on the below example, I guess due to the unpack op in the middle and the current tile size selection mechanism, we get a rather suboptimal tiling that results in redundant computations, see the tensor<1x1x1xf32> extracted from the unpack and used for the generic.

// -----// IR Dump Before LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //

func.func @main$async_dispatch_10_batch_matmul_transpose_b_9x140x140x64_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} { %cst = arith.constant 0.000000e+00 : f32 %c537919616 = arith.constant 537919616 : index %c1622016 = arith.constant 1622016 : index %c967680 = arith.constant 967680 : index %c1953792 = arith.constant 1953792 : index %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c537919616) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>> %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c1622016) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c967680) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1953792) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32> %6 = tensor.empty() : tensor<9x140x140xf32> %7 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 140, 140], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>> -> tensor<140x140xf32> %8 = scf.forall (%arg0) in (9) shared_outs(%arg1 = %6) -> (tensor<9x140x140xf32>) { %extracted_slice = tensor.extract_slice %4[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32> %extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32> %9 = tensor.empty() : tensor<1x9x9x16x16xf32> %10 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0], [1, 1, 1, 0, 16], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]>} ins(%cst : f32) outs(%9 : tensor<1x9x9x16x16xf32>) -> tensor<1x9x9x16x16xf32> %11 = linalg.batch_mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0, 0, 0], [1, 1, 1, 0, 16, 16, 0], [0, 0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<1x9x64x16x1xf32>, tensor<1x9x64x16x1xf32>) outs(%10 : tensor<1x9x9x16x16xf32>) -> tensor<1x9x9x16x16xf32> %12 = tensor.empty() : tensor<1x140x140xf32> %unpack = linalg.unpack %11 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %12 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} : tensor<1x9x9x16x16xf32> -> tensor<1x140x140xf32> %extracted_slice_1 = tensor.extract_slice %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<9x140x140xf32> to tensor<1x140x140xf32> %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack, %7 : tensor<1x140x140xf32>, tensor<140x140xf32>) outs(%extracted_slice_1 : tensor<1x140x140xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} { ^bb0(%in: f32 , %in_2: f32, %out: f32: %14 = arith.addf %in, %in_2 : f32 linalg.yield %14 : f32 } -> tensor<1x140x140xf32> scf.forall.in_parallel { tensor.parallel_insert_slice %13 into %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<1x140x140xf32> into tensor<9x140x140xf32> } } {mapping = [#iree_codegen.workgroup_mapping]} flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [9, 140, 140], strides = [1, 1, 1] : tensor<9x140x140xf32> -> !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>> return }

// -----// IR Dump After LLVMCPUTileAndFusePass (iree-llvmcpu-tile-and-fuse) //----- //

func.func @main$async_dispatch_10_batch_matmul_transpose_b_9x140x140x64_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} { %c140 = arith.constant 140 : index
%c1 = arith.constant 1 : index
%c0 = arith.constant 0 : index
%cst = arith.constant 0.000000e+00 : f32
%c537919616 = arith.constant 537919616 : index
%c1622016 = arith.constant 1622016 : index
%c967680 = arith.constant 967680 : index
%c1953792 = arith.constant 1953792 : index
%0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c537919616) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>>
%1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c1622016) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>>
%2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c967680) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>>
%3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1953792) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>>
%4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32>
%5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32>
%6 = tensor.empty() : tensor<9x140x140xf32>
%7 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 140, 140], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>> -> tensor<140x140xf32>
%8 = scf.forall (%arg0) in (9) shared_outs(%arg1 = %6) -> (tensor<9x140x140xf32>) { %extracted_slice = tensor.extract_slice %4[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32>
%extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32>
%extracted_slice_1 = tensor.extract_slice %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<9x140x140xf32> to tensor<1x140x140xf32>
%9 = scf.for %arg2 = %c0 to %c140 step %c1 iter_args(%arg3 = %extracted_slice_1) -> (tensor<1x140x140xf32>) { %10 = scf.for %arg4 = %c0 to %c140 step %c1 iter_args(%arg5 = %arg3) -> (tensor<1x140x140xf32>) { %11 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg2)
%12 = affine.apply affine_map<(d0) -> (d0 mod 16)>(%arg4)
%13 = affine.apply affine_map<(d0) -> (d0 floordiv 16)>(%arg2)
%14 = affine.apply affine_map<(d0) -> (d0 floordiv 16)>(%arg4)
%extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %13, 0, 0, 0] [1, 1, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<1x9x64x16x1xf32> to tensor<1x1x64x16x1xf32>
%extracted_slice_3 = tensor.extract_slice %extracted_slice_0[0, %14, 0, 0, 0] [1, 1, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<1x9x64x16x1xf32> to tensor<1x1x64x16x1xf32>
%15 = tensor.empty() : tensor<1x1x1x16x16xf32>
%16 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0], [1, 1, 1, 0, 16], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]>} ins(%cst : f32) outs(%15 : tensor<1x1x1x16x16xf32>) -> tensor<1x1x1x16x16xf32>
%17 = linalg.batch_mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0, 0, 0], [1, 1, 1, 0, 16, 16, 0], [0, 0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x64x16x1xf32>, tensor<1x1x64x16x1xf32>) outs(%16 : tensor<1x1x1x16x16xf32>) -> tensor<1x1x1x16x16xf32>
%18 = tensor.empty() : tensor<1x16x16xf32>
%unpack = linalg.unpack %17 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %18 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} : tensor<1x1x1x16x16xf32> -> tensor<1x16x16xf32>
%extracted_slice_4 = tensor.extract_slice %unpack[0, %11, %12] [1, 1, 1] [1, 1, 1] : tensor<1x16x16xf32> to tensor<1x1x1xf32>
%extracted_slice_5 = tensor.extract_slice %7[%arg2, %arg4] [1, 1] [1, 1] : tensor<140x140xf32> to tensor<1x1xf32>
%extracted_slice_6 = tensor.extract_slice %arg5[0, %arg2, %arg4] [1, 1, 1] [1, 1, 1] : tensor<1x140x140xf32> to tensor<1x1x1xf32>
%19 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_4, %extracted_slice_5 : tensor<1x1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_6 : tensor<1x1x1xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} { ^bb0(%in: f32
%20 = arith.addf %in, %in_7 : f32
linalg.yield %20 : f32
} -> tensor<1x1x1xf32>
%inserted_slice = tensor.insert_slice %19 into %arg5[0, %arg2, %arg4] [1, 1, 1] [1, 1, 1] : tensor<1x1x1xf32> into tensor<1x140x140xf32>
scf.yield %inserted_slice : tensor<1x140x140xf32>
}
scf.yield %10 : tensor<1x140x140xf32>
}
scf.forall.in_parallel { tensor.parallel_insert_slice %9 into %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<1x140x140xf32> into tensor<9x140x140xf32>
}
} {mapping = [#iree_codegen.workgroup_mapping]}
flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [9, 140, 140], strides = [1, 1, 1] : tensor<9x140x140xf32> -> !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>>
return
}
The above would be the result of running TileRootAndFuseProducerConsumer on the same IR above, before the TileAndFuse:

// -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- // func.func @main$async_dispatch_10_batch_matmul_transpose_b_9x140x140x64_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} { %c9 = arith.constant 9 : index %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index %cst = arith.constant 0.000000e+00 : f32 %c537919616 = arith.constant 537919616 : index %c1622016 = arith.constant 1622016 : index %c967680 = arith.constant 967680 : index %c1953792 = arith.constant 1953792 : index %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c537919616) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>> %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c1622016) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c967680) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1953792) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>> %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32> %5 = flow.dispatch.tensor.load %2, offsets = [0, 0, 0, 0, 0], sizes = [9, 9, 64, 16, 1], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<9x9x64x16x1xf32>> -> tensor<9x9x64x16x1xf32> %6 = tensor.empty() : tensor<9x140x140xf32> %7 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 1, 140, 140], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x1x140x141xf32>> -> tensor<140x140xf32> %8 = scf.forall (%arg0) in (9) shared_outs(%arg1 = %6) -> (tensor<9x140x140xf32>) { %extracted_slice = tensor.extract_slice %4[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32> %extracted_slice_0 = tensor.extract_slice %5[%arg0, 0, 0, 0, 0] [1, 9, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<9x9x64x16x1xf32> to tensor<1x9x64x16x1xf32> %9 = tensor.empty() : tensor<1x9x9x16x16xf32> %10 = tensor.empty() : tensor<1x140x140xf32> %extracted_slice_1 = tensor.extract_slice %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<9x140x140xf32> to tensor<1x140x140xf32> %11:3 = scf.for %arg2 = %c0 to %c9 step %c1 iter_args(%arg3 = %9, %arg4 = %10, %arg5 = %extracted_slice_1) -> (tensor<1x9x9x16x16xf32>, tensor<1x140x140xf32>, tensor<1x140x140xf32>) { %12:3 = scf.for %arg6 = %c0 to %c9 step %c1 iter_args(%arg7 = %arg3, %arg8 = %arg4, %arg9 = %arg5) -> (tensor<1x9x9x16x16xf32>, tensor<1x140x140xf32>, tensor<1x140x140xf32>) { %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg2, 0, 0, 0] [1, 1, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<1x9x64x16x1xf32> to tensor<1x1x64x16x1xf32> %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[0, %arg6, 0, 0, 0] [1, 1, 64, 16, 1] [1, 1, 1, 1, 1] : tensor<1x9x64x16x1xf32> to tensor<1x1x64x16x1xf32> %extracted_slice_4 = tensor.extract_slice %arg7[0, %arg2, %arg6, 0, 0] [1, 1, 1, 16, 16] [1, 1, 1, 1, 1] : tensor<1x9x9x16x16xf32> to tensor<1x1x1x16x16xf32> %13 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0], [1, 1, 1, 0, 16], [0, 0, 0, 0, 0], [0, 0, 0, 0, 0]]>} ins(%cst : f32) outs(%extracted_slice_4 : tensor<1x1x1x16x16xf32>) -> tensor<1x1x1x16x16xf32> %14 = linalg.batch_mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9, 0, 0, 0, 0], [1, 1, 1, 0, 16, 16, 0], [0, 0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x64x16x1xf32>, tensor<1x1x64x16x1xf32>) outs(%13 : tensor<1x1x1x16x16xf32>) -> tensor<1x1x1x16x16xf32> %15 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2) %16 = affine.min affine_map<(d0) -> (d0 * -16 + 140, 16)>(%arg2) %17 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg6) %18 = affine.min affine_map<(d0) -> (d0 * -16 + 140, 16)>(%arg6) %extracted_slice_5 = tensor.extract_slice %arg8[0, %15, %17] [1, %16, %18] [1, 1, 1] : tensor<1x140x140xf32> to tensor<1x?x?xf32> %unpack = linalg.unpack %14 outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [16, 16] into %extracted_slice_5 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} : tensor<1x1x1x16x16xf32> -> tensor<1x?x?xf32> %inserted_slice = tensor.insert_slice %14 into %arg7[0, %arg2, %arg6, 0, 0] [1, 1, 1, 16, 16] [1, 1, 1, 1, 1] : tensor<1x1x1x16x16xf32> into tensor<1x9x9x16x16xf32> %19 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2) %20 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg6) %extracted_slice_6 = tensor.extract_slice %7[%19, %20] [%16, %18] [1, 1] : tensor<140x140xf32> to tensor<?x?xf32> %21 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg2) %22 = affine.apply affine_map<(d0) -> (d0 * 16)>(%arg6) %extracted_slice_7 = tensor.extract_slice %arg9[0, %21, %22] [1, %16, %18] [1, 1, 1] : tensor<1x140x140xf32> to tensor<1x?x?xf32> %23 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%unpack, %extracted_slice_6 : tensor<1x?x?xf32>, tensor<?x?xf32>) outs(%extracted_slice_7 : tensor<1x?x?xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 9, 9], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>} { ^bb0(%in: f32, %in_10: f32, %out: f32): %24 = arith.addf %in, %in_10 : f32 linalg.yield %24 : f32 } -> tensor<1x?x?xf32> %inserted_slice_8 = tensor.insert_slice %unpack into %arg8[0, %15, %17] [1, %16, %18] [1, 1, 1] : tensor<1x?x?xf32> into tensor<1x140x140xf32> %inserted_slice_9 = tensor.insert_slice %23 into %arg9[0, %21, %22] [1, %16, %18] [1, 1, 1] : tensor<1x?x?xf32> into tensor<1x140x140xf32> scf.yield %inserted_slice, %inserted_slice_8, %inserted_slice_9 : tensor<1x9x9x16x16xf32>, tensor<1x140x140xf32>, tensor<1x140x140xf32> } scf.yield %12#0, %12#1, %12#2 : tensor<1x9x9x16x16xf32>, tensor<1x140x140xf32>, tensor<1x140x140xf32> } scf.forall.in_parallel { tensor.parallel_insert_slice %11#2 into %arg1[%arg0, 0, 0] [1, 140, 140] [1, 1, 1] : tensor<1x140x140xf32> into tensor<9x140x140xf32> } } {mapping = [#iree_codegen.workgroup_mapping]} flow.dispatch.tensor.store %8, %3, offsets = [0, 0, 0], sizes = [9, 140, 140], strides = [1, 1, 1] : tensor<9x140x140xf32> -> !flow.dispatch.tensor<writeonly:tensor<9x140x140xf32>> return } As I stated above, I believe these unused results of the tiled and fused loop get bufferized to large stack buffer allocations. I was wondering if the plan is to switch the CPU pipeline to use TileAndFuse instead of TileRootAndFuseProducerConsumer (or if the plan is to support/utilize both consumer+producer fusion or stick to producer fusion), or if the iterative consumer fusion mechanism in the latter pass might also be adjusted to not result in redundant stack allocations. I guess we could cleanup the unused results of the loop (and possibly add tensor.empty() ops in the innermost loops for the output buffers of compute ops) to achieve this. Do you think that would be sensible to do?

Also, since the above dispatch occurs in the DT-fusion path, I was also curious if there were any plans to improve DT/codegen for these - so that they would not end up with redundant computations :) It's probably not exactly the correct place for this one, though I had some observations and ideas for this part and would love to hear your thoughts/plans as well :)

EDIT: I guess the problem I mentioned with the redundant computations is the same issue as in: #20786 (comment) - the extract slice op of the generic.

I see two problems: 1. The tile selection logic for tensor.unpack is wrong. (If you fix that, tile and fuse pipeline would work fine) 2. Yes, you're right, consumer fusion creates extra buffers that need to be fixed for tileRootFuseConsumerProducerPipeline. I think we want to avoid using tileAndFuse because we don't want extra config propagation strategy.

pashu123 avatar May 14 '25 22:05 pashu123

As mentioned in the other issue https://github.com/iree-org/iree/issues/20785, we want to get rid of lowering config propagation. It implies that we will switch to TileRootAndFuseProducerConsumer.

The issue mainly demonstrates that it should be doable in the pass. I think something is off/missing in the pass, and we need to fix it.

hanhanW avatar May 15 '25 00:05 hanhanW

@hanhanW and I had an offline discussion about the issue. The current problem is that we don't have control over intermediate yields during consumer fusion, and they are always attached as an iter_arg. For tileConsumerFuseProducer (Producer Fusion) we have controls to enable/disable the intermediate yields, see https://github.com/iree-org/iree/blob/65b6541605f78b1dfec4b2cdbf3c22eb0f09cfa4/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileRootAndFuseProducerConsumer.cpp#L102 .

To tackle this, we have two options:

  1. Use scf.forall fusion for parallel iterators. It's easy to enable this in the TileRootFuseConsumerProducer pass, we only have to set tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); here https://github.com/iree-org/iree/blob/65b6541605f78b1dfec4b2cdbf3c22eb0f09cfa4/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileRootAndFuseProducerConsumer.cpp#L95 (There's is a bool in the pass to determine whether we do parallel or reduction tiling onlyFuseProducerInputOperands. We can only set this if it's a parallel tiling. For reduction tiling we don't fuse the consumers, so it's not a problem https://github.com/iree-org/iree/blob/65b6541605f78b1dfec4b2cdbf3c22eb0f09cfa4/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileRootAndFuseProducerConsumer.cpp#L207 ). scf.forall naturally gets rid of intermediate iter_args because it follows what's finally stored in the scf.forall.in_parallel (https://mlir.llvm.org/docs/Dialects/SCFDialect/#scfforallin_parallel-scfinparallelop) block and removes all other iter_args. We can then lower scf.forall to scf.for and we have the upstream pattern for this https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/SCF/Transforms/ForallToFor.cpp (Caveat: Don't lower scf.forall that have workgroup ids attached -- they are handled differently)

  2. Analyze scf.for to remove the not used independent iter_arg or add the control function in the consumer fusion.

I would take the first route; it makes IR clearer in terms of semantics—we have scf.forall, so it's parallel tiling.

pashu123 avatar May 15 '25 02:05 pashu123

Hey, thanks for your responses!

As mentioned in the other issue #20785, we want to get rid of lowering config propagation. It implies that we will switch to TileRootAndFuseProducerConsumer.

Get rid of as in not have it at all and use the same mechanisms (or modified versions) used for setting the lowering config on the first place on-the-fly?

  1. The tile selection logic for tensor.unpack is wrong. (If you fix that, tile and fuse pipeline would work fine)

I also have some example in which a generic -> pack dispatch fails to fuse the ~generic~ pack on the workgroup level, hence failing compilation with "large vector sizes". I'll also post that one as well tomorrow. ~But generally, you say that although the tile selection logic and the lowering config propagation might cause some issues, it's better to actually tackle them at the points where they actually do cause the failures and not burden the lowering configs even more, do I understand that correctly?~

I would take the first route; it makes IR clearer in terms of semantics—we have scf.forall, so it's parallel tiling.

I actually do have a prototype for the second route, but the first one also seems to be more clear to me to be honest.

egebeysel avatar May 15 '25 17:05 egebeysel

The example I was talking about above does not happen just now, but rather if we switch the TileAndDistributeToWorkgroupsUsingForallOpPass to use the root op instead of the last compute op over here.

At first, I thought this one was caused by the tile selection logic, but I think it's rather consumer fusion and the odd shapes, please correct me if I'm wrong here. I guess the consumer fusion could maybe be improved to handle these cases as well, since producer fusion starting from the pack op seems to be able to fuse the generic as well. One other question I had was about the selection criteria for the root operation. What's the reason that element-wise ops have precedence over packs/unpacks? Would giving precedence to pack/unpack ops be a feasible way to force producer fusion in these cases? Would forcing producer fusion in such cases even make sense or would it be (more) feasible to improve consumer fusion? I'd love to hear your thoughts on that one as well :)


// -----// IR Dump Before TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
func.func @main$async_dispatch_10_elementwise_transpose_197x12x64_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = CPUDoubleTilingExpert>} {
    %cst = arith.constant 0.000000e+00 : f32 
    %c3035136 = arith.constant 3035136 : index 
    %cst_0 = arith.constant 0.353553385 : f32 
    %c1210368 = arith.constant 1210368 : index 
    %0 = hal.interface.constant.load layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32 
    %1 = arith.index_castui %0 : i32 to index 
    %2 = util.assume.int %1[<umin = 346249152, umax = 346249152, udiv = 346249152>, <umin = 346221504, umax = 346221504, udiv = 346221504>, <umin = 346193856, umax = 346193856, udiv = 346193856>, <umin = 346166208, umax = 346166208, udiv = 346166208>, <umin = 346138560, umax = 346138560, udiv = 346138560>, <umin = 346110912, umax = 346110912, udiv = 346110912>, <umin = 346083264, umax = 346083264, udiv = 346083264>, <umin = 346055616, umax = 346055616, udiv = 346055616>, <umin = 346027968, umax = 346027968, udiv = 346027968>, <umin = 346000320, umax = 346000320, udiv = 346000320>, <umin = 345972672, umax = 345972672, udiv = 345972672>, <umin = 345945024, umax = 345945024, udiv = 345945024>] : index 
    %3 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c3035136) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<197x12x64xf32>> 
    %4 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%2) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<12x64xf32>> 
    %5 = hal.interface.binding.subspan layout(<constants = 1, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1210368) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<12x25x64x8x1xf32>> 
    %6 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [197, 12, 64], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<197x12x64xf32>> -> tensor<197x12x64xf32> 
    %7 = iree_tensor_ext.dispatch.tensor.load %4, offsets = [0, 0], sizes = [12, 64], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<12x64xf32>> -> tensor<12x64xf32> 
    %8 = tensor.empty() : tensor<12x197x64xf32> 
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d2)>, affine_map<(d0, d1, d2) -> (d1, d0, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%6, %7 : tensor<197x12x64xf32>, tensor<12x64xf32>) outs(%8 : tensor<12x197x64xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 16, 64], [1, 8, 1], [0, 0, 0], [0, 0, 0]]>} {
    ^bb0(%in: f32 , %in_1: f32 , %out: f32):
        %11 = arith.addf %in, %in_1 : f32 
        %12 = arith.mulf %11, %cst_0 : f32 
        linalg.yield %12 : f32 
    } -> tensor<12x197x64xf32> 
    %10 = tensor.empty() : tensor<12x25x64x8x1xf32> 
    %pack = linalg.pack %9 padding_value(%cst : f32) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %10 {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[8, 2, 64], [1, 2, 1], [0, 0, 0], [0, 0, 0]]>} : tensor<12x197x64xf32> -> tensor<12x25x64x8x1xf32> 
    iree_tensor_ext.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0, 0], sizes = [12, 25, 64, 8, 1], strides = [1, 1, 1, 1, 1] : tensor<12x25x64x8x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<12x25x64x8x1xf32>> 
    return 
}


// -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
#config2 = #iree_codegen.lowering_config<tile_sizes = [[8, 16, 64], [1, 8, 1], [0, 0, 0], [0, 0, 0]]>
#config3 = #iree_codegen.lowering_config<tile_sizes = [[8, 2, 64], [1, 1, 1], [0, 0, 0], [0, 0, 0]]>
func.func @main$async_dispatch_10_elementwise_transpose_197x12x64_f32() attributes {translation_info = #translation} {
    %c12 = arith.constant 12 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %c3035136 = arith.constant 3035136 : index
    %cst_0 = arith.constant 0.353553385 : f32
    %c1210368 = arith.constant 1210368 : index
    %0 = hal.interface.constant.load layout(#pipeline_layout1) ordinal(0) : i32
    %1 = arith.index_castui %0 : i32 to index
    %2 = util.assume.int %1[<umin = 346249152, umax = 346249152, udiv = 346249152>, <umin = 346221504, umax = 346221504, udiv = 346221504>, <umin = 346193856, umax = 346193856, udiv = 346193856>, <umin = 346166208, umax = 346166208, udiv = 346166208>, <umin = 346138560, umax = 346138560, udiv = 346138560>, <umin = 346110912, umax = 346110912, udiv = 346110912>, <umin = 346083264, umax = 346083264, udiv = 346083264>, <umin = 346055616, umax = 346055616, udiv = 346055616>, <umin = 346027968, umax = 346027968, udiv = 346027968>, <umin = 346000320, umax = 346000320, udiv = 346000320>, <umin = 345972672, umax = 345972672, udiv = 345972672>, <umin = 345945024, umax = 345945024, udiv = 345945024>] : index
    %3 = hal.interface.binding.subspan layout(#pipeline_layout1) binding(0) alignment(64) offset(%c3035136) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<197x12x64xf32>>
    %4 = hal.interface.binding.subspan layout(#pipeline_layout1) binding(1) alignment(64) offset(%2) flags(ReadOnly) : !iree_tensor_ext.dispatch.tensor<readonly:tensor<12x64xf32>>
    %5 = hal.interface.binding.subspan layout(#pipeline_layout1) binding(2) alignment(64) offset(%c1210368) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<12x25x64x8x1xf32>>
    %6 = iree_tensor_ext.dispatch.tensor.load %3, offsets = [0, 0, 0], sizes = [197, 12, 64], strides = [1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<197x12x64xf32>> -> tensor<197x12x64xf32>
    %7 = iree_tensor_ext.dispatch.tensor.load %4, offsets = [0, 0], sizes = [12, 64], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<12x64xf32>> -> tensor<12x64xf32>
    %8 = tensor.empty() : tensor<12x197x64xf32>
    %9 = tensor.empty() : tensor<12x25x64x8x1xf32>
    %10 = scf.forall (%arg0) = (0) to (197) step (8) shared_outs(%arg1 = %8) -> (tensor<12x197x64xf32>) {
      %11 = affine.min #map3(%arg0)
      %extracted_slice = tensor.extract_slice %arg1[0, %arg0, 0] [12, %11, 64] [1, 1, 1] : tensor<12x197x64xf32> to tensor<12x?x64xf32>
      %extracted_slice_1 = tensor.extract_slice %6[%arg0, 0, 0] [%11, 12, 64] [1, 1, 1] : tensor<197x12x64xf32> to tensor<?x12x64xf32>
      %12 = linalg.generic {indexing_maps = [#map4, #map5, #map6], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice_1, %7 : tensor<?x12x64xf32>, tensor<12x64xf32>) outs(%extracted_slice : tensor<12x?x64xf32>) attrs =  {lowering_config = #config2} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %13 = arith.addf %in, %in_2 : f32
        %14 = arith.mulf %13, %cst_0 : f32
        linalg.yield %14 : f32
      } -> tensor<12x?x64xf32>
      %cast = tensor.cast %12 : tensor<12x?x64xf32> to tensor<?x?x64xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %cast into %arg1[%c0, %arg0, 0] [%c12, %11, 64] [1, 1, 1] : tensor<?x?x64xf32> into tensor<12x197x64xf32>
      }
    } {mapping = [#iree_codegen.workgroup_mapping<x>]}
    %pack = linalg.pack %10 padding_value(%cst : f32) outer_dims_perm = [0, 1, 2] inner_dims_pos = [1, 2] inner_tiles = [8, 1] into %9 {lowering_config = #config3} : tensor<12x197x64xf32> -> tensor<12x25x64x8x1xf32>
    iree_tensor_ext.dispatch.tensor.store %pack, %5, offsets = [0, 0, 0, 0, 0], sizes = [12, 25, 64, 8, 1], strides = [1, 1, 1, 1, 1] : tensor<12x25x64x8x1xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<12x25x64x8x1xf32>>
    return
  }
}

I'll also open another issue for this one next week, since I believe this isn't entirely relevant to this one, but does concern consumer fusion.

egebeysel avatar May 16 '25 17:05 egebeysel

Get rid of as in not have it at all and use the same mechanisms (or modified versions) used for setting the lowering config on the first place on-the-fly?

Yes, we use the same mechanism to set the lowering_config and translation_info on root op. We only disable the propagation, which drops some "hints" for other ops. (We can set the hints if needed. I just don't see an example for now.)

What's the reason that element-wise ops have precedence over packs/unpacks? Would giving precedence to pack/unpack ops be a feasible way to force producer fusion in these cases?

I'll need to refresh my memory. :) The short answer is mostly because of legacy. The thing that I need to rethink is pack/unpack ops are more like "storing the tiled results to a different position". The core computation is still the same, so the root op does not change in this context. However, I've been thinking to break this rule in element->pack cases because the transposition needs to be efficient. I guess I proposed the idea in some casual chat before, but I don't remember the details now. The switch is not a bad decision, IMO. We need to reason it.

The other thinking is that we should explore if we can improve the indexing mapping on CPU or not, but it requires some POC. Unfortunately, I don't have enough cycles to invest this part. So it is waht it is today.

Side note is that @Max191 has been pushing this on GPU side. We recently learned that we can explore the early bufferization approach for codegen, while still keep the core computation ops on tensors. It uses MapScatter op, that helps the indices remapping. We are still building the transformations for the op, so it is not ready yet.

https://github.com/iree-org/iree/blob/e66fc1503b9fa145fea92e707252e65bc67dbe27/compiler/src/iree/compiler/Dialect/LinalgExt/IR/LinalgExtOps.td#L316-L344

There were potential issues on CPU side, when last time I reviewed the early bufferization idea though. My study note was outdated, so I'm not going to open the pandora box here. :)

I'll also open another issue for this one next week, since I believe this isn't entirely relevant to this one, but does concern consumer fusion.

We seem to have a similar issue here: https://github.com/iree-org/iree/issues/20723

I don't have cycles to take a look at it, so I don't know what's happening. IMO, there is a bug somewhere, and we need to fix it. I'm not convinced that the fusion can't happen atm. It is a bug to me, and someone needs to fix it.

hanhanW avatar May 16 '25 17:05 hanhanW

However, I've been thinking to break this rule in element->pack cases because the transposition needs to be efficient. I guess I proposed the idea in some casual chat before, but I don't remember the details now. The switch is not a bad decision, IMO. We need to reason it.

Do you mean special-casing on that order, or in general giving precedence to pack (possibly also unpack) ops over elementwise ops? I agree with the fact that the core of the computation being the element-wise ops and the pack/unpacks being more like utility ops so to say, but does being the root op have any semantics rather than sort of being the starting point for tiling+fusing? Also, I know that it's theoretically possible to have pack->generic cases, but is that really a pattern that is observed often? Because with the fusion data-tiling path, I guess one almost never observes such dispatches.

I don't have cycles to take a look at it, so I don't know what's happening. IMO, there is a bug somewhere, and we need to fix it. I'm not convinced that the fusion can't happen atm. It is a bug to me, and someone needs to fix it.

Nevertheless, I also agree that consumer fusion should be (as) symmetric (as possible) to producer fusion and that it should be able to fuse as well. I'd be happy to investigate this further.

We recently learned that we can explore the early bufferization approach for codegen, while still keep the core computation ops on tensors.

Is there an RFC kind of document, an example or even a Discord thread I can read for this one? It's not super easy to deduce the general idea from the operation only and I'd appreciate some pointers :)

The other thinking is that we should explore if we can improve the indexing mapping on CPU or not, but it requires some POC. Unfortunately, I don't have enough cycles to invest this part. So it is waht it is today.

I'd also be happy to invest some time in this one, if you would have time to explain what exactly you would want to improve and what you expect to change with that. But I guess I would also prioritize the blockers first.

  1. Use scf.forall fusion for parallel iterators. It's easy to enable this in the TileRootFuseConsumerProducer pass, we only have to set tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); here

I'd also like to take a stab at this option if @pashu123 is not working on it already :)

egebeysel avatar May 19 '25 11:05 egebeysel

We seem to have a similar issue here: #20723

Btw. I also just took a look at this one, and I think this sort of dispatch is also something that makes me think that the pack operation might have to have some precedence over some other ops. There, you have something in the lines of:

...
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (8, 64) step (1, 32) shared_outs(%arg2 = %4) -> (tensor<8x64x1x1xf32>) {
    %extracted_slice = tensor.extract_slice %2[%arg0, %arg1, 0, 0] [1, 32, 1, 256] [1, 1, 1, 1] : tensor<8x64x1x256xf32> to tensor<1x32x1x256xf32>
    %extracted_slice_0 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 32, 1, 1] [1, 1, 1, 1] : tensor<8x64x1x1xf32> to tensor<1x32x1x1xf32>
    %7 = linalg.fill ins(%cst : f32) outs(%extracted_slice_0 : tensor<1x32x1x1xf32>) -> tensor<1x32x1x1xf32>
    %8 = linalg.pooling_nchw_max {dilations = dense<1> : vector<2xi64>, lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 32, 0, 0, 0, 0], [1, 8, 1, 1, 0, 0], [0, 0, 0, 0, 1, 8], [0, 0, 0, 0, 0, 0]]>, strides = dense<[1, 256]> : vector<2xi64>} ins(%extracted_slice, %5 : tensor<1x32x1x256xf32>, tensor<1x256xf32>) outs(%7 : tensor<1x32x1x1xf32>) -> tensor<1x32x1x1xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1, 0, 0] [1, 32, 1, 1] [1, 1, 1, 1] : tensor<1x32x1x1xf32> into tensor<8x64x1x1xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  %pack = linalg.pack %6 outer_dims_perm = [0, 1, 2, 3] inner_dims_pos = [0, 1] inner_tiles = [8, 1] into %3 : tensor<8x64x1x1xf32> -> tensor<1x64x1x1x8x1xf32>
...

Regardless of the fact that the pack op not having a lowering config attached to it, if the pooling op had precedence over the pack and the tiling started from that one, fusion gets blocked once you tile the outermost dimension M with tile size 1, since the pack op operates on M=8 and cannot use the intermediate 1x32x1x1xf32 result from the pooling op. On that regard, I would assume that either that information of "Hey, if you want to fuse me in, you would need the outermost tile size to be 8!" to be propagated to the tile sizes of the pooling op, or the tiling to start with the pack as root and fuse in the pooling. I know that the pooling is not an element-wise, but still, ending up with these tile sizes points to something missing IMO. Please do correct me if I'm missing something over here.

cc @qedawkins, because I asked something in #20723 about this as well.

egebeysel avatar May 19 '25 12:05 egebeysel

On that regard, I would assume that either that information of "Hey, if you want to fuse me in, you would need the outermost tile size to be 8!" to be propagated to the tile sizes of the pooling op, or the tiling to start with the pack as root and fuse in the pooling

Correct, today this is the only way we can set configs that will work, e.g. #20455 (reshapes and packs have the same problem), but that's a WaR. The correct fix is just to add support for the fusion, however that's not possible with the current incarnations of scf.forall + TilingInterface. That's what this WIP branch is supposed to fix, but I have been context switched away from it.

qedawkins avatar May 19 '25 13:05 qedawkins

Also, I know that it's theoretically possible to have pack->generic cases, but is that really a pattern that is observed often? Because with the fusion data-tiling path, I guess one almost never observes such dispatches.

It depends on whether you want to do local propagation within a dispatch.

Use scf.forall fusion for parallel iterators. It's easy to enable this in the TileRootFuseConsumerProducer pass, we only have to set tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); here I'd also like to take a stab at this option if @pashu123 is not working on it already :)

@AaronStGeorge is going to look at this. @AaronStGeorge this is the issue that we talked about in our 1:1. Please take a look, thanks!

EDIT: to be more specific, see https://github.com/iree-org/iree/issues/20792#issuecomment-2882065945 comment. Feel free to reach out to me or @pashu123 if you have any question.

hanhanW avatar May 19 '25 17:05 hanhanW

@egebeysel let's move the pack fusion discussion to https://github.com/iree-org/iree/issues/20723, and make the scope of this issue be: solving the redundant buffer issue in TileRootAndFuseProducerConsumer approach.

hanhanW avatar May 19 '25 18:05 hanhanW

// -----// IR Dump After LLVMCPUTileRootAndFuseProducerConsumerPass (iree-llvmcpu-tile-root-and-fuse-producer-consumer) //----- //
func.func @mmt4d_bias_relu_dispatch_0_mmt4d_DxDxDx16x16x1_f32() attributes {hal.executable.target = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", iree.encoding.resolver = #iree_cpu.cpu_encoding_layout<>, max_stack_allocation_size = 32768 : i64, native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf", ukernels = "none"}>, translation_info = #iree_codegen.translation_info<pipeline = Mmt4dTilingExpert>} {
  %c0 = arith.constant 0 : index
  %c32_i64 = arith.constant 32 : i64
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
  %1 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
  %2 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
  %3 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
  %4 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
  %5 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
  %6 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
  %7 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
  %8 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
  %9 = hal.interface.constant.load layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(9) : i32
  %10 = arith.extui %0 : i32 to i64
  %11 = arith.extui %1 : i32 to i64
  %12 = arith.shli %11, %c32_i64 : i64
  %13 = arith.ori %10, %12 : i64
  %14 = arith.index_castui %13 : i64 to index
  %15 = arith.extui %2 : i32 to i64
  %16 = arith.extui %3 : i32 to i64
  %17 = arith.shli %16, %c32_i64 : i64
  %18 = arith.ori %15, %17 : i64
  %19 = arith.index_castui %18 : i64 to index
  %20 = arith.extui %4 : i32 to i64
  %21 = arith.extui %5 : i32 to i64
  %22 = arith.shli %21, %c32_i64 : i64
  %23 = arith.ori %20, %22 : i64
  %24 = arith.index_castui %23 : i64 to index
  %25 = arith.extui %6 : i32 to i64
  %26 = arith.extui %7 : i32 to i64
  %27 = arith.shli %26, %c32_i64 : i64
  %28 = arith.ori %25, %27 : i64
  %29 = arith.index_castui %28 : i64 to index
  %30 = arith.extui %8 : i32 to i64
  %31 = arith.extui %9 : i32 to i64
  %32 = arith.shli %31, %c32_i64 : i64
  %33 = arith.ori %30, %32 : i64
  %34 = arith.index_castui %33 : i64 to index
  %35:5 = util.assume.int 
      %14<umin = 0, umax = 9007199254740991>, 
      %19<umin = 0, umax = 9007199254740991>, 
      %24<umin = 0, umax = 9007199254740991>, 
      %29<umin = 0, umax = 9007199254740991>, 
      %34<umin = 0, umax = 9007199254740991>
    : index, index, index, index, index
  %36 = iree_tensor_ext.dispatch.workload.ordinal %35#0, 0 : index
  %37 = iree_tensor_ext.dispatch.workload.ordinal %35#1, 1 : index
  %38 = iree_tensor_ext.dispatch.workload.ordinal %35#2, 2 : index
  %39 = iree_tensor_ext.dispatch.workload.ordinal %35#3, 3 : index
  %40 = iree_tensor_ext.dispatch.workload.ordinal %35#4, 4 : index
  %41 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36}
  %42 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40}
  %43 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38}
  %44 = hal.interface.binding.subspan layout(<constants = 10, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%c0) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  %45 = iree_tensor_ext.dispatch.tensor.load %41, offsets = [0, 0, 0, 0], sizes = [%39, %36, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%39, %36} -> tensor<?x?x16x1xf32>
  %46 = iree_tensor_ext.dispatch.tensor.load %42, offsets = [0, 0, 0, 0], sizes = [%37, %40, 16, 1], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x?x16x1xf32>>{%37, %40} -> tensor<?x?x16x1xf32>
  %47 = iree_tensor_ext.dispatch.tensor.load %43, offsets = [0, 0], sizes = [%38, 16], strides = [1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<?x16xf32>>{%38} -> tensor<?x16xf32>
  %48 = tensor.empty(%39, %40) : tensor<?x?x16x16xf32>
  %49 = scf.forall (%arg0, %arg1) in (%39, %37) shared_outs(%arg2 = %48) -> (tensor<?x?x16x16xf32>) {
    %extracted_slice = tensor.extract_slice %45[%arg0, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %extracted_slice_0 = tensor.extract_slice %46[%arg1, 0, 0, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x?x16x1xf32>
    %50 = tensor.empty() : tensor<1x1x16x16xf32>
    %51 = scf.forall (%arg3, %arg4, %arg5, %arg6) = (0, 0, 0, 0) to (1, 1, 16, 16) step (1, 1, 2, 16) shared_outs(%arg7 = %50) -> (tensor<1x1x16x16xf32>) {
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[%arg3, 0, %arg5, 0] [1, %36, 2, 1] [1, 1, 1, 1] : tensor<1x?x16x1xf32> to tensor<1x?x2x1xf32>
      %extracted_slice_4 = tensor.extract_slice %extracted_slice_0[%arg4, 0, %arg6, 0] [1, %36, 16, 1] [1, 1, 1, 1] : tensor<1x?x16x1xf32> to tensor<1x?x16x1xf32>
      %extracted_slice_5 = tensor.extract_slice %arg7[%arg3, %arg4, %arg5, %arg6] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
      %53 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%extracted_slice_5 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      %54 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_3, %extracted_slice_4 : tensor<1x?x2x1xf32>, tensor<1x?x16x1xf32>) outs(%53 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %54 into %arg7[%arg3, %arg4, %arg5, %arg6] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
      }
    }
    %extracted_slice_1 = tensor.extract_slice %47[%arg0, 0] [1, 16] [1, 1] : tensor<?x16xf32> to tensor<1x16xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<?x?x16x16xf32> to tensor<1x1x16x16xf32>
    %52 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%51, %extracted_slice_1 : tensor<1x1x16x16xf32>, tensor<1x16xf32>) outs(%extracted_slice_2 : tensor<1x1x16x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
    ^bb0(%in: f32, %in_3: f32, %out: f32):
      %53 = arith.addf %in, %in_3 : f32
      %54 = arith.maximumf %53, %cst : f32
      linalg.yield %54 : f32
    } -> tensor<1x1x16x16xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %52 into %arg2[%arg0, %arg1, 0, 0] [1, 1, 16, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> into tensor<?x?x16x16xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  iree_tensor_ext.dispatch.tensor.store %49, %44, offsets = [0, 0, 0, 0], sizes = [%39, %40, 16, 16], strides = [1, 1, 1, 1] : tensor<?x?x16x16xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<?x?x16x16xf32>>{%39, %40}
  return
}

Use scf.forall fusion for parallel iterators. It's easy to enable this in the TileRootFuseConsumerProducer pass, we only have to set tilingOptions.setLoopType(scf::SCFTilingOptions::LoopType::ForallOp); here https://github.com/iree-org/iree/blob/65b6541605f78b1dfec4b2cdbf3c22eb0f09cfa4/compiler/src/iree/compiler/Codegen/LLVMCPU/LLVMCPUTileRootAndFuseProducerConsumer.cpp#L95

@pashu123 fusing using the above options, it looks like the biased relu is no longer fused into the loop.

AaronStGeorge avatar May 22 '25 18:05 AaronStGeorge

@AaronStGeorge could you share your branch and the changes?

pashu123 avatar May 22 '25 19:05 pashu123

@pashu123 I just added the tiling options change, nothing else.

AaronStGeorge avatar May 22 '25 19:05 AaronStGeorge

@pashu123 I shared how CPU codegen pipeline works, lowering_config details, and strategy selection idea with @AaronStGeorge this week. Now he has better understanding, and starts playing the IR with your suggestions. Can you share context and pointers with him?

hanhanW avatar May 22 '25 19:05 hanhanW

@pashu123 I shared how CPU codegen pipeline works, lowering_config details, and strategy selection idea with @AaronStGeorge this week. Now he has better understanding, and starts playing the IR with your suggestions. Can you share context and pointers with him?

Sure.

pashu123 avatar May 22 '25 20:05 pashu123

One thing to note, if you change the input IR so the bias + relu takes the result of mmt4d as a a destination operand the issue seems to go away

    %49 = linalg.fill ins(%cst : f32) outs(%48 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %50 = linalg.mmt4d ins(%45, %46 : tensor<?x?x16x1xf32>, tensor<?x?x16x1xf32>) outs(%49 : tensor<?x?x16x16xf32>) -> tensor<?x?x16x16xf32>
    %51 = linalg.generic {indexing_maps = [#map1, #map], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%47 : tensor<?x16xf32>) outs(%50 : tensor<?x?x16x16xf32>) {
    ^bb0(%in: f32, %out: f32):
      %52 = arith.addf %in, %out : f32
      %53 = arith.maximumf %52, %cst : f32
      linalg.yield %53 : f32
    } -> tensor<?x?x16x16xf32>

So it could be a fix upstream from the input.

AaronStGeorge avatar May 22 '25 20:05 AaronStGeorge

Branch: https://github.com/pashu123/iree/tree/test_cpu

Test IR: https://gist.github.com/pashu123/d96ccfb9a16111c3d55acff461b38233 Command: ~/iree-build/tools/iree-opt --pass-pipeline='builtin.module(iree-llvmcpu-select-lowering-strategy, func.func(iree-llvmcpu-lower-executable-target))' test_forall.mlir IR dump: https://gist.github.com/pashu123/45e933052f0d767d822790a5c86c508a

You can see that it's perfectly fused at the parallel level. (I had to borrow changes from tileDispatchUsingForall). So, my take is if tileDispatchUsingForall can fuse all the computation inside scf.forall; tileRootFuseConsumerProducer should also do that.

As mentioned earlier, it eliminates unnecessary yields and doesn't create extra allocs.

pashu123 avatar May 22 '25 20:05 pashu123

The plan of work should be:

  1. Make sure tileDispatchUsingForall has two modes for selecting the first tile operation(where it starts tiling from, i.e., apply tileUsingScf): the last operation that has workgroup tileSizes or the root operation. (This can be done by passing flags to the pass.) We'll default to the root operation on the CPU side.
  2. All the common functions used by tileDispatchUsingForall and tileRootAndFuseConsumerProducer should be moved to a common utils file. See https://github.com/iree-org/iree/pull/19821#pullrequestreview-2577318550

pashu123 avatar May 22 '25 21:05 pashu123

Branch: https://github.com/pashu123/iree/tree/test_cpu

Nice, everything looks right on your branch. I see the producer and consumer operations get fused into the scf.forall and the redundant output of the linalg.generic isn't included in the shared_outs (avoiding the allocation).

One question, @hanhanW mentioned that there are some places later in the pipeline that expect it a scf.for and that do you know where a good place to swap the scf.forall back to an scf.for? Would there be any problems swapping the scf.forall to an scf.for in the same pass right after the fusion is done?

  1. Make sure tileDispatchUsingForall has two modes for selecting the first tile operation(where it starts tiling from, i.e., apply tileUsingScf): the last operation that has workgroup tileSizes or the root operation. (This can be done by passing flags to the pass.) We'll default to the root operation on the CPU side.

I'm a little confused here. Why does tiling using scf.forall vs scf.for require a change in the operation we're starting the tiling from? I didn't see anything in the utility functions you pulled from tileDispatchUsingForall that assumed tiling on the last operation that needed updating, did I miss something?

AaronStGeorge avatar May 27 '25 21:05 AaronStGeorge

@pashu123 and synced up out of band, updating here.

One question, @hanhanW mentioned that there are some places later in the pipeline that expect it a scf.for and that do you know where a good place to swap the scf.forall back to an scf.for? Would there be any problems swapping the scf.forall to an scf.for in the same pass right after the fusion is done?

This should probably be done in a separate pass to be run after tiling is complete in this (and possibly other) pipelines:

https://github.com/iree-org/iree/blob/bb906c1eed975743ea9e22e2189e307807155644/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp#L529-L542

The pass should convert all scf.forall without a workgroup mapping ({mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}) to an scf.for.

Make sure tileDispatchUsingForall has two modes for selecting the first tile operation(where it starts tiling from, i.e., apply tileUsingScf): the last operation that has workgroup tileSizes or the root operation. (This can be done by passing flags to the pass.) We'll default to the root operation on the CPU side.

I'm a little confused here. Why does tiling using scf.forall vs scf.for require a change in the operation we're starting the tiling from? I didn't see anything in the utility functions you pulled from tileDispatchUsingForall that assumed tiling on the last operation that needed updating, did I miss something?

There's no dependency between the flag and the work to change tiling from using scf.for to scf.forall. However, we do need the flag. We have three levels of tiling:

  • Workgroup https://github.com/iree-org/iree/blob/bb906c1eed975743ea9e22e2189e307807155644/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp#L532
  • Parallel https://github.com/iree-org/iree/blob/bb906c1eed975743ea9e22e2189e307807155644/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp#L534-L535
  • Reduction https://github.com/iree-org/iree/blob/bb906c1eed975743ea9e22e2189e307807155644/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp#L541-L542 and all tiling passes must agree on the op they're tiling + fusing with respect to. GPU and CPU codegen pipelines both use the same passes at the workgroup level. GPU codegen path expects the tiling op to to be the last operation with workgroup tilesize attribute while the CPU path will choose the "root" operation (root operations are those which the graph is broken into workgroups around). The distribution passes currently assume the last operation with workgroup tilesize attribute case, which will often be the same as the root operation, but not always. A flag will let us direct workgroup tiling to always choose the right op based on what the rest of the current pipeline expects.

AaronStGeorge avatar May 27 '25 23:05 AaronStGeorge

One question, @hanhanW mentioned that there are some places later in the pipeline that expect it a scf.for and that do you know where a good place to swap the scf.forall back to an scf.for? Would there be any problems swapping the scf.forall to an scf.for in the same pass right after the fusion is done?

It should happen before vectorization. However, there could be some optimization as pre-vectorization passes, e.g., peeling, etc. If there are peeling, you should convert it to scf. Otherwise, peeling might be broken.

https://github.com/iree-org/iree/blob/bb906c1eed975743ea9e22e2189e307807155644/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp#L419-L447

For this work, you can add it right before vectorization because it is scoped within mmt4d pipeline for now. We will need the switch for other pipelines in the future, I think.

https://github.com/iree-org/iree/blob/bb906c1eed975743ea9e22e2189e307807155644/compiler/src/iree/compiler/Codegen/LLVMCPU/Passes.cpp#L529-L556

The distribution passes currently assume the last operation with workgroup tilesize attribute case, which will often be the same as the root operation, but not always

I just want to point out that this statement is not accurate. Because of legacy issues, all the backends propagate the lowering_config, so the last compute op could have the lowering_config that was derived from the root op. To be more specific, the config is chosen based on the root op, and the lowering_config propagation broadcasts the config to other compute ops. In CPU's world, I'd like to remove the propagation in the future. Then the flag is no longer needed. I'm not asking you do this now, but I'd like to share my thoughts. It is okay to do it this way to make progress.

hanhanW avatar May 28 '25 01:05 hanhanW

For this work, you can add it right before vectorization because it is scoped within mmt4d pipeline for now. We will need the switch for other pipelines in the future, I think.

I guess this pipeline occurs somewhat less frequently, but don't we need it here as well?

  • GPU codegen path expects the tiling op to to be the last operation with workgroup tilesize attribute

Also, may I ask why this is the case? Is choosing the root op infeasible for some reason over here or is it because producer fusion is more favorable to producer+consumer fusion?

egebeysel avatar May 28 '25 13:05 egebeysel

For this work, you can add it right before vectorization because it is scoped within mmt4d pipeline for now. We will need the switch for other pipelines in the future, I think.

I guess this pipeline occurs somewhat less frequently, but don't we need it here as well?

We'll need it for all the pipelines, IMO.

  • GPU codegen path expects the tiling op to to be the last operation with workgroup tilesize attribute

Also, may I ask why this is the case? Is choosing the root op infeasible for some reason over here or is it because producer fusion is more favorable to producer+consumer fusion?

I think it is just what's happening today.. I don't know what the plan for GPU, and it may take much more time if they are catching up the same strategy.

hanhanW avatar May 28 '25 14:05 hanhanW

We'll need it for all the pipelines, IMO.

Alright, it's because of the flag that it's scoped to just the mmt4d pipelines.

I meant that that pipeline over there was also using the TileRootAndFuseProducerConsumer pass on vector parallel level with consumer fusion enabled. In that case, this is just an FYI if you decide on not using the flag and/or if switching to scf.forall is also effective for that pipeline over there 👍

egebeysel avatar May 28 '25 14:05 egebeysel

Looks like upstream mlir::scf::forallToForLoop doesn't deal with shared_outs -> iter_args.

It seems like a reasonable addition to write, and maybe contribute upstream. The only thing that seems a little tricky is moving the ops inside the scf.forall.in_parralel region into the serial scf.forall. There's a bit of an impedance mismatch between the yield based way scf.for handles things and the region based way scf.forall handles them. For example below the parallel tensor.parallel_insert_slice must be converted to the serial version tensor.insert_slice. I'm not sure if there's an interface that will give you the "serial dual" of an op?

example:

%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
  %51 = tensor.empty() : tensor<1x1x2x16xf32>
  %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
  %53 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %52) -> (tensor<1x1x2x16xf32>) {
    %extracted_slice_2 = tensor.extract_slice %45[%arg0, %arg5, %arg3, 0] [1, 1, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x2x1xf32>
    %extracted_slice_3 = tensor.extract_slice %46[%arg1, %arg5, 0, 0] [1, 1, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x16x1xf32>
    %55 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x2x1xf32>, tensor<1x1x16x1xf32>) outs(%arg6 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
    scf.yield %55 : tensor<1x1x2x16xf32>
  }
  %extracted_slice_0 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
  %extracted_slice_1 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
  %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_0 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_1 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %55 = arith.addf %in, %in_2 : f32
    %56 = arith.maximumf %55, %cst : f32
    linalg.yield %56 : f32
  } -> tensor<1x1x2x16xf32>
  scf.forall.in_parallel {
    tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
  }
}

becomes

%50 = scf.for %iv = 0 to 16 step 0
 iter_args(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) {
  %51 = tensor.empty() : tensor<1x1x2x16xf32>
  %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
  %53 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %52) -> (tensor<1x1x2x16xf32>) {
    %extracted_slice_2 = tensor.extract_slice %45[%arg0, %arg5, %arg3, 0] [1, 1, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x2x1xf32>
    %extracted_slice_3 = tensor.extract_slice %46[%arg1, %arg5, 0, 0] [1, 1, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x16x1xf32>
    %55 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x2x1xf32>, tensor<1x1x16x1xf32>) outs(%arg6 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32>
    scf.yield %55 : tensor<1x1x2x16xf32>
  }
  %extracted_slice_0 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32>
  %extracted_slice_1 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32>
  %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_0 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_1 : tensor<1x1x2x16xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %55 = arith.addf %in, %in_2 : f32
    %56 = arith.maximumf %55, %cst : f32
    linalg.yield %56 : f32
  } -> tensor<1x1x2x16xf32>
  tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
  %55 = tensor.insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32>
  scf.yield %55 : tensor<1x1x16x16xf32>
}

AaronStGeorge avatar May 29 '25 00:05 AaronStGeorge

Looks like upstream mlir::scf::forallToForLoop doesn't deal with shared_outs -> iter_args.

It seems like a reasonable addition to write, and maybe contribute upstream. The only thing that seems a little tricky is moving the ops inside the scf.forall.in_parralel region into the serial scf.forall. There's a bit of an impedance mismatch between the yield based way scf.for handles things and the region based way scf.forall handles them. For example below the parallel tensor.parallel_insert_slice must be converted to the serial version tensor.insert_slice. I'm not sure if there's an interface that will give you the "serial dual" of an op?

example:

%50 = scf.forall (%arg3) = (0) to (16) step (2) shared_outs(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) { %51 = tensor.empty() : tensor<1x1x2x16xf32> %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32> %53 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %52) -> (tensor<1x1x2x16xf32>) { %extracted_slice_2 = tensor.extract_slice %45[%arg0, %arg5, %arg3, 0] [1, 1, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x2x1xf32> %extracted_slice_3 = tensor.extract_slice %46[%arg1, %arg5, 0, 0] [1, 1, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x16x1xf32> %55 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x2x1xf32>, tensor<1x1x16x1xf32>) outs(%arg6 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32> scf.yield %55 : tensor<1x1x2x16xf32> } %extracted_slice_0 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32> %extracted_slice_1 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32> %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_0 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_1 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} { ^bb0(%in: f32, %in_2: f32, %out: f32): %55 = arith.addf %in, %in_2 : f32 %56 = arith.maximumf %55, %cst : f32 linalg.yield %56 : f32 } -> tensor<1x1x2x16xf32> scf.forall.in_parallel { tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32> } } becomes

%50 = scf.for %iv = 0 to 16 step 0 iter_args(%arg4 = %extracted_slice) -> (tensor<1x1x16x16xf32>) { %51 = tensor.empty() : tensor<1x1x2x16xf32> %52 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} ins(%cst : f32) outs(%51 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32> %53 = scf.for %arg5 = %c0 to %36 step %c1 iter_args(%arg6 = %52) -> (tensor<1x1x2x16xf32>) { %extracted_slice_2 = tensor.extract_slice %45[%arg0, %arg5, %arg3, 0] [1, 1, 2, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x2x1xf32> %extracted_slice_3 = tensor.extract_slice %46[%arg1, %arg5, 0, 0] [1, 1, 16, 1] [1, 1, 1, 1] : tensor<?x?x16x1xf32> to tensor<1x1x16x1xf32> %55 = linalg.mmt4d {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0, 0, 0], [1, 1, 0, 2, 16, 0], [0, 0, 1, 0, 0, 1]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<1x1x2x1xf32>, tensor<1x1x16x1xf32>) outs(%arg6 : tensor<1x1x2x16xf32>) -> tensor<1x1x2x16xf32> scf.yield %55 : tensor<1x1x2x16xf32> } %extracted_slice_0 = tensor.extract_slice %47[%arg0, %arg3] [1, 2] [1, 1] : tensor<?x16xf32> to tensor<1x2xf32> %extracted_slice_1 = tensor.extract_slice %arg4[0, 0, %arg3, 0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x16x16xf32> to tensor<1x1x2x16xf32> %54 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d2)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%53, %extracted_slice_0 : tensor<1x1x2x16xf32>, tensor<1x2xf32>) outs(%extracted_slice_1 : tensor<1x1x2x16xf32>) attrs = {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1, 0, 0], [1, 1, 0, 2], [0, 0, 0, 0], [0, 0, 1, 0]]>} { ^bb0(%in: f32, %in_2: f32, %out: f32): %55 = arith.addf %in, %in_2 : f32 %56 = arith.maximumf %55, %cst : f32 linalg.yield %56 : f32 } -> tensor<1x1x2x16xf32> tensor.parallel_insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32> %55 = tensor.insert_slice %54 into %arg4[%c0, %c0, %arg3, %c0] [1, 1, 2, 16] [1, 1, 1, 1] : tensor<1x1x2x16xf32> into tensor<1x1x16x16xf32> scf.yield %55 : tensor<1x1x16x16xf32> }

I am unable to follow. Is the above IR what it currently outputs? In that case, it's mapping correctly to the iter_args. The only problem I see is with the tensor.parallel_insert_slice, which will be lowered to a memref.subview operation. Additionally, could you paste the IR dump (-mlir-print-ir-after-all).

pashu123 avatar May 29 '25 04:05 pashu123

I think you just need to replace the parallel_insert_slice with the tensor.insert_slice? The upstream method clones the ops unconditionally, and it is wrong in this context. It is reasonable because it does not support the case at all.

I may be wrong, because I did not check the parallel ops semantics and constraints. It is the idea that I have when I look at the IR.

hanhanW avatar May 29 '25 04:05 hanhanW

The other missing IR is that we seem to miss extract_slice in the beginning. @AaronStGeorge I think you can compare the result of scf.for op tiling and the result of scf.forall op tiling to get the idea.

scf.for op tiling is off in TileRootAndFuseProducerConsumerPass, but it is supported in TileAndFuse which tiles the last operation and fuses all the producers. Please coordinate with @pashu123 to get the IR, if you have questions.

hanhanW avatar May 29 '25 04:05 hanhanW