Failure in ConvertToStream, no useful debugging info
What happened?
Failure converting to stream with no error:
repro2.mlir:1:1: error: Failures have been detected while processing an MLIR pass pipeline
module @module {
^
repro2.mlir:1:1: note: Pipeline failed while executing [`ConvertToStreamPass` on 'builtin.module' operation: @module]
Steps to reproduce your issue
torch dialect minimal reproducer: (note that removing %12 and returning %11 avoids the error)
module @module {
func.func @main(%arg0: !torch.vtensor<[1,77,4096],f32>, %arg1: !torch.vtensor<[1,2048],f32>, %arg2: !torch.vtensor<[1,77,4096],f32>, %arg3: !torch.vtensor<[1,2048],f32>, %arg4: !torch.vtensor<[1,16,128,128],f32>, %arg5: !torch.vtensor<[1,16,128,128],f32>) -> !torch.vtensor<[50],f32> {
%__auto.sd3.model_sampling.sigmas = util.global.load @__auto.sd3.model_sampling.sigmas : tensor<1000xf32>
%0 = torch_c.from_builtin_tensor %__auto.sd3.model_sampling.sigmas : tensor<1000xf32> -> !torch.vtensor<[1000],f32>
%int0 = torch.constant.int 0
%int-1 = torch.constant.int -1
%1 = torch.aten.select.int %0, %int0, %int-1 : !torch.vtensor<[1000],f32>, !torch.int, !torch.int -> !torch.vtensor<[],f32>
%int1000 = torch.constant.int 1000
%2 = torch.aten.mul.Scalar %1, %int1000 : !torch.vtensor<[],f32>, !torch.int -> !torch.vtensor<[],f32>
%int0_0 = torch.constant.int 0
%int0_1 = torch.constant.int 0
%3 = torch.aten.select.int %0, %int0_0, %int0_1 : !torch.vtensor<[1000],f32>, !torch.int, !torch.int -> !torch.vtensor<[],f32>
%int1000_2 = torch.constant.int 1000
%4 = torch.aten.mul.Scalar %3, %int1000_2 : !torch.vtensor<[],f32>, !torch.int -> !torch.vtensor<[],f32>
%int7 = torch.constant.int 7
%5 = torch.prims.convert_element_type %2, %int7 : !torch.vtensor<[],f32>, !torch.int -> !torch.vtensor<[],f64>
%int7_3 = torch.constant.int 7
%6 = torch.prims.convert_element_type %4, %int7_3 : !torch.vtensor<[],f32>, !torch.int -> !torch.vtensor<[],f64>
%int0_4 = torch.constant.int 0
%int50 = torch.constant.int 50
%none = torch.constant.none
%int0_5 = torch.constant.int 0
%cpu = torch.constant.device "cpu"
%false = torch.constant.bool false
%7 = torch.aten.arange.start %int0_4, %int50, %none, %int0_5, %cpu, %false : !torch.int, !torch.int, !torch.none, !torch.int, !torch.Device, !torch.bool -> !torch.vtensor<[50],si64>
%int1 = torch.constant.int 1
%8 = torch.aten.sub.Tensor %6, %5, %int1 : !torch.vtensor<[],f64>, !torch.vtensor<[],f64>, !torch.int -> !torch.vtensor<[],f64>
%int49 = torch.constant.int 49
%9 = torch.aten.div.Scalar %8, %int49 : !torch.vtensor<[],f64>, !torch.int -> !torch.vtensor<[],f64>
%float2.500000e01 = torch.constant.float 2.500000e+01
%10 = torch.aten.lt.Scalar %7, %float2.500000e01 : !torch.vtensor<[50],si64>, !torch.float -> !torch.vtensor<[50],i1>
%int6 = torch.constant.int 6
%11 = torch.prims.convert_element_type %7, %int6 : !torch.vtensor<[50],si64>, !torch.int -> !torch.vtensor<[50],f32>
%12 = torch.aten.mul.Tensor %9, %11 : !torch.vtensor<[],f64>, !torch.vtensor<[50],f32> -> !torch.vtensor<[50],f32>
return %12 : !torch.vtensor<[50],f32>
}
util.global private @__auto.sd3.model_sampling.sigmas = #stream.parameter.named<"model"::"sd3.model_sampling.sigmas"> : tensor<1000xf32>
}
compile command: iree-compile repro2.mlir --iree-input-type=torch --iree-vm-bytecode-module-output-format=flatbuffer-binary --iree-hal-target-backends=llvm-cpu --iree-llvmcpu-embedded-linker-path=iree-lld --mlir-print-debuginfo --mlir-print-op-on-diagnostic=false --iree-llvmcpu-target-triple=x86_64-linux-gnu --iree-llvmcpu-target-cpu-features=host --iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false
ir before ConvertToStream:
// -----// IR Dump Before ConvertToStreamPass (iree-stream-conversion) //----- // #executable_target_embedded_elf_x86_64_ = #hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "znver3", cpu_features = "+prfchw,-cldemote,+avx,+aes,+sahf,+pclmul,-xop,+crc32,+xsaves,-avx512fp16,-usermsr,-sm4,-egpr,+sse4.1,-avx512ifma,+xsave,-avx512pf,+sse4.2,-tsxldtrk,-ptwrite,-widekl,-sm3,+invpcid,+64bit,+xsavec,-avx10.1-512,-avx512vpopcntdq,+cmov,-avx512vp2intersect,-avx512cd,+movbe,-avxvnniint8,-avx512er,-ccmp,-amx-int8,-kl,-avx10.1-256,-sha512,-avxvnni,-rtm,+adx,+avx2,-hreset,-movdiri,-serialize,+vpclmulqdq,-avx512vl,-uintr,-cf,+clflushopt,-raoint,-cmpccxadd,+bmi,-amx-tile,+sse,-gfni,-avxvnniint16,-amx-fp16,-ndd,+xsaveopt,+rdrnd,-avx512f,-amx-bf16,-avx512bf16,-avx512vnni,-push2pop2,+cx8,-avx512bw,+sse3,-pku,+fsgsbase,+clzero,-mwaitx,-lwp,+lzcnt,+sha,-movdir64b,-ppx,-wbnoinvd,-enqcmd,-prefetchwt1,-avxneconvert,-tbm,-pconfig,-amx-complex,+ssse3,+cx16,+bmi2,+fma,+popcnt,-avxifma,+f16c,-avx512bitalg,+rdpru,+clwb,+mmx,+sse2,+rdseed,-avx512vbmi2,-prefetchi,+rdpid,-fma4,-avx512vbmi,+shstk,+vaes,-waitpkg,-sgx,+fxsr,-avx512dq,+sse4a", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 32 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}> #loc1 = loc("repro2.mlir":25:10) #loc2 = loc("repro2.mlir":34:11) #loc3 = loc("repro2.mlir":3:41) #loc6 = loc("repro2.mlir":7:10) #loc7 = loc("repro2.mlir":12:10) #map = affine_map<(d0) -> (d0)> #map1 = affine_map<(d0) -> ()> #device_target_llvm_cpu = #hal.device.target<"llvm-cpu", [#executable_target_embedded_elf_x86_64_]> module @module attributes {hal.device.targets = [#device_target_llvm_cpu]} { util.global private @hoisted : tensor<50xi64> loc(#loc1) flow.executable private @_initializer_0_dispatch_0 { flow.executable.export public @_initializer_0_dispatch_0_generic_50_i64 workgroups() -> (index, index, index) { %x, %y, %z = flow.dispatch.workgroup_count_from_slice loc(#loc1) flow.return %x, %y, %z : index, index, index loc(#loc1) } loc(#loc1) builtin.module { func.func @_initializer_0_dispatch_0_generic_50_i64(%arg0: !flow.dispatch.tensor<writeonly:tensor<50xi64>> loc("repro2.mlir":25:10)) { %0 = tensor.empty() : tensor<50xi64> loc(#loc1) %1 = linalg.generic {indexing_maps = [#map], iterator_types = ["parallel"]} outs(%0 : tensor<50xi64>) { ^bb0(%out: i64 loc("repro2.mlir":25:10)): %2 = linalg.index 0 : index loc(#loc1) %3 = arith.index_cast %2 : index to i64 loc(#loc1) linalg.yield %3 : i64 loc(#loc1) } -> tensor<50xi64> loc(#loc1) flow.dispatch.tensor.store %1, %arg0, offsets = [0], sizes = [50], strides = [1] : tensor<50xi64> -> !flow.dispatch.tensor<writeonly:tensor<50xi64>> loc(#loc1) return loc(#loc1) } loc(#loc1) } loc(#loc1) } loc(#loc1) util.initializer { %0 = flow.dispatch @_initializer_0_dispatch_0::@_initializer_0_dispatch_0_generic_50_i64() : () -> tensor<50xi64> loc(#loc1) util.global.store %0, @hoisted : tensor<50xi64> loc(#loc1) util.return loc(#loc1) } loc(#loc1) util.global private @hoisted_0 : tensor<50xf32> loc(#loc2) flow.executable private @_initializer_1_dispatch_0 { flow.executable.export public @_initializer_1_dispatch_0_generic_50_f32xf32xi64xf32 workgroups() -> (index, index, index) {
%x, %y, %z = flow.dispatch.workgroup_count_from_slice loc(#loc2)
flow.return %x, %y, %z : index, index, index loc(#loc2)
} loc(#loc2)
builtin.module {
func.func @_initializer_1_dispatch_0_generic_50_f32xf32xi64xf32(%arg0: !flow.dispatch.tensor<readonly:tensor<1000xf32>> loc("repro2.mlir":3:41), %arg1: !flow.dispatch.tensor<readonly:tensor<50xi64>> loc("repro2.mlir":25:10), %arg2: !flow.dispatch.tensor<writeonly:tensor<50xf32>> loc("repro2.mlir":34:11)) {
%cst = arith.constant 1.000000e+03 : f32 loc(#loc4)
%cst_0 = arith.constant 4.900000e+01 : f32 loc(#loc4)
%0 = flow.dispatch.tensor.load %arg1, offsets = [0], sizes = [50], strides = [1] : !flow.dispatch.tensor<readonly:tensor<50xi64>> -> tensor<50xi64> loc(#loc2)
%1 = tensor.empty() : tensor<50xf32> loc(#loc5)
%2 = flow.dispatch.tensor.load %arg0, offsets = [999], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1000xf32>> -> tensor<f32> loc(#loc6) %3 = flow.dispatch.tensor.load %arg0, offsets = [0], sizes = [1], strides = [1] : !flow.dispatch.tensor<readonly:tensor<1000xf32>> -> tensor<f32> loc(#loc7)
%4 = linalg.generic {indexing_maps = [#map1, #map1, #map, #map], iterator_types = ["parallel"]} ins(%3, %2, %0 : tensor<f32>, tensor<f32>, tensor<50xi64>) outs(%1 : tensor<50xf32>) {
^bb0(%in: f32 loc("repro2.mlir":12:10), %in_1: f32 loc("repro2.mlir":7:10), %in_2: i64 loc("repro2.mlir":25:10), %out: f32 loc("repro2.mlir":34:11)):
%5 = arith.mulf %in_1, %cst : f32 loc(#loc8)
%6 = arith.mulf %in, %cst : f32 loc(#loc9)
%7 = arith.subf %6, %5 : f32 loc(#loc10)
%8 = arith.sitofp %in_2 : i64 to f32 loc(#loc5)
%9 = arith.divf %7, %cst_0 : f32 loc(#loc11)
%10 = arith.mulf %9, %8 : f32 loc(#loc2)
linalg.yield %10 : f32 loc(#loc2)
} -> tensor<50xf32> loc(#loc2)
flow.dispatch.tensor.store %4, %arg2, offsets = [0], sizes = [50], strides = [1] : tensor<50xf32> -> !flow.dispatch.tensor<writeonly:tensor<50xf32>> loc(#loc2)
return loc(#loc2)
} loc(#loc2)
} loc(#loc2)
} loc(#loc2)
util.initializer {
%hoisted = util.global.load @hoisted : tensor<50xi64> loc(#loc1)
%__auto.sd3.model_sampling.sigmas = util.global.load immutable @__auto.sd3.model_sampling.sigmas : tensor<1000xf32> loc(#loc3)
%0 = flow.dispatch @_initializer_1_dispatch_0::@_initializer_1_dispatch_0_generic_50_f32xf32xi64xf32(%__auto.sd3.model_sampling.sigmas, %hoisted) : (tensor<1000xf32>, tensor<50xi64>) -> tensor<50xf32> loc(#loc2)
util.global.store %0, @hoisted_0 : tensor<50xf32> loc(#loc2)
util.return loc(#loc2)
} loc(#loc2)
util.func public @main$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.buffer_view, %arg5: !hal.buffer_view, %arg6: !hal.fence, %arg7: !hal.fence) -> !hal.buffer_view attributes {inlining_policy = #util.inline.never, iree.abi.model = "coarse-fences", iree.abi.stub} {
%hoisted_0 = util.global.load @hoisted_0 : tensor<50xf32> loc(#loc2)
%0 = hal.tensor.barrier join(%hoisted_0 : tensor<50xf32>) => %arg7 : !hal.fence loc(#loc12)
%1 = hal.tensor.export %0 : tensor<50xf32> -> !hal.buffer_view loc(#loc12)
util.return %1 : !hal.buffer_view loc(#loc13)
} loc(#loc12)
util.func public @main(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.buffer_view, %arg4: !hal.buffer_view, %arg5: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c-1_i32 = arith.constant -1 : i32 loc(#loc12)
%c0 = arith.constant 0 : index loc(#loc12)
%device_0 = hal.devices.get %c0 : !hal.device loc(#loc12)
%0 = util.null : !hal.fence loc(#loc12)
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence loc(#loc12)
%1 = util.call @main$async(%arg0, %arg1, %arg2, %arg3, %arg4, %arg5, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view loc(#loc12)
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32 loc(#loc12)
util.return %1 : !hal.buffer_view loc(#loc12)
} loc(#loc12)
util.global private @__auto.sd3.model_sampling.sigmas = #stream.parameter.named<"model"::"sd3.model_sampling.sigmas"> : tensor<1000xf32> loc(#loc14)
} loc(#loc)
#loc = loc("repro2.mlir":1:1)
#loc4 = loc(unknown)
#loc5 = loc("repro2.mlir":33:11)
#loc8 = loc("repro2.mlir":9:10)
#loc9 = loc("repro2.mlir":14:10)
#loc10 = loc("repro2.mlir":27:10)
#loc11 = loc("repro2.mlir":29:10)
#loc12 = loc("repro2.mlir":2:3)
#loc13 = loc("repro2.mlir":35:5)
#loc14 = loc("repro2.mlir":37:3)
- Click on '....'
- Scroll down to '....'
- See error
What component(s) does this issue relate to?
No response
Version information
No response
Additional context
No response
I can take a look. globals should be in proper order, though, as initialization is performed as they are declared in the top-level module - probably not the issue here, but you should ensure the globals are inserted at the top of the module or before first-use.
probably unrelated but this is with your remove-jit-attr branch
@benvanik ok after moving the global to the top it compiles.
nice - I'll add a pass that runs and verifies globals are in the correct order (declared before use) converttostream requires that globals be converted prior to their uses, so I suspect that's what the issue is here
@stellaraccident this is happening after using externalize_module_parameters(nn.module)
from https://github.com/iree-org/iree-turbine/blob/main/shark_turbine/aot/support/ir_utils.py#L173 I thought we insert global at the start of the moduleOp, am I misunderstanding something?
Fixed in https://github.com/iree-org/iree-turbine/pull/5