Hang in iree-run-module on ONNX dequantizelinear test case
I'm seeing iree-run-module hang indefinitely with this call stack:
[Inline Frame] iree-run-module.exe!iree_futex_wait(void * expected_value, unsigned int) Line 148 (d:\dev\projects\iree\runtime\src\iree\base\internal\synchronization.c:148)
iree-run-module.exe!iree_notification_commit_wait(iree_notification_t * notification, unsigned int wait_token, __int64 spin_ns, __int64 deadline_ns) Line 819 (d:\dev\projects\iree\runtime\src\iree\base\internal\synchronization.c:819)
iree-run-module.exe!iree_notification_await(iree_notification_t * notification, bool(*)(void *) condition_fn, void * condition_arg, iree_timeout_t timeout) Line 875 (d:\dev\projects\iree\runtime\src\iree\base\internal\synchronization.c:875)
iree-run-module.exe!iree_hal_sync_semaphore_wait(iree_hal_semaphore_t * base_semaphore, unsigned __int64 value, iree_timeout_t timeout) Line 303 (d:\dev\projects\iree\runtime\src\iree\hal\drivers\local_sync\sync_semaphore.c:303)
iree-run-module.exe!iree_hal_semaphore_wait(iree_hal_semaphore_t * semaphore, unsigned __int64 value, iree_timeout_t timeout) Line 78 (d:\dev\projects\iree\runtime\src\iree\hal\semaphore.c:78)
iree-run-module.exe!iree_hal_semaphore_list_wait(iree_hal_semaphore_list_t semaphore_list, iree_timeout_t timeout) Line 201 (d:\dev\projects\iree\runtime\src\iree\hal\semaphore.c:201)
iree-run-module.exe!iree_hal_fence_wait(iree_hal_fence_t * fence, iree_timeout_t timeout) Line 253 (d:\dev\projects\iree\runtime\src\iree\hal\fence.c:253)
iree-run-module.exe!iree_hal_module_fence_await(iree_vm_stack_t * stack, void * module, iree_hal_module_state_t * state, iree_vm_abi_iCrD_t * args, iree_vm_abi_i_t * rets) Line 1521 (d:\dev\projects\iree\runtime\src\iree\modules\hal\module.c:1521)
iree-run-module.exe!iree_vm_shim_iCrD_i(iree_vm_stack_t * stack, unsigned int flags, iree_byte_span_t args_storage, iree_byte_span_t rets_storage, iree_status_handle_t *(*)(iree_vm_stack_t *, void *, void *, const void *, void *) target_fn, void * module, void * module_state) Line 76 (d:\dev\projects\iree\runtime\src\iree\vm\shims.c:76)
iree-run-module.exe!iree_vm_native_module_issue_call(iree_vm_native_module_t * module, iree_vm_stack_t * stack, iree_vm_stack_frame_t * callee_frame, unsigned int flags, iree_byte_span_t args_storage, iree_byte_span_t rets_storage) Line 342 (d:\dev\projects\iree\runtime\src\iree\vm\native_module.c:342)
iree-run-module.exe!iree_vm_native_module_begin_call(void * self, iree_vm_stack_t * stack, iree_vm_function_call_t call) Line 394 (d:\dev\projects\iree\runtime\src\iree\vm\native_module.c:394)
iree-run-module.exe!iree_vm_bytecode_issue_import_call(iree_vm_stack_t * stack, const iree_vm_function_call_t call, iree_string_view_t cconv_results, const iree_vm_register_list_t * dst_reg_list, iree_vm_stack_frame_t * * out_caller_frame, iree_vm_registers_t * out_caller_registers) Line 453 (d:\dev\projects\iree\runtime\src\iree\vm\bytecode\dispatch.c:453)
iree-run-module.exe!iree_vm_bytecode_call_import_variadic(iree_vm_stack_t * stack, const iree_vm_bytecode_module_state_t * module_state, unsigned int import_ordinal, const iree_vm_registers_t caller_registers, const iree_vm_register_list_t * segment_size_list, const iree_vm_register_list_t * src_reg_list, const iree_vm_register_list_t * dst_reg_list, iree_vm_stack_frame_t * * out_caller_frame, iree_vm_registers_t * out_caller_registers) Line 612 (d:\dev\projects\iree\runtime\src\iree\vm\bytecode\dispatch.c:612)
iree-run-module.exe!iree_vm_bytecode_dispatch(iree_vm_stack_t * stack, iree_vm_bytecode_module_t * module, iree_vm_stack_frame_t * current_frame, iree_vm_registers_t regs, iree_byte_span_t call_results) Line 1716 (d:\dev\projects\iree\runtime\src\iree\vm\bytecode\dispatch.c:1716)
iree-run-module.exe!iree_vm_bytecode_dispatch_begin(iree_vm_stack_t * stack, iree_vm_bytecode_module_t * module, const iree_vm_function_call_t call, iree_string_view_t cconv_arguments, iree_string_view_t cconv_results) Line 638 (d:\dev\projects\iree\runtime\src\iree\vm\bytecode\dispatch.c:638)
Original test case: https://github.com/nod-ai/SHARK-TestSuite/tree/main/iree_tests/onnx/node/generated/test_dequantizelinear
Input MLIR:
module {
func.func @test_dequantizelinear(%arg0: !torch.vtensor<[4],ui8>, %arg1: !torch.vtensor<[],f32>, %arg2: !torch.vtensor<[],ui8>) -> !torch.vtensor<[4],f32> attributes {torch.onnx_meta.ir_version = 10 : si64, torch.onnx_meta.opset_version = 21 : si64, torch.onnx_meta.producer_name = "backend-test", torch.onnx_meta.producer_version = ""} {
%none = torch.constant.none
%0 = torch.operator "onnx.DequantizeLinear"(%arg0, %arg1, %arg2) : (!torch.vtensor<[4],ui8>, !torch.vtensor<[],f32>, !torch.vtensor<[],ui8>) -> !torch.vtensor<[4],f32>
return %0 : !torch.vtensor<[4],f32>
}
}
After iree-compile --compile-to=input dequantize_linear.mlir:
module attributes {hal.device.targets = [#hal.device.target<"llvm-cpu", [#hal.executable.target<"llvm-cpu", "embedded-elf-x86_64", {cpu = "generic", cpu_features = "", data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", native_vector_size = 16 : i64, target_triple = "x86_64-unknown-unknown-eabi-elf"}>]>]} {
util.func public @test_dequantizelinear$async(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view, %arg3: !hal.fence, %arg4: !hal.fence) -> !hal.buffer_view attributes {iree.abi.model = "coarse-fences", iree.abi.stub} {
%0 = hal.tensor.import wait(%arg3) => %arg0 : !hal.buffer_view -> tensor<4xi8>
%1 = hal.tensor.import wait(%arg3) => %arg1 : !hal.buffer_view -> tensor<f32>
%2 = hal.tensor.import wait(%arg3) => %arg2 : !hal.buffer_view -> tensor<i8>
%extracted = tensor.extract %1[] : tensor<f32>
%3 = arith.extf %extracted : f32 to f64
%extracted_0 = tensor.extract %2[] : tensor<i8>
%4 = arith.extsi %extracted_0 : i8 to i64
%5 = tensor.empty() : tensor<4xf32>
%6 = linalg.generic {indexing_maps = [affine_map<(d0) -> (d0)>, affine_map<(d0) -> (d0)>], iterator_types = ["parallel"]} ins(%0 : tensor<4xi8>) outs(%5 : tensor<4xf32>) {
^bb0(%in: i8, %out: f32):
%9 = arith.extui %in : i8 to i32
%10 = arith.trunci %4 : i64 to i32
%11 = arith.subi %9, %10 : i32
%12 = arith.uitofp %11 : i32 to f32
%13 = arith.truncf %3 : f64 to f32
%14 = arith.mulf %12, %13 : f32
linalg.yield %14 : f32
} -> tensor<4xf32>
%7 = hal.tensor.barrier join(%6 : tensor<4xf32>) => %arg4 : !hal.fence
%8 = hal.tensor.export %7 : tensor<4xf32> -> !hal.buffer_view
util.return %8 : !hal.buffer_view
}
util.func public @test_dequantizelinear(%arg0: !hal.buffer_view, %arg1: !hal.buffer_view, %arg2: !hal.buffer_view) -> !hal.buffer_view attributes {iree.abi.stub} {
%c-1_i32 = arith.constant -1 : i32
%c0 = arith.constant 0 : index
%device_0 = hal.devices.get %c0 : !hal.device
%0 = util.null : !hal.fence
%fence = hal.fence.create device(%device_0 : !hal.device) flags("None") : !hal.fence
%1 = util.call @test_dequantizelinear$async(%arg0, %arg1, %arg2, %0, %fence) : (!hal.buffer_view, !hal.buffer_view, !hal.buffer_view, !hal.fence, !hal.fence) -> !hal.buffer_view
%status = hal.fence.await until([%fence]) timeout_millis(%c-1_i32) : i32
util.return %1 : !hal.buffer_view
}
}
--print-ir-after-all output: https://gist.github.com/ScottTodd/15ea43eae8733806b80d7d9ed1fb4f1a
Run command (using the input .npy files from the test case folder):
iree-run-module \
--device=local-sync
--module=dequantize_linear_cpu.vmfb \
--input=@/path/to/input_0.npy \
--input=@/path/to/input_1.npy \
--input=@/path/to/input_2.npy
Running those .npy files through iree\tools\test\echo_npy.py produces
[ 0. 2. 3. 1000. -254. -1000.]
2.0
128
(first is type ui8 though, so why negative?)
try --trace_execution?
try --trace_execution?
[module.__init+00000000] <block>
[module.__init+00000001] %r0 = vm.const.ref.zero
[module.__init+00000004] %i0 = vm.const.i32 2 // 0x00000002
[module.__init+0000000B] %i1 = vm.const.i32 7 // 0x00000007
[module.__init+00000012] %i2 = vm.const.i32 1 // 0x00000001
[module.__init+00000019] %i3 = vm.const.i32 14 // 0x0000000E
[module.__init+00000020] %i4:5 = vm.const.i64 -1 // 0xFFFFFFFFFFFFFFFF
[module.__init+0000002B] %i6:7 = vm.const.i64.zero
[module.__init+0000002E] %i8 = vm.const.i32.zero
[module.__init+00000031] %r1 = vm.call @hal.devices.get(%i8(0))
[module.__init+0000003E] %r2 = vm.const.ref.rodata 1 // 0x0000019750ECD048 21b
[module.__init+00000045] %r3 = vm.const.ref.rodata 2 // 0x0000019750ECD06C 19b
[module.__init+0000004C] %i9, %i10 = vm.call @hal.device.query.i64(%r1(!hal.device/0x0000019750EAC9C0), %r2(!vm.buffer/0x0000019750ECB690), %r3(!vm.buffer/0x0000019750ECB6B8))
[module.__init+00000060] %i10 = vm.trunc.i64.i32 %i10:11(1)
[module.__init+00000065] %i10 = vm.and.i32 %i10(1), %i2(1)
[module.__init+0000006C] %i9 = vm.select.i32 %i9(1) ? %i10(1) : %i8(0)
[module.__init+00000075] %r2 = vm.call.varadic @hal.descriptor_set_layout.create(%r1(!hal.device/0x0000019750EAC9C0), %i8(0), %i8(0), %i1(7), %i2(1), %i2(1), %i1(7), %i8(0))
[module.__init+00000098] %r2 = vm.call.varadic @hal.pipeline_layout.create(%r1(!hal.device/0x0000019750EAC9C0), %i0(2), %r2(!hal.descriptor_set_layout/0x0000019750EBA180))
[module.__init+000000B2] %i0:1 = vm.select.i64 %i9(1) ? %i6:7(0) : %i4:5(-1)
[module.__init+000000BB] %i0 = vm.cmp.eq.i64 %i0:1(0), %i6:7(0)
[module.__init+000000C2] vm.global.store.i32 %i9(1), .rwdata[0]
[module.__init+000000C9] vm.global.store.ref %r2(!hal.pipeline_layout/0x0000019750EBABD0), .refs[0] : !hal.pipeline_layout
[module.__init+000000D4] vm.cond_br %i0(1), ^000000E4(), ^00000126()
[module.__init+000000E5] %r2 = vm.global.load.ref .refs[0](!hal.pipeline_layout/0x0000019750EBABD0) : !hal.pipeline_layout
[module.__init+000000F0] %r4 = vm.const.ref.rodata 0 // 0x0000019750ECDBC0 3840b
[module.__init+000000F7] %r0 = vm.call.varadic @hal.executable.create(%r1(!hal.device/0x0000019750EAC9C0), %r3(!vm.buffer/0x0000019750ECB6B8), %r4(!vm.buffer/0x0000019750ECB668), %r0(null), %r2(!hal.pipeline_layout/0x0000019750EBABD0))
[module.__init+00000118] vm.global.store.ref %r0(!hal.executable/0x0000019750ED4630), .refs[1] : !hal.executable
[module.__init+00000123] vm.return
EXEC @test_dequantizelinear
[module.test_dequantizelinear+00000000] <block>
[module.test_dequantizelinear+00000001] %i0 = vm.const.i32 3 // 0x00000003
[module.test_dequantizelinear+00000008] %i1 = vm.const.i32 48 // 0x00000030
[module.test_dequantizelinear+0000000F] %i2 = vm.const.i32 4 // 0x00000004
[module.test_dequantizelinear+00000016] %i3 = vm.const.i32 13 // 0x0000000D
[module.test_dequantizelinear+0000001D] %i4 = vm.const.i32 28 // 0x0000001C
[module.test_dequantizelinear+00000024] %i5 = vm.const.i32 150998019 // 0x09000C03
[module.test_dequantizelinear+0000002B] %i6 = vm.const.i32 86 // 0x00000056
[module.test_dequantizelinear+00000032] %i7 = vm.const.i32 553648160 // 0x21000020
[module.test_dequantizelinear+00000039] %i8 = vm.const.i32 3075 // 0x00000C03
[module.test_dequantizelinear+00000040] %i9 = vm.const.i32 16 // 0x00000010
[module.test_dequantizelinear+00000047] %i10 = vm.const.i32 1 // 0x00000001
[module.test_dequantizelinear+0000004E] %i11 = vm.const.i32 268435464 // 0x10000008
[module.test_dequantizelinear+00000055] %r3 = vm.const.ref.zero
[module.test_dequantizelinear+00000058] %i12 = vm.const.i32.zero
[module.test_dequantizelinear+0000005B] %i13 = vm.const.i32 -1 // 0xFFFFFFFF
[module.test_dequantizelinear+00000062] %i14:15 = vm.const.i64 4 // 0x0000000000000004
[module.test_dequantizelinear+0000006D] %i16:17 = vm.const.i64 1 // 0x0000000000000001
[module.test_dequantizelinear+00000078] %i18:19 = vm.const.i64 16 // 0x0000000000000010
[module.test_dequantizelinear+00000083] %i20:21 = vm.const.i64.zero
[module.test_dequantizelinear+00000086] %i22:23 = vm.const.i64 64 // 0x0000000000000040
[module.test_dequantizelinear+00000091] %i24:25 = vm.const.i64 128 // 0x0000000000000080
[module.test_dequantizelinear+0000009C] %i26:27 = vm.const.i64 -1 // 0xFFFFFFFFFFFFFFFF
[module.test_dequantizelinear+000000A7] %i28 = vm.global.load.i32 .rwdata[0](1)
[module.test_dequantizelinear+000000AE] %r4 = vm.global.load.ref .refs[0](!hal.pipeline_layout/0x0000019750EBABD0) : !hal.pipeline_layout
[module.test_dequantizelinear+000000B9] %r5 = vm.global.load.ref .refs[1](!hal.executable/0x0000019750ED4630) : !hal.executable
[module.test_dequantizelinear+000000C4] %r6 = vm.call @hal.devices.get(%i12(0))
[module.test_dequantizelinear+000000D2] %r7 = vm.call @hal.fence.create(%r6(!hal.device/0x0000019750EAC9C0), %i12(0))
[module.test_dequantizelinear+000000E2] %r8 = vm.const.ref.rodata 3 // 0x0000019750ECD08C 6b
[module.test_dequantizelinear+000000E9] vm.call.varadic @hal.buffer_view.assert(%r0(!hal.buffer_view/0x0000019750EBA590), %r8(!vm.buffer/0x0000019750ECB6E0), %i11(268435464), %i10(1), %i14(4))
[module.test_dequantizelinear+00000108] %r0 = vm.call @hal.buffer_view.buffer(%r0(!hal.buffer_view/0x0000019750EBA590))
[module.test_dequantizelinear+00000116] %r9 = vm.call @hal.device.allocator(%r6(!hal.device/0x0000019750EAC9C0))
[module.test_dequantizelinear+00000124] vm.call @hal.buffer.assert(%r0(!hal.buffer/0x0000019750EC6380), %r8(!vm.buffer/0x0000019750ECB6E0), %r9(!hal.allocator/0x0000019750ED55D0), %i14(4), %i9(16), %i8(3075))
[module.test_dequantizelinear+0000013A] vm.call.varadic @hal.buffer_view.assert(%r1(!hal.buffer_view/0x0000019750F3AA10), %r8(!vm.buffer/0x0000019750ECB6E0), %i7(553648160), %i10(1))
[module.test_dequantizelinear+00000158] %r1 = vm.call @hal.buffer_view.buffer(%r1(!hal.buffer_view/0x0000019750F3AA10))
[module.test_dequantizelinear+00000166] vm.call @hal.buffer.assert(%r1(!hal.buffer/0x0000019750EC6280), %r8(!vm.buffer/0x0000019750ECB6E0), %r9(!hal.allocator/0x0000019750ED55D0), %i14(4), %i9(16), %i8(3075))
[module.test_dequantizelinear+0000017C] vm.call.varadic @hal.buffer_view.assert(%r2(!hal.buffer_view/0x0000019750F3A550), %r8(!vm.buffer/0x0000019750ECB6E0), %i11(268435464), %i10(1))
[module.test_dequantizelinear+0000019A] %r2 = vm.call @hal.buffer_view.buffer(%r2(!hal.buffer_view/0x0000019750F3A550))
[module.test_dequantizelinear+000001A8] vm.call @hal.buffer.assert(%r2(!hal.buffer/0x0000019750EC5B80), %r8(!vm.buffer/0x0000019750ECB6E0), %r9(!hal.allocator/0x0000019750ED55D0), %i16(1), %i9(16), %i8(3075))
[module.test_dequantizelinear+000001BE] %r8 = vm.call @hal.fence.create(%r6(!hal.device/0x0000019750EAC9C0), %i12(0))
[module.test_dequantizelinear+000001CE] %r9 = vm.call @hal.device.queue.alloca(%r6(!hal.device/0x0000019750EAC9C0), %i26(4294967295), %r3(null), %r8(!hal.fence/0x0000019750F3A690), %i12(0), %i6(86), %i5(150998019), %i24(128))
[module.test_dequantizelinear+000001EA] %r10 = vm.call @hal.command_buffer.create(%r6(!hal.device/0x0000019750EAC9C0), %i10(1), %i10(1), %i12(0))
[module.test_dequantizelinear+000001FE] vm.call @hal.command_buffer.copy_buffer(%r10(!hal.command_buffer/0x0000019750F02B50), %r1(!hal.buffer/0x0000019750EC6280), %i20(0), %r9(!hal.buffer/0x0000019750F08440), %i20(0), %i14(4))
[module.test_dequantizelinear+00000214] vm.call @hal.command_buffer.copy_buffer(%r10(!hal.command_buffer/0x0000019750F02B50), %r2(!hal.buffer/0x0000019750EC5B80), %i20(0), %r9(!hal.buffer/0x0000019750F08440), %i22(64), %i16(1))
[module.test_dequantizelinear+0000022A] vm.call @hal.command_buffer.execution_barrier(%r10(!hal.command_buffer/0x0000019750F02B50), %i4(28), %i3(13), %i12(0))
[module.test_dequantizelinear+0000023C] vm.call @hal.command_buffer.finalize(%r10(!hal.command_buffer/0x0000019750F02B50))
[module.test_dequantizelinear+00000248] %r1 = vm.call @hal.fence.create(%r6(!hal.device/0x0000019750EAC9C0), %i12(0))
[module.test_dequantizelinear+00000258] vm.call.varadic @hal.device.queue.execute(%r6(!hal.device/0x0000019750EAC9C0), %i26(4294967295), %r8(!hal.fence/0x0000019750F3A690), %r1(!hal.fence/0x0000019750F3A1D0), %r10(!hal.command_buffer/0x0000019750F02B50))
[module.test_dequantizelinear+00000278] %i5 = vm.call.varadic @hal.fence.await(%i13(4294967295), %r1(!hal.fence/0x0000019750F3A1D0))
[module.test_dequantizelinear+0000028E] vm.cond_br %i5(0), ^00000348(), ^0000029E()
[module.test_dequantizelinear+0000029F] %i5 = vm.call @hal.buffer.load(%r9(!hal.buffer/0x0000019750F08440), %i22(64), %i10(1))
[module.test_dequantizelinear+000002B0] %i6 = vm.call.varadic @hal.fence.await(%i13(4294967295), %r7(!hal.fence/0x0000019750F39D90))
feels like a miscompile to me - I have no idea what's going on there (copies and then a load and such)
That was my feeling too, or a divergent / infinite loop based on the input values (wondering about the ui8 inputs).
I no longer see these hangs locally. Weird, since I want to debug an unrelated hang and thought these could be helpful with debugging :/
Starting to narrow down the poor failure mode here. Seems like pytest-timeout and pytest-retry are (sometimes?) incompatible with one another.
Still not sure why the tests are actually hanging though.