a handle passed as operand #0 and consumed by this operation points to a payload entity more than once
What happened?
When trying to compile mlir file exported from mixed percision LLama2, getting the below error:
root@aiinfra-C9X299-PGF:/home/admin/iree-dist# ./bin/iree-compile --iree-hal-target-backends=cuda --iree-hal-cuda-llvm-target-arch=sm_80 --mlir-print-op-on-diagnostic=false ../SHARK-Turbine/models/Llama_2_7b_chat_hf_fp16.mlir -o llama.vmfb
failed to translate executables
../SHARK-Turbine/models/Llama_2_7b_chat_hf_fp16.mlir:1033:11: error: a handle passed as operand #0 and consumed by this operation points to a payload entity more than once
%37 = torch.aten.mul.Tensor %36, %35 : !torch.vtensor<[4096],f16>, !torch.vtensor<[1,?,4096],f16> -> !torch.vtensor<[1,?,4096],f16>
^
../SHARK-Turbine/models/Llama_2_7b_chat_hf_fp16.mlir:360:13: note: called from
%1:65 = call @initialize(%0) : (!torch.vtensor<[1,?],si64>) -> (!torch.vtensor<[1,1],si64>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>)
^
../SHARK-Turbine/models/Llama_2_7b_chat_hf_fp16.mlir:1015:11: note: repeated target op
%27 = torch.prims.convert_element_type %5, %int6_30 : !torch.vtensor<[1,?,4096],f16>, !torch.int -> !torch.vtensor<[1,?,4096],f32>
^
../SHARK-Turbine/models/Llama_2_7b_chat_hf_fp16.mlir:1033:11: error: 'builtin.module' op failed to run transform dialect passes
%37 = torch.aten.mul.Tensor %36, %35 : !torch.vtensor<[4096],f16>, !torch.vtensor<[1,?,4096],f16> -> !torch.vtensor<[1,?,4096],f16>
^
../SHARK-Turbine/models/Llama_2_7b_chat_hf_fp16.mlir:360:13: note: called from
%1:65 = call @initialize(%0) : (!torch.vtensor<[1,?],si64>) -> (!torch.vtensor<[1,1],si64>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>)
^
../SHARK-Turbine/models/Llama_2_7b_chat_hf_fp16.mlir:1033:11: error: failed to run translation of source executable to target executable for backend #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_80", features = "+ptx76", wgp = <compute = fp64|fp32|fp16|int64|int32|int16|int8, storage = b64|b32|b16|b8, subgroup = shuffle|arithmetic, dot = dp4xi8toi32, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 166912>>}>
%37 = torch.aten.mul.Tensor %36, %35 : !torch.vtensor<[4096],f16>, !torch.vtensor<[1,?,4096],f16> -> !torch.vtensor<[1,?,4096],f16>
^
../SHARK-Turbine/models/Llama_2_7b_chat_hf_fp16.mlir:360:13: note: called from
%1:65 = call @initialize(%0) : (!torch.vtensor<[1,?],si64>) -> (!torch.vtensor<[1,1],si64>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>, !torch.vtensor<[1,?,32,128],f16>)
Steps to reproduce your issue
- Download LLama2 model and export as mlir by executing a python script : models/turbine_models/custom_models/stateless_llama.py
- Run:
iree-compile --iree-hal-target-backends=cuda --iree-hal-cuda-llvm-target-arch=sm_80 --mlir-print-op-on-diagnostic=false ../SHARK-Turbine/models/Llama_2_7b_chat_hf_fp16.mlir -o llama.vmfb
What component(s) does this issue relate to?
MLIR, Compiler
Version information
63a2d14
Additional context
No response
This issue can be reproduce with following simple IR
module {
func.func @main_graph(%arg0: !torch.vtensor<[1,32,6272],si8>, %arg2: !torch.vtensor<[32],f32> ) -> !torch.vtensor<[1,32,6272],f32> attributes {torch.onnx_meta.ir_version = 8 : si64, torch.onnx_meta.opset_version = 17 : si64, torch.onnx_meta.opset_versions = {ai.onnx.contrib = 1 : si64, ai.onnx.ml = 4 : si64, ai.onnx.preview.training = 1 : si64, ai.onnx.training = 1 : si64, com.microsoft = 1 : si64, com.microsoft.experimental = 1 : si64, com.microsoft.nchwc = 1 : si64, org.pytorch.aten = 1 : si64}, torch.onnx_meta.producer_name = "vai_q_onnx", torch.onnx_meta.producer_version = "1.17.0+43059a7"} {
%14 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<0> : tensor<si8>} : () -> !torch.vtensor<[],si8>
%15 = torch.operator "onnx.Constant"() {torch.onnx.value = dense<5.000000e-01> : tensor<f32>} : () -> !torch.vtensor<[],f32>
%1081 = torch.operator "onnx.DequantizeLinear"(%arg0, %15, %14) : (!torch.vtensor<[1,32,6272],si8>, !torch.vtensor<[],f32>, !torch.vtensor<[],si8>) -> !torch.vtensor<[1,32,6272],f32>
%1082 = torch.operator "onnx.InstanceNormalization"(%1081, %arg2, %arg2) {torch.onnx.epsilon = 9.99999974E-6 : f32} : (!torch.vtensor<[1,32,6272],f32>, !torch.vtensor<[32],f32>, !torch.vtensor<[32],f32>) -> !torch.vtensor<[1,32,6272],f32>
return %1082 : !torch.vtensor<[1,32,6272],f32>
}
}
I have attached dump.log with '--mlir-print-ir-after-all --mlir-print-ir-before-all --mlir-disable-threading --mlir-elide-elementsattrs-if-larger=4' and this issue is not there in when backend is cpu and similar issue seen in multiple model present in e2e shark suite.
command:
iree-compile --iree-hal-target-backends=rocm --iree-input-demote-i64-to-i32 --iree-rocm-target-chip=gfx942 temp.mlir
Possible duplicate of https://github.com/iree-org/iree/issues/17874 ? That had a fix merged - does this still reproduce with https://github.com/iree-org/iree/commit/d1ccc8c5efbf662da49891e9992803a85759420e ?
BTW, I wouldn't trust any reproducers with onnx.DequantizeLinear in them until the unit tests for that op pass (https://github.com/iree-org/iree/issues/16666)
I am trying with IREE compiler version 20240724.964 @ 1b60b62ed4b3f2a3c95f42b397889444c7df318c and still see the failure but when I add flag '--iree-codegen-llvmgpu-enable-transform-dialect-jit=false' as mentioned in https://github.com/iree-org/iree/issues/17874 , it works fine
https://github.com/iree-org/iree/commit/1b60b62ed4b3f2a3c95f42b397889444c7df318c is from July 23rd. I'd suggesting filing all issues on IREE if it repros on iree/main branch.