Odin icon indicating copy to clipboard operation
Odin copied to clipboard

LLVM SIMD code generation bug - "Both operands to ICmp instruction are not of the same type!"

Open ChuuniMage opened this issue 10 months ago • 1 comments

Context

Please provide any relevant information about your setup. This is important in case the issue is not reproducible except for under certain conditions.

  • Operating System & Odin Version: Windows 10, version dev-2024-12:597fba7c3
  • Please paste odin report output: Odin: dev-2024-12:597fba7c3 OS: Windows 10 Professional (version: 20H2), build 19042.1706 CPU: Intel(R) Core(TM) i7-10700KF CPU @ 3.80GHz RAM: 32637 MiB Backend: LLVM 18.1.8

Expected Behavior

LLVM compiles the proc GET_ANIM_IDX_SIMD correctly

Current Behavior

Following llvm error is generated with "odin run .":

` LLVM CODE GEN FAILED FOR PROCEDURE: main.GET_ANIM_IDX_SIMD define i32 @main.GET_ANIM_IDX_SIMD(ptr %0, i32 %1, ptr noalias nocapture nonnull %__.context_ptr) { decls: %2 = alloca <4 x i32>, align 16 %3 = alloca i32, align 4 %4 = alloca <4 x i32>, align 16 %INDICES = alloca <4 x i32>, align 16 %GT = alloca <4 x i32>, align 16 %result = alloca i32, align 4 br label %entry

entry: ; preds = %decls call void @llvm.memcpy.inline.p0.p0.i64(ptr %2, ptr %0, i64 16, i1 false) store i32 %1, ptr %3, align 4 store <4 x i32> zeroinitializer, ptr %4, align 16 %5 = insertelement <4 x i32> zeroinitializer, i32 %1, i32 0 %6 = insertelement <4 x i32> %5, i32 %1, i32 1 %7 = insertelement <4 x i32> %6, i32 %1, i32 2 %8 = insertelement <4 x i32> %7, i32 %1, i32 3 store <4 x i32> %8, ptr %4, align 16 %9 = load <4 x i32>, ptr %4, align 16 store <4 x i32> %9, ptr %INDICES, align 16 %10 = load <4 x i32>, ptr %2, align 16 %11 = icmp sgt <4 x i32> %10, i32 %1 %12 = sext <4 x i1> %11 to <4 x i32> store <4 x i32> %12, ptr %GT, align 16 %13 = load <4 x i32>, ptr %GT, align 16 %14 = sub <4 x i32> zeroinitializer, %13 store <4 x i32> %14, ptr %GT, align 16 %15 = load <4 x i32>, ptr %GT, align 16 %16 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %15) store i32 %16, ptr %result, align 4 %17 = load i32, ptr %result, align 4 ret i32 %17 }

Both operands to ICmp instruction are not of the same type! %11 = icmp sgt <4 x i32> %10, i32 %1

Steps to Reproduce

Run the provided code

package main;

import "core:fmt";

Universal_Player_States :: enum { _5, _2, _Transition_5_to_2, _Transition_2_to_5, _WALK_F, _WALK_B, _DASH_F, _DASH_B, _5G, _2G, _Transition_5G_to_2G, _Transition_2G_to_5G, _Blockstun_5ing, _Blockstun_2ing, _Hitstun_Stand_Highreel, _Hitstun_Stand_Midgut, _Hitstun_Crouch_Reel, _ThrowBreaking, _Dead, _Ringout, }

// Optimise this to make this a flat array of 5 elements? perhaps 8 for cache alignment? or does just the whole thing need to be cache aligned? // curious optimisation cases

HBOX_DURATIONS_UNIVERSAL :: [Universal_Player_States][]i32{ ._5 = {66, }, ._2 = {255,}, ._Transition_5_to_2 = {3,}, ._Transition_2_to_5 = {3, }, ._WALK_F = {32,}, ._WALK_B = {28,}, ._DASH_F = {16,}, ._DASH_B = {14,}, ._5G = {255,}, ._2G = {255,}, ._Transition_5G_to_2G = {5,}, ._Transition_2G_to_5G = {4,}, ._Blockstun_5ing = {255,}, ._Blockstun_2ing = {255,}, ._Hitstun_Stand_Highreel = {255,}, ._Hitstun_Stand_Midgut = {255,}, ._Hitstun_Crouch_Reel = {255,}, ._ThrowBreaking = {38,}, ._Dead = {35, 155}, ._Ringout = {255,}, }

HBOX_DURATIONS_CLASSIC :: [Classic_Player_State][]i32{ ._5S = {11, 14, 18, 27,}, ._2S = {11, 13, 17, 33,}, ._6S = {13, 15, 26, 32, 36,}, ._3S = {14, 16, 20, 42,}, ._8S = {7, 19, 21, 27, 42,}, ._5Throw = {9, 14, 33,}, ._5Throw_Hitting = {12, 14, 36}, ._4Throw = {9, 14, 33,}, ._4Throw_Hitting = {24, 34, 45, 52,}, }

Character :: enum { Classic, Wrassler, Striker, CounterMan, }

Classic_Player_State :: enum { _5S, _2S, _6S, _3S, _8S, _5Throw, _5Throw_Hitting, _4Throw, _4Throw_Hitting, }

Wrassler_Player_State :: enum { _5S, _2S, _6S, _3S, _4S, _5Throw, _5Throw_Hitting, _4Throw, _4Throw_Hitting, _6Throw, _6Throw_Hitting, }

Striker_Player_State :: enum { _5S, _2S, _6S, _6Sstr_S, _3S, _1S, _5Throw, _5Throw_Hitting, _6Throw, _6Throw_Hitting, }

CounterMan_Player_State :: enum { _5S, _2S, _6S, _8S, _3S, _1S, _5Throw, _5Throw_Hitting, _6Throw, _6Throw_Hitting, _4Throw, _4Throw_Hitting, }

Which_Player_State :: enum { _Universal, _CharSpecific, }

// Problem: Need to use an enum for which character they are using anyway // so which should just be 1 bit 61 thingo anyway

_Player_State_1 :: bit_field u64 { which: Which_Player_State | 1, p_state: Universal_Player_States | 63, }

_Player_State_2 :: bit_field u64 { which: Which_Player_State | 1, c_state: Classic_Player_State | 63, }

_Player_State_3 :: bit_field u64 { which: Which_Player_State | 1, c_state: Wrassler_Player_State | 63, }

_Player_State_4 :: bit_field u64 { which: Which_Player_State | 1, c_state: Striker_Player_State | 63, }

_Player_State_5 :: bit_field u64 { which: Which_Player_State | 1, c_state: CounterMan_Player_State | 63, }

Player_State :: struct #raw_union { using _: _Player_State_1, using _: _Player_State_2, using _: _Player_State_3, using _: _Player_State_4, using _: _Player_State_5, }

Cursed_1 :: bit_field u64 { uh_oh: i32 | 32, }

Cursed_2 :: bit_field u64 { uh_oh: u8 | 8, }

Cursed :: struct #raw_union { using _:Cursed_1 }

Player_Data :: struct { state_counter: i32, character: Character, current_state: Player_State, }

GET_ANIM_IDX_ORIGINAL :: proc (dur:[]i32, state_counter:i32) -> i32 { anim_idx := i32{} for el, idx in dur { if state_counter > el do continue; anim_idx = cast(i32)idx break; } return anim_idx }

_5S_SIMD :: #simd[4]i32{11, 14, 18, 27,}

import "core:simd"

GET_ANIM_IDX_SIMD :: proc (dur:#simd[4]i32, state_counter:i32) -> i32 {

INDICES : #simd[4]i32 = {state_counter, state_counter, state_counter, state_counter}

GT := simd.lanes_gt(dur, state_counter) GT = simd.neg(GT) result := simd.reduce_add_ordered(GT) return transmute(i32)result }

// import "shared:prof"

expected_5S_index := [28]i32 { 0 = 0, 1 = 0, 2 = 0, 3 = 0, 4 = 0, 5 = 0, 6 = 0, 7 = 0, 8 = 0, 9 = 0, 10 = 0, 11 = 0, 12 = 1, 13 = 1, 14 = 1, 15 = 2, 16 = 2, 17 = 2, 18 = 2, 19 = 3, 20 = 3, 21 = 3, 22 = 3, 23 = 3, 24 = 3, 25 = 3, 26 = 3, 27 = 3, }

main :: proc(){ fmt.printf("Hello, world! Your Odin project is set up.\n") honk:Player_State honk.which = ._Universal honk.p_state = ._5 honk.c_state = ._5S for i in cast(i32)0..=27 { result := GET_ANIM_IDX_ORIGINAL(HBOX_DURATIONS_CLASSIC[._5S], i) assert(result == expected_5S_index[i]) } fmt.printf("Original test passed!") for i in cast(i32)0..=27 { result := GET_ANIM_IDX_SIMD(_5S_SIMD, i) assert(result == expected_5S_index[i]) } fmt.printf("SIMD test passed!") result:i32 // prof.TIME_TYPE = .micros

fmt.printf("Original algorithm: ") for x in 0..<1_000_000 { for i in cast(i32)0..=27 { // prof.profile_start(.micros) result = GET_ANIM_IDX_ORIGINAL(HBOX_DURATIONS_CLASSIC[._5S], i) // prof.profile_end() assert(result != -1) } } // prof.average_profiling_data()

fmt.printf("SIMD algorithm: ") for x in 0..<1_000_000 { for i in cast(i32)0..=27 { // prof.profile_start(.micros) result = GET_ANIM_IDX_SIMD(_5S_SIMD, i) // prof.profile_end() assert(result != -1) } } // prof.average_profiling_data()

fmt.printf("cursed %v \n", honk.which) };

ChuuniMage avatar Dec 20 '24 06:12 ChuuniMage