goat
goat copied to clipboard
Can't generate assembly on main (0.1.4) but works on (0.1.3)
failed to parse source file ./c/encode_neon_arm64.c: /opt/homebrew/Cellar/llvm/20.1.8/lib/clang/20/include/arm_bf16.h:14:16: unexpected identifier, expected external declaration
I'm building on macos, here's the example code. Works fine on latest but not on main.
#include <arm_neon.h> // Required for NEON intrinsics
void encode_neon(float* input_floats, unsigned char* output_bytes, long *len) {
// Process 4 floats at a time (128-bit vector)
int i = 0;
for (; i < *len; i += 4) {
// Load 4 float32 values into a NEON register
float32x4_t float_vec = vld1q_f32(input_floats + i);
// Reinterpret the bit pattern of the floats as 32-bit integers
uint32x4_t int_vec = vreinterpretq_u32_f32(float_vec);
// Store the 32-bit integers as bytes.
// This will effectively store the raw byte representation of the floats.
vst1q_u8(output_bytes + (i * 4), vreinterpretq_u8_u32(int_vec));
}
// Handle remaining elements
for (; i < *len; i++) {
uint32_t bits = *(uint32_t*)&input_floats[i];
output_bytes[i*4] = bits & 0xFF;
output_bytes[i*4+1] = (bits >> 8) & 0xFF;
output_bytes[i*4+2] = (bits >> 16) & 0xFF;
output_bytes[i*4+3] = (bits >> 24) & 0xFF;
}
}
void decode_neon(unsigned char* input_bytes, float* output_floats, long *num_elements) {
// Process 4 floats (16 bytes) at a time (128-bit vector)
int i = 0;
for (; i + 3 < *num_elements; i += 4) {
// Load 16 bytes into a NEON register
uint8x16_t byte_vec = vld1q_u8(input_bytes + (i * 4));
// Reinterpret the bit pattern of the bytes as 32-bit unsigned integers
uint32x4_t int_vec = vreinterpretq_u32_u8(byte_vec);
// Reinterpret the bit pattern of the integers as float32 values
float32x4_t float_vec = vreinterpretq_f32_u32(int_vec);
// Store the 4 float32 values into the output array
vst1q_f32(output_floats + i, float_vec);
}
// Handle remainder of 2 elements using SIMD
if (i + 1 < *num_elements) {
// Load 8 bytes (2 floats worth) into a NEON register
uint8x8_t byte_vec = vld1_u8(input_bytes + (i * 4));
// Reinterpret as 32-bit integers
uint32x2_t int_vec = vreinterpret_u32_u8(byte_vec);
// Reinterpret as float32 values
float32x2_t float_vec = vreinterpret_f32_u32(int_vec);
// Store the 2 float32 values
vst1_f32(output_floats + i, float_vec);
i += 2;
}
// Handle final single element if needed
if (i < *num_elements) {
// Load 4 bytes into a 32-bit integer using NEON
uint32_t bits = vget_lane_u32(vreinterpret_u32_u8(vld1_u8(input_bytes + (i * 4))), 0);
// Store as float using NEON
vst1_lane_f32(output_floats + i, vreinterpret_f32_u32(vdup_n_u32(bits)), 0);
}
}