crabml
crabml copied to clipboard
Unsoundness issue in fn vec_fma_f16_f16_neon
Hello, thank you for your contribution in this project, I an testing our static analysis tool in github's Rust project and I notice the following code: https://github.com/crabml/crabml/blob/db55e53884a4740e0cd68e208c0ec4b3892bc97b/crabml-core/src/cpu/buf/buf_f16.rs#L143
fn vec_fma_f16_f16_neon(a: &[f16], b: f16, c: &mut [f16], a_offset: usize, m: usize) {
use crate::cpu::archutil::aarch64 as myaarch64;
unsafe {
let m_rounded = m - m % 16;
let bv = myaarch64::vdupq_n_f16(b.to_bits());
for mi in (0..m_rounded).step_by(16) {
let av0 = myaarch64::vld1q_f16(a.as_ptr().add(a_offset + mi));
let av1 = myaarch64::vld1q_f16(a.as_ptr().add(a_offset + mi + 8));
let cv0 = myaarch64::vld1q_f16(c.as_ptr().add(mi));
let cv1 = myaarch64::vld1q_f16(c.as_ptr().add(mi + 8));
let cv0 = myaarch64::vfmaq_f16(cv0, av0, bv);
let cv1 = myaarch64::vfmaq_f16(cv1, av1, bv);
myaarch64::vst1q_f16(c.as_mut_ptr().add(mi), cv0);
myaarch64::vst1q_f16(c.as_mut_ptr().add(mi + 8), cv1);
}
for mi in m_rounded..m {
c[mi] += a[a_offset + mi] * b;
}
}
}
The issue is in vec_fma_f16_f16_neon where NEON vector operations access 8 elements at a time without bounds checking. When processing in chunks of 16 elements, it accesses c[mi] through c[mi+7] and c[mi+8] through c[mi+15] but there's no validation that these ranges are within bounds. A caller can pass a valid input array but an output array that's too small, leading to undefined behavior. Although it is a private function, I notice a possible way to call this function from a pub function vec_fma_f16_f16.
pub fn vec_fma_f16_f16 -> fn vec_fma_f16_f16_neon
// 函数: vec_fma_f16_f16
pub fn vec_fma_f16_f16(v: &[f16], b: f16, c: &mut [f16], v_offset: usize, m: usize) {
#[cfg(target_arch = "aarch64")] { vec_fma_f16_f16_neon(v, b, c, v_offset, m) }
#[cfg(not(any(target_arch = "aarch64")))]
{ vec_fma_f16_f16_fallback(v, b, c, v_offset, m) }
}
// 函数: vec_fma_f16_f16_neon
#[cfg(target_arch = "aarch64")]
fn vec_fma_f16_f16_neon(a: &[f16], b: f16, c: &mut [f16], a_offset: usize, m: usize) {
use crate::cpu::archutil::aarch64 as myaarch64;
unsafe {
let m_rounded = m - m % 16;
let bv = myaarch64::vdupq_n_f16(b.to_bits());
for mi in (0..m_rounded).step_by(16) {
let av0 = myaarch64::vld1q_f16(a.as_ptr().add(a_offset + mi));
let av1 = myaarch64::vld1q_f16(a.as_ptr().add(a_offset + mi + 8));
let cv0 = myaarch64::vld1q_f16(c.as_ptr().add(mi));
let cv1 = myaarch64::vld1q_f16(c.as_ptr().add(mi + 8));
let cv0 = myaarch64::vfmaq_f16(cv0, av0, bv);
let cv1 = myaarch64::vfmaq_f16(cv1, av1, bv);
myaarch64::vst1q_f16(c.as_mut_ptr().add(mi), cv0);
myaarch64::vst1q_f16(c.as_mut_ptr().add(mi + 8), cv1);
}
for mi in m_rounded..m {
c[mi] += a[a_offset + mi] * b;
}
}
}
PoC
fn main() {
// Create a small output array that's too small for the operation
let v = vec![f16::from_f32(1.0); 16]; // Input array with 16 elements
let mut c = vec![f16::from_f32(0.0); 8]; // Output array with only 8 elements
let b = f16::from_f32(2.0);
// Set parameters that will cause out-of-bounds access
let v_offset = 0;
let m = 16; // Will try to access 16 elements in c, but c only has 8!
// This will cause undefined behavior in the NEON implementation
// when it tries to access c[8] through c[15] which are out of bounds
vec_fma_f16_f16(&v, b, &mut c, v_offset, m);
}
another samilar issue is https://github.com/crabml/crabml/blob/db55e53884a4740e0cd68e208c0ec4b3892bc97b/crabml-core/src/cpu/buf/buf_q8_0.rs#L155 The issue is in vec_dot_q8_0_q8_0_neon where it processes blocks in pairs without checking if abs has enough elements:
for i in (0..blocks_rounded).step_by(2) {
let ab0 = abs.get_unchecked(i);
let ab1 = abs.get_unchecked(i + 1); // This can be out of bounds if abs.len() < bbs.len()
// ...
}
The issue is in vec_dot_q8_0_q8_0_neon where it processes blocks in pairs without checking that abs has enough elements. The NEON implementation calculates blocks_rounded based only on bbs.len() and then tries to access elements in abs using get_unchecked(), causing undefined behavior when abs.len() < bbs.len().
Also, a valid path to this private fn is : pub fn vec_dot -> fn vec_dot_q8_0_q8_0 -> fn vec_dot_q8_0_q8_0_neon
Poc
fn main() { // Create a QuantBufQ8_0 with 1 block let block_size = std::mem::size_of::<BlockQ8_0>(); let one_block_bytes = vec![0u8; block_size]; let one_block_vector = QuantBufQ8_0::from_bytes(&one_block_bytes);
// Create a QuantBufQ8_0 with 2 blocks
let two_blocks_bytes = vec![0u8; 2 * block_size];
let two_blocks_vector = QuantBufQ8_0::from_bytes(&two_blocks_bytes);
// Call vec_dot with carefully crafted parameters that will create slices of different lengths
// a_offset = 0, b_offset = 16, len = 48
let result = one_block_vector.vec_dot(0, &two_blocks_vector, 16, 48);
// This creates:
// abs = &one_block_vector.blocks[0..1] (length 1)
// bbs = &two_blocks_vector.blocks[0..2] (length 2)
// When passed to vec_dot_q8_0_q8_0_neon, it will try to access abs[1] which is out of bounds
}
thank you for the issue! would you mind demonstrate this fix in a pull request?
Ok, but I am busy working on my thesis recently and may not be able to create a pr fix soon, and in the meantime, if someone else is willing to take on the job, I have no objection. :)