burn icon indicating copy to clipboard operation
burn copied to clipboard

WGPU error: Shader '' parsing error: Incompatible operands: LogicalAnd(vec4<bool>, _)

Open antimora opened this issue 3 months ago • 13 comments

I am trying to enable wgpu onnx-tests, however, I am getting this strange error. All tests are passing for ndarray and tch.


#[cfg(feature = "backend-wgpu")]
pub type Backend = burn::backend::Wgpu;

#[cfg(all(
    feature = "backend-ndarray",
    not(feature = "backend-wgpu"),
    not(feature = "backend-tch")
))]
pub type Backend = burn::backend::NdArray<f32>;

#[cfg(feature = "backend-tch")]
pub type Backend = burn::backend::LibTorch<f32>;

// Import the shared macro
use crate::include_models;
include_models!(and);

#[cfg(test)]
mod tests {
    use super::*;
    use burn::tensor::{Bool, Tensor, TensorData};

    use crate::backend::Backend;

    #[test]
    fn and() {
        let device = Default::default();
        let model: and::Model<Backend> = and::Model::new(&device);

        let input_x = Tensor::<Backend, 4, Bool>::from_bool(
            TensorData::from([[[[false, false, true, true]]]]),
            &device,
        );
        let input_y = Tensor::<Backend, 4, Bool>::from_bool(
            TensorData::from([[[[false, true, false, true]]]]),
            &device,
        );

        let output = model.forward(input_x, input_y).to_data();
        let expected = TensorData::from([[[[false, false, false, true]]]]);

        output.assert_eq(&expected, true);
    }
}

Image

The errors:

     Running tests/test_mod.rs (/Users/dilshod/Projects/burn/target/debug/deps/test_mod-9ce59c654818100c)

running 242 tests
test abs::tests::abs ... ok
test argmin::tests::argmin_1d ... ok

thread 'and::tests::and' panicked at /Users/dilshod/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/wgpu-25.0.2/src/backend/wgpu_core.rs:1051:30:
wgpu error: Validation Error

Caused by:
  In Device::create_shader_module

Shader '' parsing error: Incompatible operands: LogicalAnd(vec4<bool>, _)


      Incompatible operands: LogicalAnd(vec4<bool>, _)



thread 'and::tests::and' panicked at /Users/dilshod/Projects/burn/crates/burn-fusion/src/stream/execution/ordering.rs:67:38:
index out of bounds: the len is 0 but the index is 0
stack backtrace:
   0:        0x10660c504 - <std::sys::backtrace::BacktraceLock::print::DisplayBacktrace as core::fmt::Display>::fmt::h373e57e2286956dc
   1:        0x106629b8c - core::fmt::write::h2c4a0b98b09e3b30
   2:        0x1066097e4 - std::io::Write::write_fmt::h619de9749845ad1b
   3:        0x10660c3b8 - std::sys::backtrace::BacktraceLock::print::h3eb1535b8d3666ca
   4:        0x10660dbf4 - std::panicking::default_hook::{{closure}}::hf623c44b740b115f
   5:        0x10660da08 - std::panicking::default_hook::h8875fb31ec87dfad
   6:        0x105972300 - test::test_main_with_exit_callback::{{closure}}::h9fc2c419030d8caf
   7:        0x10660e6b8 - std::panicking::rust_panic_with_hook::hdd8ceeeb04975c2b
   8:        0x10660e2b0 - std::panicking::begin_panic_handler::{{closure}}::hdf417b72ab8ffff8
   9:        0x10660c9b0 - std::sys::backtrace::__rust_end_short_backtrace::h507d79c50996742e
  10:        0x10660df8c - __rustc[5224e6b81cd82a8f]::rust_begin_unwind
  11:        0x1066ac40c - core::panicking::panic_fmt::h3505bfbec5a0b799
  12:        0x1066ac58c - core::panicking::panic_bounds_check::he97faa96f3a18922
  13:        0x104ec10e8 - <usize as core::slice::index::SliceIndex<[T]>>::index::h08a01d4dda84def2
  14:        0x104efeaf4 - <alloc::vec::Vec<T,A> as core::ops::index::Index<I>>::index::h8133231da83a569a
  15:        0x1047eb0c0 - burn_fusion::stream::execution::ordering::OrderedExecution<R>::execute_operations::h03e4085842520b45
  16:        0x10487753c - burn_fusion::stream::queue::execution::QueueExecution<R>::execute_strategy::hafe9fbeedeb0a88d
  17:        0x104877828 - burn_fusion::stream::queue::execution::QueueExecution<R>::run::h53d03db85150cb56
  18:        0x104e522dc - burn_fusion::stream::queue::execution::<impl burn_fusion::stream::queue::base::OperationQueue<R>>::execute_block_optimization::h3e645412ef6ba306
  19:        0x104e523f4 - burn_fusion::stream::queue::execution::<impl burn_fusion::stream::queue::base::OperationQueue<R>>::execute::hae34ad08a946a6a0
  20:        0x105436088 - <burn_fusion::stream::multi::Segment<R> as burn_fusion::stream::execution::processor::StreamSegment<<R as burn_fusion::backend::FusionRuntime>::Optimization>>::execute::ha41c4290388252df
  21:        0x105328478 - burn_fusion::stream::execution::processor::Processor<O>::explore::h4db7ac7fd82d56d4
  22:        0x105328650 - burn_fusion::stream::execution::processor::Processor<O>::process::hed24618643f619cd
  23:        0x10542e3dc - burn_fusion::stream::multi::MultiStream<R>::enqueue_operation::h45c2d082060551ff
  24:        0x10542f850 - burn_fusion::stream::multi::MultiStream<R>::register::hd9421df4a66a5641
  25:        0x104b888bc - burn_fusion::server::FusionServer<R>::register::h1231600d2c3674b5
  26:        0x104d11728 - <burn_fusion::client::mutex::MutexFusionClient<R> as burn_fusion::client::base::FusionClient<R>>::register::hf0793f7b23591407
  27:        0x105287c84 - <burn_fusion::tensor::FusionTensor<R> as core::ops::drop::Drop>::drop::h1406b9c98e3487ba
  28:        0x10524ca2c - core::ptr::drop_in_place<burn_fusion::tensor::FusionTensor<burn_cubecl::fusion::FusionCubeRuntime<cubecl_wgpu::runtime::WgpuRuntime,u32>>>::h5db23c89770f4f66
  29:        0x1053d3d68 - burn_fusion::ops::boolean::<impl burn_tensor::tensor::ops::bool_tensor::BoolTensorOps<burn_fusion::backend::Fusion<B>> for burn_fusion::backend::Fusion<B>>::bool_and::h6ddce8d278aefd95
  30:        0x104d5c838 - burn_tensor::tensor::api::bool::<impl burn_tensor::tensor::api::base::Tensor<B,_,burn_tensor::tensor::api::kind::Bool>>::bool_and::h95a11262423f67a5
  31:        0x104e24b90 - test_mod::and::and::Model<B>::forward::h63c4cc2e490584db
  32:        0x105175664 - test_mod::and::tests::and::h9a16a7f7a4633156
  33:        0x104f56920 - test_mod::and::tests::and::{{closure}}::hdd84aa58ae1c80f7
  34:        0x10523cde8 - core::ops::function::FnOnce::call_once::hd53d4d969a716269
  35:        0x10597673c - test::__rust_begin_short_backtrace::ha9b990352ecdf496
  36:        0x1059759fc - test::run_test::{{closure}}::h2eb38d5b94f96a1f
  37:        0x105945c9c - std::sys::backtrace::__rust_begin_short_backtrace::h0ed48394d7c0c8df
  38:        0x105948d60 - core::ops::function::FnOnce::call_once{{vtable.shim}}::hf9d9c328caaab3b1
  39:        0x106610ca8 - std::sys::pal::unix::thread::Thread::new::thread_start::h447d747a543e4adc
  40:        0x1915ebc0c - __pthread_cond_wait

thread 'and::tests::and' panicked at library/core/src/panicking.rs:233:5:
panic in a destructor during cleanup
thread caused non-unwinding panic. aborting.
error: test failed, to rerun pass `--test test_mod`

Caused by:
  process didn't exit successfully: `/Users/dilshod/Projects/burn/target/debug/deps/test_mod-9ce59c654818100c` (signal: 6, SIGABRT: process abort signal)
     Running tests/test_record_type.rs (/Users/dilshod/Projects/burn/target/debug/deps/test_record_type-bdfb707a99fec346)

running 10 tests

antimora avatar Aug 19 '25 20:08 antimora

CC @wingertge @nathanielsimard

antimora avatar Aug 19 '25 20:08 antimora

After this PR merge: https://github.com/tracel-ai/burn/pull/3584

you can run:

[onnx-tests]% cargo test --features backend-wgpu

antimora avatar Aug 19 '25 21:08 antimora

I can't seem to reproduce this, the shader looks correct and seems to run, but then it crashes with a STATUS_STACK_BUFFER_OVERRUN.

wingertge avatar Aug 20 '25 14:08 wingertge

I got this error on Mac. Here are the details:

[mnist]% uname -a Darwin Mac.attlocal.net 24.6.0 Darwin Kernel Version 24.6.0: Mon Jul 14 11:30:55 PDT 2025; root:xnu-11417.140.69~1/RELEASE_ARM64_T6031 arm64

It's on Apple M3 Max

antimora avatar Aug 20 '25 14:08 antimora

The STATUS_STACK_BUFFER_OVERRUN still seems to be triggered by the backend, because it's not happening with ndarray. So there might be a second bug in there and the wgpu compile error is hiding the secondary issue, or it's the other way around and that's why I can't reproduce the wgpu issue.

wingertge avatar Aug 20 '25 14:08 wingertge

@wingertge I got the logs

CUBECL_DEBUG_LOG=/Users/dilshod/Projects/burn/crates/burn-import/out/debug.log cargo test --features backend-wgpu

debug.log

antimora avatar Aug 20 '25 16:08 antimora

that's odd, it kinda looks like a bug in the WGSL compiler that causes type inference to fail. Shader '' parsing error: Incompatible operands: LogicalAnd(vec4<bool>, _) Makes it seem like it can't infer the type of the rhs, but they're both explicitly defined as vec4<bool>, and obviously logical and is supported for two boolean vectors.

let l_6 = vec4<bool>(l_5);
let l_7 = vec4<bool>(l_4);
let l_8 = l_7 && l_6;

So I'm not sure why it might fail specifically on mac, or if the buffer overrun is actually hiding the error on my end.

wingertge avatar Aug 20 '25 17:08 wingertge

@louisfd, since you have Mac, can you check if this is failing for you too.

antimora avatar Aug 20 '25 22:08 antimora

I'm not sure if vec4 && vec4 is valid wgsl anyway, I remember seeing a bug in wgpu about incorrectly accepting it (though I can't fnd it anymore). Logical and is fine though.

https://compute.fornwall.net/#source=fn%20compute()%20-%3E%20f32%20%7B%0A%20%20let%20l_6%20%3D%20vec4%3Cbool%3E(true)%3B%0A%20%20let%20l_7%20%3D%20vec4%3Cbool%3E(true)%3B%0A%20%20let%20l_8%20%3D%20l_7%20%26%26%20l_6%3B%0A%20%20return%200.0%3B%0A%7D

ArthurBrussee avatar Aug 21 '25 19:08 ArthurBrussee

I'm not sure if vec4 && vec4 is valid wgsl anyway, I remember seeing a bug in wgpu about incorrectly accepting it (though I can't fnd it anymore). Logical and is fine though.

https://compute.fornwall.net/#source=fn%20compute()%20-%3E%20f32%20%7B%0A%20%20let%20l_6%20%3D%20vec4%3Cbool%3E(true)%3B%0A%20%20let%20l_7%20%3D%20vec4%3Cbool%3E(true)%3B%0A%20%20let%20l_8%20%3D%20l_7%20%26%26%20l_6%3B%0A%20%20return%200.0%3B%0A%7D

If you have a mac, do you mind running?

cd crates/burn-import/onnx-tests
cargo test --features test-wgpu

antimora avatar Aug 21 '25 19:08 antimora

Running on a mac at main gives:

running 242 tests
test add::tests::add_shape_with_scalar_and_shape ... ok
test argmin::tests::argmin_1d ... ok
test argmax::tests::argmax_1d ... ok
test argmin::tests::argmin ... FAILED
test add::tests::add_scalar_to_int_tensor_and_int_tensor_to_int_tensor ... FAILED
test argmax::tests::argmax ... FAILED

thread 'and::tests::and' panicked at /Users/arthurkb/.cargo/registry/src/index.crates.io-1949cf8c6b5b557f/wgpu-25.0.2/src/backend/wgpu_core.rs:1051:30:
wgpu error: Validation Error

Caused by:
  In Device::create_shader_module

Shader '' parsing error: Incompatible operands: LogicalAnd(vec4<bool>, _)


      Incompatible operands: LogicalAnd(vec4<bool>, _)



thread 'and::tests::and' panicked at /Users/arthurkb/Documents/Projects/burn/crates/burn-fusion/src/stream/execution/ordering.rs:67:38:
index out of bounds: the len is 0 but the index is 0

I have also seen STATUS_STACK_BUFFER_OVERRUN from wgpu on windows when there are wgsl compile errors.

ArthurBrussee avatar Aug 21 '25 20:08 ArthurBrussee

CCing @nathanielsimard and @laggui, I am also getting this error when I am evaluating Yolo11x but passing with torch backend.

antimora avatar Aug 21 '25 23:08 antimora

CCing @nathanielsimard and @laggui, I am also getting this error when I am evaluating Yolo11x but passing with torch backend.

There shouldn't be any shader error when using torch 😅 maybe the wrong backend is being configured?

laggui avatar Aug 22 '25 11:08 laggui