wgpu-py Overallocation of workgroup buffer inside compute shader causes memory corruption

Describe the bug

If you create a compute shader with workgroup buffers that need more memory than the allowed limit then writing to one buffer will cause corruption in other buffers.

To Reproduce

Code attached at bottom.

Observed behaviour

If I write to shared buffer and then write to another shared buffer, the second shared buffor has results overlapping with the previous buffer in read time.

Screenshots

NA

Your environment

██ system:

         platform:  Windows-10-10.0.26100-SP0

python_implementation: CPython python: 3.10.16

██ versions:

wgpu: 0.22.2 cffi: 1.17.1 numpy: 2.2.6

██ wgpu_native_info:

expected_version: 24.0.3.1 lib_version: 24.0.3.1 lib_path: .\resources\wgpu_native-release.dll

██ object_counts:

                  count  resource_mem

        Adapter:      1
      BindGroup:      0
BindGroupLayout:      0
         Buffer:      0             0
  CanvasContext:      0
  CommandBuffer:      0
 CommandEncoder:      0

ComputePassEncoder: 0 ComputePipeline: 0 Device: 1 PipelineLayout: 0 QuerySet: 0 Queue: 1 RenderBundle: 0 RenderBundleEncoder: 0 RenderPassEncoder: 0 RenderPipeline: 0 Sampler: 0 ShaderModule: 0 Texture: 0 TextureView: 0

          total:      3             0

██ wgpu_native_counts:

              count  mem  hub   a  k  r  el_size

    Adapter:      1    8  hub:  1  1  0        8
  BindGroup:      0    0  hub:  0  0  1       16

BindGroupLayout: 0 0 hub: 0 0 1 16 Buffer: 0 0 hub: 0 0 7 16 CanvasContext: 0 0 0 0 0 8 CommandBuffer: 0 0 hub: 0 0 1 8 ComputePipeline: 0 0 hub: 0 0 1 16 Device: 1 8 hub: 1 1 0 8 PipelineCache: 0 0 hub: 0 0 0 16 PipelineLayout: 0 0 hub: 0 0 1 16 QuerySet: 0 0 hub: 0 0 0 16 Queue: 1 8 hub: 1 1 0 8 RenderBundle: 0 0 hub: 0 0 0 16 RenderPipeline: 0 0 hub: 0 0 0 16 Sampler: 0 0 hub: 0 0 0 16 ShaderModule: 2 32 hub: 2 2 1 16 Texture: 0 0 hub: 0 0 0 16 TextureView: 0 0 hub: 0 0 0 16

      total:      5   56

* The a, k, r are allocated, kept, and released, respectively.
* Reported memory does not include buffer/texture data.

Steps to reproduce

const RANDOM_FLOAT_ARRAY_LENGTH:u32 = 256;
const OBJECTIVE_FUNCTION_OUTPUT_LENGTH:u32 = 63;
const OBJECTIVE_FUNCTION_INPUT_LENGTH:u32 = 195;
const SVD_EPSILON:f32 = 1e-10;
const WORKGROUP_SIZE_X:u32 = 8;
const WORKGROUP_SIZE_Y :u32 = 8;

struct State{
    random_vector_norm:f32
}
const WORKGROUP_LENGTH = WORKGROUP_SIZE_X * WORKGROUP_SIZE_Y;
const OBJECTIVE_FUNCTION_JACOBIAN_SIZE = OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_INPUT_LENGTH;

@group(0) @binding(1)
var<storage, read> jacobian: array<f32>;


@group(0) @binding(2)
var<storage, read_write> U: array<f32>;


var<workgroup> workgroup_shared_obj_out_len_buf: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> workgroup_shared_obj_inp_len_buf: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> jacobian_workgroup_copy: array<
    f32, 
    OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> decomposed_jacobian: array<
    f32, 
    OBJECTIVE_FUNCTION_INPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> one_dim_svd_b: array<
    f32, 
    OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH
>;
var<workgroup> current_u_buffer: array<f32, OBJECTIVE_FUNCTION_OUTPUT_LENGTH>;
var<workgroup> current_v_buffer: array<f32, OBJECTIVE_FUNCTION_INPUT_LENGTH>;
var<workgroup> state: State;

fn row_major_access_jac(r: u32, c:u32)->u32{
    return r * OBJECTIVE_FUNCTION_INPUT_LENGTH + c;
}

fn col_major_access_jac(r: u32, c:u32)->u32{
    return c * OBJECTIVE_FUNCTION_OUTPUT_LENGTH + r;
}

fn sweep_calculator(length: u32, workgroup_size: u32) -> u32 {
    return (length + workgroup_size - 1u) / workgroup_size; // ceiling division
}


fn copy_jacobian_into_local_buffers(
    local_invocation_index: u32,
){
    let total_elements = OBJECTIVE_FUNCTION_JACOBIAN_SIZE;
    let workgroup_size = WORKGROUP_LENGTH; // e.g., 64
    var index = local_invocation_index;
    while (index < total_elements) {
        jacobian_workgroup_copy[index] = jacobian[index];
        decomposed_jacobian[index] = jacobian[index];
        index += workgroup_size;
    }
    workgroupBarrier();
}


fn fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(
    local_invocation_id : vec3<u32>,
    global_invocation_id : vec3<u32>,
    local_invocation_index: u32,
){
    let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
    for(var iter = 0u ; iter < sweeps ; iter++){
        let index = WORKGROUP_LENGTH * iter + local_invocation_index;
        if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
            workgroup_shared_obj_out_len_buf[index] = 9.9;
            current_u_buffer[index] = 9.9;
        }
    }
    workgroupBarrier();

}

fn fill_one_dim_svd_b_buffer(
    local_invocation_id : vec3<u32>,
    local_invocation_index: u32,
){
    let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH* OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
    for(var iter = 0u ; iter < sweeps ; iter++){
        let index = iter * WORKGROUP_LENGTH + local_invocation_index;
        if(index >= OBJECTIVE_FUNCTION_OUTPUT_LENGTH * OBJECTIVE_FUNCTION_OUTPUT_LENGTH){continue;}
        let row = index / OBJECTIVE_FUNCTION_OUTPUT_LENGTH; 
        let col = index % OBJECTIVE_FUNCTION_OUTPUT_LENGTH; 
        var b_val = 0.0;
        for(var z_index:u32 = 0u ; z_index < OBJECTIVE_FUNCTION_INPUT_LENGTH ; z_index++ ){
            let left_matrix_index = row_major_access_jac(row, z_index);
            let right_matrix_index = row_major_access_jac(col, z_index);
            b_val += decomposed_jacobian[left_matrix_index] * decomposed_jacobian[right_matrix_index]; 
        }
        one_dim_svd_b[index] = b_val;
    }
    workgroupBarrier();
}

fn populate_u_with_next_singular_vector(
    local_invocation_id : vec3<u32>,
    global_invocation_id : vec3<u32>,
    local_invocation_index: u32,
){
    fill_u_buffer_and_shared_output_buffer_with_random_normal_vector(local_invocation_id, global_invocation_id, local_invocation_index);
    workgroupBarrier();
    fill_one_dim_svd_b_buffer(local_invocation_id, local_invocation_index);
    workgroupBarrier();
    let sweeps = sweep_calculator(OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH, WORKGROUP_LENGTH);
    for(var iter = 0u ; iter < sweeps ; iter++){
        let index = WORKGROUP_LENGTH * iter + local_invocation_index;
        if(index < OBJECTIVE_FUNCTION_OUTPUT_LENGTH*OBJECTIVE_FUNCTION_OUTPUT_LENGTH){
            U[index] = one_dim_svd_b[index];
        }
    }
    workgroupBarrier();
}

@compute @workgroup_size(WORKGROUP_SIZE_X, WORKGROUP_SIZE_Y)
fn main(
    @builtin(local_invocation_id) local_invocation_id : vec3<u32>,
    @builtin(global_invocation_id) global_invocation_id : vec3<u32>,
    @builtin(local_invocation_index) local_invocation_index: u32,
) {
    copy_jacobian_into_local_buffers(local_invocation_index);
    populate_u_with_next_singular_vector(local_invocation_id, global_invocation_id, local_invocation_index);
}

In the above code, if you comment out the write to workgroup_shared_obj_out_len_buf or remove redundant workgroup shared buffers, the code works and one_dim_svd_b and U will have the intended values. Otherwise one_dim_svd_b will get corrupted and U will have incorrect values. Ideally the compiler should error on compile time if configures the shader to use more shared memory than available.

Jul 05 '25 17:07 professorcode1

is this with Dx12 or Vulkan? I know that Dx12 has some issues with compute shader sin the current and next version of wgpu.

Perhaps you can give the wgpu25 branch a try as that includes upstream bugfixes. If you can also get this problem in wgpu-native it's not a problem with the bindings and perhaps validation along the way.

Jul 05 '25 18:07 Vipitis

Thanks for reporting this. Do you perhaps also have the Python code to run the shader with, to make it a truely reproducable example?

Jul 07 '25 08:07 almarklein