Metal.jl Threadgroup atomics require all-atomic operation

MWE:

using Metal

function local_kernel(a, expected::AbstractArray{T}, desired::T) where T
    i = thread_position_in_grid_1d()
    b = MtlThreadGroupArray(T, 16)

    #b[i] = a[i]
    Metal.atomic_store_explicit(pointer(b, i), Metal.atomic_load_explicit(pointer(a, i)))

    while Metal.atomic_compare_exchange_weak_explicit(pointer(b, i), expected[i], desired) != expected[i]
        # keep on trying
    end

    #a[i] = b[i]
    Metal.atomic_store_explicit(pointer(a, i), Metal.atomic_load_explicit(pointer(b, i)))

    return
end

function main(; T=Int32, n=16)
    a = Metal.zeros(T, n)
    expected = copy(a)
    desired = T(42)
    @metal threads=n local_kernel(a, expected, desired)
    Array(a)
end

Note how the load and stores that initialize the threadgroup memory and copy it back to global memory need to be atomics for this example to work, even though every thread has its own dedicated memory address to act upon. Demoting those operations to regular array operations results in the final array containing all zeros.

This smells like an upstream bug, especially because the above pattern is impossible to replicate in Metal C (where atomic_int is used as element type, promoting all operations to atomic):

#include <metal_stdlib>
using namespace metal;

kernel void local_kernel(device atomic_int* a [[ buffer(0) ]],
                         device int* expected [[ buffer(1) ]],
                         device int* desired [[ buffer(2) ]],
                         uint i [[ thread_position_in_grid ]]) {
    threadgroup atomic_int b[16];
    atomic_store_explicit(&b[i], atomic_load_explicit(&a[i], memory_order_relaxed), memory_order_relaxed);

    int expectedValue = expected[i];
    while (!atomic_compare_exchange_weak_explicit(&b[i], &expectedValue, desired[i], memory_order_relaxed, memory_order_relaxed)) {
        // keep on trying
    }
    atomic_store_explicit(&a[i], atomic_load_explicit(&b[i], memory_order_relaxed), memory_order_relaxed);
}

#import <Metal/Metal.h>

int main() {
    id<MTLDevice> device = MTLCreateSystemDefaultDevice();

    NSError *error = nil;
    id<MTLLibrary> library = [device newLibraryWithFile:@"atomic_xchg.metallib" error:&error];
    id<MTLFunction> function = [library newFunctionWithName:@"local_kernel"];

    id<MTLCommandQueue> commandQueue = [device newCommandQueue];
    id<MTLCommandBuffer> commandBuffer = [commandQueue commandBuffer];

    id<MTLComputePipelineState> pipelineState = [device newComputePipelineStateWithFunction:function error:&error];

    int n = 16;
    id<MTLBuffer> a = [device newBufferWithLength:n*sizeof(int) options:MTLResourceOptionCPUCacheModeDefault];
    id<MTLBuffer> expected = [device newBufferWithBytesNoCopy:malloc(n*sizeof(int)) length:n*sizeof(int) options:0 deallocator:nil];
    int desiredValues[n];
    for (int i = 0; i < n; i++) {
        desiredValues[i] = 42;
    }
    id<MTLBuffer> desired = [device newBufferWithBytes:&desiredValues length:n*sizeof(int) options:MTLResourceOptionCPUCacheModeDefault];

    id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
    [encoder setComputePipelineState:pipelineState];
    [encoder setBuffer:a offset:0 atIndex:0];
    [encoder setBuffer:expected offset:0 atIndex:1];
    [encoder setBuffer:desired offset:0 atIndex:2];

    MTLSize threadsPerGroup = MTLSizeMake(n, 1, 1);
    MTLSize numThreadgroups = MTLSizeMake(1, 1, 1);
    [encoder dispatchThreadgroups:numThreadgroups threadsPerThreadgroup:threadsPerGroup];
    [encoder endEncoding];

    [commandBuffer commit];
    [commandBuffer waitUntilCompleted];

    int *result = a.contents;
    for (int i = 0; i < n; i++) {
        NSLog(@"%d\n", result[i]);
    }

    return 0;
}

Jun 26 '23 10:06 maleadt

Hmm, this seems to apply to other atomics as well:

using Metal

function local_kernel(f, a, val::T) where T
    i = thread_position_in_grid_1d()
    b = MtlThreadGroupArray(T, 128)
    #b[i] = a[i]
    val = Metal.atomic_load_explicit(pointer(a, i))
    Metal.atomic_store_explicit(pointer(b, i), val)
    f(pointer(b, i), val)
    #a[i] = b[i]
    val = Metal.atomic_load_explicit(pointer(b, i))
    Metal.atomic_store_explicit(pointer(a, i), val)
    return
end

function main(; T=Int32, n=16)
    a = ones(T, n)
    b = MtlArray(a)
    val = one(T)
    @metal threads=n local_kernel(Metal.atomic_fetch_add_explicit, b, val)
    @show .+(a, val)
    @show Array(b)
    return
end

Jun 26 '23 10:06 maleadt

Note how the load and stores that initialize the threadgroup memory and copy it back to global memory need to be atomics for this example to work, even though every thread has its own dedicated memory address to act upon. Demoting those operations to regular array operations results in the final array containing all zeros.

I think this is a general truth (and why we don't have atomics for arrays yet)

If you mix atomic operations with non-atomic operations you will get issues.

But I would have expected that the load and stores to thread-local so b[I] would have been able to not be atomic. Just the loads and stores to global memory.

Jun 26 '23 12:06 vchuravy

If you mix atomic operations with non-atomic operations you will get issues.

Why is that? Every thread is accessing its own memory locations, so why would mixing atomics with regular loads and stores not work? Note that removing atomics altogether works fine here.

Jun 26 '23 12:06 maleadt