OpenBLAS icon indicating copy to clipboard operation
OpenBLAS copied to clipboard

Incorrect matrix-matrix multiplication results on Mac

Open nvb opened this issue 4 months ago • 1 comments

Extracting from https://github.com/rust-ndarray/ndarray/issues/1521.

The following reproduction fails on Mac M4 (version 0.3.30) but passes on aarch64 Linux (version 0.3.21):

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <math.h>

// Include BLAS headers - OpenBLAS or Accelerate
#ifdef __APPLE__
    #include <Accelerate/Accelerate.h>
#else
    #include <cblas.h>
#endif

int main() {
    // Test case from ndarray test_mat_mul
    // A is 45x33, B is 33x33 identity, C should be 45x33 (same as A)
    
    const int m = 45, n = 33, k = 33;
    const float alpha = 1.0f, beta = 0.0f;
    
    // Allocate matrices
    float *A = (float*)malloc(m * k * sizeof(float));
    float *B = (float*)malloc(k * n * sizeof(float));  
    float *C = (float*)malloc(m * n * sizeof(float));
    
    // Initialize A with linspace(0, 1484, 45*33) like ndarray test
    for (int i = 0; i < m * k; i++) {
        A[i] = (float)i;
    }
    
    // Initialize B as identity matrix 
    memset(B, 0, k * n * sizeof(float));
    for (int i = 0; i < k; i++) {
        B[i * n + i] = 1.0f;  // B[i][i] = 1
    }
    
    // Initialize C to zero
    memset(C, 0, m * n * sizeof(float));
    
    printf("Before SGEMM:\n");
    printf("A[0..2,0..2]:\n");
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            printf("%.2f ", A[i * k + j]);
        }
        printf("\n");
    }
    
    printf("B[0..2,0..2]:\n");
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            printf("%.2f ", B[i * n + j]);
        }
        printf("\n");
    }
    
    // Call OpenBLAS SGEMM
    // C = alpha * A * B + beta * C
    // Row-major layout: CblasRowMajor, no transpose
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
                m, n, k,
                alpha,
                A, k,    // lda = k (leading dimension of A)
                B, n,    // ldb = n (leading dimension of B)
                beta,
                C, n);   // ldc = n (leading dimension of C)
    
    printf("\nAfter SGEMM:\n");
    printf("C[0..2,0..2] (should equal A[0..2,0..2]):\n");
    for (int i = 0; i < 3; i++) {
        for (int j = 0; j < 3; j++) {
            printf("%.2f ", C[i * n + j]);
        }
        printf("\n");
    }
    
    // Check if result is correct (C should equal A since A * I = A)
    int correct = 1;
    for (int i = 0; i < m * n; i++) {
        if (fabs(C[i] - A[i]) > 1e-5) {
            correct = 0;
            break;
        }
    }
    
    printf("\nResult: %s\n", correct ? "CORRECT" : "INCORRECT");
    
    // Print some diagnostics
    printf("\nDiagnostics:\n");
    printf("All C values zero? %s\n", C[0] == 0.0f && C[1] == 0.0f ? "YES" : "NO");
    
    free(A);
    free(B);
    free(C);
    
    return correct ? 0 : 1;
}

Compile with gcc -I/opt/homebrew/opt/openblas/include -L/opt/homebrew/opt/openblas/lib -lopenblas debug_openblas.c -o debug_openblas -Wno-deprecated

On Mac with OpenBlas, this returns:

After SGEMM:
C[0..2,0..2] (should equal A[0..2,0..2]):
0.00 0.00 0.00
0.00 0.00 0.00
0.00 0.00 0.00

On Linux with OpenBlas or on Mac with Accelerate, this returns:

After SGEMM:
C[0..2,0..2] (should equal A[0..2,0..2]):
0.00 1.00 2.00
33.00 34.00 35.00
66.00 67.00 68.00

nvb avatar Aug 26 '25 19:08 nvb

most likely a duplicate of #5414 (problems with the SME-based fast path for small-matrix SGEMM that was added in 0.3.30)

martin-frbg avatar Aug 26 '25 20:08 martin-frbg