OpenBLAS icon indicating copy to clipboard operation
OpenBLAS copied to clipboard

Reordering matrix rows leads to output differences

Open j-moeller opened this issue 1 year ago • 2 comments

Hello,

while using OpenBLAS as a backend for PyTorch code, we found minor output differences, when reordering our input samples in a batch. In our code, we used a linear two layer neural network to compute:

$$f(x) = W_1 * ReLU(W_0 * x + b_0) + b_1$$

where $W_0$ and $W_1$ are the first and second layer's weight matrices, $b_0$ and $b_1$ are the bias vectors, and $ReLU(x) = max(x, 0)$. In our case, we used the setup x.shape = (batch_size, 784), W_0.shape = (784, 128), W_1.shape = (128, 10). The behavior does only occur for a batch_size > 8.

When calculating the output for an input $x$, we receive an output vector $y_i$ for each input row $x_i$. We found that when we reorder the input rows and repeat the calculation, an input vector does not lead to the same output vector as before. In other words, the position of a sample in the input batch influences its output. Although these differences are negligible in absolute terms, there should be no differences to begin with.

To reproduce the behavior, compile the program below with the latest OpenBLAS commit by running:

gcc -o matrix_multiplication matrix_multiplication.c OpenBLAS/libopenblas.a -I.
./matrix_multiplication

After running the program, we would expect all output hash values to be identical (since all outputs should contain the same values in different orders). However, the hash outputs are different. The correct behavior can be achieved by using only ints (#define INT_ONLY) or setting the batch_size to a value <=8.

// matrix_multiplication.c

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <time.h>
#include <OpenBLAS/cblas.h>
#include <stdint.h>
#include <string.h>

// #define INT_ONLY

// Function to initialize weights and biases
void init_weights_and_biases(float *weights, float *biases, int rows, int cols, int seed) {
    srand(seed);

    // Initialize weights
    for (int i = 0; i < rows * cols; i++) {
        #ifdef INT_ONLY
            weights[i] = ((rand() % 11) - 5); // Random integers between -5 and 5
        #else
            weights[i] = (float)rand() / RAND_MAX; // Random floats between 0 and 1
        #endif
    }

    // Initialize biases
    for (int i = 0; i < rows; i++) {
        #ifdef INT_ONLY
            biases[i] = 0;  // Bias is 0 in INT_ONLY mode
        #else
            biases[i] = (float)rand() / RAND_MAX * 0.01;  // Small random bias values
        #endif
    }
}

// Function to apply ReLU activation
void relu(float *matrix, int size) {
    for (int i = 0; i < size; i++) {
        if (matrix[i] < 0) {
            matrix[i] = 0;
        }
    }
}

// Comparison function for sorting floats
int compare_floats(const void *a, const void *b) {
    float float_a = *(const float *)a;
    float float_b = *(const float *)b;
    return (float_a > float_b) - (float_a < float_b);
}

// Improved unordered hash function
uint32_t unordered_hash(float *data, int size) {
    if (size == 0) return 0; // Handle empty input

    // Sort the input to ensure the order doesn't affect the hash
    qsort(data, size, sizeof(float), compare_floats);

    uint32_t hash = 2166136261u; // FNV-1a hash initialization
    uint32_t prime = 16777619; // FNV prime

    for (int i = 0; i < size; i++) {
        uint32_t bits;
        memcpy(&bits, &data[i], sizeof(float)); // Interpret float as bits
        
        // XOR the hash with the byte-represented float
        hash ^= bits;

        // Multiply by the prime
        hash *= prime;

        // Mix in some shifting
        hash ^= (hash >> 15);
        hash ^= (hash << 13);
        hash ^= (hash >> 10);
    }

    return hash;
}

uint32_t simple_hash(float *data, int size) {
    uint32_t hash = 2166136261u; // FNV-1a hash initialization
    uint32_t prime = 16777619; // FNV prime

    for (int i = 0; i < size; i++) {
        uint32_t bits;
        memcpy(&bits, &data[i], sizeof(float)); // Interpret float as bits

        // XOR the hash with the byte-represented float
        hash ^= bits;

        // Multiply by the prime
        hash *= prime;

        // Mix in some shifting
        hash ^= (hash >> 15);
        hash ^= (hash << 13);
        hash ^= (hash >> 10);
    }

    return hash;
}

// Modified function to evaluate model with biases and print hash values
uint32_t eval(float *input, float *layer0_weights, float *layer0_bias, float *layer1_weights, float *layer1_bias, int in_dim, int out_dim, int batch_size) {
    // Print the hash of the input
    uint32_t input_hash = simple_hash(input, batch_size * in_dim);

    uint32_t layer0_weights_hash = simple_hash(layer0_weights, in_dim * 128);
    uint32_t layer0_bias_hash = simple_hash(layer0_bias, 128);
    uint32_t layer1_weights_hash = simple_hash(layer1_weights, 128 * out_dim);
    uint32_t layer1_bias_hash = simple_hash(layer1_bias, out_dim);

    printf("Input: %u\n", input_hash);

    // Layer 0 output
    float *hidden = (float *)malloc(batch_size * 128 * sizeof(float));

    // Perform layer0: input * layer0_weights + layer0_bias
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, batch_size, 128, in_dim, 1.0, input, in_dim, layer0_weights, 128, 0.0, hidden, 128);

    // Add bias for layer 0
    for (int i = 0; i < batch_size; i++) {
        for (int j = 0; j < 128; j++) {
            hidden[i * 128 + j] += layer0_bias[j];
        }
    }

    // Apply ReLU activation
    relu(hidden, batch_size * 128);

    // Layer 1 output (final output)
    float *output = (float *)malloc(batch_size * out_dim * sizeof(float));

    // Perform layer1: hidden * layer1_weights + layer1_bias
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, batch_size, out_dim, 128, 1.0, hidden, 128, layer1_weights, out_dim, 0.0, output, out_dim);

    // Add bias for layer 1
    for (int i = 0; i < batch_size; i++) {
        for (int j = 0; j < out_dim; j++) {
            output[i * out_dim + j] += layer1_bias[j];
        }
    }

    // Compute hash of the final output
    uint32_t hash = unordered_hash(output, batch_size * out_dim);

    // Clean up
    free(hidden);
    free(output);

    return hash;
}

// Fisher-Yates shuffle function for shuffling a 2D matrix (flattened into 1D)
void fisher_yates_shuffle(float *matrix, int rows, int cols) {
    for (int i = rows - 1; i > 0; i--) {
        // Pick a random index from 0 to i
        int j = rand() % (i + 1);

        // Swap row i with row j
        for (int k = 0; k < cols; k++) {
            float temp = matrix[i * cols + k];
            matrix[i * cols + k] = matrix[j * cols + k];
            matrix[j * cols + k] = temp;
        }
    }
}

// Function to shuffle a copy of the input matrix and evaluate the model with biases
void eval_with_shuffled_inputs(float *input, float *layer0_weights, float *layer0_bias, float *layer1_weights, float *layer1_bias, int in_dim, int out_dim, int batch_size, int num_shuffles) {
    // Allocate memory for a copy of the input matrix
    float *input_copy = (float *)malloc(batch_size * in_dim * sizeof(float));

    // Loop to shuffle and evaluate the model multiple times
    for (int i = 0; i < num_shuffles; i++) {
        // Copy the original input matrix
        memcpy(input_copy, input, batch_size * in_dim * sizeof(float));

        // Shuffle the input copy using Fisher-Yates algorithm
        fisher_yates_shuffle(input_copy, batch_size, in_dim);

        // Evaluate the shuffled input with biases
        uint32_t shuffled_hash = eval(input_copy, layer0_weights, layer0_bias, layer1_weights, layer1_bias, in_dim, out_dim, batch_size);
        printf("Output (%d): %u\n\n", i + 1, shuffled_hash);
    }

    // Clean up
    free(input_copy);
}

int main() {
    int in_dim = 784;
    int out_dim = 10;
    int batch_size = 1024;
    int seed = 42;

    // Initialize random input
    float *input = (float *)malloc(batch_size * in_dim * sizeof(float));
    srand(seed);
    for (int i = 0; i < batch_size * in_dim; i++) {
        #ifdef INT_ONLY
            input[i] = rand() % 10;
        #else
            input[i] = (float)rand() / RAND_MAX;
        #endif
    }

    // Initialize weights and biases for both layers
    float *layer0_weights = (float *)malloc(in_dim * 128 * sizeof(float));
    float *layer0_bias = (float *)malloc(128 * sizeof(float));
    float *layer1_weights = (float *)malloc(128 * out_dim * sizeof(float));
    float *layer1_bias = (float *)malloc(out_dim * sizeof(float));

    init_weights_and_biases(layer0_weights, layer0_bias, 128, in_dim, seed);
    init_weights_and_biases(layer1_weights, layer1_bias, out_dim, 128, seed);

    // Evaluate the model with the original input
    uint32_t original_hash = eval(input, layer0_weights, layer0_bias, layer1_weights, layer1_bias, in_dim, out_dim, batch_size);
    printf("Output (original): %u\n\n", original_hash);

    // Evaluate the model with shuffled inputs
    eval_with_shuffled_inputs(input, layer0_weights, layer0_bias, layer1_weights, layer1_bias, in_dim, out_dim, batch_size, 10);

    // Clean up
    free(input);
    free(layer0_weights);
    free(layer0_bias);
    free(layer1_weights);
    free(layer1_bias);

    return 0;
}

j-moeller avatar Oct 18 '24 11:10 j-moeller