gemmini icon indicating copy to clipboard operation
gemmini copied to clipboard

Softmax implementation issue

Open LordScarface opened this issue 1 year ago • 2 comments

Hello, I just started with Gemmini and Chipyard and not I am facing some issues with the Softmax, GELU and LayerNorm activation functions.

I got Gemmini running and I get the correct results for matrix multiplications and RELU activation, but Softmax causes a crash in the simulator and also on the FPGA implementation.

I built a baremetal app according to the gemmini-rocc-tests repo that just does a matrix multiplication and then a Softmax, this is the code:

#include <stdint.h>
#include <stddef.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#ifndef BAREMETAL
#include <sys/mman.h>
#endif
#include "include/gemmini_testutils.h"

#define CHECK_RESULT 1

#define NO_BIAS 1
#define FULL_BIAS_WIDTH 1
#define BERT_SCALE 0.05

#if FULL_BIAS_WIDTH
typedef acc_t ACC_T;
#else
typedef elem_t ACC_T;
#endif

#define MAT_DIM_I 2
#define MAT_DIM_K 2
#define MAT_DIM_J 2

void full_printMatrix(elem_t m[MAT_DIM_I][MAT_DIM_J]) {
  for (size_t i = 0; i < MAT_DIM_I; ++i) {
    for (size_t j = 0; j < MAT_DIM_J; ++j) {
        #ifdef ELEM_T_IS_FLOAT 
            printf("%f ", (double)m[i][j]);
        #else
            printf("%ld ", m[i][j]);
        #endif
    }
      
    printf("\n");
  }
  printf("\n");
}

int full_is_equal(elem_t x[MAT_DIM_I][MAT_DIM_J], elem_t y[MAT_DIM_I][MAT_DIM_J]) {
  for (size_t i = 0; i < MAT_DIM_I; ++i)
    for (size_t j = 0; j < MAT_DIM_J; ++j)
      if (x[i][j] != y[i][j])
        return 0;
  return 1;
}

int main() {
#ifndef BAREMETAL
    if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
      perror("mlockall failed");
      exit(1);
    }
#endif

    gemmini_flush(0);

    static elem_t full_A[MAT_DIM_I][MAT_DIM_K]  row_align(1) = {{-7 ,5}, {-120, 7}};
    static elem_t full_B[MAT_DIM_K][MAT_DIM_J] row_align(1) = {{1,0}, {0,1}};

    static elem_t A_at_B_CPU[MAT_DIM_I][MAT_DIM_J] row_align(1);
    static elem_t A_at_B_GEMM[MAT_DIM_I][MAT_DIM_J] row_align(1);

    static ACC_T full_D[MAT_DIM_I][MAT_DIM_J] row_align_acc(1) = {{0,0}, {0,0}}; // the bias

    static elem_t Softmax_CPU[MAT_DIM_I][MAT_DIM_J];
    static elem_t Softmax_GEMM[MAT_DIM_I][MAT_DIM_J];
/*

    printf("Starting slow CPU matmul\n");
    unsigned long cpu_start = read_cycles();

    tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K,
            (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)A_at_B_CPU,
            MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J,
            MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
            NO_ACTIVATION, ACC_SCALE_IDENTITY, BERT_SCALE, false,
            false, false,
            false, !FULL_BIAS_WIDTH,
            0,
            CPU);

    tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K,
            (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)Softmax_CPU,
            MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J,
            MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
            SOFTMAX, ACC_SCALE_IDENTITY, BERT_SCALE, false,
            false, false,
            false, !FULL_BIAS_WIDTH,
            0,
            CPU);

    unsigned long cpu_end = read_cycles();
    printf("Cycles taken: %u\n", cpu_end-cpu_start);

    printf("Starting gemmini matmul\n");
    unsigned long start = read_cycles();

    tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K,
            (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)A_at_B_GEMM,
            MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J,
            MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
            NO_ACTIVATION, ACC_SCALE_IDENTITY, BERT_SCALE, false,
            false, false,
            false, !FULL_BIAS_WIDTH,
            0,
            WS);
*/
    tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K,
            (elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)Softmax_GEMM,
            MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J,
            MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
            SOFTMAX, ACC_SCALE_IDENTITY, BERT_SCALE, false,
            false, false,
            false, !FULL_BIAS_WIDTH,
            0,
            WS);

    //unsigned long end = read_cycles();
    //printf("Cycles taken: %u\n", end-start);
/*
    printf("A:\n");
    full_printMatrix(full_A);

    printf("B:\n");
    full_printMatrix(full_B);

    printf("A @ B (CPU):\n");
    full_printMatrix(A_at_B_CPU);

    printf("A @ B (Gemmini):\n");
    full_printMatrix(A_at_B_GEMM);

    printf("Softmax (CPU):\n");
    full_printMatrix(Softmax_CPU);
*/
    printf("Softmax (Gemmini):\n");
    full_printMatrix(Softmax_GEMM);
    printf("\n");

  exit(0);
}

I then compiled verilator with the ./scripts/build-verilator.sh command and ran the test with ./scripts/run-verilator.sh $(which ./software/gemmini-rocc-tests/build/transformers/softmax_test-baremetal).

I also modified the xcustom.h to print all commands that are executed on Gemmini.

Here is the output:

This emulator compiled with JTAG Remote Bitbang client. To enable, use +jtag_rbb_enable=1.
Listening on port 45603
[UART] UART0 is here (stdin/stdout).
asdd  
Testing float print: 1.0: %f -- 123.456789: %f -- -12.765: %f -- -67894.0654561: %f -- 1.012345487: %f
GEMMINI 0RR CUSTOM_3, 0x3, 7, rs1=00000000, rs2=00000000 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00010004, rs2=00000000 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00000002, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00040101, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00040109, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00040111, rs2=00000008 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00010003, rs2=0000001b 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00050003, rs2=0000001b 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 9, rs1=00020002, rs2=00010001 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 10, rs1=80002740, rs2=8000273c 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 11, rs1=00000000, rs2=80002748 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 12, rs1=00000002, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 13, rs1=00000002, rs2=00000002 
   -> DONE 
GEMMINI 0RR CUSTOM_3, 0x3, 8, rs1=00000400, rs2=00000000 
[94000] %Error: chipyard.TestHarness.CustomGemminiSoCConfig.top.v:304744: Assertion failed in TOP.TestHarness.chiptop.system.tile_prci_domain.tile_reset_domain.tile.gemmini.load_controller
%Error: /home/lukas/Documents/Gemmini/chipyard/sims/verilator/generated-src/chipyard.TestHarness.CustomGemminiSoCConfig/chipyard.TestHarness.CustomGemminiSoCConfig.top.v:304744: Verilog $stop
Aborting...

So the last command fails, which is the following from gemmini.h :

// weight-stationary matmul loop
#define gemmini_loop_ws(I, J, K, pad_I, pad_J, pad_K, A, B, D, C, A_stride, B_stride, D_stride, C_stride, A_transpose, B_transpose, full_C, low_D, ex_accumulate, act) \
  { \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(pad_K) << 32) | ((uint64_t)(pad_J) << 16) | (uint64_t)(pad_I), ((uint64_t)(K) << 32) | ((uint64_t)(J) << 16) | (uint64_t)(I), k_LOOP_WS_CONFIG_BOUNDS) \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A, B, k_LOOP_WS_CONFIG_ADDRS_AB) \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D, C, k_LOOP_WS_CONFIG_ADDRS_DC) \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A_stride, B_stride, k_LOOP_WS_CONFIG_STRIDES_AB) \
    ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D_stride, C_stride, k_LOOP_WS_CONFIG_STRIDES_DC) \
    -> This fails : ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(act) << 8) | ((low_D) << 2) | ((full_C) << 1) | (ex_accumulate), ((B_transpose) << 1) | (A_transpose), k_LOOP_WS) \
  }

The corresponding assert that fails is in DMACommandTracker Line 89

If I just change the activation to be RELU it works as expected.

Here are the Versions I am using:

  • gemmini-rocc-tests @ ae0cd82
  • gemmini @ v0.7.0
  • chipyard @ 1.8.1

I hope someone can help me with this, Best Regards, Lukas

LordScarface avatar Jun 13 '23 14:06 LordScarface

Okay so I was missing the has_normalizations=true flag, it is working now. But I was wondering if it would be feasible to also add a floating point implementation of Softmax and GELU?

LordScarface avatar Jun 27 '23 10:06 LordScarface

I was wondering if it would be feasible to also add a floating point implementation of Softmax and GELU?

That would be a great feature to have for sure, but it's not planned for the near future, just due to a lack of manpower. Hopefully, we get someone who wants to start working on that, or an outside contributor makes a PR to add that feature.

For now, Gemmini's transformer support is targeted towards I-BERT, rather than floating-point BERT implementations

hngenc avatar Nov 30 '23 22:11 hngenc