gemmini
gemmini copied to clipboard
Softmax implementation issue
Hello, I just started with Gemmini and Chipyard and not I am facing some issues with the Softmax, GELU and LayerNorm activation functions.
I got Gemmini running and I get the correct results for matrix multiplications and RELU activation, but Softmax causes a crash in the simulator and also on the FPGA implementation.
I built a baremetal app according to the gemmini-rocc-tests repo that just does a matrix multiplication and then a Softmax, this is the code:
#include <stdint.h>
#include <stddef.h>
#include <assert.h>
#include <stdlib.h>
#include <stdio.h>
#ifndef BAREMETAL
#include <sys/mman.h>
#endif
#include "include/gemmini_testutils.h"
#define CHECK_RESULT 1
#define NO_BIAS 1
#define FULL_BIAS_WIDTH 1
#define BERT_SCALE 0.05
#if FULL_BIAS_WIDTH
typedef acc_t ACC_T;
#else
typedef elem_t ACC_T;
#endif
#define MAT_DIM_I 2
#define MAT_DIM_K 2
#define MAT_DIM_J 2
void full_printMatrix(elem_t m[MAT_DIM_I][MAT_DIM_J]) {
for (size_t i = 0; i < MAT_DIM_I; ++i) {
for (size_t j = 0; j < MAT_DIM_J; ++j) {
#ifdef ELEM_T_IS_FLOAT
printf("%f ", (double)m[i][j]);
#else
printf("%ld ", m[i][j]);
#endif
}
printf("\n");
}
printf("\n");
}
int full_is_equal(elem_t x[MAT_DIM_I][MAT_DIM_J], elem_t y[MAT_DIM_I][MAT_DIM_J]) {
for (size_t i = 0; i < MAT_DIM_I; ++i)
for (size_t j = 0; j < MAT_DIM_J; ++j)
if (x[i][j] != y[i][j])
return 0;
return 1;
}
int main() {
#ifndef BAREMETAL
if (mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
perror("mlockall failed");
exit(1);
}
#endif
gemmini_flush(0);
static elem_t full_A[MAT_DIM_I][MAT_DIM_K] row_align(1) = {{-7 ,5}, {-120, 7}};
static elem_t full_B[MAT_DIM_K][MAT_DIM_J] row_align(1) = {{1,0}, {0,1}};
static elem_t A_at_B_CPU[MAT_DIM_I][MAT_DIM_J] row_align(1);
static elem_t A_at_B_GEMM[MAT_DIM_I][MAT_DIM_J] row_align(1);
static ACC_T full_D[MAT_DIM_I][MAT_DIM_J] row_align_acc(1) = {{0,0}, {0,0}}; // the bias
static elem_t Softmax_CPU[MAT_DIM_I][MAT_DIM_J];
static elem_t Softmax_GEMM[MAT_DIM_I][MAT_DIM_J];
/*
printf("Starting slow CPU matmul\n");
unsigned long cpu_start = read_cycles();
tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K,
(elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)A_at_B_CPU,
MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
NO_ACTIVATION, ACC_SCALE_IDENTITY, BERT_SCALE, false,
false, false,
false, !FULL_BIAS_WIDTH,
0,
CPU);
tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K,
(elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)Softmax_CPU,
MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
SOFTMAX, ACC_SCALE_IDENTITY, BERT_SCALE, false,
false, false,
false, !FULL_BIAS_WIDTH,
0,
CPU);
unsigned long cpu_end = read_cycles();
printf("Cycles taken: %u\n", cpu_end-cpu_start);
printf("Starting gemmini matmul\n");
unsigned long start = read_cycles();
tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K,
(elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)A_at_B_GEMM,
MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
NO_ACTIVATION, ACC_SCALE_IDENTITY, BERT_SCALE, false,
false, false,
false, !FULL_BIAS_WIDTH,
0,
WS);
*/
tiled_matmul_auto(MAT_DIM_I, MAT_DIM_J, MAT_DIM_K,
(elem_t*)full_A, (elem_t*)full_B, NO_BIAS ? NULL : &full_D[0][0], (elem_t*)Softmax_GEMM,
MAT_DIM_K, MAT_DIM_J, MAT_DIM_J, MAT_DIM_J,
MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY, MVIN_SCALE_IDENTITY,
SOFTMAX, ACC_SCALE_IDENTITY, BERT_SCALE, false,
false, false,
false, !FULL_BIAS_WIDTH,
0,
WS);
//unsigned long end = read_cycles();
//printf("Cycles taken: %u\n", end-start);
/*
printf("A:\n");
full_printMatrix(full_A);
printf("B:\n");
full_printMatrix(full_B);
printf("A @ B (CPU):\n");
full_printMatrix(A_at_B_CPU);
printf("A @ B (Gemmini):\n");
full_printMatrix(A_at_B_GEMM);
printf("Softmax (CPU):\n");
full_printMatrix(Softmax_CPU);
*/
printf("Softmax (Gemmini):\n");
full_printMatrix(Softmax_GEMM);
printf("\n");
exit(0);
}
I then compiled verilator with the ./scripts/build-verilator.sh
command and ran the test with ./scripts/run-verilator.sh $(which ./software/gemmini-rocc-tests/build/transformers/softmax_test-baremetal)
.
I also modified the xcustom.h
to print all commands that are executed on Gemmini.
Here is the output:
This emulator compiled with JTAG Remote Bitbang client. To enable, use +jtag_rbb_enable=1.
Listening on port 45603
[UART] UART0 is here (stdin/stdout).
asdd
Testing float print: 1.0: %f -- 123.456789: %f -- -12.765: %f -- -67894.0654561: %f -- 1.012345487: %f
GEMMINI 0RR CUSTOM_3, 0x3, 7, rs1=00000000, rs2=00000000
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00010004, rs2=00000000
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00000002, rs2=00000002
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00040101, rs2=00000002
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00040109, rs2=00000002
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00040111, rs2=00000008
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00010003, rs2=0000001b
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 0, rs1=00050003, rs2=0000001b
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 9, rs1=00020002, rs2=00010001
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 10, rs1=80002740, rs2=8000273c
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 11, rs1=00000000, rs2=80002748
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 12, rs1=00000002, rs2=00000002
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 13, rs1=00000002, rs2=00000002
-> DONE
GEMMINI 0RR CUSTOM_3, 0x3, 8, rs1=00000400, rs2=00000000
[94000] %Error: chipyard.TestHarness.CustomGemminiSoCConfig.top.v:304744: Assertion failed in TOP.TestHarness.chiptop.system.tile_prci_domain.tile_reset_domain.tile.gemmini.load_controller
%Error: /home/lukas/Documents/Gemmini/chipyard/sims/verilator/generated-src/chipyard.TestHarness.CustomGemminiSoCConfig/chipyard.TestHarness.CustomGemminiSoCConfig.top.v:304744: Verilog $stop
Aborting...
So the last command fails, which is the following from gemmini.h
:
// weight-stationary matmul loop
#define gemmini_loop_ws(I, J, K, pad_I, pad_J, pad_K, A, B, D, C, A_stride, B_stride, D_stride, C_stride, A_transpose, B_transpose, full_C, low_D, ex_accumulate, act) \
{ \
ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(pad_K) << 32) | ((uint64_t)(pad_J) << 16) | (uint64_t)(pad_I), ((uint64_t)(K) << 32) | ((uint64_t)(J) << 16) | (uint64_t)(I), k_LOOP_WS_CONFIG_BOUNDS) \
ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A, B, k_LOOP_WS_CONFIG_ADDRS_AB) \
ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D, C, k_LOOP_WS_CONFIG_ADDRS_DC) \
ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, A_stride, B_stride, k_LOOP_WS_CONFIG_STRIDES_AB) \
ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, D_stride, C_stride, k_LOOP_WS_CONFIG_STRIDES_DC) \
-> This fails : ROCC_INSTRUCTION_RS1_RS2(XCUSTOM_ACC, ((uint64_t)(act) << 8) | ((low_D) << 2) | ((full_C) << 1) | (ex_accumulate), ((B_transpose) << 1) | (A_transpose), k_LOOP_WS) \
}
The corresponding assert that fails is in DMACommandTracker Line 89
If I just change the activation to be RELU it works as expected.
Here are the Versions I am using:
- gemmini-rocc-tests @ ae0cd82
- gemmini @ v0.7.0
- chipyard @ 1.8.1
I hope someone can help me with this, Best Regards, Lukas
Okay so I was missing the has_normalizations=true
flag, it is working now. But I was wondering if it would be feasible to also add a floating point implementation of Softmax and GELU?
I was wondering if it would be feasible to also add a floating point implementation of Softmax and GELU?
That would be a great feature to have for sure, but it's not planned for the near future, just due to a lack of manpower. Hopefully, we get someone who wants to start working on that, or an outside contributor makes a PR to add that feature.
For now, Gemmini's transformer support is targeted towards I-BERT, rather than floating-point BERT implementations