FPU support for ESP32-S3 followup for #2016
as #2016 has been closed and i'm not allowed to reopen it.
just as an info: math:Moniker(..) now calls Monikerf but my nif-implementation is still ~10% (~27% before the bugfix) faster. maybe this is because i placed the function at the beginning of the function table.
Do you want to share source code of your nif and your benchmark?
this is my test code:
% ..
handle_call({circle, N}, _From, #state{timer = TimerIn, cycle_time = CycleTime} = State) ->
MakeTimer = fun(undef) ->
TimerResolution = 1 * 1000 * 1000,
{ok, Timer} = esp_gptimer:new(TimerResolution),
ok = esp_gptimer:enable(Timer),
ok = esp_gptimer:start(Timer),
io:format("Timer=~p~n", [Timer]),
Timer
;
(Timer) ->
Timer
end,
CheckLoop = fun CheckLoop(Sum, 0) ->
Sum
;
CheckLoop(Sum, P) ->
Arg = (math:pi() / N) * P,
Sin = esp:sinf(Arg),
Cos = esp:cosf(Arg),
% Sin = math:sin(Arg),
% Cos = math:cos(Arg),
Sqr = Sin * Sin + Cos * Cos,
CheckLoop(Sum + esp:sqrtf(Sqr), P - 1)
% CheckLoop(Sum + math:sqrt(Sqr), P - 1)
end,
Timer = MakeTimer(TimerIn),
{ok, StartTics} = esp_gptimer:get(Timer),
S = CheckLoop(0, N),
{ok, EndTics} = esp_gptimer:get(Timer),
Tics = EndTics - StartTics,
{reply, {S, Tics}, State#state{timer = Timer}, CycleTime}
;
% ..
and these are my nifs:
static term nif_etlib_math(Context *ctx, term argv[], float (*f)(float)) {
avm_float_t x = term_conv_to_float(argv[0]);
maybe_clear_exceptions();
avm_float_t y = f(x);
term exception = get_exception(y);
if (exception != OK_ATOM) {
return exception;
}
if (UNLIKELY(memory_ensure_free_opt(ctx, FLOAT_SIZE, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
return OUT_OF_MEMORY_ATOM;
}
return term_from_float(y, &ctx->heap);
}
static inline term nif_etlib_sinf(Context *ctx, int argc, term argv[]) {
return nif_etlib_math(ctx, argv, sinf);
}
static inline term nif_etlib_asinf(Context *ctx, int argc, term argv[]) {
return nif_etlib_math(ctx, argv, asinf);
}
static inline term nif_etlib_cosf(Context *ctx, int argc, term argv[]) {
return nif_etlib_math(ctx, argv, cosf);
}
static inline term nif_etlib_acosf(Context *ctx, int argc, term argv[]) {
return nif_etlib_math(ctx, argv, acosf);
}
static inline term nif_etlib_sqrtf(Context *ctx, int argc, term argv[]) {
return nif_etlib_math(ctx, argv, sqrtf);
}
// ..
static const Nif* etlib_get_nif(const char *nifname) {
static Name_to_Nifs name_to_nifs[] = {
{"etlib:init/0", { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_init}},
{"esp:sinf/1", { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_sinf}},
{"esp:asinf/1", { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_asinf}},
{"esp:cosf/1", { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_cosf}},
{"esp:acosf/1", { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_acosf}},
{"esp:sqrtf/1", { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_sqrtf}},
// ..
{NULL}
};
return avm_etlib_utl_resolve_nif(name_to_nifs, nifname);
}
where
Nif const *avm_etlib_utl_resolve_nif(Name_to_Nifs * const name_to_nifs, const char *nifname) {
Nif const *nif = NULL;
for (Name_to_Nifs *try = name_to_nifs; try->nifname != NULL; try++) {
if (strcmp(try->nifname, nifname) == 0) {
TRACE("Resolved platform nif %s ...\n", nifname);
nif = &try->nif;
break;
}
}
return nif;
}
and
typedef struct {
const char* const nifname;
const Nif nif;
} Name_to_Nifs;
new finding:
implementing my test code as a nif
static inline term nif_etlib_circle_test(Context *ctx, int argc, term argv[]) {
uint32_t divider = 1;
if (argc > 0) {
term divider_term = argv[0];
VALIDATE_VALUE(divider_term, term_is_integer);
divider = (uint32_t) term_to_int32(divider_term);
}
float sum = 0.0;
volatile float sin, cos, sqr;
for (int i = 0; i < divider; i++) {
float angle = (M_PI / divider) * i;
sin = sinf(angle);
cos = cosf(angle);
sqr = sin * sin + cos * cos;
sum += sqrtf(sqr);
}
return term_from_float(sum, &ctx->heap);
}
and calling it from erlang has shown that this executes dramatically faster (5548uS - 5572uS vs 437465uS - 459812uS both called with N=1000, where uS has the meaning of microseconds) than the erlang version.
this makes me feel that there is large calling overhead (not only for (self implemented) nifs and bifs but for all function calls which are resolved by a string lookup) which might be optimized .