AtomVM icon indicating copy to clipboard operation
AtomVM copied to clipboard

FPU support for ESP32-S3 followup for #2016

Open elsbiet opened this issue 1 month ago • 3 comments

as #2016 has been closed and i'm not allowed to reopen it.

just as an info: math:Moniker(..) now calls Monikerf but my nif-implementation is still ~10% (~27% before the bugfix) faster. maybe this is because i placed the function at the beginning of the function table.

elsbiet avatar Dec 08 '25 08:12 elsbiet

Do you want to share source code of your nif and your benchmark?

pguyot avatar Dec 08 '25 19:12 pguyot

this is my test code:

% ..

handle_call({circle, N}, _From, #state{timer = TimerIn, cycle_time = CycleTime} = State) ->
    MakeTimer = fun(undef) ->
                        TimerResolution = 1 * 1000 * 1000,
                        {ok, Timer} = esp_gptimer:new(TimerResolution),
                        ok = esp_gptimer:enable(Timer),
                        ok = esp_gptimer:start(Timer),

                        io:format("Timer=~p~n", [Timer]),
                        Timer
                        ;
                   (Timer) ->
                        Timer
                end,
    CheckLoop = fun CheckLoop(Sum, 0) ->
                        Sum
                        ;
                    CheckLoop(Sum, P) ->
                        Arg = (math:pi() / N) * P,
                        Sin = esp:sinf(Arg),
                        Cos = esp:cosf(Arg),
%                       Sin = math:sin(Arg),
%                       Cos = math:cos(Arg),
                        Sqr = Sin * Sin + Cos * Cos,
                        CheckLoop(Sum + esp:sqrtf(Sqr), P - 1)
%                       CheckLoop(Sum + math:sqrt(Sqr), P - 1)
                end,

    Timer = MakeTimer(TimerIn),
    {ok, StartTics} = esp_gptimer:get(Timer),
    S = CheckLoop(0, N),
    {ok, EndTics} = esp_gptimer:get(Timer),
    Tics = EndTics - StartTics,

    {reply, {S, Tics}, State#state{timer = Timer}, CycleTime}
    ;

% ..

and these are my nifs:

static term nif_etlib_math(Context *ctx, term argv[], float (*f)(float)) {
    avm_float_t x = term_conv_to_float(argv[0]);
    maybe_clear_exceptions();
    avm_float_t y = f(x);
    term exception = get_exception(y);
    if (exception != OK_ATOM) {
        return exception;
    }
    if (UNLIKELY(memory_ensure_free_opt(ctx, FLOAT_SIZE, MEMORY_CAN_SHRINK) != MEMORY_GC_OK)) {
        return OUT_OF_MEMORY_ATOM;
    }
    return term_from_float(y, &ctx->heap);
}

static inline term nif_etlib_sinf(Context *ctx, int argc, term argv[]) {
    return nif_etlib_math(ctx, argv, sinf);
}

static inline term nif_etlib_asinf(Context *ctx, int argc, term argv[]) {
    return nif_etlib_math(ctx, argv, asinf);
}

static inline term nif_etlib_cosf(Context *ctx, int argc, term argv[]) {
    return nif_etlib_math(ctx, argv, cosf);
}

static inline term nif_etlib_acosf(Context *ctx, int argc, term argv[]) {
    return nif_etlib_math(ctx, argv, acosf);
}

static inline term nif_etlib_sqrtf(Context *ctx, int argc, term argv[]) {
    return nif_etlib_math(ctx, argv, sqrtf);
}

// ..

static const Nif* etlib_get_nif(const char *nifname) {
    static Name_to_Nifs name_to_nifs[] = {
        {"etlib:init/0",                { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_init}},

        {"esp:sinf/1",                  { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_sinf}},
        {"esp:asinf/1",                 { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_asinf}},
        {"esp:cosf/1",                  { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_cosf}},
        {"esp:acosf/1",                 { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_acosf}},
        {"esp:sqrtf/1",                 { .base.type = NIFFunctionType, .nif_ptr = nif_etlib_sqrtf}},

// ..

       {NULL}
    };
    return avm_etlib_utl_resolve_nif(name_to_nifs, nifname);
}

where

Nif const *avm_etlib_utl_resolve_nif(Name_to_Nifs * const name_to_nifs, const char *nifname) {
    Nif const *nif = NULL;
    for (Name_to_Nifs *try = name_to_nifs; try->nifname != NULL; try++) {
        if (strcmp(try->nifname, nifname) == 0) {
            TRACE("Resolved platform nif %s ...\n", nifname);
            nif = &try->nif;
            break;
        }
    }
    return nif;
}

and

typedef struct {
    const char*         const   nifname;
    const Nif                   nif;
} Name_to_Nifs; 

elsbiet avatar Dec 09 '25 07:12 elsbiet

new finding:

implementing my test code as a nif

static inline term nif_etlib_circle_test(Context *ctx, int argc, term argv[]) {
    uint32_t divider = 1;
    if (argc > 0) {
        term divider_term = argv[0];
        VALIDATE_VALUE(divider_term, term_is_integer);
        divider = (uint32_t) term_to_int32(divider_term);
    }
    float sum = 0.0;
    volatile float sin, cos, sqr;
    for (int i = 0; i < divider; i++) {
        float angle = (M_PI / divider) * i;
        sin = sinf(angle);
        cos = cosf(angle);
        sqr = sin * sin + cos * cos;
        sum += sqrtf(sqr);
    }
    return term_from_float(sum, &ctx->heap);
}

and calling it from erlang has shown that this executes dramatically faster (5548uS - 5572uS vs 437465uS - 459812uS both called with N=1000, where uS has the meaning of microseconds) than the erlang version.

this makes me feel that there is large calling overhead (not only for (self implemented) nifs and bifs but for all function calls which are resolved by a string lookup) which might be optimized .

elsbiet avatar Dec 10 '25 09:12 elsbiet