heed OS Error 22: invalid argument when opening transactions on threads

Bug description

When running the following example using heed, we get unwraps of error 22: invalid argument:

use heed::EnvOpenOptions;

fn main() {
    const NBR_THREADS: usize = 11;
    const NBR_DB: u32 = 100;

    let mut handles = vec![];
    for _i in 0..NBR_THREADS {
        let h = std::thread::spawn(|| {
            let dir = tempfile::tempdir_in(".").unwrap();

            let mut options = EnvOpenOptions::new();
            options.max_dbs(NBR_DB);

            let env = options.open(dir.path()).unwrap();
            for i in 0..NBR_DB {
                env.create_poly_database(Some(&format!("db{i}"))).unwrap();
            }
        });
        handles.push(h);
    }
    for h in handles {
        h.join().unwrap();
    }
    println!("ok!");
}

(see the associated repository for more information)

Raw lmdb reproducer

The issue can be further minimized in C, directly using the master branch (not master3) of lmdb instead of heed, with the following:

#include <stdlib.h>
#include <stdio.h>
#include <errno.h>
#include <pthread.h>
#include <unistd.h>
#include "../lmdb.h"
#include "../midl.h"

#define NBR_THREADS 20
#define NBR_DB 2

void* run(void* param) {
    char* dir_name = (char*) param;
    printf("Starting %s\n", dir_name);
    MDB_env* env;
    mdb_env_create(&env);
    mdb_env_set_maxdbs(env, NBR_DB);
    if (mdb_env_open(env, dir_name, MDB_NOTLS, 0600) != 0) {
        printf("ERROR opening env\n");
        goto exit;
    }
    int parent_txn_res;

    for (int i=0; i<NBR_DB;++i) {
        char* db_name = malloc(100);
        sprintf(db_name, "db_%i", i);

        MDB_txn* txn;

        if (mdb_txn_begin(env, NULL, 0, &txn) != 0) {
            printf("ERROR opening nested txn\n");
            printf("[%s]ERROR opening parent_txn, %d\n", dir_name, parent_txn_res);
            fprintf(stderr, "errno code: %d ", errno);
            perror("Cause");
            goto exit_loop;
        }

        MDB_dbi db;
        sleep(1);
        mdb_txn_commit(txn);
        free(db_name);
        continue;
exit_loop:
        free(db_name);
        goto exit;
    }
    printf("ok env\n");
exit:
    free(dir_name);
    mdb_env_close(env);

    return NULL;
}

int main(int argc, char** argv) {
    pthread_t threads[NBR_THREADS];
    for (int i = 0; i < NBR_THREADS; ++i) {
        char* dir_name = malloc(100);
        sprintf(dir_name, "tmp_env_%i", i);
        pthread_create(&threads[i], NULL, run, dir_name);
    }
    
    for (int i = 0; i < NBR_THREADS; ++i) {
        void* retval;
        pthread_join(threads[i], &retval);
    }
    printf("ok!\n");
    return 0;
}

(see the associated repository for more information)

Likely related to https://github.com/meilisearch/meilisearch/issues/3017

Nov 10 '22 11:11 dureuill

This issue seems directly related to lmdb and not heed. @hyc do you think you could look into it? It reproduce 100% of the time on macOS?

Nov 10 '22 11:11 irevoire

After creating the tmp_env directories, worked fine here on Linux. Will try Mac shortly.

Nov 10 '22 12:11 hyc

Works fine here

hyc@Howards-MacBook-Pro xyz % ./prog
Starting tmp_env_0
Starting tmp_env_5
Starting tmp_env_6
Starting tmp_env_2
Starting tmp_env_7
Starting tmp_env_3
Starting tmp_env_4
Starting tmp_env_1
Starting tmp_env_8
Starting tmp_env_9
ok env
ok env
ok env
ok env
ok env
ok env
ok env
ok env
ok env
ok env
ok!
hyc@Howards-MacBook-Pro xyz % uname -a
Darwin Howards-MacBook-Pro.local 21.1.0 Darwin Kernel Version 21.1.0: Wed Oct 13 17:33:24 PDT 2021; root:xnu-8019.41.5~1/RELEASE_ARM64_T8101 arm64
hyc@Howards-MacBook-Pro xyz % sw_vers
ProductName:	macOS
ProductVersion:	12.0.1
BuildVersion:	21A559

Nov 10 '22 12:11 hyc

Ah, sorry, I used 10 threads in the linked example, but you need at least 11 to see the issue (at least on Mac M1).

I updated the example in the repository and in the issue description above.

Nov 10 '22 12:11 dureuill

It's failing in LOCK_MUTEX, which defaults to using semop() on MacOS. The manpage says

     [EINVAL]           No semaphore set corresponds to semid, or the process would exceed the system-
                        defined limit for the number of per-process SEM_UNDO structures.

So this appears to be an OS limitation. No idea if/how that's tunable, I leave that up to you.

Nov 10 '22 17:11 hyc

Instead of using the default SysV Semaphores, you can compile mdb.c with -DMDB_USE_POSIX_SEM and this problem goes away. Unfortunately POSIX semaphores on MacOS aren't robust, killing a process that holds a semaphore will leave it locked.

Nov 10 '22 17:11 hyc

And thus, if I understood correctly, limiting our number of threads to 10 would work?

Nov 10 '22 17:11 irevoire

I suppose so. Or just stop creating so many environments. Why are you using one environment per thread?

Nov 10 '22 17:11 hyc

Meilisearch is not specifically creating one environment by thread, but when we run the tests, it produces this behavior as the tests are run in parallel. So the easy fix for that is to reduce the number of tests run at the same time in the CI.

The real issue is that on arch Linux, we had another issue returning an os error 22: we can't create any Meilisearch index on this OS. We will see if the above minimal reproducible example is the one that triggers this bug or not. We thought it was the same bug.

Nov 10 '22 18:11 Kerollmops

the tests are run in parallel

Or run tests in separate processes instead of separate threads.

Nov 10 '22 18:11 hyc

Thank you for your insight, hyc, you pinpointing which resource is limited was really helpful.

Some points about this issue:

It appears unrelated to the archlinux error 22 (meilisearch/meilisearch#3017), which appears to be an lmdb-rs build issue (uses system ldmb instead of vendored lmdb, see this comment for more information).
ipcs -S (shell command) allows displaying the various IPC-related limits in macOS. The problematic one is semume: 10 (max # of undo entries per process)
It is changeable: https://stackoverflow.com/questions/64158180/how-to-increase-the-maximum-semaphore-limit-on-macos-bsd but I didn't find a way to query it programmatically at this point. For tests, and since there is no way to trigger this from a production meilisearch, I guess it should be OK to increase the limit before the running the test in CI.
I suggest we document this in heed as an OS-specific behavior, as heed users are not expected to read the code of lmdb to find out that it uses OS resources that exist in a surprisingly limited number per process (the default value on MacOS is 10). I will open a PR on heed later, suggesting an initial wording.

Nov 14 '22 12:11 dureuill