xxHash icon indicating copy to clipboard operation
xxHash copied to clipboard

Bug: XXH_INLINE_ALL does not always force inline? How to force inline?

Open simonhf opened this issue 7 months ago • 4 comments

Here's a simple program:

$ cat mystery.c
#include <stdio.h>
#include <assert.h>
#include <sys/time.h>

#include "xxHash/xxhash.h"

// export USE_FOO2=0; gcc -DUSE_FOO2=$USE_FOO2 -DXXH_INLINE_ALL -O1 -S -o mystery$USE_FOO2.s mystery.c && gcc -c -o mystery$USE_FOO2.o mystery$USE_FOO2.s && gcc -o mystery$USE_FOO2.exe mystery$USE_FOO2.o && ./mystery$USE_FOO2.exe; cat mystery$USE_FOO2.s | egrep -i "(^[a-z].*:|call)"
// export USE_FOO2=1; gcc -DUSE_FOO2=$USE_FOO2 -DXXH_INLINE_ALL -O1 -S -o mystery$USE_FOO2.s mystery.c && gcc -c -o mystery$USE_FOO2.o mystery$USE_FOO2.s && gcc -o mystery$USE_FOO2.exe mystery$USE_FOO2.o && ./mystery$USE_FOO2.exe; cat mystery$USE_FOO2.s | egrep -i "(^[a-z].*:|call)"

double get_time_in_seconds(void) {
    struct timeval tv;
    assert(gettimeofday(&tv, NULL) >= 0);
    return (double)tv.tv_sec + 1.e-6 * (double)tv.tv_usec;
}

uint64_t loop = 500000000;

void __attribute__ ((noinline)) foo1(void) {
    XXH64_hash_t hash_xor = 0;
    double t1 = get_time_in_seconds();
    for(uint64_t i = 0; i < loop ; i ++) {
        XXH64_hash_t hash = XXH64(&i, sizeof(i), 123456 /* seed */);
        hash_xor = hash_xor ^ hash;
    }
    double t2 = get_time_in_seconds();
    printf("- %lu 64bit hashes created xor hash 0x%lx in %f seconds or %.0f per second\n", loop, hash_xor, t2 - t1, loop / (t2 - t1));
}

#if USE_FOO2
void __attribute__ ((noinline)) foo2(void) {
    XXH64_hash_t hash_xor = 0;
    double t1 = get_time_in_seconds();
    for(uint64_t i = 0; i < loop ; i ++) {
        XXH64_hash_t hash = XXH64(&i, sizeof(i), 123456 /* seed */);
        hash_xor = hash_xor ^ hash;
    }
    double t2 = get_time_in_seconds();
    printf("- %lu 64bit hashes created xor hash 0x%lx in %f seconds or %.0f per second\n", loop, hash_xor, t2 - t1, loop / (t2 - t1));
}
#endif

void main(void) {
    foo1();
    //foo2();
}

Compiling and running it twice, the 1st time XXH64() is inlined, but the 2nd time it is not inlined:

$ gcc --version
gcc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0

$ git clone https://github.com/Cyan4973/xxHash.git

$ export USE_FOO2=0; gcc -DUSE_FOO2=$USE_FOO2 -DXXH_INLINE_ALL -O1 -S -o mystery$USE_FOO2.s mystery.c && gcc -c -o mystery$USE_FOO2.o mystery$USE_FOO2.s && gcc -o mystery$USE_FOO2.exe mystery$USE_FOO2.o && ./mystery$USE_FOO2.exe; cat mystery$USE_FOO2.s | egrep -i "(^[a-z].*:|call)"
- 500000000 64bit hashes created xor hash 0xde5d248747c84e34 in 1.452864 seconds or 344147851 per second
get_time_in_seconds:
	call	gettimeofday@PLT
	call	__assert_fail@PLT
	call	__stack_chk_fail@PLT
foo1:
	call	get_time_in_seconds
	call	get_time_in_seconds
	call	__printf_chk@PLT
main:
	call	foo1
loop:

$ export USE_FOO2=1; gcc -DUSE_FOO2=$USE_FOO2 -DXXH_INLINE_ALL -O1 -S -o mystery$USE_FOO2.s mystery.c && gcc -c -o mystery$USE_FOO2.o mystery$USE_FOO2.s && gcc -o mystery$USE_FOO2.exe mystery$USE_FOO2.o && ./mystery$USE_FOO2.exe; cat mystery$USE_FOO2.s | egrep -i "(^[a-z].*:|call)"
- 500000000 64bit hashes created xor hash 0xde5d248747c84e34 in 2.790860 seconds or 179156250 per second
XXH64_finalize:
get_time_in_seconds:
	call	gettimeofday@PLT
	call	__assert_fail@PLT
	call	__stack_chk_fail@PLT
foo1:
	call	get_time_in_seconds
	call	XXH64_finalize
	call	get_time_in_seconds
	call	__printf_chk@PLT
	call	__stack_chk_fail@PLT
foo2:
	call	get_time_in_seconds
	call	XXH64_finalize
	call	get_time_in_seconds
	call	__printf_chk@PLT
	call	__stack_chk_fail@PLT
main:
	call	foo1
loop:

$ wc --bytes mystery*
16280 mystery0.exe
 3088 mystery0.o
 3508 mystery0.s
16344 mystery1.exe
 4064 mystery1.o
 6976 mystery1.s
 3314 mystery.c

The xxHash README says "By default, xxHash uses attribute((always_inline)) and __forceinline to improve performance at the cost of code size." but does it really? Or is this a compiler bug? Or should I be compiling the above example differently somehow?

P.S. This also goes against my understanding of how the compiler decides to inline or not inline, if force inline is not specified. I always thought the compiler decides on code size in a function. But in this example, the code size of the function stays the same. We just add another function which is never actually called. So... ?!

simonhf avatar Dec 02 '23 01:12 simonhf