sysbench
sysbench copied to clipboard
Sysbench CPU is absolutley broken.
In
int cpu_execute_event()
{
unsigned long long c;
unsigned long long l;
double t;
unsigned long long n=0;
/* So far we're using very simple test prime number tests in 64bit */
for(c=3; c < max_prime; c++)
{
t = sqrt((double)c);
for(l = 2; l <= t; l++)
if (c % l == 0)
break;
if (l > t )
n++;
}
return 0;
}
n is not used, therefore everything can be and is optimized away. https://godbolt.org/z/8qbGhe9ba For some reason outside of compiler explorer, GCC doesn't optimize it away:
ALIGN 16
cpu_execute_event:
mov edi, dword [rel max_prime] ; 0001D250 _ 8B. 3D, 00296C0A(rel)
cmp edi, 3 ; 0001D256 _ 83. FF, 03
jbe ?_01481 ; 0001D259 _ 76, 75
vxorps xmm1, xmm1, xmm1 ; 0001D25B _ C5 F0: 57. C9
mov esi, 3 ; 0001D25F _ BE, 00000003
vmovsd xmm2, qword [rel ?_12157] ; 0001D264 _ C5 FB: 10. 15, 0006BB04(rel)
vmovsd xmm3, qword [rel ?_12158] ; 0001D26C _ C5 FB: 10. 1D, 0006BB04(rel)
vcvtsi2sd xmm0, xmm1, rsi ; 0001D274 _ C4 E1 F3: 2A. C6
; Filling space: 7H
; Filler type: Multi-byte NOP
; db 0FH, 1FH, 80H, 00H, 00H, 00H, 00H
ALIGN 8
?_01477:vcomisd xmm0, xmm3 ; 0001D280 _ C5 F9: 2F. C3
jc ?_01480 ; 0001D284 _ 72, 2E
test sil, 01H ; 0001D286 _ 40: F6. C6, 01
jz ?_01480 ; 0001D28A _ 74, 28
mov ecx, 2 ; 0001D28C _ B9, 00000002
jmp ?_01479 ; 0001D291 _ EB, 12
; Filling space: 5H
; Filler type: Multi-byte NOP
; db 0FH, 1FH, 44H, 00H, 00H
ALIGN 8
?_01478:mov rax, rsi ; 0001D298 _ 48: 89. F0
xor edx, edx ; 0001D29B _ 31. D2
div rcx ; 0001D29D _ 48: F7. F1
test rdx, rdx ; 0001D2A0 _ 48: 85. D2
jz ?_01480 ; 0001D2A3 _ 74, 0F
?_01479:inc rcx ; 0001D2A5 _ 48: FF. C1
vcvtusi2sd xmm0, xmm1, rcx ; 0001D2A8 _ 62 F1 F7 08: 7B. C1
vcomisd xmm2, xmm0 ; 0001D2AE _ C5 F9: 2F. D0
jnc ?_01478 ; 0001D2B2 _ 73, E4
?_01480:inc rsi ; 0001D2B4 _ 48: FF. C6
cmp rsi, rdi ; 0001D2B7 _ 48: 39. FE
jz ?_01481 ; 0001D2BA _ 74, 14
vcvtsi2sd xmm0, xmm1, rsi ; 0001D2BC _ C4 E1 F3: 2A. C6
vsqrtsd xmm2, xmm0, xmm0 ; 0001D2C1 _ C5 FB: 51. D0
jmp ?_01477 ; 0001D2C5 _ EB, B9
; Filling space: 9H
; Filler type: Multi-byte NOP
; db 66H, 0FH, 1FH, 84H, 00H, 00H, 00H, 00H
; db 00H
ALIGN 16
?_01481:xor eax, eax ; 0001D2D0 _ 31. C0
ret ; 0001D2D2 _ C3
The result is Clang:
Prime numbers limit: 10000
Initializing worker threads...
Threads started!
CPU speed:
events per second: 11638783.77
Throughput:
events/s (eps): 11638783.7653
time elapsed: 10.0001s
total number of events: 116388756
Latency (ms):
min: 0.00
avg: 0.00
max: 0.13
95th percentile: 0.00
sum: 2072.88
Threads fairness:
events (avg/stddev): 116388756.0000/0.00
execution time (avg/stddev): 2.0729/0.00
CGG:
Prime numbers limit: 10000
Initializing worker threads...
Threads started!
CPU speed:
events per second: 3999.47
Throughput:
events/s (eps): 3999.4704
time elapsed: 10.0003s
total number of events: 39996
Latency (ms):
min: 0.25
avg: 0.25
max: 1.06
95th percentile: 0.26
sum: 9996.91
Threads fairness:
events (avg/stddev): 39996.0000/0.00
execution time (avg/stddev): 9.9969/0.00
To fix this:
- Add google benchmark dependency
- Convert the file from C to C++
- Add
benchmark::DoNotOptimize(std::move(n));
I have no clue how to use autogen so I'm not goint to do that.
FYI, This part of a larger pull request implements it this way, via storing TLS variable. It was noticed, because MSVC compiler also optimized away the function.