exanic-software
exanic-software copied to clipboard
eliminate high-latency IDIV CPU instruction
Simplifies disassembly as follows. The IDIV is 56 uops
exanic_cycles_to_ns:
imul rax, rsi, 1000000000
mov ecx, DWORD PTR 104[rdi]
xor edx, edx
div rcx
ret
exanic_cycles_to_ns_orig:
mov ecx, DWORD PTR 104[rdi]
mov rax, rsi
xor edx, edx
div rcx
mov rsi, rax
imul rax, rdx, 1000000000
xor edx, edx
imul rsi, rsi, 1000000000
div rcx
add rax, rsi
ret
return cycles * NANOS_PER_SEC / exanic->tick_hz;
I don't think this will work, cycles * NANOS_PER_SEC
will overflow. I agree that there might be some optimisations possible though... the traditional way used in the Linux kernel, etc., would be to store a multiply factor and shift count rather than tick_hz which must be divided by.
You're right. Can we hard-code tick_hz as 161132800? Are there much variations in it?
exanic_cycles_to_timespec:
movabs rdi, -3081281060773882715
mov rax, rsi
mov rcx, rdx
mul rdi
shr rdx, 27
mov QWORD PTR [rcx], rdx
imul rdx, rdx, 161132800
sub rsi, rdx
imul rdx, rsi, 1000000000
mov rax, rdx
mul rdi
shr rdx, 27
mov QWORD PTR 8[rcx], rdx
ret
It does vary, e.g.:
ExaNIC X2/X4/X10/X40/FDK 1.x: 161Mhz
ExaNIC V5P/FDK 2.x/future cards: 322Mhz
ExaNIC GM: 250Mhz
ExaNIC HPT: 4Ghz
There may be a way to optimise the arithmetic a bit, noting that both a divisor and remainder can be calculated in one div instruction, and signed idiv is not really necessary. However as noted before I think the most efficient way would probably be to pre-calculate a multiply/shift value for the conversion... see clocks_calc_mult_shift
in kernel/time/clocksource.c
in the Linux kernel.
#include <utility>
#include <cstdint>
#include <cmath>
using u32 = uint32_t;
using u64 = uint64_t;
inline constexpr std::pair<u32, u32> clocks_calc_mult_shift(u32 from, u32 to, u32 maxsec)
{
u64 tmp = 0;
u32 sft = 0, sftacc = 32;
/*
* Calculate the shift factor which is limiting the conversion
* range:
*/
tmp = ((u64)maxsec * from) >> 32;
while (tmp) {
tmp >>=1;
sftacc--;
}
/*
* Find the conversion shift/mult pair which has the best
* accuracy and fits the maxsec conversion range:
*/
for (sft = 32; sft > 0; sft--) {
tmp = (u64) to << sft;
tmp += from / 2;
tmp /= from;
if ((tmp >> sftacc) == 0)
break;
}
return { tmp, sft };
}
inline constexpr u64 NANOS_PER_SEC = 1000000000ull;
inline constexpr u64 EXA_X40_CLK_HZ = 161132800ull;
inline constexpr u64 NOW_EPOCH = 1546303354ull;
inline constexpr u64 NOW_FRACT = 723406389ull;
inline constexpr u64 NOW_NANOS = NOW_EPOCH * NANOS_PER_SEC + NOW_FRACT;
inline constexpr u64 NOW_EXA_CLK = NOW_EPOCH * EXA_X40_CLK_HZ + (NOW_FRACT * EXA_X40_CLK_HZ / NANOS_PER_SEC);
u64 precision()
{
constexpr auto mult_shift = clocks_calc_mult_shift(EXA_X40_CLK_HZ, NANOS_PER_SEC, NOW_EPOCH);
return 1e9 * (double)mult_shift.first / std::pow(2, mult_shift.second);
}
u64 true_precision()
{
return 1e9 * ((double)NANOS_PER_SEC / (double)EXA_X40_CLK_HZ);
}
u64 precision_one()
{
constexpr auto mult_shift = clocks_calc_mult_shift(EXA_X40_CLK_HZ, NANOS_PER_SEC, 1u);
return 1e9 * (double)mult_shift.first / std::pow(2, mult_shift.second);
}
u64 to_nanos(u64 cycles, u64 hz)
{
// select larger mult and shift to deal with only one second intervals
constexpr auto mult_shift = clocks_calc_mult_shift(EXA_X40_CLK_HZ, NANOS_PER_SEC, 1);
const u64 seconds = cycles / hz;
const u64 fract = cycles % hz;
return seconds * NANOS_PER_SEC + ((fract * mult_shift.first) >> mult_shift.second);
}
int64_t precision_ns()
{
return NOW_NANOS - to_nanos(NOW_EXA_CLK, EXA_X40_CLK_HZ);
}
precision():
movabs rax, 6250000000
ret
true_precision():
movabs rax, 6206061087
ret
precision_one():
movabs rax, 6206061087
ret
to_nanos(unsigned long, unsigned long):
mov rax, rdi
xor edx, edx
mov edi, 3331853676
div rsi
imul rdx, rdi
imul rax, rax, 1000000000
shr rdx, 29
add rax, rdx
ret
precision_ns():
mov eax, 7
ret
If we just try to do it in one mul + shr for entire cycles counter we'd effectively multiply it by 6.25 instead of true ratio of ~6.206061087. Which would obviously cause huge loss of precision. The mult selected would be 50 and shift would be only 3. This is because we need to set maxsec to pretty much Unix Epoch.
We can convert seconds separately (see to_nanos) and then fraction via mult and shift (which would be selected as 29) but it'd also cause div instruction and pretty much take us to square one. Actually would be super cool to have something like seconds since boot in the cycles register instead of full Epoch.
As far as I can tell the best method could be to have all the different TICK_HZ versions as separate functions where the compiler could figure better integer math for us and do a switch. Or we'd need some more sophisticated way to have mult, shr, etc i.e. pretty much do that GCC does once it knows the constant divisor. Makes sense?
If the divisors are fixed, or at least computed at the start of the process, we could calculate the constants for the Granlund-Montgomery division algorithm, which is just a couple cycles.