nethermind
nethermind copied to clipboard
Optimize Avx512 Keccak
Changes
Performance improvements:
- Code size: 1919 → 994 bytes (48% reduction)
- Stack usage: 1104 → 32 bytes (640 bytes saved, 10 fewer XMM saves)
- Hot loop: eliminated ~40 constant vector loads per iteration
Optimizations:
- Hoisted constants - moved rotation/permutation vectors to static readonly fields, eliminating redundant vector creations and memory loads in the hot loop
-
Improved instruction scheduling - restructured Keccak round function to overlap independent operations:
- Theta: use 3-way TernaryLogic XOR to avoid materializing intermediate results
- Rho+Pi: pipeline permutes immediately after rotates complete
- Pi: rewrite using matrix transpose (unpack/shuffle) instead of 25× PermuteVar8x64x2 calls
- Cache optimization - increased KeccakCache entry size from 96 to 128 bytes (align to 2 cache lines), added SSE prefetch hints
-
Benchmarking - added
BenchmarkHashmethod for AVX-512 vs scalar comparison
Assembly impact:
Before: hot loop with 40+ memory loads for constant vectors, poor instruction interleaving
vmovups zmm18, zmmword ptr [reloc @RWD64]
vmovaps zmm19, zmm18
vpermi2q zmm19, zmm6, zmm7
vmovups zmm20, zmmword ptr [reloc @RWD128] ; repeated 40+ times
After: no constant loads, better scheduling
vpternlogq zmm0, zmm24, zmm25, -106 ; direct use of hoisted constants
vprolvq zmm0, zmm0, zmm19
vpermq zmm1, zmm16, zmm1 ; overlapped execution
Before
G_M000_IG04: ;; offset=0x013F
vmovaps zmm18, zmm6
vpternlogq zmm18, zmm7, zmm8, -106
vpternlogq zmm18, zmm10, zmm9, -106
vpermq zmm19, zmm2, zmm18
vpermq zmm18, zmm0, zmm18
vprolq zmm18, zmm18, 1
vpxorq zmm18, zmm18, zmm19
vpxorq zmm6, zmm6, zmm18
vpxorq zmm7, zmm7, zmm18
vpxorq zmm8, zmm8, zmm18
vpxorq zmm9, zmm9, zmm18
vpxorq zmm10, zmm10, zmm18
vprolvq zmm6, zmm6, zmm3
vprolvq zmm7, zmm7, zmm4
vprolvq zmm8, zmm8, zmm5
vprolvq zmm9, zmm9, zmm16
vprolvq zmm10, zmm10, zmm17
vmovups zmm18, zmmword ptr [reloc @RWD64]
vmovaps zmm19, zmm18
vpermi2q zmm19, zmm6, zmm7
vmovups zmm20, zmmword ptr [reloc @RWD128]
vpermt2q zmm19, zmm20, zmm8
vmovups zmm20, zmmword ptr [reloc @RWD192]
vpermt2q zmm19, zmm20, zmm9
vmovups zmm20, zmmword ptr [reloc @RWD256]
vpermt2q zmm19, zmm20, zmm10
vmovups zmm20, zmmword ptr [reloc @RWD320]
vmovaps zmm21, zmm6
vpermt2q zmm21, zmm20, zmm7
vmovups zmm20, zmmword ptr [reloc @RWD384]
vpermt2q zmm21, zmm20, zmm8
vmovups zmm20, zmmword ptr [reloc @RWD448]
vpermt2q zmm21, zmm20, zmm9
vmovups zmm20, zmmword ptr [reloc @RWD512]
vpermt2q zmm21, zmm20, zmm10
vmovups zmm20, zmmword ptr [reloc @RWD576]
vmovaps zmm22, zmm6
vpermt2q zmm22, zmm20, zmm7
vmovups zmm20, zmmword ptr [reloc @RWD640]
vpermt2q zmm22, zmm20, zmm8
vmovups zmm20, zmmword ptr [reloc @RWD704]
vpermt2q zmm22, zmm20, zmm9
vmovups zmm20, zmmword ptr [reloc @RWD768]
vpermt2q zmm22, zmm20, zmm10
vmovups zmm20, zmmword ptr [reloc @RWD832]
vmovaps zmm23, zmm6
vpermt2q zmm23, zmm20, zmm7
vmovups zmm20, zmmword ptr [reloc @RWD896]
vpermt2q zmm23, zmm20, zmm8
vmovups zmm20, zmmword ptr [reloc @RWD960]
vpermt2q zmm23, zmm20, zmm9
vmovups zmm20, zmmword ptr [reloc @RWD1024]
vpermt2q zmm23, zmm20, zmm10
vmovups zmm20, zmmword ptr [reloc @RWD1088]
vpermt2q zmm6, zmm20, zmm7
vmovups zmm20, zmmword ptr [reloc @RWD1152]
vpermt2q zmm6, zmm20, zmm8
vmovups zmm20, zmmword ptr [reloc @RWD1216]
vpermt2q zmm6, zmm20, zmm9
vmovups zmm20, zmmword ptr [reloc @RWD1280]
vpermt2q zmm10, zmm20, zmm6
After
G_M000_IG04: ;; offset=0x00DA
vmovaps zmm24, zmm0
vpternlogq zmm24, zmm1, zmm2, -106
vpternlogq zmm24, zmm4, zmm3, -106
vpermq zmm25, zmm16, zmm24
vpermq zmm24, zmm5, zmm24
vprolq zmm25, zmm25, 1
vpternlogq zmm0, zmm24, zmm25, -106
vpternlogq zmm1, zmm24, zmm25, -106
vpternlogq zmm2, zmm24, zmm25, -106
vpternlogq zmm3, zmm24, zmm25, -106
vpternlogq zmm4, zmm24, zmm25, -106
vprolvq zmm0, zmm0, zmm19
vprolvq zmm1, zmm1, zmm20
vpermq zmm1, zmm16, zmm1
vprolvq zmm2, zmm2, zmm21
vpermq zmm2, zmm17, zmm2
vprolvq zmm3, zmm3, zmm22
vpermq zmm3, zmm18, zmm3
vprolvq zmm4, zmm4, zmm23
vpunpcklqdq zmm24, zmm0, zmm1
vpunpcklqdq zmm25, zmm2, zmm3
vpunpckhqdq zmm0, zmm0, zmm1
vpunpckhqdq zmm1, zmm2, zmm3
vpermq zmm4, zmm5, zmm4
vpunpcklqdq zmm2, zmm4, zmm4
vpunpckhqdq zmm3, zmm4, zmm4
vshufi64x2 zmm4, zmm24, zmm25, 68
vshufi64x2 zmm0, zmm0, zmm1, 68
vshufi64x2 zmm1, zmm4, zmm2, -120
vshufi64x2 zmm26, zmm0, zmm3, -35
vshufi64x2 zmm0, zmm0, zmm3, -120
vshufi64x2 zmm3, zmm24, zmm25, -18
vshufi64x2 zmm3, zmm3, zmm2, -88
vpermq zmm24, zmm16, zmm1
vpermq zmm25, zmm17, zmm1
vpermq zmm27, zmm16, zmm26
vpermq zmm28, zmm17, zmm26
vpternlogq zmm1, zmm24, zmm25, -46
vpermq zmm24, zmm16, zmm0
vpermq zmm25, zmm17, zmm0
vpermq zmm29, zmm16, zmm3
vpermq zmm30, zmm17, zmm3
vpternlogq zmm26, zmm27, zmm28, -46
vpternlogq zmm0, zmm24, zmm25, -46
vshufi64x2 zmm4, zmm4, zmm2, -35
vpermq zmm2, zmm16, zmm4
vpermq zmm24, zmm17, zmm4
vpternlogq zmm3, zmm29, zmm30, -46
vpternlogq zmm4, zmm2, zmm24, -46
vmovq xmm2, qword ptr [rax]
vpxord zmm1, zmm2, zmm1
add rax, 8
vmovaps zmm2, zmm1
- 640 bytes less of stack used; 10 less 16bytes pushes in preamble
Before
; Method Nethermind.Core.Crypto.KeccakHash:KeccakF1600Avx512F(System.Span`1[ulong]) (FullOpts)
G_M000_IG01: ;; offset=0x0000
push rbx
sub rsp, 1104
vmovaps xmmword ptr [rsp+0x440], xmm6
vmovaps xmmword ptr [rsp+0x430], xmm7
vmovaps xmmword ptr [rsp+0x420], xmm8
vmovaps xmmword ptr [rsp+0x410], xmm9
vmovaps xmmword ptr [rsp+0x400], xmm10
vmovaps xmmword ptr [rsp+0x3F0], xmm11
vmovaps xmmword ptr [rsp+0x3E0], xmm12
vmovaps xmmword ptr [rsp+0x3D0], xmm13
vmovaps xmmword ptr [rsp+0x3C0], xmm14
vmovaps xmmword ptr [rsp+0x3B0], xmm15
G_M000_IG02: ;; offset=0x0062
After
; Method Nethermind.Core.Crypto.KeccakHash:KeccakF1600Avx512F(System.Span`1[ulong]) (FullOpts)
G_M000_IG01: ;; offset=0x0000
push rbx
sub rsp, 32
G_M000_IG02: ;; offset=0x0005
- Reduced code size and constant loads
Before
G_M000_IG11: ;; offset=0x0779
call CORINFO_HELP_RNGCHKFAIL
int3
RWD00 dq FFFFFFFFFFFFFFFFh, FFFFFFFFFFFFFFFFh, FFFFFFFFFFFFFFFFh, FFFFFFFFFFFFFFFFh, FFFFFFFFFFFFFFFFh, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD64 dq 0000000000000001h, 0000000000000002h, 0000000000000003h, 0000000000000004h, 0000000000000000h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD128 dq 0000000000000002h, 0000000000000003h, 0000000000000004h, 0000000000000000h, 0000000000000001h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD192 dq 0000000000000004h, 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000003h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD256 dq 0000000000000000h, 0000000000000001h, 000000000000003Eh, 000000000000001Ch, 000000000000001Bh, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD320 dq 0000000000000024h, 000000000000002Ch, 0000000000000006h, 0000000000000037h, 0000000000000014h, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD384 dq 0000000000000003h, 000000000000000Ah, 000000000000002Bh, 0000000000000019h, 0000000000000027h, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD448 dq 0000000000000029h, 000000000000002Dh, 000000000000000Fh, 0000000000000015h, 0000000000000008h, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD512 dq 0000000000000012h, 0000000000000002h, 000000000000003Dh, 0000000000000038h, 000000000000000Eh, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD576 dq 0000000000000000h, 0000000000000009h, 0000000000000002h, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD640 dq 0000000000000000h, 0000000000000001h, 000000000000000Ah, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD704 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 000000000000000Bh, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD768 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000003h, 000000000000000Ch, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD832 dq 0000000000000003h, 000000000000000Ch, 0000000000000002h, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD896 dq 0000000000000000h, 0000000000000001h, 0000000000000008h, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD960 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000009h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1024 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000003h, 000000000000000Ah, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1088 dq 0000000000000001h, 000000000000000Ah, 0000000000000002h, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1152 dq 0000000000000000h, 0000000000000001h, 000000000000000Bh, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1216 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 000000000000000Ch, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1280 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000003h, 0000000000000008h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1344 dq 0000000000000004h, 0000000000000008h, 0000000000000002h, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1408 dq 0000000000000000h, 0000000000000001h, 0000000000000009h, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1472 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 000000000000000Ah, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1536 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000003h, 000000000000000Bh, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1600 dq 0000000000000002h, 000000000000000Bh, 0000000000000002h, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1664 dq 0000000000000000h, 0000000000000001h, 000000000000000Ch, 0000000000000003h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1728 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000008h, 0000000000000004h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD1792 dq 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000003h, 0000000000000009h, 0000000000000005h, 0000000000000006h, 0000000000000007h
; Total bytes of code: 1919
After
G_M000_IG09: ;; offset=0x03CE
mov rcx, 0x7FFEEC264120
call CORINFO_HELP_GET_GCSTATIC_BASE
jmp G_M000_IG03
RWD00 dq 0000000000000004h, 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000003h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD64 dq 0000000000000001h, 0000000000000002h, 0000000000000003h, 0000000000000004h, 0000000000000000h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD128 dq 0000000000000002h, 0000000000000003h, 0000000000000004h, 0000000000000000h, 0000000000000001h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD192 dq 0000000000000003h, 0000000000000004h, 0000000000000000h, 0000000000000001h, 0000000000000002h, 0000000000000005h, 0000000000000006h, 0000000000000007h
RWD256 dq 0000000000000000h, 0000000000000001h, 000000000000003Eh, 000000000000001Ch, 000000000000001Bh, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD320 dq 0000000000000024h, 000000000000002Ch, 0000000000000006h, 0000000000000037h, 0000000000000014h, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD384 dq 0000000000000003h, 000000000000000Ah, 000000000000002Bh, 0000000000000019h, 0000000000000027h, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD448 dq 0000000000000029h, 000000000000002Dh, 000000000000000Fh, 0000000000000015h, 0000000000000008h, 0000000000000000h, 0000000000000000h, 0000000000000000h
RWD512 dq 0000000000000012h, 0000000000000002h, 000000000000003Dh, 0000000000000038h, 000000000000000Eh, 0000000000000000h, 0000000000000000h, 0000000000000000h
; Total bytes of code: 994
- List the changes
Types of changes
What types of changes does your code introduce?
- [x] Optimization
Testing
Requires testing
- [x] No
@copilot summarize the changes in this PR
@benaadams I've opened a new pull request, #9929, to work on those changes. Once the pull request is ready, I'll request review from you.