bc-csharp
bc-csharp copied to clipboard
Optimize AesEngine_X86
Produces very tight asm
; Method Encode256:ProcessRounds(byref):this
G_M000_IG01: ;; offset=0000H
sub rsp, 40
vzeroupper
G_M000_IG02: ;; offset=0007H
mov rax, gword ptr [rcx+10H]
mov ecx, dword ptr [rax+08H]
cmp ecx, 14
jbe G_M000_IG04
vmovupd xmm0, xmmword ptr [rdx]
vpxor xmm0, xmm0, xmmword ptr [rax+10H]
vaesenc xmm0, xmm0, xmmword ptr [rax+20H]
vaesenc xmm0, xmm0, xmmword ptr [rax+30H]
vaesenc xmm0, xmm0, xmmword ptr [rax+40H]
vaesenc xmm0, xmm0, xmmword ptr [rax+50H]
vaesenc xmm0, xmm0, xmmword ptr [rax+60H]
vaesenc xmm0, xmm0, xmmword ptr [rax+70H]
vaesenc xmm0, xmm0, xmmword ptr [rax+80H]
vaesenc xmm0, xmm0, xmmword ptr [rax+90H]
vaesenc xmm0, xmm0, xmmword ptr [rax+A0H]
vaesenc xmm0, xmm0, xmmword ptr [rax+B0H]
vaesenc xmm0, xmm0, xmmword ptr [rax+C0H]
vaesenc xmm0, xmm0, xmmword ptr [rax+D0H]
vaesenc xmm0, xmm0, xmmword ptr [rax+E0H]
vaesenclast xmm0, xmm0, xmmword ptr [rax+F0H]
vmovupd xmmword ptr [rdx], xmm0
G_M000_IG03: ;; offset=0090H
add rsp, 40
ret
G_M000_IG04: ;; offset=0095H
call CORINFO_HELP_RNGCHKFAIL
int3
; Total bytes of code: 155
vs current
; Assembly listing for method AesEngine_X86:ProcessBlock(<unnamed>,int,<unnamed>,int):int:this
; 2 inlinees with PGO data; 22 single block inlinees; 7 inlinees without PGO data
G_M000_IG01: ;; offset=0000H
56 push rsi
4883EC20 sub rsp, 32
C5F877 vzeroupper
8B442450 mov eax, dword ptr [rsp+50H]
G_M000_IG02: ;; offset=000CH
448B5208 mov r10d, dword ptr [rdx+08H]
458D5AF0 lea r11d, [r10-10H]
453BD8 cmp r11d, r8d
0F8CB4050000 jl G_M000_IG13
458B5908 mov r11d, dword ptr [r9+08H]
418D73F0 lea esi, [r11-10H]
3BF0 cmp esi, eax
0F8CE7050000 jl G_M000_IG14
458BD2 mov r10d, r10d
418BF0 mov esi, r8d
4883C610 add rsi, 16
4C3BD6 cmp r10, rsi
0F8251060000 jb G_M000_IG16
458BC0 mov r8d, r8d
4A8D540210 lea rdx, bword ptr [rdx+r8+10H]
C5F91002 vmovupd xmm0, xmmword ptr [rdx]
8B5108 mov edx, dword ptr [rcx+08H]
83FA05 cmp edx, 5
0F87FE050000 ja G_M000_IG15
8BD2 mov edx, edx
4C8D053F060000 lea r8, [reloc @RWD00]
458B0490 mov r8d, dword ptr [r8+4*rdx]
4C8D15A0FFFFFF lea r10, G_M000_IG02
4D03C2 add r8, r10
41FFE0 jmp r8
G_M000_IG03: ;; offset=0072H
488B09 mov rcx, gword ptr [rcx]
8B5108 mov edx, dword ptr [rcx+08H]
85D2 test edx, edx
0F8417060000 je G_M000_IG17
C5F9EF4110 vpxor xmm0, xmm0, xmmword ptr [rcx+10H]
83FA01 cmp edx, 1
0F8608060000 jbe G_M000_IG17
C4E279DE4120 vaesdec xmm0, xmm0, xmmword ptr [rcx+20H]
83FA02 cmp edx, 2
0F86F9050000 jbe G_M000_IG17
C4E279DE4130 vaesdec xmm0, xmm0, xmmword ptr [rcx+30H]
83FA03 cmp edx, 3
0F86EA050000 jbe G_M000_IG17
C4E279DE4140 vaesdec xmm0, xmm0, xmmword ptr [rcx+40H]
83FA04 cmp edx, 4
0F86DB050000 jbe G_M000_IG17
C4E279DE4150 vaesdec xmm0, xmm0, xmmword ptr [rcx+50H]
83FA05 cmp edx, 5
0F86CC050000 jbe G_M000_IG17
C4E279DE4160 vaesdec xmm0, xmm0, xmmword ptr [rcx+60H]
83FA06 cmp edx, 6
0F86BD050000 jbe G_M000_IG17
C4E279DE4170 vaesdec xmm0, xmm0, xmmword ptr [rcx+70H]
83FA07 cmp edx, 7
0F86AE050000 jbe G_M000_IG17
C4E279DE8180000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+80H]
83FA08 cmp edx, 8
0F869C050000 jbe G_M000_IG17
C4E279DE8190000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+90H]
83FA09 cmp edx, 9
0F868A050000 jbe G_M000_IG17
C4E279DE81A0000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+A0H]
83FA0A cmp edx, 10
0F8678050000 jbe G_M000_IG17
C4E279DF81B0000000 vaesdeclast xmm0, xmm0, xmmword ptr [rcx+B0H]
E97A040000 jmp G_M000_IG11
G_M000_IG04: ;; offset=012CH
488B09 mov rcx, gword ptr [rcx]
8B5108 mov edx, dword ptr [rcx+08H]
85D2 test edx, edx
0F845C050000 je G_M000_IG17
C5F9EF4110 vpxor xmm0, xmm0, xmmword ptr [rcx+10H]
83FA01 cmp edx, 1
0F864D050000 jbe G_M000_IG17
C4E279DE4120 vaesdec xmm0, xmm0, xmmword ptr [rcx+20H]
83FA02 cmp edx, 2
0F863E050000 jbe G_M000_IG17
C4E279DE4130 vaesdec xmm0, xmm0, xmmword ptr [rcx+30H]
83FA03 cmp edx, 3
0F862F050000 jbe G_M000_IG17
C4E279DE4140 vaesdec xmm0, xmm0, xmmword ptr [rcx+40H]
83FA04 cmp edx, 4
0F8620050000 jbe G_M000_IG17
C4E279DE4150 vaesdec xmm0, xmm0, xmmword ptr [rcx+50H]
83FA05 cmp edx, 5
0F8611050000 jbe G_M000_IG17
C4E279DE4160 vaesdec xmm0, xmm0, xmmword ptr [rcx+60H]
83FA06 cmp edx, 6
0F8602050000 jbe G_M000_IG17
C4E279DE4170 vaesdec xmm0, xmm0, xmmword ptr [rcx+70H]
83FA07 cmp edx, 7
0F86F3040000 jbe G_M000_IG17
C4E279DE8180000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+80H]
83FA08 cmp edx, 8
0F86E1040000 jbe G_M000_IG17
C4E279DE8190000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+90H]
83FA09 cmp edx, 9
0F86CF040000 jbe G_M000_IG17
C4E279DE81A0000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+A0H]
83FA0A cmp edx, 10
0F86BD040000 jbe G_M000_IG17
C4E279DE81B0000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+B0H]
83FA0B cmp edx, 11
0F86AB040000 jbe G_M000_IG17
C4E279DE81C0000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+C0H]
83FA0C cmp edx, 12
0F8699040000 jbe G_M000_IG17
C4E279DF81D0000000 vaesdeclast xmm0, xmm0, xmmword ptr [rcx+D0H]
E99B030000 jmp G_M000_IG11
G_M000_IG05: ;; offset=020AH
488B09 mov rcx, gword ptr [rcx]
8B5108 mov edx, dword ptr [rcx+08H]
85D2 test edx, edx
0F847D040000 je G_M000_IG17
C5F9EF4110 vpxor xmm0, xmm0, xmmword ptr [rcx+10H]
83FA01 cmp edx, 1
0F866E040000 jbe G_M000_IG17
C4E279DE4120 vaesdec xmm0, xmm0, xmmword ptr [rcx+20H]
83FA02 cmp edx, 2
0F865F040000 jbe G_M000_IG17
C4E279DE4130 vaesdec xmm0, xmm0, xmmword ptr [rcx+30H]
83FA03 cmp edx, 3
0F8650040000 jbe G_M000_IG17
C4E279DE4140 vaesdec xmm0, xmm0, xmmword ptr [rcx+40H]
83FA04 cmp edx, 4
0F8641040000 jbe G_M000_IG17
C4E279DE4150 vaesdec xmm0, xmm0, xmmword ptr [rcx+50H]
83FA05 cmp edx, 5
0F8632040000 jbe G_M000_IG17
C4E279DE4160 vaesdec xmm0, xmm0, xmmword ptr [rcx+60H]
83FA06 cmp edx, 6
0F8623040000 jbe G_M000_IG17
C4E279DE4170 vaesdec xmm0, xmm0, xmmword ptr [rcx+70H]
83FA07 cmp edx, 7
0F8614040000 jbe G_M000_IG17
C4E279DE8180000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+80H]
83FA08 cmp edx, 8
0F8602040000 jbe G_M000_IG17
C4E279DE8190000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+90H]
83FA09 cmp edx, 9
0F86F0030000 jbe G_M000_IG17
C4E279DE81A0000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+A0H]
83FA0A cmp edx, 10
0F86DE030000 jbe G_M000_IG17
C4E279DE81B0000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+B0H]
83FA0B cmp edx, 11
0F86CC030000 jbe G_M000_IG17
C4E279DE81C0000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+C0H]
83FA0C cmp edx, 12
0F86BA030000 jbe G_M000_IG17
C4E279DE81D0000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+D0H]
83FA0D cmp edx, 13
0F86A8030000 jbe G_M000_IG17
C4E279DE81E0000000 vaesdec xmm0, xmm0, xmmword ptr [rcx+E0H]
83FA0E cmp edx, 14
G_M000_IG06: ;; offset=02F8H
0F8696030000 jbe G_M000_IG17
C4E279DF81F0000000 vaesdeclast xmm0, xmm0, xmmword ptr [rcx+F0H]
E998020000 jmp G_M000_IG11
G_M000_IG07: ;; offset=030CH
488B09 mov rcx, gword ptr [rcx]
8B5108 mov edx, dword ptr [rcx+08H]
85D2 test edx, edx
0F847A030000 je G_M000_IG17
C5F9EF4110 vpxor xmm0, xmm0, xmmword ptr [rcx+10H]
83FA01 cmp edx, 1
0F866B030000 jbe G_M000_IG17
C4E279DC4120 vaesenc xmm0, xmm0, xmmword ptr [rcx+20H]
83FA02 cmp edx, 2
0F865C030000 jbe G_M000_IG17
C4E279DC4130 vaesenc xmm0, xmm0, xmmword ptr [rcx+30H]
83FA03 cmp edx, 3
0F864D030000 jbe G_M000_IG17
C4E279DC4140 vaesenc xmm0, xmm0, xmmword ptr [rcx+40H]
83FA04 cmp edx, 4
0F863E030000 jbe G_M000_IG17
C4E279DC4150 vaesenc xmm0, xmm0, xmmword ptr [rcx+50H]
83FA05 cmp edx, 5
0F862F030000 jbe G_M000_IG17
C4E279DC4160 vaesenc xmm0, xmm0, xmmword ptr [rcx+60H]
83FA06 cmp edx, 6
0F8620030000 jbe G_M000_IG17
C4E279DC4170 vaesenc xmm0, xmm0, xmmword ptr [rcx+70H]
83FA07 cmp edx, 7
0F8611030000 jbe G_M000_IG17
C4E279DC8180000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+80H]
83FA08 cmp edx, 8
0F86FF020000 jbe G_M000_IG17
C4E279DC8190000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+90H]
83FA09 cmp edx, 9
0F86ED020000 jbe G_M000_IG17
C4E279DC81A0000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+A0H]
83FA0A cmp edx, 10
0F86DB020000 jbe G_M000_IG17
C4E279DD81B0000000 vaesenclast xmm0, xmm0, xmmword ptr [rcx+B0H]
E9DD010000 jmp G_M000_IG11
G_M000_IG08: ;; offset=03C6H
488B09 mov rcx, gword ptr [rcx]
8B5108 mov edx, dword ptr [rcx+08H]
85D2 test edx, edx
0F84BF020000 je G_M000_IG17
C5F9EF4110 vpxor xmm0, xmm0, xmmword ptr [rcx+10H]
83FA01 cmp edx, 1
0F86B0020000 jbe G_M000_IG17
C4E279DC4120 vaesenc xmm0, xmm0, xmmword ptr [rcx+20H]
83FA02 cmp edx, 2
0F86A1020000 jbe G_M000_IG17
C4E279DC4130 vaesenc xmm0, xmm0, xmmword ptr [rcx+30H]
83FA03 cmp edx, 3
0F8692020000 jbe G_M000_IG17
C4E279DC4140 vaesenc xmm0, xmm0, xmmword ptr [rcx+40H]
83FA04 cmp edx, 4
0F8683020000 jbe G_M000_IG17
C4E279DC4150 vaesenc xmm0, xmm0, xmmword ptr [rcx+50H]
83FA05 cmp edx, 5
0F8674020000 jbe G_M000_IG17
C4E279DC4160 vaesenc xmm0, xmm0, xmmword ptr [rcx+60H]
83FA06 cmp edx, 6
0F8665020000 jbe G_M000_IG17
C4E279DC4170 vaesenc xmm0, xmm0, xmmword ptr [rcx+70H]
83FA07 cmp edx, 7
0F8656020000 jbe G_M000_IG17
C4E279DC8180000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+80H]
83FA08 cmp edx, 8
0F8644020000 jbe G_M000_IG17
C4E279DC8190000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+90H]
83FA09 cmp edx, 9
0F8632020000 jbe G_M000_IG17
C4E279DC81A0000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+A0H]
83FA0A cmp edx, 10
0F8620020000 jbe G_M000_IG17
C4E279DC81B0000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+B0H]
83FA0B cmp edx, 11
0F860E020000 jbe G_M000_IG17
C4E279DC81C0000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+C0H]
83FA0C cmp edx, 12
0F86FC010000 jbe G_M000_IG17
C4E279DD81D0000000 vaesenclast xmm0, xmm0, xmmword ptr [rcx+D0H]
E9FE000000 jmp G_M000_IG11
G_M000_IG09: ;; offset=04A4H
488B09 mov rcx, gword ptr [rcx]
8B5108 mov edx, dword ptr [rcx+08H]
85D2 test edx, edx
0F84E0010000 je G_M000_IG17
C5F9EF4110 vpxor xmm0, xmm0, xmmword ptr [rcx+10H]
83FA01 cmp edx, 1
0F86D1010000 jbe G_M000_IG17
C4E279DC4120 vaesenc xmm0, xmm0, xmmword ptr [rcx+20H]
83FA02 cmp edx, 2
0F86C2010000 jbe G_M000_IG17
C4E279DC4130 vaesenc xmm0, xmm0, xmmword ptr [rcx+30H]
83FA03 cmp edx, 3
0F86B3010000 jbe G_M000_IG17
C4E279DC4140 vaesenc xmm0, xmm0, xmmword ptr [rcx+40H]
83FA04 cmp edx, 4
0F86A4010000 jbe G_M000_IG17
C4E279DC4150 vaesenc xmm0, xmm0, xmmword ptr [rcx+50H]
83FA05 cmp edx, 5
0F8695010000 jbe G_M000_IG17
C4E279DC4160 vaesenc xmm0, xmm0, xmmword ptr [rcx+60H]
83FA06 cmp edx, 6
0F8686010000 jbe G_M000_IG17
C4E279DC4170 vaesenc xmm0, xmm0, xmmword ptr [rcx+70H]
83FA07 cmp edx, 7
0F8677010000 jbe G_M000_IG17
C4E279DC8180000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+80H]
83FA08 cmp edx, 8
0F8665010000 jbe G_M000_IG17
C4E279DC8190000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+90H]
83FA09 cmp edx, 9
0F8653010000 jbe G_M000_IG17
C4E279DC81A0000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+A0H]
83FA0A cmp edx, 10
0F8641010000 jbe G_M000_IG17
C4E279DC81B0000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+B0H]
83FA0B cmp edx, 11
0F862F010000 jbe G_M000_IG17
C4E279DC81C0000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+C0H]
83FA0C cmp edx, 12
0F861D010000 jbe G_M000_IG17
C4E279DC81D0000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+D0H]
83FA0D cmp edx, 13
0F860B010000 jbe G_M000_IG17
C4E279DC81E0000000 vaesenc xmm0, xmm0, xmmword ptr [rcx+E0H]
83FA0E cmp edx, 14
G_M000_IG10: ;; offset=0592H
0F86F9000000 jbe G_M000_IG17
C4E279DD81F0000000 vaesenclast xmm0, xmm0, xmmword ptr [rcx+F0H]
G_M000_IG11: ;; offset=05A1H
418BCB mov ecx, r11d
8BD0 mov edx, eax
4883C210 add rdx, 16
483BCA cmp rcx, rdx
0F82D7000000 jb G_M000_IG16
8BC0 mov eax, eax
498D440110 lea rax, bword ptr [r9+rax+10H]
C5F91100 vmovupd xmmword ptr [rax], xmm0
B810000000 mov eax, 16
G_M000_IG12: ;; offset=05C3H
4883C420 add rsp, 32
5E pop rsi
C3 ret
G_M000_IG13: ;; offset=05C9H
48B9482063FEFE7F0000 mov rcx, 0x7FFEFE632048
E8F8CFAE5F call CORINFO_HELP_NEWSFAST
488BF0 mov rsi, rax
488BCE mov rcx, rsi
FF15AC380900 call [System.Exception:.ctor():this]
B901000000 mov ecx, 1
48BAF0BC6DFEFE7F0000 mov rdx, 0x7FFEFE6DBCF0
E828FFA65F call CORINFO_HELP_STRCNS
488D4E10 lea rcx, bword ptr [rsi+10H]
488BD0 mov rdx, rax
E89CDCE2FF call CORINFO_HELP_ASSIGN_REF
488BCE mov rcx, rsi
E87466A45F call CORINFO_HELP_THROW
G_M000_IG14: ;; offset=060CH
48B9482063FEFE7F0000 mov rcx, 0x7FFEFE632048
E8B5CFAE5F call CORINFO_HELP_NEWSFAST
488BF0 mov rsi, rax
488BCE mov rcx, rsi
FF1569380900 call [System.Exception:.ctor():this]
B92F000000 mov ecx, 47
48BAF0BC6DFEFE7F0000 mov rdx, 0x7FFEFE6DBCF0
E8E5FEA65F call CORINFO_HELP_STRCNS
488D4E10 lea rcx, bword ptr [rsi+10H]
488BD0 mov rdx, rax
E859DCE2FF call CORINFO_HELP_ASSIGN_REF
488BCE mov rcx, rsi
E83166A45F call CORINFO_HELP_THROW
G_M000_IG15: ;; offset=064FH
48B980BC6DFEFE7F0000 mov rcx, 0x7FFEFE6DBC80
E872CFAE5F call CORINFO_HELP_NEWSFAST
488BF0 mov rsi, rax
B9FE0F0000 mov ecx, 0xFFE
48BAF0BC6DFEFE7F0000 mov rdx, 0x7FFEFE6DBCF0
E8ABFEA65F call CORINFO_HELP_STRCNS
488BD0 mov rdx, rax
488BCE mov rcx, rsi
FF1597D11100 call [System.InvalidOperationException:.ctor(System.String):this]
488BCE mov rcx, rsi
E8F765A45F call CORINFO_HELP_THROW
G_M000_IG16: ;; offset=0689H
FF15490C1100 call [System.ThrowHelper:ThrowArgumentOutOfRangeException()]
CC int3
G_M000_IG17: ;; offset=0690H
E82B67C15F call CORINFO_HELP_RNGCHKFAIL
CC int3
RWD00 dd 00000066h ; case G_M000_IG03
dd 00000120h ; case G_M000_IG04
dd 000001FEh ; case G_M000_IG05
dd 00000300h ; case G_M000_IG07
dd 000003BAh ; case G_M000_IG08
dd 00000498h ; case G_M000_IG09
; Total bytes of code 1686
Thanks, I've incorporated a couple of the improvements from this PR (the bounds check elimination and non-ref locals).
The Unsafe load/store code definitely improves performance further, we might look at those later (currently being excessively cautious about Unsafe).
Using virtual dispatch doesn't show much difference for me, but it's certainly true that in some future API it would be better to just have different implementation classes for encrypt/decrypt per key size.