garnet icon indicating copy to clipboard operation
garnet copied to clipboard

Coalesce some writes in RespWriteUtils

Open PaulusParssinen opened this issue 1 year ago • 1 comments

Help JIT to coalesce some word and dword sized stores in RespWriteUtils.

Sample diffs

Garnet.common.RespWriteUtils:WriteNull
 ; Assembly listing for method Garnet.common.RespWriteUtils:WriteNull(byref,ulong):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Windows
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
+; 1 inlinees with PGO data; 3 single block inlinees; 0 inlinees without PGO data
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T00] (  9,  6   )   byref  ->  rcx         single-def
+;  V00 arg0         [V00,T00] (  7,  5   )   byref  ->  rcx         single-def
 ;  V01 arg1         [V01,T01] (  3,  3   )    long  ->  rdx         single-def
-;  V02 loc0         [V02,T02] (  9,  4.50)    long  ->   r8        
-;  V03 OutArgs      [V03    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V04 cse0         [V04,T03] (  3,  2.50)    long  ->   r8         "CSE #01: aggressive"
+;  V02 loc0         [V02,T04] (  3,  1.50)    long  ->   r8         single-def
+;# V03 OutArgs      [V03    ] (  1,  1   )  struct ( 0) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
+;* V04 tmp1         [V04    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ubyte]>
+;* V05 tmp2         [V05    ] (  0,  0   )  struct (16) zero-ref    "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;  V06 tmp3         [V06,T03] (  2,  2   )    long  ->  rax         "impAppendStmt"
+;* V07 tmp4         [V07    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;* V08 tmp5         [V08    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V04._reference (fldOffset=0x0)" P-INDEP
+;* V09 tmp6         [V09    ] (  0,  0   )     int  ->  zero-ref    single-def "field V04._length (fldOffset=0x8)" P-INDEP
+;* V10 tmp7         [V10    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V05._reference (fldOffset=0x0)" P-INDEP
+;* V11 tmp8         [V11    ] (  0,  0   )     int  ->  zero-ref    single-def "field V05._length (fldOffset=0x8)" P-INDEP
+;* V12 tmp9         [V12    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V07._reference (fldOffset=0x0)" P-INDEP
+;* V13 tmp10        [V13    ] (  0,  0   )     int  ->  zero-ref    "field V07._length (fldOffset=0x8)" P-INDEP
+;  V14 cse0         [V14,T02] (  3,  2.50)    long  ->   r8         "CSE #01: aggressive"
 ;
-; Lcl frame size = 40
+; Lcl frame size = 0
 
 G_M4163_IG01:  ;; offset=0x0000
-       sub      rsp, 40
-						;; size=4 bbWeight=1 PerfScore 0.25
-G_M4163_IG02:  ;; offset=0x0004
+						;; size=0 bbWeight=1 PerfScore 0.00
+G_M4163_IG02:  ;; offset=0x0000
        mov      r8, qword ptr [rcx]
        sub      rdx, r8
        cmp      edx, 5
        jge      SHORT G_M4163_IG05
 						;; size=11 bbWeight=1 PerfScore 3.50
-G_M4163_IG03:  ;; offset=0x000F
+G_M4163_IG03:  ;; offset=0x000B
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
-G_M4163_IG04:  ;; offset=0x0011
-       add      rsp, 40
+G_M4163_IG04:  ;; offset=0x000D
        ret      
-						;; size=5 bbWeight=0.50 PerfScore 0.62
-G_M4163_IG05:  ;; offset=0x0016
+						;; size=1 bbWeight=0.50 PerfScore 0.50
+G_M4163_IG05:  ;; offset=0x000E
        lea      rax, [r8+0x01]
        mov      qword ptr [rcx], rax
        mov      byte  ptr [r8], 36
-       mov      r8, qword ptr [rcx]
-       lea      rax, [r8+0x01]
-       mov      qword ptr [rcx], rax
-       mov      byte  ptr [r8], 45
-       mov      r8, qword ptr [rcx]
-       lea      rax, [r8+0x01]
-       mov      qword ptr [rcx], rax
-       mov      byte  ptr [r8], 49
-       call     [Garnet.common.RespWriteUtils:WriteNewline(byref)]
+       mov      rax, qword ptr [rcx]
+       mov      dword ptr [rax], 0xA0D312D
+       add      qword ptr [rcx], 4
        mov      eax, 1
-						;; size=50 bbWeight=0.50 PerfScore 7.38
-G_M4163_IG06:  ;; offset=0x0048
-       add      rsp, 40
+						;; size=29 bbWeight=0.50 PerfScore 4.38
+G_M4163_IG06:  ;; offset=0x002B
        ret      
-						;; size=5 bbWeight=0.50 PerfScore 0.62
+						;; size=1 bbWeight=0.50 PerfScore 0.50
 
-; Total bytes of code 77, prolog size 4, PerfScore 12.50, instruction count 23, allocated bytes for code 77 (MethodHash=bc12efbc) for method Garnet.common.RespWriteUtils:WriteNull(byref,ulong):ubyte (FullOpts)
+; Total bytes of code 44, prolog size 0, PerfScore 9.00, instruction count 14, allocated bytes for code 44 (MethodHash=bc12efbc) for method Garnet.common.RespWriteUtils:WriteNull(byref,ulong):ubyte (FullOpts)
 ; ============================================================
Garnet.common.RespWriteUtils:WriteIntegerAsBulkString
 ; Assembly listing for method Garnet.common.RespWriteUtils:WriteIntegerAsBulkString(int,byref,ulong):ubyte (FullOpts)
 ; Emitting BLENDED_CODE for X64 with AVX - Windows
 ; FullOpts code
 ; optimized code
 ; rsp based frame
 ; partially interruptible
 ; No PGO data
+; 2 inlinees with PGO data; 8 single block inlinees; 0 inlinees without PGO data
 ; Final local variable assignments
 ;
-;  V00 arg0         [V00,T02] (  5,  4.50)     int  ->  rsi         single-def
-;  V01 arg1         [V01,T00] ( 14,  8.50)   byref  ->  rbx         single-def
-;  V02 arg2         [V02,T03] (  3,  3   )    long  ->  rdi         single-def
-;  V03 loc0         [V03,T04] (  5,  4   )     int  ->  rbp         single-def
-;  V04 loc1         [V04,T05] (  4,  3.50)   ubyte  ->  r14         single-def
-;  V05 loc2         [V05,T06] (  3,  2.50)     int  ->  rdx         single-def
-;  V06 loc3         [V06,T01] ( 15,  7.50)    long  ->  r10        
+;  V00 arg0         [V00,T01] (  5,  4.50)     int  ->  rsi         single-def
+;  V01 arg1         [V01,T00] ( 12,  7.50)   byref  ->  rbx         single-def
+;  V02 arg2         [V02,T02] (  3,  3   )    long  ->  rdi         single-def
+;  V03 loc0         [V03,T03] (  4,  3.50)     int  ->  rbp         single-def
+;  V04 loc1         [V04,T04] (  3,  3   )   ubyte  ->  r14         single-def
+;  V05 loc2         [V05,T05] (  3,  2.50)     int  ->  rdx         single-def
+;  V06 loc3         [V06,T10] (  3,  1.50)    long  ->  rcx         single-def
 ;  V07 OutArgs      [V07    ] (  1,  1   )  struct (32) [rsp+0x00]  do-not-enreg[XS] addr-exposed "OutgoingArgSpace"
-;  V08 cse0         [V08,T07] (  3,  2.50)    long  ->  r10         "CSE #01: aggressive"
+;* V08 tmp1         [V08    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ubyte]>
+;* V09 tmp2         [V09    ] (  0,  0   )  struct (16) zero-ref    "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;  V10 tmp3         [V10,T08] (  2,  2   )    long  ->  rcx         "impAppendStmt"
+;* V11 tmp4         [V11    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;* V12 tmp5         [V12    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "NewObj constructor temp" <System.ReadOnlySpan`1[ubyte]>
+;* V13 tmp6         [V13    ] (  0,  0   )  struct (16) zero-ref    "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;  V14 tmp7         [V14,T09] (  2,  2   )    long  ->  rax         "impAppendStmt"
+;* V15 tmp8         [V15    ] (  0,  0   )  struct (16) zero-ref    ld-addr-op "Inlining Arg" <System.ReadOnlySpan`1[ubyte]>
+;* V16 tmp9         [V16    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V08._reference (fldOffset=0x0)" P-INDEP
+;* V17 tmp10        [V17    ] (  0,  0   )     int  ->  zero-ref    single-def "field V08._length (fldOffset=0x8)" P-INDEP
+;* V18 tmp11        [V18    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V09._reference (fldOffset=0x0)" P-INDEP
+;* V19 tmp12        [V19    ] (  0,  0   )     int  ->  zero-ref    single-def "field V09._length (fldOffset=0x8)" P-INDEP
+;* V20 tmp13        [V20    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V11._reference (fldOffset=0x0)" P-INDEP
+;* V21 tmp14        [V21    ] (  0,  0   )     int  ->  zero-ref    "field V11._length (fldOffset=0x8)" P-INDEP
+;* V22 tmp15        [V22    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V12._reference (fldOffset=0x0)" P-INDEP
+;* V23 tmp16        [V23    ] (  0,  0   )     int  ->  zero-ref    single-def "field V12._length (fldOffset=0x8)" P-INDEP
+;* V24 tmp17        [V24    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V13._reference (fldOffset=0x0)" P-INDEP
+;* V25 tmp18        [V25    ] (  0,  0   )     int  ->  zero-ref    single-def "field V13._length (fldOffset=0x8)" P-INDEP
+;* V26 tmp19        [V26    ] (  0,  0   )   byref  ->  zero-ref    single-def "field V15._reference (fldOffset=0x0)" P-INDEP
+;* V27 tmp20        [V27    ] (  0,  0   )     int  ->  zero-ref    "field V15._length (fldOffset=0x8)" P-INDEP
+;  V28 cse0         [V28,T06] (  3,  2.50)    long  ->  rcx         "CSE #01: aggressive"
+;  V29 cse1         [V29,T07] (  3,  2.50)     int  ->  r15         "CSE #02: aggressive"
 ;
-; Lcl frame size = 32
+; Lcl frame size = 40
 
 G_M4598_IG01:  ;; offset=0x0000
+       push     r15
        push     r14
        push     rdi
        push     rsi
        push     rbp
        push     rbx
-       sub      rsp, 32
+       sub      rsp, 40
        mov      esi, ecx
        mov      rbx, rdx
        mov      rdi, r8
-						;; size=18 bbWeight=1 PerfScore 6.00
-G_M4598_IG02:  ;; offset=0x0012
+						;; size=20 bbWeight=1 PerfScore 7.00
+G_M4598_IG02:  ;; offset=0x0014
        movsxd   rcx, esi
        call     [Garnet.common.NumUtils:NumDigitsInLong(long):int]
        mov      ebp, eax
        mov      r14d, esi
        shr      r14d, 31
-       lea      ecx, [r14+rbp]
+       lea      r15d, [r14+rbp]
+       mov      ecx, r15d
        call     [Garnet.common.NumUtils:NumDigits(int):int]
        mov      edx, eax
-       lea      eax, [rdx+r14]
-       lea      eax, [rax+rbp+0x05]
-       mov      r10, qword ptr [rbx]
-       sub      rdi, r10
+       add      r14d, edx
+       lea      eax, [r14+rbp+0x05]
+       mov      rcx, qword ptr [rbx]
+       sub      rdi, rcx
        cmp      eax, edi
        jle      SHORT G_M4598_IG05
-						;; size=48 bbWeight=1 PerfScore 13.00
-G_M4598_IG03:  ;; offset=0x0042
+						;; size=51 bbWeight=1 PerfScore 13.00
+G_M4598_IG03:  ;; offset=0x0047
        xor      eax, eax
 						;; size=2 bbWeight=0.50 PerfScore 0.12
-G_M4598_IG04:  ;; offset=0x0044
-       add      rsp, 32
+G_M4598_IG04:  ;; offset=0x0049
+       add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
        pop      r14
+       pop      r15
        ret      
-						;; size=11 bbWeight=0.50 PerfScore 1.88
-G_M4598_IG05:  ;; offset=0x004F
-       lea      rcx, [r10+0x01]
-       mov      qword ptr [rbx], rcx
-       mov      byte  ptr [r10], 36
-       lea      ecx, [r14+rbp]
+						;; size=13 bbWeight=0.50 PerfScore 2.12
+G_M4598_IG05:  ;; offset=0x0056
+       lea      r8, [rcx+0x01]
+       mov      qword ptr [rbx], r8
+       mov      byte  ptr [rcx], 36
+       mov      ecx, r15d
        mov      r8, rbx
        call     [Garnet.common.NumUtils:IntToBytes(int,int,byref)]
-       mov      r10, qword ptr [rbx]
-       lea      rcx, [r10+0x01]
-       mov      qword ptr [rbx], rcx
-       mov      byte  ptr [r10], 13
-       mov      r10, qword ptr [rbx]
-       lea      rcx, [r10+0x01]
-       mov      qword ptr [rbx], rcx
-       mov      byte  ptr [r10], 10
+       mov      rcx, qword ptr [rbx]
+       mov      word  ptr [rcx], 0xA0D
+       add      qword ptr [rbx], 2
        mov      ecx, esi
        mov      edx, ebp
        mov      r8, rbx
        call     [Garnet.common.NumUtils:IntToBytes(int,int,byref)]
-       mov      r10, qword ptr [rbx]
-       lea      rax, [r10+0x01]
-       mov      qword ptr [rbx], rax
-       mov      byte  ptr [r10], 13
-       mov      r10, qword ptr [rbx]
-       lea      rax, [r10+0x01]
-       mov      qword ptr [rbx], rax
-       mov      byte  ptr [r10], 10
+       mov      rax, qword ptr [rbx]
+       mov      word  ptr [rax], 0xA0D
+       add      qword ptr [rbx], 2
        mov      eax, 1
-						;; size=98 bbWeight=0.50 PerfScore 14.12
-G_M4598_IG06:  ;; offset=0x00B1
-       add      rsp, 32
+						;; size=64 bbWeight=0.50 PerfScore 11.00
+G_M4598_IG06:  ;; offset=0x0096
+       add      rsp, 40
        pop      rbx
        pop      rbp
        pop      rsi
        pop      rdi
        pop      r14
+       pop      r15
        ret      
-						;; size=11 bbWeight=0.50 PerfScore 1.88
+						;; size=13 bbWeight=0.50 PerfScore 2.12
 
-; Total bytes of code 188, prolog size 10, PerfScore 37.00, instruction count 65, allocated bytes for code 188 (MethodHash=cb87ee09) for method Garnet.common.RespWriteUtils:WriteIntegerAsBulkString(int,byref,ulong):ubyte (FullOpts)
+; Total bytes of code 163, prolog size 12, PerfScore 35.38, instruction count 59, allocated bytes for code 163 (MethodHash=cb87ee09) for method Garnet.common.RespWriteUtils:WriteIntegerAsBulkString(int,byref,ulong):ubyte (FullOpts)
+; ============================================================

PaulusParssinen avatar Mar 29 '24 19:03 PaulusParssinen

Very cool!!

badrishc avatar Mar 29 '24 21:03 badrishc