Using llvm-mca for statically measuring the performance of machine code

Open pramodk opened this issue 5 years ago • 0 comments

Haven't used this "in practice" but trying this first time to get an idea:

llvm-mca : LLVM Machine Code Analyzer

See docs
Understand pitfalls / what to expect : stackoverflow answer
See RFC

Sample mod file

NEURON {
    SUFFIX hh
    NONSPECIFIC_CURRENT il
    RANGE minf, mtau, gl, el
}

STATE {
    m
}

ASSIGNED {
    v (mV)
    minf
    mtau (ms)
}

BREAKPOINT {
    SOLVE states METHOD cnexp
    il = gl*(v - el)
}

DERIVATIVE states {
     m =  (minf-m)/mtau
     : m' =  (minf-m)/mtau       : you can uncomment this like if need ode solution involving exp
}

Patch to llvm branch

 → git diff
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c8143ac..5f55cb1 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -534,10 +534,10 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
         std::vector<std::string> double_variables{"v"};

         /// access node index and corresponding voltage
-        loop_index_statements.push_back(
-            visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
-        loop_body_statements.push_back(
-            visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
+        //loop_index_statements.push_back(
+        //    visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
+        //loop_body_statements.push_back(
+        //    visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));

         /// read ion variables
         ion_read_statements(BlockType::State,

Analysing serial code

./bin/nmodl hh.mod llvm --ir --vector-width 1 > tmp.ll

# extra LLVM IR and compile it to assembly
sed -n '/ModuleID/,$p' tmp.ll > hh.1.ll
llc -o - -O3  -march=x86-64 -mcpu=skylake-avx512 hh.1.ll > hh.1.s

# mark loop body for analysis
sed -i '' 's/^LBB0_2/# LLVM-MCA-BEGIN Body\'$'\nLBB0_2/g' hh.1.s
sed -i '' 's/^LBB0_3/# LLVM-MCA-END\'$'\nLBB0_3/g' hh.1.s


# loop body 

$ → cat hh.1.s
....
# LLVM-MCA-BEGIN Body
LBB0_2:                                 ## %for.body
                                        ## =>This Inner Loop Header: Depth=1
	movq	%rsp, %rax
	addq	$-16, %rax
	movq	%rax, %rsp
	movq	%rsp, %rax
	addq	$-16, %rax
	movq	%rax, %rsp
	movq	-16(%rbp), %rax
	movslq	-4(%rbp), %rcx
	movq	(%rax), %rdx
	movq	8(%rax), %rsi
	movq	16(%rax), %rax
	vmovsd	(%rdx,%rcx,8), %xmm0            ## xmm0 = mem[0],zero
	vsubsd	(%rax,%rcx,8), %xmm0, %xmm0
	vdivsd	(%rsi,%rcx,8), %xmm0, %xmm0
	vmovsd	%xmm0, (%rax,%rcx,8)
	incl	-4(%rbp)
	movq	-16(%rbp), %rax
	movl	-4(%rbp), %ecx
	cmpl	92(%rax), %ecx
	jl	LBB0_2
# LLVM-MCA-END
...

## Run MCA

→llvm-mca -march=x86-64 -mcpu=skylake-avx512 hh.1.s
hh.1.s:2:2: warning: .build_version macos used while targeting darwin19.5.0
        .build_version macos, 10, 15
        ^

[0] Code Region - Body

Iterations:        100
Instructions:      2000
Total Cycles:      638
Total uOps:        2600

Dispatch Width:    6
uOps Per Cycle:    4.08
IPC:               3.13
Block RThroughput: 6.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        movq	%rsp, %rax
 1      1     0.25                        addq	$-16, %rax
 1      1     0.25                        movq	%rax, %rsp
 1      1     0.25                        movq	%rsp, %rax
 1      1     0.25                        addq	$-16, %rax
 1      1     0.25                        movq	%rax, %rsp
 1      5     0.50    *                   movq	-16(%rbp), %rax
 1      5     0.50    *                   movslq	-4(%rbp), %rcx
 1      5     0.50    *                   movq	(%rax), %rdx
 1      5     0.50    *                   movq	8(%rax), %rsi
 1      5     0.50    *                   movq	16(%rax), %rax
 1      5     0.50    *                   vmovsd	(%rdx,%rcx,8), %xmm0
 2      9     0.50    *                   vsubsd	(%rax,%rcx,8), %xmm0, %xmm0
 2      19    4.00    *                   vdivsd	(%rsi,%rcx,8), %xmm0, %xmm0
 2      1     1.00           *            vmovsd	%xmm0, (%rax,%rcx,8)
 3      7     1.00    *      *            incl	-4(%rbp)
 1      5     0.50    *                   movq	-16(%rbp), %rax
 1      5     0.50    *                   movl	-4(%rbp), %ecx
 2      6     0.50    *                   cmpl	92(%rax), %ecx
 1      1     0.50                        jl	LBB0_2


Resources:
[0]   - SKXDivider
[1]   - SKXFPDivider
[2]   - SKXPort0
[3]   - SKXPort1
[4]   - SKXPort2
[5]   - SKXPort3
[6]   - SKXPort4
[7]   - SKXPort5
[8]   - SKXPort6
[9]   - SKXPort7


Resource pressure per iteration:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
 -     4.00   2.75   2.75   6.01   6.02   2.00   2.75   2.75   1.97

Resource pressure by instruction:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 -      -     0.12   0.36    -      -      -     0.26   0.26    -     movq	%rsp, %rax
 -      -     0.22   0.25    -      -      -     0.37   0.16    -     addq	$-16, %rax
 -      -     0.01   0.15    -      -      -     0.28   0.56    -     movq	%rax, %rsp
 -      -     0.12   0.27    -      -      -     0.36   0.25    -     movq	%rsp, %rax
 -      -     0.24   0.01    -      -      -     0.48   0.27    -     addq	$-16, %rax
 -      -     0.01   0.24    -      -      -     0.39   0.36    -     movq	%rax, %rsp
 -      -      -      -     0.74   0.26    -      -      -      -     movq	-16(%rbp), %rax
 -      -      -      -     0.57   0.43    -      -      -      -     movslq	-4(%rbp), %rcx
 -      -      -      -     0.06   0.94    -      -      -      -     movq	(%rax), %rdx
 -      -      -      -     0.95   0.05    -      -      -      -     movq	8(%rax), %rsi
 -      -      -      -     0.53   0.47    -      -      -      -     movq	16(%rax), %rax
 -      -      -      -     0.01   0.99    -      -      -      -     vmovsd	(%rdx,%rcx,8), %xmm0
 -      -     0.25   0.75   0.68   0.32    -      -      -      -     vsubsd	(%rax,%rcx,8), %xmm0, %xmm0
 -     4.00   1.00    -     0.25   0.75    -      -      -      -     vdivsd	(%rsi,%rcx,8), %xmm0, %xmm0
 -      -      -      -     0.03    -     1.00    -      -     0.97   vmovsd	%xmm0, (%rax,%rcx,8)
 -      -     0.03   0.35   0.01   0.99   1.00   0.24   0.38   1.00   incl	-4(%rbp)
 -      -      -      -     0.73   0.27    -      -      -      -     movq	-16(%rbp), %rax
 -      -      -      -     0.99   0.01    -      -      -      -     movl	-4(%rbp), %ecx
 -      -     0.12   0.37   0.46   0.54    -     0.37   0.14    -     cmpl	92(%rax), %ecx
 -      -     0.63    -      -      -      -      -     0.37    -     jl	LBB0_2
 -

Analysing vector / SIMD code


./bin/nmodl hh.mod llvm --ir --vector-width 8 > tmp.ll
 sed -n '/ModuleID/,$p' tmp.ll > hh.8.ll

 llc -o - -O3  -march=x86-64 -mcpu=skylake-avx512 hh.8.ll > hh.8.s
 
sed -i '' 's/^LBB0_2/# LLVM-MCA-BEGIN Body\'$'\nLBB0_2/g' hh.8.s
sed -i '' 's/^LBB0_3/# LLVM-MCA-END\'$'\nLBB0_3/g' hh.8.s

# generated code

→ cat hh.8.s
....
# LLVM-MCA-BEGIN Body
LBB0_2:                                 ## %for.body
                                        ## =>This Inner Loop Header: Depth=1
	movq	%rsp, %rax
	addq	$-16, %rax
	movq	%rax, %rsp
	movq	%rsp, %rax
	addq	$-16, %rax
	movq	%rax, %rsp
	movq	24(%rbx), %rax
	movslq	20(%rbx), %rcx
	movq	(%rax), %rdx
	movq	8(%rax), %rsi
	shlq	$6, %rcx
	movq	16(%rax), %rax
	vmovapd	(%rdx,%rcx), %zmm1
	vsubpd	(%rax,%rcx), %zmm1, %zmm1
	vdivpd	(%rsi,%rcx), %zmm1, %zmm1
	vmovapd	%zmm1, (%rax,%rcx)
	addl	$8, 20(%rbx)
	vpaddd	32(%rbx), %ymm0, %ymm1
	vmovdqa	%ymm1, 32(%rbx)
	movq	24(%rbx), %rax
	movl	20(%rbx), %ecx
	cmpl	92(%rax), %ecx
	jl	LBB0_2
# LLVM-MCA-END
LBB0_3:                                 ## %for.exit
...


# Analysing code

→ llvm-mca -march=x86-64 -mcpu=skylake-avx512 h.8.s


[0] Code Region - Body

Iterations:        100
Instructions:      2300
Total Cycles:      1641
Total uOps:        3400

Dispatch Width:    6
uOps Per Cycle:    2.07
IPC:               1.40
Block RThroughput: 16.0


Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)

[1]    [2]    [3]    [4]    [5]    [6]    Instructions:
 1      1     0.25                        movq	%rsp, %rax
 1      1     0.25                        addq	$-16, %rax
 1      1     0.25                        movq	%rax, %rsp
 1      1     0.25                        movq	%rsp, %rax
 1      1     0.25                        addq	$-16, %rax
 1      1     0.25                        movq	%rax, %rsp
 1      5     0.50    *                   movq	24(%rbx), %rax
 1      5     0.50    *                   movslq	20(%rbx), %rcx
 1      5     0.50    *                   movq	(%rax), %rdx
 1      5     0.50    *                   movq	8(%rax), %rsi
 1      1     0.50                        shlq	$6, %rcx
 1      5     0.50    *                   movq	16(%rax), %rax
 2      8     0.50    *                   vmovapd	(%rdx,%rcx), %zmm1
 2      11    0.50    *                   vsubpd	(%rax,%rcx), %zmm1, %zmm1
 4      30    16.00   *                   vdivpd	(%rsi,%rcx), %zmm1, %zmm1
 2      1     1.00           *            vmovapd	%zmm1, (%rax,%rcx)
 3      7     1.00    *      *            addl	$8, 20(%rbx)
 2      8     0.50    *                   vpaddd	32(%rbx), %ymm0, %ymm1
 2      1     1.00           *            vmovdqa	%ymm1, 32(%rbx)
 1      5     0.50    *                   movq	24(%rbx), %rax
 1      5     0.50    *                   movl	20(%rbx), %ecx
 2      6     0.50    *                   cmpl	92(%rax), %ecx
 1      1     0.50                        jl	LBB0_2


Resources:
[0]   - SKXDivider
[1]   - SKXFPDivider
[2]   - SKXPort0
[3]   - SKXPort1
[4]   - SKXPort2
[5]   - SKXPort3
[6]   - SKXPort4
[7]   - SKXPort5
[8]   - SKXPort6
[9]   - SKXPort7


Resource pressure per iteration:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
 -     16.00  4.75   3.74   6.54   6.54   3.00   3.76   3.75   2.92

Resource pressure by instruction:
[0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
 -      -     0.24   0.02    -      -      -     0.24   0.50    -     movq	%rsp, %rax
 -      -     0.01   0.26    -      -      -     0.25   0.48    -     addq	$-16, %rax
 -      -     0.01   0.51    -      -      -     0.01   0.47    -     movq	%rax, %rsp
 -      -     0.03   0.24    -      -      -     0.25   0.48    -     movq	%rsp, %rax
 -      -     0.23   0.27    -      -      -     0.46   0.04    -     addq	$-16, %rax
 -      -     0.24   0.50    -      -      -      -     0.26    -     movq	%rax, %rsp
 -      -      -      -     0.49   0.51    -      -      -      -     movq	24(%rbx), %rax
 -      -      -      -     0.50   0.50    -      -      -      -     movslq	20(%rbx), %rcx
 -      -      -      -     0.52   0.48    -      -      -      -     movq	(%rax), %rdx
 -      -      -      -     0.48   0.52    -      -      -      -     movq	8(%rax), %rsi
 -      -     0.51    -      -      -      -      -     0.49    -     shlq	$6, %rcx
 -      -      -      -     0.53   0.47    -      -      -      -     movq	16(%rax), %rax
 -      -      -     0.02   0.48   0.52    -     0.98    -      -     vmovapd	(%rdx,%rcx), %zmm1
 -      -     0.72    -     0.51   0.49    -     0.28    -      -     vsubpd	(%rax,%rcx), %zmm1, %zmm1
 -     16.00  2.00    -     0.48   0.52    -     1.00    -      -     vdivpd	(%rsi,%rcx), %zmm1, %zmm1
 -      -      -      -     0.02    -     1.00    -      -     0.98   vmovapd	%zmm1, (%rax,%rcx)
 -      -      -     0.49   0.51   0.52   1.00   0.23   0.28   0.97   addl	$8, 20(%rbx)
 -      -     0.03   0.93   0.49   0.51    -     0.04    -      -     vpaddd	32(%rbx), %ymm0, %ymm1
 -      -      -      -     0.02   0.01   1.00    -      -     0.97   vmovdqa	%ymm1, 32(%rbx)
 -      -      -      -     0.51   0.49    -      -      -      -     movq	24(%rbx), %rax
 -      -      -      -     0.48   0.52    -      -      -      -     movl	20(%rbx), %ecx
 -      -      -     0.50   0.52   0.48    -     0.02   0.48    -     cmpl	92(%rax), %ecx
 -      -     0.73    -      -      -      -      -     0.27    -     jl	LBB0_2

cc: @georgemitenkov @iomaganaris @castigli

Mar 09 '21 07:03 pramodk