nmodl
nmodl copied to clipboard
Using llvm-mca for statically measuring the performance of machine code
Haven't used this "in practice" but trying this first time to get an idea:
llvm-mca : LLVM Machine Code Analyzer
- See docs
- Understand pitfalls / what to expect : stackoverflow answer
- See RFC
Sample mod file
NEURON {
SUFFIX hh
NONSPECIFIC_CURRENT il
RANGE minf, mtau, gl, el
}
STATE {
m
}
ASSIGNED {
v (mV)
minf
mtau (ms)
}
BREAKPOINT {
SOLVE states METHOD cnexp
il = gl*(v - el)
}
DERIVATIVE states {
m = (minf-m)/mtau
: m' = (minf-m)/mtau : you can uncomment this like if need ode solution involving exp
}
Patch to llvm branch
→ git diff
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index c8143ac..5f55cb1 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -534,10 +534,10 @@ void CodegenLLVMHelperVisitor::visit_nrn_state_block(ast::NrnStateBlock& node) {
std::vector<std::string> double_variables{"v"};
/// access node index and corresponding voltage
- loop_index_statements.push_back(
- visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
- loop_body_statements.push_back(
- visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
+ //loop_index_statements.push_back(
+ // visitor::create_statement("node_id = node_index[{}]"_format(INDUCTION_VAR)));
+ //loop_body_statements.push_back(
+ // visitor::create_statement("v = {}[node_id]"_format(VOLTAGE_VAR)));
/// read ion variables
ion_read_statements(BlockType::State,
Analysing serial code
./bin/nmodl hh.mod llvm --ir --vector-width 1 > tmp.ll
# extra LLVM IR and compile it to assembly
sed -n '/ModuleID/,$p' tmp.ll > hh.1.ll
llc -o - -O3 -march=x86-64 -mcpu=skylake-avx512 hh.1.ll > hh.1.s
# mark loop body for analysis
sed -i '' 's/^LBB0_2/# LLVM-MCA-BEGIN Body\'$'\nLBB0_2/g' hh.1.s
sed -i '' 's/^LBB0_3/# LLVM-MCA-END\'$'\nLBB0_3/g' hh.1.s
# loop body
$ → cat hh.1.s
....
# LLVM-MCA-BEGIN Body
LBB0_2: ## %for.body
## =>This Inner Loop Header: Depth=1
movq %rsp, %rax
addq $-16, %rax
movq %rax, %rsp
movq %rsp, %rax
addq $-16, %rax
movq %rax, %rsp
movq -16(%rbp), %rax
movslq -4(%rbp), %rcx
movq (%rax), %rdx
movq 8(%rax), %rsi
movq 16(%rax), %rax
vmovsd (%rdx,%rcx,8), %xmm0 ## xmm0 = mem[0],zero
vsubsd (%rax,%rcx,8), %xmm0, %xmm0
vdivsd (%rsi,%rcx,8), %xmm0, %xmm0
vmovsd %xmm0, (%rax,%rcx,8)
incl -4(%rbp)
movq -16(%rbp), %rax
movl -4(%rbp), %ecx
cmpl 92(%rax), %ecx
jl LBB0_2
# LLVM-MCA-END
...
## Run MCA
→llvm-mca -march=x86-64 -mcpu=skylake-avx512 hh.1.s
hh.1.s:2:2: warning: .build_version macos used while targeting darwin19.5.0
.build_version macos, 10, 15
^
[0] Code Region - Body
Iterations: 100
Instructions: 2000
Total Cycles: 638
Total uOps: 2600
Dispatch Width: 6
uOps Per Cycle: 4.08
IPC: 3.13
Block RThroughput: 6.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.25 movq %rsp, %rax
1 1 0.25 addq $-16, %rax
1 1 0.25 movq %rax, %rsp
1 1 0.25 movq %rsp, %rax
1 1 0.25 addq $-16, %rax
1 1 0.25 movq %rax, %rsp
1 5 0.50 * movq -16(%rbp), %rax
1 5 0.50 * movslq -4(%rbp), %rcx
1 5 0.50 * movq (%rax), %rdx
1 5 0.50 * movq 8(%rax), %rsi
1 5 0.50 * movq 16(%rax), %rax
1 5 0.50 * vmovsd (%rdx,%rcx,8), %xmm0
2 9 0.50 * vsubsd (%rax,%rcx,8), %xmm0, %xmm0
2 19 4.00 * vdivsd (%rsi,%rcx,8), %xmm0, %xmm0
2 1 1.00 * vmovsd %xmm0, (%rax,%rcx,8)
3 7 1.00 * * incl -4(%rbp)
1 5 0.50 * movq -16(%rbp), %rax
1 5 0.50 * movl -4(%rbp), %ecx
2 6 0.50 * cmpl 92(%rax), %ecx
1 1 0.50 jl LBB0_2
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- 4.00 2.75 2.75 6.01 6.02 2.00 2.75 2.75 1.97
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - 0.12 0.36 - - - 0.26 0.26 - movq %rsp, %rax
- - 0.22 0.25 - - - 0.37 0.16 - addq $-16, %rax
- - 0.01 0.15 - - - 0.28 0.56 - movq %rax, %rsp
- - 0.12 0.27 - - - 0.36 0.25 - movq %rsp, %rax
- - 0.24 0.01 - - - 0.48 0.27 - addq $-16, %rax
- - 0.01 0.24 - - - 0.39 0.36 - movq %rax, %rsp
- - - - 0.74 0.26 - - - - movq -16(%rbp), %rax
- - - - 0.57 0.43 - - - - movslq -4(%rbp), %rcx
- - - - 0.06 0.94 - - - - movq (%rax), %rdx
- - - - 0.95 0.05 - - - - movq 8(%rax), %rsi
- - - - 0.53 0.47 - - - - movq 16(%rax), %rax
- - - - 0.01 0.99 - - - - vmovsd (%rdx,%rcx,8), %xmm0
- - 0.25 0.75 0.68 0.32 - - - - vsubsd (%rax,%rcx,8), %xmm0, %xmm0
- 4.00 1.00 - 0.25 0.75 - - - - vdivsd (%rsi,%rcx,8), %xmm0, %xmm0
- - - - 0.03 - 1.00 - - 0.97 vmovsd %xmm0, (%rax,%rcx,8)
- - 0.03 0.35 0.01 0.99 1.00 0.24 0.38 1.00 incl -4(%rbp)
- - - - 0.73 0.27 - - - - movq -16(%rbp), %rax
- - - - 0.99 0.01 - - - - movl -4(%rbp), %ecx
- - 0.12 0.37 0.46 0.54 - 0.37 0.14 - cmpl 92(%rax), %ecx
- - 0.63 - - - - - 0.37 - jl LBB0_2
-
Analysing vector / SIMD code
./bin/nmodl hh.mod llvm --ir --vector-width 8 > tmp.ll
sed -n '/ModuleID/,$p' tmp.ll > hh.8.ll
llc -o - -O3 -march=x86-64 -mcpu=skylake-avx512 hh.8.ll > hh.8.s
sed -i '' 's/^LBB0_2/# LLVM-MCA-BEGIN Body\'$'\nLBB0_2/g' hh.8.s
sed -i '' 's/^LBB0_3/# LLVM-MCA-END\'$'\nLBB0_3/g' hh.8.s
# generated code
→ cat hh.8.s
....
# LLVM-MCA-BEGIN Body
LBB0_2: ## %for.body
## =>This Inner Loop Header: Depth=1
movq %rsp, %rax
addq $-16, %rax
movq %rax, %rsp
movq %rsp, %rax
addq $-16, %rax
movq %rax, %rsp
movq 24(%rbx), %rax
movslq 20(%rbx), %rcx
movq (%rax), %rdx
movq 8(%rax), %rsi
shlq $6, %rcx
movq 16(%rax), %rax
vmovapd (%rdx,%rcx), %zmm1
vsubpd (%rax,%rcx), %zmm1, %zmm1
vdivpd (%rsi,%rcx), %zmm1, %zmm1
vmovapd %zmm1, (%rax,%rcx)
addl $8, 20(%rbx)
vpaddd 32(%rbx), %ymm0, %ymm1
vmovdqa %ymm1, 32(%rbx)
movq 24(%rbx), %rax
movl 20(%rbx), %ecx
cmpl 92(%rax), %ecx
jl LBB0_2
# LLVM-MCA-END
LBB0_3: ## %for.exit
...
# Analysing code
→ llvm-mca -march=x86-64 -mcpu=skylake-avx512 h.8.s
[0] Code Region - Body
Iterations: 100
Instructions: 2300
Total Cycles: 1641
Total uOps: 3400
Dispatch Width: 6
uOps Per Cycle: 2.07
IPC: 1.40
Block RThroughput: 16.0
Instruction Info:
[1]: #uOps
[2]: Latency
[3]: RThroughput
[4]: MayLoad
[5]: MayStore
[6]: HasSideEffects (U)
[1] [2] [3] [4] [5] [6] Instructions:
1 1 0.25 movq %rsp, %rax
1 1 0.25 addq $-16, %rax
1 1 0.25 movq %rax, %rsp
1 1 0.25 movq %rsp, %rax
1 1 0.25 addq $-16, %rax
1 1 0.25 movq %rax, %rsp
1 5 0.50 * movq 24(%rbx), %rax
1 5 0.50 * movslq 20(%rbx), %rcx
1 5 0.50 * movq (%rax), %rdx
1 5 0.50 * movq 8(%rax), %rsi
1 1 0.50 shlq $6, %rcx
1 5 0.50 * movq 16(%rax), %rax
2 8 0.50 * vmovapd (%rdx,%rcx), %zmm1
2 11 0.50 * vsubpd (%rax,%rcx), %zmm1, %zmm1
4 30 16.00 * vdivpd (%rsi,%rcx), %zmm1, %zmm1
2 1 1.00 * vmovapd %zmm1, (%rax,%rcx)
3 7 1.00 * * addl $8, 20(%rbx)
2 8 0.50 * vpaddd 32(%rbx), %ymm0, %ymm1
2 1 1.00 * vmovdqa %ymm1, 32(%rbx)
1 5 0.50 * movq 24(%rbx), %rax
1 5 0.50 * movl 20(%rbx), %ecx
2 6 0.50 * cmpl 92(%rax), %ecx
1 1 0.50 jl LBB0_2
Resources:
[0] - SKXDivider
[1] - SKXFPDivider
[2] - SKXPort0
[3] - SKXPort1
[4] - SKXPort2
[5] - SKXPort3
[6] - SKXPort4
[7] - SKXPort5
[8] - SKXPort6
[9] - SKXPort7
Resource pressure per iteration:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
- 16.00 4.75 3.74 6.54 6.54 3.00 3.76 3.75 2.92
Resource pressure by instruction:
[0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
- - 0.24 0.02 - - - 0.24 0.50 - movq %rsp, %rax
- - 0.01 0.26 - - - 0.25 0.48 - addq $-16, %rax
- - 0.01 0.51 - - - 0.01 0.47 - movq %rax, %rsp
- - 0.03 0.24 - - - 0.25 0.48 - movq %rsp, %rax
- - 0.23 0.27 - - - 0.46 0.04 - addq $-16, %rax
- - 0.24 0.50 - - - - 0.26 - movq %rax, %rsp
- - - - 0.49 0.51 - - - - movq 24(%rbx), %rax
- - - - 0.50 0.50 - - - - movslq 20(%rbx), %rcx
- - - - 0.52 0.48 - - - - movq (%rax), %rdx
- - - - 0.48 0.52 - - - - movq 8(%rax), %rsi
- - 0.51 - - - - - 0.49 - shlq $6, %rcx
- - - - 0.53 0.47 - - - - movq 16(%rax), %rax
- - - 0.02 0.48 0.52 - 0.98 - - vmovapd (%rdx,%rcx), %zmm1
- - 0.72 - 0.51 0.49 - 0.28 - - vsubpd (%rax,%rcx), %zmm1, %zmm1
- 16.00 2.00 - 0.48 0.52 - 1.00 - - vdivpd (%rsi,%rcx), %zmm1, %zmm1
- - - - 0.02 - 1.00 - - 0.98 vmovapd %zmm1, (%rax,%rcx)
- - - 0.49 0.51 0.52 1.00 0.23 0.28 0.97 addl $8, 20(%rbx)
- - 0.03 0.93 0.49 0.51 - 0.04 - - vpaddd 32(%rbx), %ymm0, %ymm1
- - - - 0.02 0.01 1.00 - - 0.97 vmovdqa %ymm1, 32(%rbx)
- - - - 0.51 0.49 - - - - movq 24(%rbx), %rax
- - - - 0.48 0.52 - - - - movl 20(%rbx), %ecx
- - - 0.50 0.52 0.48 - 0.02 0.48 - cmpl 92(%rax), %ecx
- - 0.73 - - - - - 0.27 - jl LBB0_2
cc: @georgemitenkov @iomaganaris @castigli