ResearchNote
ResearchNote copied to clipboard
几类循环模式的指令集角度观察(2)
往期 #31 几类循环模式的指令集角度观察
simple sgemm
这个例子来自论文XLOOP 提供的benchmark
void sgemm_scalar_int( int C[], int A[], int B[], int size )
{
for ( int mm = 0; mm < size; ++mm ) {
for ( int nn = 0; nn < size; ++nn ) {
int c = 0;
for ( int i = 0; i < size; ++i ) {
int a = A[mm + i * size];
int b = B[nn + i * size];
c += a * b;
}
C[mm+nn*size] = c;
}
}
}
000000008000110a <sgemm_scalar_int>:
8000110a: 06d05363 blez a3,80001170 <sgemm_scalar_int+0x66>
8000110e: 1141 addi sp,sp,-16
80001110: e422 sd s0,8(sp)
80001112: 82aa mv t0,a0
80001114: 8336 mv t1,a3
80001116: 8432 mv s0,a2
80001118: 8fae mv t6,a1
8000111a: 00269513 slli a0,a3,0x2
8000111e: 4381 li t2,0
80001120: 8f22 mv t5,s0
80001122: 8e96 mv t4,t0
80001124: 4e01 li t3,0
80001126: 867a mv a2,t5
80001128: 86fe mv a3,t6
8000112a: 4781 li a5,0
8000112c: 4581 li a1,0
**最内层循环的汇编代码------------------------------------
**8000112e: 4298 lw a4,0(a3)
80001130: 00062803 lw a6,0(a2)
80001134: 88be mv a7,a5
80001136: 2785 addiw a5,a5,1
80001138: 0307073b mulw a4,a4,a6
8000113c: 96aa add a3,a3,a0
8000113e: 962a add a2,a2,a0
80001140: 9db9 addw a1,a1,a4
80001142: fef316e3 bne t1,a5,8000112e <sgemm_scalar_int+0x24>
** 最内层循环的汇编代码--end----------------------------------
80001146: 00bea023 sw a1,0(t4)
8000114a: 001e079b addiw a5,t3,1
8000114e: 9eaa add t4,t4,a0
80001150: 0f11 addi t5,t5,4
80001152: 011e0463 beq t3,a7,8000115a <sgemm_scalar_int+0x50>
80001156: 8e3e mv t3,a5
80001158: b7f9 j 80001126 <sgemm_scalar_int+0x1c>
8000115a: 0013879b addiw a5,t2,1
8000115e: 0f91 addi t6,t6,4
80001160: 0291 addi t0,t0,4
80001162: 01c38463 beq t2,t3,8000116a <sgemm_scalar_int+0x60>
80001166: 83be mv t2,a5
80001168: bf65 j 80001120 <sgemm_scalar_int+0x16>
8000116a: 6422 ld s0,8(sp)
8000116c: 0141 addi sp,sp,16
8000116e: 8082 ret
80001170: 8082 ret
dynamic programing (这个例子来自 xloop 的dynprog)
__attribute__((noinline))
void dynprog_scalar_longlong( int length, long long *c, long long *W, long long *out )
{
int out_l = 0;
int i, j, k;
for (i = 0; i < length - 1; i++)
{
for (j = i + 1; j < length; j++)
{
int s = 0;
asm volatile ("fence");
for (k = i + 1; k < j; k++)
s += c[i * length + k] + c[k * length + j];
asm volatile ("fence" ::: "memory");
c[i * length + j] = s + W[i * length + j];
}
out_l += c[length - 1];
}
*out = out_l;
}
000000008000115e <dynprog_scalar_longlong>:
8000115e: 7159 addi sp,sp,-112
80001160: f4a2 sd s0,104(sp)
80001162: f0a6 sd s1,96(sp)
80001164: ecca sd s2,88(sp)
80001166: e8ce sd s3,80(sp)
80001168: e4d2 sd s4,72(sp)
8000116a: e0d6 sd s5,64(sp)
8000116c: fc5a sd s6,56(sp)
8000116e: f85e sd s7,48(sp)
80001170: f462 sd s8,40(sp)
80001172: f066 sd s9,32(sp)
80001174: ec6a sd s10,24(sp)
80001176: e86e sd s11,16(sp)
80001178: 4785 li a5,1
8000117a: e436 sd a3,8(sp)
8000117c: 0ea7de63 bge a5,a0,80001278 <dynprog_scalar_longlong+0x11a>
80001180: 00351813 slli a6,a0,0x3
80001184: ff880d93 addi s11,a6,-8
80001188: 00050d1b sext.w s10,a0
8000118c: 9f89 subw a5,a5,a0
8000118e: 8f2a mv t5,a0
80001190: 8fae mv t6,a1
80001192: 83b2 mv t2,a2
80001194: 9dae add s11,s11,a1
80001196: e03e sd a5,0(sp)
80001198: 8c6a mv s8,s10
8000119a: 4b01 li s6,0
8000119c: 4a85 li s5,1
8000119e: 4a01 li s4,0
800011a0: 4b81 li s7,0
800011a2: 4c81 li s9,0
800011a4: 01058993 addi s3,a1,16
800011a8: 000a889b sext.w a7,s5
800011ac: 82c6 mv t0,a7
800011ae: 09e8d663 bge a7,t5,8000123a <dynprog_scalar_longlong+0xdc>
800011b2: 015a0333 add t1,s4,s5
800011b6: 030e slli t1,t1,0x3
800011b8: 414c0eb3 sub t4,s8,s4
800011bc: 006f8e33 add t3,t6,t1
800011c0: 0e8e slli t4,t4,0x3
800011c2: 8972 mv s2,t3
800011c4: 9efe add t4,t4,t6
800011c6: ffeb049b addiw s1,s6,-2
800011ca: 017a0433 add s0,s4,s7
800011ce: 0ff0000f fence
800011d2: 0512d763 bge t0,a7,80001220 <dynprog_scalar_longlong+0xc2>
800011d6: 0114853b addw a0,s1,a7
800011da: 1502 slli a0,a0,0x20
800011dc: 9101 srli a0,a0,0x20
800011de: 9522 add a0,a0,s0
800011e0: 050e slli a0,a0,0x3
800011e2: 01d306b3 add a3,t1,t4
800011e6: 954e add a0,a0,s3
800011e8: 87ca mv a5,s2
800011ea: 4601 li a2,
## 最内层循环 ------------------------------------------
800011ec: 6398 ld a4,0(a5)
800011ee: 628c ld a1,0(a3)
800011f0: 07a1 addi a5,a5,8
800011f2: 96c2 add a3,a3,a6
800011f4: 9f2d addw a4,a4,a1
800011f6: 9e39 addw a2,a2,a4
800011f8: fef51ae3 bne a0,a5,800011ec <dynprog_scalar_longlong+0x8e>
## 最内层循环 end----------------------------------------
800011fc: 0ff0000f fence
80001200: 006387b3 add a5,t2,t1
80001204: 639c ld a5,0(a5)
80001206: 2885 addiw a7,a7,1
80001208: 0321 addi t1,t1,8
8000120a: 963e add a2,a2,a5
8000120c: 00ce3023 sd a2,0(t3)
80001210: 031f0563 beq t5,a7,8000123a <dynprog_scalar_longlong+0xdc>
80001214: 006f8e33 add t3,t6,t1
80001218: 0ff0000f fence
8000121c: fb12cde3 blt t0,a7,800011d6 <dynprog_scalar_longlong+0x78>
80001220: 4601 li a2,0
80001222: 0ff0000f fence
80001226: 006387b3 add a5,t2,t1
8000122a: 639c ld a5,0(a5)
8000122c: 2885 addiw a7,a7,1
8000122e: 0321 addi t1,t1,8
80001230: 963e add a2,a2,a5
80001232: 00ce3023 sd a2,0(t3)
80001236: fd1f1fe3 bne t5,a7,80001214 <dynprog_scalar_longlong+0xb6>
8000123a: 000db783 ld a5,0(s11)
8000123e: 3b7d addiw s6,s6,-1
80001240: 0b85 addi s7,s7,1
80001242: 01978cbb addw s9,a5,s9
80001246: 6782 ld a5,0(sp)
80001248: 01aa0a3b addw s4,s4,s10
8000124c: 0a85 addi s5,s5,1
8000124e: 018d0c3b addw s8,s10,s8
80001252: f4fb1be3 bne s6,a5,800011a8 <dynprog_scalar_longlong+0x4a>
80001256: 67a2 ld a5,8(sp)
80001258: 7426 ld s0,104(sp)
8000125a: 7486 ld s1,96(sp)
8000125c: 0197b023 sd s9,0(a5)
80001260: 6966 ld s2,88(sp)
80001262: 69c6 ld s3,80(sp)
80001264: 6a26 ld s4,72(sp)
80001266: 6a86 ld s5,64(sp)
80001268: 7b62 ld s6,56(sp)
8000126a: 7bc2 ld s7,48(sp)
8000126c: 7c22 ld s8,40(sp)
8000126e: 7c82 ld s9,32(sp)
80001270: 6d62 ld s10,24(sp)
80001272: 6dc2 ld s11,16(sp)
80001274: 6165 addi sp,sp,112
80001276: 8082 ret
80001278: 4c81 li s9,0
8000127a: bff1 j 80001256 <dynprog_scalar_longlong+0xf8>
warshall computing 例子来自xloop的benchmark
__attribute__((noinline))
void warshall_scalar( int n, float *path, float *path_in )
{
int i, j, k;
// initially copy the input path to path
memcpy( path, path_in, sizeof(float) * n * n );
for (k = 0; k < n; k++)
{
for (i = 0; i < n; i++)
asm volatile ("fence"); //最内层循环中有条件判断
for (j = 0; j < n; j++)
path[i*n+j] = path[i*n+j] < path[i*n+k] + path[k*n+j] ?
path[i*n+j] : path[i*n+k] + path[k*n+j];
asm volatile ("fence" ::: "memory");
}
}
80001098: 0ff0000f fence
8000109c: 86be mv a3,a5
8000109e: 2785 addiw a5,a5,1
800010a0: fef41ce3 bne s0,a5,80001098 <warshall_scalar+0x50>
800010a4: 879a mv a5,t1
800010a6: 00b78733 add a4,a5,a1
800010aa: 9742 add a4,a4,a6
800010ac: 00072787 flw fa5,0(a4)
800010b0: 00062687 flw fa3,0(a2)
800010b4: 0007a707 flw fa4,0(a5)
800010b8: 00d7f7d3 fadd.s fa5,fa5,fa3
800010bc: a0f71753 flt.s a4,fa4,fa5
800010c0: e319 bnez a4,800010c6 <warshall_scalar+0x7e>
800010c2: 20f78753 fmv.s fa4,fa5
800010c6: 0791 addi a5,a5,4
800010c8: fee7ae27 fsw fa4,-4(a5)
800010cc: fcf51de3 bne a0,a5,800010a6 <warshall_scalar+0x5e>
800010d0: 0ff0000f fence