ResearchNote
ResearchNote copied to clipboard
几类循环模式的指令集角度观察
概述
在做一个加速器设计项目时需要从指令集角度去观察循环可加速代码,下面记录以备忘
单层循环,全并行
for(int i =0; i<MAXELEMS; i++){
c[i] = a[i] * b[i];
}
800017e2: 601c ld a5,0(s0)
800017e4: 6098 ld a4,0(s1)
800017e6: 09a1 addi s3,s3,8
800017e8: 0421 addi s0,s0,8
800017ea: 02e787b3 mul a5,a5,a4
800017ee: 04a1 addi s1,s1,8
800017f0: fef9bc23 sd a5,-8(s3)
800017f4: ff2417e3 bne s0,s2,800017e2 <main+0xb0>
单层循环,树并行
for(int i =0; i< 4096; i++){
sum += a[i];
}
800017b2: 4681 li a3,0
800017b4: 87a2 mv a5,s0
800017b6: 6398 ld a4,0(a5)
800017b8: 07a1 addi a5,a5,8
800017ba: 96ba add a3,a3,a4
800017bc: ff279de3 bne a5,s2,800017b6 <main+0x84>
两层循环 全并行
for(int round=0; round<4; round++){
for(int i =0; i<MAXELEMS; i++){
c[i] = a[i] * b[i];
}
}
//外层循环
80001854: 4511 li a0,4
80001856: 6642 ld a2,16(sp)
80001858: 67a2 ld a5,8(sp)
8000185a: 86a6 mv a3,s1
//内层循环
8000185c: 6398 ld a4,0(a5)
8000185e: 628c ld a1,0(a3)
80001860: 0621 addi a2,a2,8
80001862: 07a1 addi a5,a5,8
80001864: 02b70733 mul a4,a4,a1
80001868: 06a1 addi a3,a3,8
8000186a: fee63c23 sd a4,-8(a2) # ff8 <_tbss_end+0xfb4>
8000186e: fe8797e3 bne a5,s0,8000185c <main+0x12a>
//内层循环 END
80001872: 357d addiw a0,a0,-1
80001874: f16d bnez a0,80001856 <main+0x124>
变例2:
for(int round=0; round<4; round++){
for(int i =0; i<MAXELEMS; i++){
a[i] = a[i] * b[i];
}
}
80001860: 4491 li s1,4
80001862: 86d6 mv a3,s5
80001864: 87ca mv a5,s2
80001866: 6398 ld a4,0(a5)
80001868: 6290 ld a2,0(a3)
8000186a: 07a1 addi a5,a5,8
8000186c: 06a1 addi a3,a3,8
8000186e: 02c70733 mul a4,a4,a2
80001872: fee7bc23 sd a4,-8(a5)
80001876: fe8798e3 bne a5,s0,80001866 <main+0x134>
8000187a: 34fd addiw s1,s1,-1
8000187c: f0fd bnez s1,80001862 <main+0x130>
一层循环,循环内有if/else 语句
for(int i =0; i<MAXELEMS; i++){
if (i < 100){
c[i] = a[i] * b[i];
}
else{
c[i] = a[i] + b[i];
}
}
800018da: 6605 lui a2,0x1
800018dc: 06300693 li a3,99
800018e0: 167d addi a2,a2,-1
800018e2: a811 j 800018f6 <main+0x1c4> //if i<100 跳转乘法运算
//加法运算
800018e4: 97ba add a5,a5,a4 //if i>100 加法计算
800018e6: 00f9b023 sd a5,0(s3)
800018ea: 02c48163 beq s1,a2,8000190c <main+0x1da> //循环出口
//乘法运算
800018ee: 2485 addiw s1,s1,1
800018f0: 0921 addi s2,s2,8
800018f2: 0a21 addi s4,s4,8
800018f4: 09a1 addi s3,s3,8
800018f6: 00093703 ld a4,0(s2) # fffffffffffe8000<__global_pointer$+0xffffffff7ffe5bd8>
800018fa: 000a3783 ld a5,0(s4)
800018fe: fe96e3e3 bltu a3,s1,800018e4 //if i >100 跳转去加法操作
80001902: 02e787b3 mul a5,a5,a4 //乘法
80001906: 00f9b023 sd a5,0(s3)
8000190a: b7d5 j 800018ee <main+0x1bc>
一个复杂点的例子: sha3加密算法核心函数
static void keccakf(uint64_t st[25], int rounds)
{
int i, j, round_num;
uint64_t t, bc[5];
//printf("Starting\n");
//printState(st);
for (round_num = 0; round_num < rounds; round_num++)
{
// Theta
for (i = 0; i < 5; i++)
bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
for (i = 0; i < 5; i++)
{
t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
for (j = 0; j < 25; j += 5)
st[j + i] ^= t;
}
//printf("After Theta:\n");
//printState(st);
// Rho Pi
t = st[1];
for (i = 0; i < 24; i++)
{
j = keccakf_piln[i];
bc[0] = st[j];
st[j] = ROTL64(t, keccakf_rotc[i]);
t = bc[0];
}
//printf("After RhoPi:\n");
//printState(st);
// Chi
for (j = 0; j < 25; j += 5)
{
for (i = 0; i < 5; i++)
bc[i] = st[j + i];
for (i = 0; i < 5; i++)
st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
}
//printf("After Chi:\n");
//printState(st);
// Iota
st[0] ^= keccakf_rndc[round_num];
//printf("After Round %d:\n",round_num);
//printState(st);
}
}
0000000080001048 <keccakf.constprop.0>:
80001048: 6118 ld a4,0(a0)
8000104a: 711d addi sp,sp,-96
8000104c: eca2 sd s0,88(sp)
8000104e: e8a6 sd s1,80(sp)
80001050: e4ca sd s2,72(sp)
80001052: e0ce sd s3,64(sp)
80001054: fc52 sd s4,56(sp)
80001056: f856 sd s5,48(sp)
80001058: 00001f17 auipc t5,0x1
8000105c: cd8f0f13 addi t5,t5,-808 # 80001d30 <keccakf_rndc+0x8>
80001060: 00001417 auipc s0,0x1
80001064: d8840413 addi s0,s0,-632 # 80001de8 <keccakf_piln>
80001068: 4f85 li t6,1
8000106a: 0c850393 addi t2,a0,200
8000106e: 00001897 auipc a7,0x1
80001072: dda88893 addi a7,a7,-550 # 80001e48 <keccakf_rotc>
80001076: 00850293 addi t0,a0,8
8000107a: 02850e93 addi t4,a0,40
8000107e: 4615 li a2,5
80001080: 4595 li a1,5
80001082: 03010813 addi a6,sp,48
80001086: 4e65 li t3,25
80001088: 8696 mv a3,t0
8000108a: 832a mv t1,a0
8000108c: 0024 addi s1,sp,8
8000108e: 893a mv s2,a4
80001090: 729c ld a5,32(a3)
80001092: 0486ba83 ld s5,72(a3)
80001096: 0706ba03 ld s4,112(a3)
8000109a: 0986b983 ld s3,152(a3)
8000109e: 0157c7b3 xor a5,a5,s5
800010a2: 0147c7b3 xor a5,a5,s4
800010a6: 0137c7b3 xor a5,a5,s3
800010aa: 0127c7b3 xor a5,a5,s2
800010ae: e09c sd a5,0(s1)
800010b0: 04a1 addi s1,s1,8
800010b2: 00de8663 beq t4,a3,800010be <keccakf.constprop.0+0x76>
800010b6: 0006b903 ld s2,0(a3)
800010ba: 06a1 addi a3,a3,8
800010bc: bfd1 j 80001090 <keccakf.constprop.0+0x48>
800010be: 849e mv s1,t2
800010c0: 4981 li s3,0
800010c2: 00198a1b addiw s4,s3,1
800010c6: 02ca66bb remw a3,s4,a2
800010ca: 0049891b addiw s2,s3,4
800010ce: 000a099b sext.w s3,s4
800010d2: 03010a13 addi s4,sp,48
800010d6: f3848793 addi a5,s1,-200
800010da: 02c9693b remw s2,s2,a2
800010de: 068e slli a3,a3,0x3
800010e0: 96d2 add a3,a3,s4
800010e2: fd86ba03 ld s4,-40(a3)
800010e6: 001a1693 slli a3,s4,0x1
800010ea: 03fa5a13 srli s4,s4,0x3f
800010ee: 0146e6b3 or a3,a3,s4
800010f2: 03010a13 addi s4,sp,48
800010f6: 090e slli s2,s2,0x3
800010f8: 9952 add s2,s2,s4
800010fa: fd893903 ld s2,-40(s2)
800010fe: 0126c6b3 xor a3,a3,s2
80001102: 8f35 xor a4,a4,a3
80001104: e398 sd a4,0(a5)
80001106: 02878793 addi a5,a5,40
8000110a: 00f48a63 beq s1,a5,8000111e <keccakf.constprop.0+0xd6>
8000110e: 6398 ld a4,0(a5)
80001110: 02878793 addi a5,a5,40
80001114: 8f35 xor a4,a4,a3
80001116: fce7bc23 sd a4,-40(a5)
8000111a: fef49ae3 bne s1,a5,8000110e <keccakf.constprop.0+0xc6>
8000111e: 04a1 addi s1,s1,8
80001120: 00b98563 beq s3,a1,8000112a <keccakf.constprop.0+0xe2>
80001124: f384b703 ld a4,-200(s1)
80001128: bf69 j 800010c2 <keccakf.constprop.0+0x7a>
8000112a: 651c ld a5,8(a0)
8000112c: 00001497 auipc s1,0x1
80001130: cc048493 addi s1,s1,-832 # 80001dec <keccakf_piln+0x4>
80001134: 00001917 auipc s2,0x1
80001138: d1890913 addi s2,s2,-744 # 80001e4c <keccakf_rotc+0x4>
8000113c: 4685 li a3,1
8000113e: 4729 li a4,10
80001140: a031 j 8000114c <keccakf.constprop.0+0x104>
80001142: 4098 lw a4,0(s1)
80001144: 00092683 lw a3,0(s2)
80001148: 0491 addi s1,s1,4
8000114a: 0911 addi s2,s2,4
8000114c: 40d009bb negw s3,a3
80001150: 070e slli a4,a4,0x3
80001152: 00d796b3 sll a3,a5,a3
80001156: 0137d7b3 srl a5,a5,s3
8000115a: 972a add a4,a4,a0
8000115c: 8edd or a3,a3,a5
8000115e: 631c ld a5,0(a4)
80001160: e314 sd a3,0(a4)
80001162: fe9890e3 bne a7,s1,80001142 <keccakf.constprop.0+0xfa>
80001166: e43e sd a5,8(sp)
80001168: 4481 li s1,0
8000116a: 003c addi a5,sp,8
8000116c: 871a mv a4,t1
8000116e: 6314 ld a3,0(a4)
80001170: 07a1 addi a5,a5,8
80001172: 0721 addi a4,a4,8
80001174: fed7bc23 sd a3,-8(a5)
80001178: fef81be3 bne a6,a5,8000116e <keccakf.constprop.0+0x126>
8000117c: 891a mv s2,t1
8000117e: 4781 li a5,0
80001180: 00178a1b addiw s4,a5,1
80001184: 02ca66bb remw a3,s4,a2
80001188: 0027871b addiw a4,a5,2
8000118c: 000a079b sext.w a5,s4
80001190: 03010a13 addi s4,sp,48
80001194: 00093983 ld s3,0(s2)
80001198: 0921 addi s2,s2,8
8000119a: 02c7673b remw a4,a4,a2
8000119e: 068e slli a3,a3,0x3
800011a0: 96d2 add a3,a3,s4
800011a2: fd86b683 ld a3,-40(a3)
800011a6: fff6c693 not a3,a3
800011aa: 070e slli a4,a4,0x3
800011ac: 9752 add a4,a4,s4
800011ae: fd873703 ld a4,-40(a4)
800011b2: 8f75 and a4,a4,a3
800011b4: 00e9c733 xor a4,s3,a4
800011b8: fee93c23 sd a4,-8(s2)
800011bc: fcb792e3 bne a5,a1,80001180 <keccakf.constprop.0+0x138>
800011c0: 2495 addiw s1,s1,5
800011c2: 02830313 addi t1,t1,40
800011c6: fbc492e3 bne s1,t3,8000116a <keccakf.constprop.0+0x122>
800011ca: 6118 ld a4,0(a0)
800011cc: 00efc733 xor a4,t6,a4
800011d0: e118 sd a4,0(a0)
800011d2: 01e40663 beq s0,t5,800011de <keccakf.constprop.0+0x196>
800011d6: 000f3f83 ld t6,0(t5)
800011da: 0f21 addi t5,t5,8
800011dc: b575 j 80001088 <keccakf.constprop.0+0x40>
800011de: 6466 ld s0,88(sp)
800011e0: 64c6 ld s1,80(sp)
800011e2: 6926 ld s2,72(sp)
800011e4: 6986 ld s3,64(sp)
800011e6: 7a62 ld s4,56(sp)
800011e8: 7ac2 ld s5,48(sp)
800011ea: 6125 addi sp,sp,96
800011ec: 8082 ret
单层循环内有多个连续计算
for (int i =0 ; i < MAXELEMS ;i++){
c[i] = a[i]*b[i]+a[i]*(b[i] - a[i])+b[i]*(a[i]+b[i])-a[i]*b[i]*b[i]+b[i];
}
80001798: 00083703 ld a4,0(a6)
8000179c: 6190 ld a2,0(a1)
8000179e: 08a1 addi a7,a7,8
800017a0: 05a1 addi a1,a1,8
800017a2: 40c707b3 sub a5,a4,a2
800017a6: 02e606b3 mul a3,a2,a4
800017aa: 00e60533 add a0,a2,a4
800017ae: 0821 addi a6,a6,8
800017b0: 02c787b3 mul a5,a5,a2
800017b4: 02e50633 mul a2,a0,a4
800017b8: 97b6 add a5,a5,a3
800017ba: 02d706b3 mul a3,a4,a3
800017be: 97b2 add a5,a5,a2
800017c0: 8f95 sub a5,a5,a3
800017c2: 97ba add a5,a5,a4
800017c4: fef8bc23 sd a5,-8(a7)
800017c8: fc6598e3 bne a1,t1,80001798 <main+0x62>
一层循环,向量找最大值
int max = 0;
for(int i =0; i<MAXELEMS; i++){
if( max < a[i] ){
max = a[i];
}
}
80001962: 4581 li a1,0
80001964: 609c ld a5,0(s1)
80001966: 04a1 addi s1,s1,8
80001968: 00f5d363 bge a1,a5,8000196e <main+0x23c>
8000196c: 85be mv a1,a5
8000196e: fe849be3 bne s1,s0,80001964 <main+0x232>
```
一层循环 a[i] = a[i-3] +2
for(int i =3; i< MAXELEMS; i++){
a[i] = a[i-3] + 2;
}
8000198c: 6782 ld a5,0(sp)
8000198e: 66a1 lui a3,0x8
80001990: 16a1 addi a3,a3,-24
80001992: 96be add a3,a3,a5
80001994: 6398 ld a4,0(a5)
80001996: 07a1 addi a5,a5,8
80001998: 0709 addi a4,a4,2
8000199a: eb98 sd a4,16(a5)
8000199c: fef69ce3 bne a3,a5,80001994 <main+0x262>
XLOOP 几个例子

带有循环指示变量:
for(int i = 0; i < 8; ++i) {
a[i] = b[i]+c[i];
tmp += 3*i+1;
//j++;
printf("a: %d, tmp: %d j: %d \n", a[4], tmp, j );
}
8000174e: 890a mv s2,sp
80001750: 4405 li s0,1
80001752: 00000b17 auipc s6,0x0
80001756: 07eb0b13 addi s6,s6,126 # 800017d0 <main+0x9e>
8000175a: 4ae5 li s5,25
//循环
8000175c: 000a2783 lw a5,0(s4)
80001760: 0009a703 lw a4,0(s3)
80001764: 9ca1 addw s1,s1,s0
80001766: 4681 li a3,0
80001768: 9fb9 addw a5,a5,a4
8000176a: 00f92023 sw a5,0(s2)
8000176e: 45c2 lw a1,16(sp)
80001770: 8626 mv a2,s1
80001772: 855a mv a0,s6
80001774: 240d addiw s0,s0,3
80001776: d43ff0ef jal ra,800014b8 <printf>
8000177a: 0a11 addi s4,s4,4
8000177c: 0991 addi s3,s3,4
8000177e: 0911 addi s2,s2,4
80001780: fd541ee3 bne s0,s5,8000175c <main+0x2a>
80001784: 05c4861b addiw a2,s1,92
80001788: 0ff0000f fence
for(int i = 0; i < 128; ++i) {
A[i] += 2;
}
80001048: 7101 addi sp,sp,-512
8000104a: 878a mv a5,sp
8000104c: 0414 addi a3,sp,512
8000104e: 4398 lw a4,0(a5)
80001050: 0791 addi a5,a5,4
80001052: 2709 addiw a4,a4,2
80001054: fee7ae23 sw a4,-4(a5)
80001058: fed79be3 bne a5,a3,8000104e <demo+0x6>
for(int i = 0; i < 128; ++i ){
B[i] = a + A[i];
a = C[i] + c;
}
10194: 4394 lw a3,0(a5)
10196: 9f35 addw a4,a4,a3
10198: c198 sw a4,0(a1)
1019a: 4218 lw a4,0(a2)
1019c: 02e5073b mulw a4,a0,a4
101a0: 0791 addi a5,a5,4
101a2: 0591 addi a1,a1,4
101a4: 0611 addi a2,a2,4
101a6: ff0797e3 bne a5,a6,10194 <main+0x12>
for(int i=0; i<128; ++i){
A[i] += i;
}
409c lw a5,0(s1)
9fa1 addw a5,a5,s0
2405 addiw s0,s0,1
c09c sw a5,0(s1)
0491 addi s1,s1,4
ff2417e3 bne s0,s2,800010a4 <demo+0x5c>
for(int i=0; i<128; i++)
{
A[i] = a + 128;
a = B[i] * 2;
}
80001072: 4288 lw a0,0(a3)
80001074: 0807879b addiw a5,a5,128
80001078: c31c sw a5,0(a4)
8000107a: 0711 addi a4,a4,4
8000107c: 0015179b slliw a5,a0,0x1
80001080: 0691 addi a3,a3,4
80001082: feb718e3 bne a4,a1,80001072 <demo+0x2a>