ResearchNote icon indicating copy to clipboard operation
ResearchNote copied to clipboard

几类循环模式的指令集角度观察

Open meton-robean opened this issue 5 years ago • 14 comments

概述

在做一个加速器设计项目时需要从指令集角度去观察循环可加速代码,下面记录以备忘

meton-robean avatar Jan 01 '20 09:01 meton-robean

单层循环,全并行

  for(int i =0; i<MAXELEMS; i++){
        c[i] = a[i] * b[i];
    }
    800017e2:	601c                	ld	a5,0(s0)
    800017e4:	6098                	ld	a4,0(s1)
    800017e6:	09a1                	addi	s3,s3,8
    800017e8:	0421                	addi	s0,s0,8
    800017ea:	02e787b3          	mul	a5,a5,a4
    800017ee:	04a1                	addi	s1,s1,8
    800017f0:	fef9bc23          	sd	a5,-8(s3)
    800017f4:	ff2417e3          	bne	s0,s2,800017e2 <main+0xb0>

meton-robean avatar Jan 01 '20 09:01 meton-robean

单层循环,树并行

    for(int i =0; i< 4096; i++){
        sum += a[i];
    }
    800017b2:	4681                	li	a3,0
    800017b4:	87a2                	mv	a5,s0
    800017b6:	6398                	ld	a4,0(a5)
    800017b8:	07a1                	addi	a5,a5,8
    800017ba:	96ba                	add	a3,a3,a4
    800017bc:	ff279de3          	bne	a5,s2,800017b6 <main+0x84>

meton-robean avatar Jan 01 '20 09:01 meton-robean

两层循环 全并行

    for(int round=0; round<4; round++){
        for(int i =0; i<MAXELEMS; i++){
            c[i] = a[i] * b[i];
        }        
    }
  //外层循环
    80001854:	4511                	li	a0,4
    80001856:	6642                	ld	a2,16(sp)
    80001858:	67a2                	ld	a5,8(sp)
    8000185a:	86a6                	mv	a3,s1

   //内层循环
    8000185c:	6398                	ld	a4,0(a5)
    8000185e:	628c                	ld	a1,0(a3)
    80001860:	0621                	addi	a2,a2,8
    80001862:	07a1                	addi	a5,a5,8
    80001864:	02b70733          	mul	a4,a4,a1
    80001868:	06a1                	addi	a3,a3,8
    8000186a:	fee63c23          	sd	a4,-8(a2) # ff8 <_tbss_end+0xfb4>
    8000186e:	fe8797e3          	bne	a5,s0,8000185c <main+0x12a>
   //内层循环 END

    80001872:	357d                	addiw	a0,a0,-1
    80001874:	f16d                	bnez	a0,80001856 <main+0x124>

变例2:

    for(int round=0; round<4; round++){
        for(int i =0; i<MAXELEMS; i++){
            a[i] = a[i] * b[i];
        }        
    }
    80001860:	4491                	li	s1,4
    80001862:	86d6                	mv	a3,s5
    80001864:	87ca                	mv	a5,s2

    80001866:	6398                	ld	a4,0(a5)
    80001868:	6290                	ld	a2,0(a3)
    8000186a:	07a1                	addi	a5,a5,8
    8000186c:	06a1                	addi	a3,a3,8
    8000186e:	02c70733          	mul	a4,a4,a2
    80001872:	fee7bc23          	sd	a4,-8(a5) 
    80001876:	fe8798e3          	bne	a5,s0,80001866 <main+0x134>

    8000187a:	34fd                	addiw	s1,s1,-1
    8000187c:	f0fd                	bnez	s1,80001862 <main+0x130>

meton-robean avatar Jan 01 '20 09:01 meton-robean

一层循环,循环内有if/else 语句

    for(int i =0; i<MAXELEMS; i++){
        if (i < 100){
            c[i] = a[i] * b[i];
        }
        else{
            c[i] = a[i] + b[i];
        }
        
    }  
    800018da:	6605                	lui	a2,0x1
    800018dc:	06300693          	li	a3,99
    800018e0:	167d                	addi	a2,a2,-1
    800018e2:	a811                	j	800018f6 <main+0x1c4>           //if i<100 跳转乘法运算
    //加法运算
    800018e4:	97ba                	add	a5,a5,a4                                     //if i>100 加法计算
    800018e6:	00f9b023          	sd	a5,0(s3)
    800018ea:	02c48163          	beq	s1,a2,8000190c <main+0x1da> //循环出口

    //乘法运算
    800018ee:	2485                	addiw	s1,s1,1
    800018f0:	0921                	addi	s2,s2,8
    800018f2:	0a21                	addi	s4,s4,8
    800018f4:	09a1                	addi	s3,s3,8
    800018f6:	00093703          	ld	a4,0(s2) # fffffffffffe8000<__global_pointer$+0xffffffff7ffe5bd8>
    800018fa:	000a3783          	ld	a5,0(s4)
    800018fe:	fe96e3e3          	bltu	a3,s1,800018e4       //if i >100 跳转去加法操作
    80001902:	02e787b3          	mul	a5,a5,a4                   //乘法
    80001906:	00f9b023          	sd	a5,0(s3)
    8000190a:	b7d5                	j	800018ee <main+0x1bc>

meton-robean avatar Jan 02 '20 03:01 meton-robean

一个复杂点的例子: sha3加密算法核心函数

static void keccakf(uint64_t st[25], int rounds)
{
  int i, j, round_num;
  uint64_t t, bc[5];

  //printf("Starting\n");
  //printState(st);
  for (round_num = 0; round_num < rounds; round_num++) 
  {
    // Theta
    for (i = 0; i < 5; i++) 
      bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
    
    for (i = 0; i < 5; i++) 
    {
      t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
      for (j = 0; j < 25; j += 5)
             st[j + i] ^= t;
    }

    //printf("After Theta:\n");
    //printState(st);
    // Rho Pi
    t = st[1];
    for (i = 0; i < 24; i++) 
    {
      j = keccakf_piln[i];
      bc[0] = st[j];
      st[j] = ROTL64(t, keccakf_rotc[i]);
      t = bc[0];
    }
    //printf("After RhoPi:\n");
    //printState(st);

    //  Chi
    for (j = 0; j < 25; j += 5) 
    {
      for (i = 0; i < 5; i++)
           bc[i] = st[j + i];
      for (i = 0; i < 5; i++)
           st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
    }

    //printf("After Chi:\n");
    //printState(st);
    //  Iota
    st[0] ^= keccakf_rndc[round_num];
    //printf("After Round %d:\n",round_num);
    //printState(st);
  }
}
0000000080001048 <keccakf.constprop.0>:
    80001048:	6118                	ld	a4,0(a0)
    8000104a:	711d                	addi	sp,sp,-96
    8000104c:	eca2                	sd	s0,88(sp)
    8000104e:	e8a6                	sd	s1,80(sp)
    80001050:	e4ca                	sd	s2,72(sp)
    80001052:	e0ce                	sd	s3,64(sp)
    80001054:	fc52                	sd	s4,56(sp)
    80001056:	f856                	sd	s5,48(sp)
    80001058:	00001f17          	auipc	t5,0x1
    8000105c:	cd8f0f13          	addi	t5,t5,-808 # 80001d30 <keccakf_rndc+0x8>
    80001060:	00001417          	auipc	s0,0x1
    80001064:	d8840413          	addi	s0,s0,-632 # 80001de8 <keccakf_piln>
    80001068:	4f85                	li	t6,1
    8000106a:	0c850393          	addi	t2,a0,200
    8000106e:	00001897          	auipc	a7,0x1
    80001072:	dda88893          	addi	a7,a7,-550 # 80001e48 <keccakf_rotc>
    80001076:	00850293          	addi	t0,a0,8
    8000107a:	02850e93          	addi	t4,a0,40
    8000107e:	4615                	li	a2,5
    80001080:	4595                	li	a1,5
    80001082:	03010813          	addi	a6,sp,48
    80001086:	4e65                	li	t3,25
    80001088:	8696                	mv	a3,t0
    8000108a:	832a                	mv	t1,a0
    8000108c:	0024                	addi	s1,sp,8
    8000108e:	893a                	mv	s2,a4
    80001090:	729c                	ld	a5,32(a3)
    80001092:	0486ba83          	ld	s5,72(a3)
    80001096:	0706ba03          	ld	s4,112(a3)
    8000109a:	0986b983          	ld	s3,152(a3)
    8000109e:	0157c7b3          	xor	a5,a5,s5
    800010a2:	0147c7b3          	xor	a5,a5,s4
    800010a6:	0137c7b3          	xor	a5,a5,s3
    800010aa:	0127c7b3          	xor	a5,a5,s2
    800010ae:	e09c                	sd	a5,0(s1)
    800010b0:	04a1                	addi	s1,s1,8
    800010b2:	00de8663          	beq	t4,a3,800010be <keccakf.constprop.0+0x76>
    800010b6:	0006b903          	ld	s2,0(a3)
    800010ba:	06a1                	addi	a3,a3,8
    800010bc:	bfd1                	j	80001090 <keccakf.constprop.0+0x48>
    800010be:	849e                	mv	s1,t2
    800010c0:	4981                	li	s3,0
    800010c2:	00198a1b          	addiw	s4,s3,1
    800010c6:	02ca66bb          	remw	a3,s4,a2
    800010ca:	0049891b          	addiw	s2,s3,4
    800010ce:	000a099b          	sext.w	s3,s4
    800010d2:	03010a13          	addi	s4,sp,48
    800010d6:	f3848793          	addi	a5,s1,-200
    800010da:	02c9693b          	remw	s2,s2,a2
    800010de:	068e                	slli	a3,a3,0x3
    800010e0:	96d2                	add	a3,a3,s4
    800010e2:	fd86ba03          	ld	s4,-40(a3)
    800010e6:	001a1693          	slli	a3,s4,0x1
    800010ea:	03fa5a13          	srli	s4,s4,0x3f
    800010ee:	0146e6b3          	or	a3,a3,s4
    800010f2:	03010a13          	addi	s4,sp,48
    800010f6:	090e                	slli	s2,s2,0x3
    800010f8:	9952                	add	s2,s2,s4
    800010fa:	fd893903          	ld	s2,-40(s2)
    800010fe:	0126c6b3          	xor	a3,a3,s2
    80001102:	8f35                	xor	a4,a4,a3
    80001104:	e398                	sd	a4,0(a5)
    80001106:	02878793          	addi	a5,a5,40
    8000110a:	00f48a63          	beq	s1,a5,8000111e <keccakf.constprop.0+0xd6>
    8000110e:	6398                	ld	a4,0(a5)
    80001110:	02878793          	addi	a5,a5,40
    80001114:	8f35                	xor	a4,a4,a3
    80001116:	fce7bc23          	sd	a4,-40(a5)
    8000111a:	fef49ae3          	bne	s1,a5,8000110e <keccakf.constprop.0+0xc6>
    8000111e:	04a1                	addi	s1,s1,8
    80001120:	00b98563          	beq	s3,a1,8000112a <keccakf.constprop.0+0xe2>
    80001124:	f384b703          	ld	a4,-200(s1)
    80001128:	bf69                	j	800010c2 <keccakf.constprop.0+0x7a>
    8000112a:	651c                	ld	a5,8(a0)
    8000112c:	00001497          	auipc	s1,0x1
    80001130:	cc048493          	addi	s1,s1,-832 # 80001dec <keccakf_piln+0x4>
    80001134:	00001917          	auipc	s2,0x1
    80001138:	d1890913          	addi	s2,s2,-744 # 80001e4c <keccakf_rotc+0x4>
    8000113c:	4685                	li	a3,1  
    8000113e:	4729                	li	a4,10
    80001140:	a031                	j	8000114c <keccakf.constprop.0+0x104>
    80001142:	4098                	lw	a4,0(s1)
    80001144:	00092683          	lw	a3,0(s2)
    80001148:	0491                	addi	s1,s1,4
    8000114a:	0911                	addi	s2,s2,4
    8000114c:	40d009bb          	negw	s3,a3
    80001150:	070e                	slli	a4,a4,0x3
    80001152:	00d796b3          	sll	a3,a5,a3
    80001156:	0137d7b3          	srl	a5,a5,s3
    8000115a:	972a                	add	a4,a4,a0
    8000115c:	8edd                	or	a3,a3,a5
    8000115e:	631c                	ld	a5,0(a4)
    80001160:	e314                	sd	a3,0(a4)
    80001162:	fe9890e3          	bne	a7,s1,80001142 <keccakf.constprop.0+0xfa>
    80001166:	e43e                	sd	a5,8(sp)
    80001168:	4481                	li	s1,0
    8000116a:	003c                	addi	a5,sp,8
    8000116c:	871a                	mv	a4,t1
    8000116e:	6314                	ld	a3,0(a4)
    80001170:	07a1                	addi	a5,a5,8
    80001172:	0721                	addi	a4,a4,8
    80001174:	fed7bc23          	sd	a3,-8(a5)
    80001178:	fef81be3          	bne	a6,a5,8000116e <keccakf.constprop.0+0x126>
    8000117c:	891a                	mv	s2,t1
    8000117e:	4781                	li	a5,0
    80001180:	00178a1b          	addiw	s4,a5,1
    80001184:	02ca66bb          	remw	a3,s4,a2
    80001188:	0027871b          	addiw	a4,a5,2
    8000118c:	000a079b          	sext.w	a5,s4
    80001190:	03010a13          	addi	s4,sp,48
    80001194:	00093983          	ld	s3,0(s2)
    80001198:	0921                	addi	s2,s2,8
    8000119a:	02c7673b          	remw	a4,a4,a2
    8000119e:	068e                	slli	a3,a3,0x3
    800011a0:	96d2                	add	a3,a3,s4
    800011a2:	fd86b683          	ld	a3,-40(a3)
    800011a6:	fff6c693          	not	a3,a3
    800011aa:	070e                	slli	a4,a4,0x3
    800011ac:	9752                	add	a4,a4,s4
    800011ae:	fd873703          	ld	a4,-40(a4)
    800011b2:	8f75                	and	a4,a4,a3
    800011b4:	00e9c733          	xor	a4,s3,a4
    800011b8:	fee93c23          	sd	a4,-8(s2)
    800011bc:	fcb792e3          	bne	a5,a1,80001180 <keccakf.constprop.0+0x138>
    800011c0:	2495                	addiw	s1,s1,5
    800011c2:	02830313          	addi	t1,t1,40
    800011c6:	fbc492e3          	bne	s1,t3,8000116a <keccakf.constprop.0+0x122>
    800011ca:	6118                	ld	a4,0(a0)
    800011cc:	00efc733          	xor	a4,t6,a4
    800011d0:	e118                	sd	a4,0(a0)
    800011d2:	01e40663          	beq	s0,t5,800011de <keccakf.constprop.0+0x196>
    800011d6:	000f3f83          	ld	t6,0(t5)
    800011da:	0f21                	addi	t5,t5,8
    800011dc:	b575                	j	80001088 <keccakf.constprop.0+0x40>
    800011de:	6466                	ld	s0,88(sp)
    800011e0:	64c6                	ld	s1,80(sp)
    800011e2:	6926                	ld	s2,72(sp)
    800011e4:	6986                	ld	s3,64(sp)
    800011e6:	7a62                	ld	s4,56(sp)
    800011e8:	7ac2                	ld	s5,48(sp)
    800011ea:	6125                	addi	sp,sp,96
    800011ec:	8082                	ret

meton-robean avatar Jan 02 '20 03:01 meton-robean

单层循环内有多个连续计算

    for (int i =0 ; i < MAXELEMS ;i++){

   	c[i] = a[i]*b[i]+a[i]*(b[i] - a[i])+b[i]*(a[i]+b[i])-a[i]*b[i]*b[i]+b[i];
        

    }

    80001798:	00083703          	ld	a4,0(a6)
    8000179c:	6190                	ld	a2,0(a1)
    8000179e:	08a1                	addi	a7,a7,8
    800017a0:	05a1                	addi	a1,a1,8

    800017a2:	40c707b3          	sub	a5,a4,a2
    800017a6:	02e606b3          	mul	a3,a2,a4
    800017aa:	00e60533          	add	a0,a2,a4
    800017ae:	0821                	addi	a6,a6,8
    800017b0:	02c787b3          	mul	a5,a5,a2
    800017b4:	02e50633          	mul	a2,a0,a4
    800017b8:	97b6                	add	a5,a5,a3
    800017ba:	02d706b3          	mul	a3,a4,a3
    800017be:	97b2                	add	a5,a5,a2
    800017c0:	8f95                	sub	a5,a5,a3
    800017c2:	97ba                	add	a5,a5,a4

    800017c4:	fef8bc23          	sd	a5,-8(a7)
    800017c8:	fc6598e3          	bne	a1,t1,80001798 <main+0x62>

meton-robean avatar Jan 02 '20 14:01 meton-robean

一层循环,向量找最大值

    int max = 0;
    for(int i =0; i<MAXELEMS; i++){
        if( max < a[i] ){
            max = a[i];
        }
    }
    80001962:	4581                	li	a1,0
    80001964:	609c                	ld	a5,0(s1)
    80001966:	04a1                	addi	s1,s1,8
    80001968:	00f5d363          	bge	a1,a5,8000196e <main+0x23c>
    8000196c:	85be                	mv	a1,a5
    8000196e:	fe849be3          	bne	s1,s0,80001964 <main+0x232>
```

meton-robean avatar Jan 06 '20 03:01 meton-robean

一层循环 a[i] = a[i-3] +2

    for(int i =3; i< MAXELEMS; i++){
        a[i] = a[i-3] + 2;
    }
    8000198c:	6782                	ld	a5,0(sp)
    8000198e:	66a1                	lui	a3,0x8
    80001990:	16a1                	addi	a3,a3,-24
    80001992:	96be                	add	a3,a3,a5

    80001994:	6398                	ld	a4,0(a5)
    80001996:	07a1                	addi	a5,a5,8
    80001998:	0709                	addi	a4,a4,2
    8000199a:	eb98                	sd	a4,16(a5)
    8000199c:	fef69ce3          	bne	a3,a5,80001994 <main+0x262>

meton-robean avatar Jan 06 '20 06:01 meton-robean

XLOOP 几个例子

Selection_055

meton-robean avatar Jan 08 '20 02:01 meton-robean

带有循环指示变量:

    for(int i = 0; i < 8; ++i) {
        a[i] = b[i]+c[i];
        tmp += 3*i+1;
        //j++;

        printf("a: %d, tmp: %d j: %d \n", a[4], tmp, j );
    }
    8000174e:	890a                	mv	s2,sp
    80001750:	4405                	li	s0,1
    80001752:	00000b17          	auipc	s6,0x0
    80001756:	07eb0b13          	addi	s6,s6,126 # 800017d0 <main+0x9e>
    8000175a:	4ae5                	li	s5,25
    
//循环
    8000175c:	000a2783          	lw	a5,0(s4)
    80001760:	0009a703          	lw	a4,0(s3)
    80001764:	9ca1                	addw	s1,s1,s0
    80001766:	4681                	li	a3,0
    80001768:	9fb9                	addw	a5,a5,a4
    8000176a:	00f92023          	sw	a5,0(s2)
    8000176e:	45c2                	lw	a1,16(sp)
    80001770:	8626                	mv	a2,s1
    80001772:	855a                	mv	a0,s6
    80001774:	240d                	addiw	s0,s0,3
    80001776:	d43ff0ef          	jal	ra,800014b8 <printf>
    8000177a:	0a11                	addi	s4,s4,4
    8000177c:	0991                	addi	s3,s3,4
    8000177e:	0911                	addi	s2,s2,4
    80001780:	fd541ee3          	bne	s0,s5,8000175c <main+0x2a>

    80001784:	05c4861b          	addiw	a2,s1,92
    80001788:	0ff0000f          	fence

meton-robean avatar Jun 02 '20 14:06 meton-robean

  for(int i = 0; i < 128; ++i) {
    A[i] += 2;
  }
    80001048:	7101                	addi	sp,sp,-512
    8000104a:	878a                	mv	a5,sp
    8000104c:	0414                	addi	a3,sp,512

    8000104e:	4398                	lw	a4,0(a5)
    80001050:	0791                	addi	a5,a5,4
    80001052:	2709                	addiw	a4,a4,2
    80001054:	fee7ae23          	sw	a4,-4(a5)
    80001058:	fed79be3          	bne	a5,a3,8000104e <demo+0x6>

meton-robean avatar Jul 03 '20 09:07 meton-robean

for(int i = 0; i < 128; ++i ){
     B[i] = a + A[i];
     a = C[i] + c; 
}
   10194:	4394                	lw	a3,0(a5)
   10196:	9f35                	        addw	a4,a4,a3
   10198:	c198                	sw	a4,0(a1)
   1019a:	4218                	lw	a4,0(a2)
   1019c:	02e5073b          	mulw	a4,a0,a4
   101a0:	0791                	addi	a5,a5,4
   101a2:	0591                	addi	a1,a1,4
   101a4:	0611                	addi	a2,a2,4
   101a6:	ff0797e3          	bne	a5,a6,10194 <main+0x12>

meton-robean avatar Jul 03 '20 10:07 meton-robean

  for(int i=0; i<128; ++i){
     A[i] += i;
  }

409c                	lw	a5,0(s1)
9fa1                	        addw	a5,a5,s0
2405                	addiw	s0,s0,1
c09c                	        sw	a5,0(s1)
0491                	addi	s1,s1,4
 ff2417e3          	bne	s0,s2,800010a4 <demo+0x5c>

meton-robean avatar Jul 03 '20 10:07 meton-robean

  for(int i=0; i<128; i++)
  {
      A[i] = a + 128;
      a = B[i] * 2;
  }
    80001072:	4288                	lw	a0,0(a3)
    80001074:	0807879b          	addiw	a5,a5,128
    80001078:	c31c                	sw	a5,0(a4)
    8000107a:	0711                	addi	a4,a4,4
    8000107c:	0015179b          	slliw	a5,a0,0x1
    80001080:	0691                	addi	a3,a3,4
    80001082:	feb718e3          	bne	a4,a1,80001072 <demo+0x2a>

meton-robean avatar Jul 03 '20 10:07 meton-robean