ResearchNote icon indicating copy to clipboard operation
ResearchNote copied to clipboard

几类循环模式的指令集角度观察(2)

Open meton-robean opened this issue 5 years ago • 2 comments

往期 #31 几类循环模式的指令集角度观察

simple sgemm

这个例子来自论文XLOOP 提供的benchmark

void sgemm_scalar_int( int C[], int A[], int B[], int size )
{

  for ( int mm = 0; mm < size; ++mm  ) {
    for ( int nn = 0; nn < size; ++nn ) {
      int c = 0;
      for ( int i = 0; i < size; ++i ) {
        int a = A[mm + i * size];
        int b = B[nn + i * size];
        c += a * b;
      }
      C[mm+nn*size] = c;
    }
  }

}

000000008000110a <sgemm_scalar_int>:
    8000110a:	06d05363          	blez	a3,80001170 <sgemm_scalar_int+0x66>
    8000110e:	1141                	addi	sp,sp,-16
    80001110:	e422                	sd	s0,8(sp)
    80001112:	82aa                	mv	t0,a0
    80001114:	8336                	mv	t1,a3
    80001116:	8432                	mv	s0,a2
    80001118:	8fae                	mv	t6,a1
    8000111a:	00269513          	slli	a0,a3,0x2
    8000111e:	4381                	li	t2,0
    80001120:	8f22                	mv	t5,s0
    80001122:	8e96                	mv	t4,t0
    80001124:	4e01                	li	t3,0
    80001126:	867a                	mv	a2,t5
    80001128:	86fe                	mv	a3,t6
    8000112a:	4781                	li	a5,0
    8000112c:	4581                	li	a1,0

  **最内层循环的汇编代码------------------------------------
    **8000112e:	4298                lw	a4,0(a3)
    80001130:	00062803          	lw	a6,0(a2)
    80001134:	88be                mv	a7,a5
    80001136:	2785                addiw	a5,a5,1
    80001138:	0307073b          	mulw	a4,a4,a6
    8000113c:	96aa                add	a3,a3,a0
    8000113e:	962a                add	a2,a2,a0
    80001140:	9db9                addw	a1,a1,a4
    80001142:	fef316e3          	bne	t1,a5,8000112e <sgemm_scalar_int+0x24>

** 最内层循环的汇编代码--end----------------------------------

    80001146:	00bea023          	sw	a1,0(t4)
    8000114a:	001e079b          	addiw	a5,t3,1
    8000114e:	9eaa                	add	t4,t4,a0
    80001150:	0f11                	addi	t5,t5,4
    80001152:	011e0463          	beq	t3,a7,8000115a <sgemm_scalar_int+0x50>
    80001156:	8e3e                	mv	t3,a5
    80001158:	b7f9                	j	80001126 <sgemm_scalar_int+0x1c>
    8000115a:	0013879b          	addiw	a5,t2,1


    8000115e:	0f91                	addi	t6,t6,4
    80001160:	0291                	addi	t0,t0,4
    80001162:	01c38463          	beq	t2,t3,8000116a <sgemm_scalar_int+0x60>
    80001166:	83be                	mv	t2,a5
    80001168:	bf65                	j	80001120 <sgemm_scalar_int+0x16>
    8000116a:	6422                	ld	s0,8(sp)


    8000116c:	0141                	addi	sp,sp,16
    8000116e:	8082                	ret
    80001170:	8082                	ret

meton-robean avatar Feb 25 '20 12:02 meton-robean

dynamic programing (这个例子来自 xloop 的dynprog)

 __attribute__((noinline))
  void dynprog_scalar_longlong( int length, long long *c, long long *W, long long *out )
  {

    int out_l = 0;
    int i, j, k;
    for (i = 0; i < length - 1; i++)
    {
      
      for (j = i + 1; j < length; j++)
      {

        int s = 0;

        asm volatile ("fence");
        for (k = i + 1; k < j; k++)
          s += c[i * length + k] + c[k * length + j];
        asm volatile ("fence" ::: "memory");

        c[i * length + j] = s + W[i * length + j];
        

      }
      
      out_l += c[length - 1];
    }

    *out = out_l;
  }
000000008000115e <dynprog_scalar_longlong>:
    8000115e:	7159                	addi	sp,sp,-112
    80001160:	f4a2                	sd	s0,104(sp)
    80001162:	f0a6                	sd	s1,96(sp)
    80001164:	ecca                	sd	s2,88(sp)
    80001166:	e8ce                	sd	s3,80(sp)
    80001168:	e4d2                	sd	s4,72(sp)
    8000116a:	e0d6                	sd	s5,64(sp)
    8000116c:	fc5a                	sd	s6,56(sp)
    8000116e:	f85e                	sd	s7,48(sp)
    80001170:	f462                	sd	s8,40(sp)
    80001172:	f066                	sd	s9,32(sp)
    80001174:	ec6a                	sd	s10,24(sp)
    80001176:	e86e                	sd	s11,16(sp)
    80001178:	4785                	li	a5,1
    8000117a:	e436                	sd	a3,8(sp)
    8000117c:	0ea7de63          	bge	a5,a0,80001278 <dynprog_scalar_longlong+0x11a>
    80001180:	00351813          	slli	a6,a0,0x3
    80001184:	ff880d93          	addi	s11,a6,-8
    80001188:	00050d1b          	sext.w	s10,a0
    8000118c:	9f89                	subw	a5,a5,a0
    8000118e:	8f2a                	mv	t5,a0
    80001190:	8fae                	mv	t6,a1
    80001192:	83b2                	mv	t2,a2
    80001194:	9dae                	add	s11,s11,a1
    80001196:	e03e                	sd	a5,0(sp)
    80001198:	8c6a                	mv	s8,s10
    8000119a:	4b01                	li	s6,0
    8000119c:	4a85                	li	s5,1
    8000119e:	4a01                	li	s4,0
    800011a0:	4b81                	li	s7,0
    800011a2:	4c81                	li	s9,0
    800011a4:	01058993          	addi	s3,a1,16
    800011a8:	000a889b          	sext.w	a7,s5
    800011ac:	82c6                	mv	t0,a7
    800011ae:	09e8d663          	bge	a7,t5,8000123a <dynprog_scalar_longlong+0xdc>
    800011b2:	015a0333          	add	t1,s4,s5
    800011b6:	030e                	slli	t1,t1,0x3
    800011b8:	414c0eb3          	sub	t4,s8,s4
    800011bc:	006f8e33          	add	t3,t6,t1
    800011c0:	0e8e                	slli	t4,t4,0x3
    800011c2:	8972                	mv	s2,t3
    800011c4:	9efe                	add	t4,t4,t6
    800011c6:	ffeb049b          	addiw	s1,s6,-2
    800011ca:	017a0433          	add	s0,s4,s7
    800011ce:	0ff0000f          	fence

    800011d2:	0512d763          	bge	t0,a7,80001220 <dynprog_scalar_longlong+0xc2>
    800011d6:	0114853b          	addw	a0,s1,a7
    800011da:	1502                	slli	a0,a0,0x20
    800011dc:	9101                	srli	a0,a0,0x20
    800011de:	9522                	add	a0,a0,s0
    800011e0:	050e                	slli	a0,a0,0x3
    800011e2:	01d306b3          	add	a3,t1,t4
    800011e6:	954e                	add	a0,a0,s3
    800011e8:	87ca                	mv	a5,s2
    800011ea:	4601                	li	a2,
    
    ## 最内层循环 ------------------------------------------
    800011ec:	6398                	ld	a4,0(a5)
    800011ee:	628c                	ld	a1,0(a3)
    800011f0:	07a1                	addi	a5,a5,8
    800011f2:	96c2                	add	a3,a3,a6
    800011f4:	9f2d                	addw	a4,a4,a1
    800011f6:	9e39                	addw	a2,a2,a4
    800011f8:	fef51ae3          	bne	a0,a5,800011ec <dynprog_scalar_longlong+0x8e>
    ## 最内层循环 end----------------------------------------
    800011fc:	0ff0000f          	fence


    80001200:	006387b3          	add	a5,t2,t1
    80001204:	639c                	ld	a5,0(a5)
    80001206:	2885                	addiw	a7,a7,1
    80001208:	0321                	addi	t1,t1,8
    8000120a:	963e                	add	a2,a2,a5
    8000120c:	00ce3023          	sd	a2,0(t3)
    80001210:	031f0563          	beq	t5,a7,8000123a <dynprog_scalar_longlong+0xdc>
    80001214:	006f8e33          	add	t3,t6,t1
    80001218:	0ff0000f          	fence
    8000121c:	fb12cde3          	blt	t0,a7,800011d6 <dynprog_scalar_longlong+0x78>
    80001220:	4601                	li	a2,0
    80001222:	0ff0000f          	fence
    80001226:	006387b3          	add	a5,t2,t1
    8000122a:	639c                	ld	a5,0(a5)
    8000122c:	2885                	addiw	a7,a7,1
    8000122e:	0321                	addi	t1,t1,8
    80001230:	963e                	add	a2,a2,a5
    80001232:	00ce3023          	sd	a2,0(t3)
    80001236:	fd1f1fe3          	bne	t5,a7,80001214 <dynprog_scalar_longlong+0xb6>
    8000123a:	000db783          	ld	a5,0(s11)
    8000123e:	3b7d                	addiw	s6,s6,-1
    80001240:	0b85                	addi	s7,s7,1
    80001242:	01978cbb          	addw	s9,a5,s9
    80001246:	6782                	ld	a5,0(sp)
    80001248:	01aa0a3b          	addw	s4,s4,s10
    8000124c:	0a85                	addi	s5,s5,1
    8000124e:	018d0c3b          	addw	s8,s10,s8
    80001252:	f4fb1be3          	bne	s6,a5,800011a8 <dynprog_scalar_longlong+0x4a>
    80001256:	67a2                	ld	a5,8(sp)
    80001258:	7426                	ld	s0,104(sp)
    8000125a:	7486                	ld	s1,96(sp)
    8000125c:	0197b023          	sd	s9,0(a5)
    80001260:	6966                	ld	s2,88(sp)
    80001262:	69c6                	ld	s3,80(sp)
    80001264:	6a26                	ld	s4,72(sp)
    80001266:	6a86                	ld	s5,64(sp)
    80001268:	7b62                	ld	s6,56(sp)
    8000126a:	7bc2                	ld	s7,48(sp)
    8000126c:	7c22                	ld	s8,40(sp)
    8000126e:	7c82                	ld	s9,32(sp)
    80001270:	6d62                	ld	s10,24(sp)
    80001272:	6dc2                	ld	s11,16(sp)
    80001274:	6165                	addi	sp,sp,112
    80001276:	8082                	ret
    80001278:	4c81                	li	s9,0
    8000127a:	bff1                	j	80001256 <dynprog_scalar_longlong+0xf8>

meton-robean avatar Mar 10 '20 14:03 meton-robean

warshall computing 例子来自xloop的benchmark

  __attribute__((noinline))
  void warshall_scalar( int n, float *path, float *path_in )
  {
    int i, j, k;

    // initially copy the input path to path
    memcpy( path, path_in, sizeof(float) * n * n );

    for (k = 0; k < n; k++)
    {
      for (i = 0; i < n; i++)
        asm volatile ("fence");  //最内层循环中有条件判断
        for (j = 0; j < n; j++)
          path[i*n+j] = path[i*n+j] < path[i*n+k] + path[k*n+j] ?
                        path[i*n+j] : path[i*n+k] + path[k*n+j];
        asm volatile ("fence" ::: "memory");
        
    }
  }

    80001098:	0ff0000f          	fence
    8000109c:	86be                	mv	a3,a5
    8000109e:	2785                	addiw	a5,a5,1
    800010a0:	fef41ce3          	bne	s0,a5,80001098 <warshall_scalar+0x50>
    800010a4:	879a                	mv	a5,t1


    800010a6:	00b78733          	add	a4,a5,a1
    800010aa:	9742                	add	a4,a4,a6
    800010ac:	00072787          	flw	fa5,0(a4)
    800010b0:	00062687          	flw	fa3,0(a2)
    800010b4:	0007a707          	flw	fa4,0(a5)
    800010b8:	00d7f7d3          	fadd.s	fa5,fa5,fa3
    800010bc:	a0f71753          	flt.s	a4,fa4,fa5
    800010c0:	e319                	bnez	a4,800010c6 <warshall_scalar+0x7e>

    800010c2:	20f78753          	fmv.s	fa4,fa5
    800010c6:	0791                	addi	a5,a5,4


    800010c8:	fee7ae27          	fsw	fa4,-4(a5)
    800010cc:	fcf51de3          	bne	a0,a5,800010a6 <warshall_scalar+0x5e>
    800010d0:	0ff0000f          	fence

meton-robean avatar Mar 13 '20 10:03 meton-robean