MNN icon indicating copy to clipboard operation
MNN copied to clipboard

opt(RVV): Optimize core math and stride functions with intrinsics

Open ihb2032 opened this issue 1 month ago • 0 comments

Summary

Optimize the following functions using RVV intrinsics: MNNAxByClampBroadcastUnit, MNNScaleAndAddBias, MNNCopyC4WithStride, MNNAddC4WithStride

Environment

  • Platform: Banana PI BPI-F3
  • OS: EulixOS 3.0

Benchmark

Click to expand full test logs
[root@EulixOS ~]# ./test_scale_and_add_bias
planeNumber=4, biasNumber=4
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.24x
Test planeNumber=4, biasNumber=4: PASSED
planeNumber=1, biasNumber=1
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.00x
Test planeNumber=1, biasNumber=1: PASSED
planeNumber=8, biasNumber=3
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.75x
Test planeNumber=8, biasNumber=3: PASSED
planeNumber=4, biasNumber=8
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.45x
Test planeNumber=4, biasNumber=8: PASSED
planeNumber=0, biasNumber=4
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.80x
Test planeNumber=0, biasNumber=4: PASSED
planeNumber=4, biasNumber=0
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.00x
Test planeNumber=4, biasNumber=0: PASSED
planeNumber=65536, biasNumber=4
Scalar time: 0.0137 sec
RVV time   : 0.0062 sec
Speedup    : 2.22x
Test planeNumber=65536, biasNumber=4: PASSED

planeNumber=1048576, biasNumber=16
Scalar time: 0.8715 sec
RVV time   : 0.4023 sec
Speedup    : 2.17x
Test planeNumber=1048576, biasNumber=16: PASSED
planeNumber=16, biasNumber=1048576
Scalar time: 0.9608 sec
RVV time   : 1.0270 sec
Speedup    : 0.94x
Test planeNumber=16, biasNumber=1048576: PASSED
planeNumber=1, biasNumber=1048576
Scalar time: 0.1521 sec
RVV time   : 1.0211 sec
Speedup    : 0.15x
Test planeNumber=1, biasNumber=1048576: PASSED
planeNumber=1048576, biasNumber=1
Scalar time: 0.0544 sec
RVV time   : 0.0242 sec
Speedup    : 2.25x
Test planeNumber=1048576, biasNumber=1: PASSED
planeNumber=0, biasNumber=0
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.44x
Test planeNumber=0, biasNumber=0: PASSED
planeNumber=256, biasNumber=256
Scalar time: 0.0034 sec
RVV time   : 0.0016 sec
Speedup    : 2.12x
Test planeNumber=256, biasNumber=256: PASSED
planeNumber=997, biasNumber=997
Scalar time: 0.0524 sec
RVV time   : 0.0236 sec
Speedup    : 2.21x
Test planeNumber=997, biasNumber=997: PASSED

All tests PASSED 
[root@EulixOS ~]# ./test_add_c4_with_stride
srcStride=4, dstStride=4, count=4
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.05x
Test srcStride=4, dstStride=4, count=4: PASSED
srcStride=4, dstStride=4, count=1
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.00x
Test srcStride=4, dstStride=4, count=1: PASSED
srcStride=8, dstStride=8, count=3
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 1.00x
Test srcStride=8, dstStride=8, count=3: PASSED
srcStride=4, dstStride=8, count=5
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 1.00x
Test srcStride=4, dstStride=8, count=5: PASSED
srcStride=8, dstStride=4, count=7
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.44x
Test srcStride=8, dstStride=4, count=7: PASSED
srcStride=4, dstStride=4, count=0
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.00x
Test srcStride=4, dstStride=4, count=0: PASSED
srcStride=4, dstStride=4, count=65536
Scalar time: 0.0071 sec
RVV time   : 0.0016 sec
Speedup    : 4.39x
Test srcStride=4, dstStride=4, count=65536: PASSED
srcStride=4, dstStride=4, count=1024
Scalar time: 0.0001 sec
RVV time   : 0.0000 sec
Speedup    : 4.39x
Test srcStride=4, dstStride=4, count=1024: PASSED
srcStride=4, dstStride=4, count=2048
Scalar time: 0.0002 sec
RVV time   : 0.0000 sec
Speedup    : 4.86x
Test srcStride=4, dstStride=4, count=2048: PASSED
srcStride=8, dstStride=8, count=8192
Scalar time: 0.0009 sec
RVV time   : 0.0003 sec
Speedup    : 3.27x
Test srcStride=8, dstStride=8, count=8192: PASSED

srcStride=16, dstStride=16, count=1000000
Scalar time: 0.1086 sec
RVV time   : 0.0780 sec
Speedup    : 1.39x
Test srcStride=16, dstStride=16, count=1000000: PASSED

All tests PASSED 
[root@EulixOS ~]# ./test_ax
width=4, cStride=16, aStride=16, height=4
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.63x
Test Result: PASSED
width=7, cStride=32, aStride=32, height=4
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 1.95x
Test Result: PASSED
width=16, cStride=64, aStride=64, height=1
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 6.25x
Test Result: PASSED
width=4, cStride=20, aStride=16, height=8
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 1.35x
Test Result: PASSED
width=10, cStride=40, aStride=40, height=0
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.00x
Test Result: PASSED
width=0, cStride=40, aStride=40, height=10
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 1.25x
Test Result: PASSED
width=128, cStride=512, aStride=512, height=128
Scalar time: 0.0052 sec
RVV time   : 0.0005 sec
Speedup    : 11.40x
Test Result: PASSED
width=1024, cStride=4096, aStride=4096, height=512
Scalar time: 0.1691 sec
RVV time   : 0.0137 sec
Speedup    : 12.34x
Test Result: PASSED

All tests PASSED 
[root@EulixOS ~]# ./test_copy_c4_with_stride
srcStride=4, dstStride=4, count=4
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.06x
Test srcStride=4, dstStride=4, count=4: PASSED
srcStride=4, dstStride=4, count=1
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.00x
Test srcStride=4, dstStride=4, count=1: PASSED
srcStride=8, dstStride=8, count=3
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 1.00x
Test srcStride=8, dstStride=8, count=3: PASSED
srcStride=4, dstStride=8, count=5
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.00x
Test srcStride=4, dstStride=8, count=5: PASSED
srcStride=8, dstStride=4, count=7
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 1.00x
Test srcStride=8, dstStride=4, count=7: PASSED
srcStride=4, dstStride=4, count=0
Scalar time: 0.0000 sec
RVV time   : 0.0000 sec
Speedup    : 0.00x
Test srcStride=4, dstStride=4, count=0: PASSED
srcStride=4, dstStride=4, count=65536
Scalar time: 0.0060 sec
RVV time   : 0.0013 sec
Speedup    : 4.61x
Test srcStride=4, dstStride=4, count=65536: PASSED
srcStride=4, dstStride=4, count=1024
Scalar time: 0.0001 sec
RVV time   : 0.0000 sec
Speedup    : 5.76x
Test srcStride=4, dstStride=4, count=1024: PASSED
srcStride=4, dstStride=4, count=2048
Scalar time: 0.0002 sec
RVV time   : 0.0000 sec
Speedup    : 5.41x
Test srcStride=4, dstStride=4, count=2048: PASSED
srcStride=8, dstStride=8, count=8192
Scalar time: 0.0007 sec
RVV time   : 0.0003 sec
Speedup    : 2.84x
Test srcStride=8, dstStride=8, count=8192: PASSED
srcStride=16, dstStride=16, count=1000000
Scalar time: 0.0921 sec
RVV time   : 0.0550 sec
Speedup    : 1.67x
Test srcStride=16, dstStride=16, count=1000000: PASSED

All tests PASSED 

</details>

ihb2032 avatar Nov 26 '25 08:11 ihb2032