MNN
MNN copied to clipboard
opt(RVV): Optimize conv and strassen functions with intrinsics
Summary
Optimize MNNConvRunForLineDepthwise, MNNDeconvRunForUnitDepthWise and MNNStrassenMergeCFunction using RVV intrinsics.
Environment
- Platform: Banana PI BPI-F3
- OS: EulixOS 3.0
Benchmark
Click to expand full test logs
[root@EulixOS ~]# ./test_deconv_run_for_unit_depth_wise
fw=4, fh=4
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.31x
Test fw=4, fh=4: PASSED
fw=1, fh=1
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.00x
Test fw=1, fh=1: PASSED
fw=3, fh=3
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.24x
Test fw=3, fh=3: PASSED
fw=5, fh=5
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.32x
Test fw=5, fh=5: PASSED
fw=7, fh=7
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.42x
Test fw=7, fh=7: PASSED
fw=8, fh=8
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.66x
Test fw=8, fh=8: PASSED
fw=16, fh=1
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.44x
Test fw=16, fh=1: PASSED
fw=1, fh=16
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.11x
Test fw=1, fh=16: PASSED
fw=0, fh=4
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.00x
Test fw=0, fh=4: PASSED
fw=4, fh=0
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.00x
Test fw=4, fh=0: PASSED
fw=32, fh=32
Scalar time: 0.0001 sec
RVV time : 0.0000 sec
Speedup : 2.02x
Test fw=32, fh=32: PASSED
fw=128, fh=64
Scalar time: 0.0006 sec
RVV time : 0.0005 sec
Speedup : 1.20x
Test fw=128, fh=64: PASSED
fw=1024, fh=4
Scalar time: 0.0003 sec
RVV time : 0.0001 sec
Speedup : 3.18x
Test fw=1024, fh=4: PASSED
fw=256, fh=256
Scalar time: 0.0050 sec
RVV time : 0.0018 sec
Speedup : 2.81x
Test fw=256, fh=256: PASSED
fw=997, fh=1
Scalar time: 0.0001 sec
RVV time : 0.0000 sec
Speedup : 3.36x
Test fw=997, fh=1: PASSED
fw=1024, fh=1024
Scalar time: 0.0801 sec
RVV time : 0.0279 sec
Speedup : 2.87x
Test fw=1024, fh=1024: PASSED
fw=2048, fh=2048
Scalar time: 0.3204 sec
RVV time : 0.1315 sec
Speedup : 2.44x
Test fw=2048, fh=2048: PASSED
fw=4096, fh=4096
Scalar time: 1.2814 sec
RVV time : 0.4732 sec
Speedup : 2.71x
Test fw=4096, fh=4096: PASSED
All tests PASSED
[root@EulixOS ~]# ./test_conv_run_for_line_depthwise
W=64, H=32, KW=3, KH=3
Scalar time: 0.0031 sec
RVV time : 0.0002 sec
Speedup : 13.61x
Test W=64 H=32 K=3x3: PASSED
W=16, H=16, KW=1, KH=1
Scalar time: 0.0001 sec
RVV time : 0.0000 sec
Speedup : 4.38x
Test W=16 H=16 K=1x1: PASSED
W=10, H=10, KW=5, KH=5
Scalar time: 0.0003 sec
RVV time : 0.0002 sec
Speedup : 2.06x
Test W=10 H=10 K=5x5: PASSED
W=128, H=64, KW=3, KH=3
Scalar time: 0.0126 sec
RVV time : 0.0009 sec
Speedup : 14.19x
Test W=128 H=64 K=3x3: PASSED
W=7, H=7, KW=3, KH=3
Scalar time: 0.0001 sec
RVV time : 0.0000 sec
Speedup : 1.62x
Test W=7 H=7 K=3x3: PASSED
W=1, H=1, KW=3, KH=3
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.38x
Test W=1 H=1 K=3x3: PASSED
W=32, H=32, KW=1, KH=5
Scalar time: 0.0011 sec
RVV time : 0.0001 sec
Speedup : 7.74x
Test W=32 H=32 K=1x5: PASSED
W=32, H=32, KW=5, KH=1
Scalar time: 0.0010 sec
RVV time : 0.0001 sec
Speedup : 7.67x
Test W=32 H=32 K=5x1: PASSED
W=256, H=128, KW=3, KH=3
Scalar time: 0.0504 sec
RVV time : 0.0036 sec
Speedup : 14.08x
Test W=256 H=128 K=3x3: PASSED
W=1920, H=1, KW=3, KH=3
Scalar time: 0.0029 sec
RVV time : 0.0002 sec
Speedup : 12.49x
Test W=1920 H=1 K=3x3: PASSED
W=100, H=100, KW=3, KH=3
Scalar time: 0.0150 sec
RVV time : 0.0013 sec
Speedup : 11.22x
Test W=100 H=100 K=3x3: PASSED
W=1024, H=1024, KW=4, KH=4
Scalar time: 2.4305 sec
RVV time : 0.2001 sec
Speedup : 12.14x
Test W=1024 H=1024 K=4x4: PASSED
W=2048, H=2048, KW=4, KH=4
Scalar time: 9.7223 sec
RVV time : 0.8005 sec
Speedup : 12.15x
Test W=2048 H=2048 K=4x4: PASSED
W=1024, H=1024, KW=10, KH=10
Scalar time: 12.6255 sec
RVV time : 1.5203 sec
Speedup : 8.30x
Test W=1024 H=1024 K=10x10: PASSED
W=2048, H=2048, KW=10, KH=10
Scalar time: 50.5326 sec
RVV time : 6.0553 sec
Speedup : 8.35x
Test W=2048 H=2048 K=10x10: PASSED
W=1024, H=1024, KW=7, KH=7
Scalar time: 6.4679 sec
RVV time : 0.5984 sec
Speedup : 10.81x
Test W=1024 H=1024 K=7x7: PASSED
W=1024, H=1024, KW=1, KH=7
Scalar time: 1.4948 sec
RVV time : 0.2010 sec
Speedup : 7.44x
Test W=1024 H=1024 K=1x7: PASSED
W=1024, H=1024, KW=7, KH=1
Scalar time: 1.2625 sec
RVV time : 0.0920 sec
Speedup : 13.72x
Test W=1024 H=1024 K=7x1: PASSED
All tests PASSED
[root@EulixOS ~]# ./test_strassen_merge_c_function
cStride=16, eSub=4, hSub=4
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.41x
Test cStride=16, eSub=4, hSub=4: PASSED
cStride=4, eSub=1, hSub=1
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 2.00x
Test cStride=4, eSub=1, hSub=1: PASSED
cStride=12, eSub=3, hSub=3
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 2.33x
Test cStride=12, eSub=3, hSub=3: PASSED
cStride=16, eSub=4, hSub=5
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 3.50x
Test cStride=16, eSub=4, hSub=5: PASSED
cStride=28, eSub=7, hSub=4
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 7.88x
Test cStride=28, eSub=7, hSub=4: PASSED
cStride=4, eSub=0, hSub=0
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.00x
Test cStride=4, eSub=0, hSub=0: PASSED
cStride=4096, eSub=1024, hSub=1024
Scalar time: 0.5247 sec
RVV time : 0.0410 sec
Speedup : 12.81x
Test cStride=4096, eSub=1024, hSub=1024: PASSED
cStride=8192, eSub=2048, hSub=2048
Scalar time: 2.1006 sec
RVV time : 0.1630 sec
Speedup : 12.89x
Test cStride=8192, eSub=2048, hSub=2048: PASSED
All tests PASSED
</details>