MNN
MNN copied to clipboard
opt(RVV): Optimize Softmax and ReluWithSlopeChannel with intrinsics
Summary
Optimize MNNSoftmax and MNNReluWithSlopeChannel using RVV intrinsics.
Environment
- Platform: Banana PI BPI-F3
- OS: EulixOS 3.0
Benchmark
Click to expand full test logs
[root@EulixOS ~]# ./test_relu_with_slope_channel
sizeQuad=4, depthQuad=4
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.11x
Test sizeQuad=4, depthQuad=4: PASSED
sizeQuad=1, depthQuad=1
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.62x
Test sizeQuad=1, depthQuad=1: PASSED
sizeQuad=8, depthQuad=3
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 1.75x
Test sizeQuad=8, depthQuad=3: PASSED
sizeQuad=3, depthQuad=8
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.72x
Test sizeQuad=3, depthQuad=8: PASSED
sizeQuad=0, depthQuad=4
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.00x
Test sizeQuad=0, depthQuad=4: PASSED
sizeQuad=4, depthQuad=0
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.00x
Test sizeQuad=4, depthQuad=0: PASSED
sizeQuad=65536, depthQuad=4
Scalar time: 0.0382 sec
RVV time : 0.0063 sec
Speedup : 6.07x
Test sizeQuad=65536, depthQuad=4: PASSED
sizeQuad=1048576, depthQuad=8
Scalar time: 1.2220 sec
RVV time : 0.2041 sec
Speedup : 5.99x
Test sizeQuad=1048576, depthQuad=8: PASSED
sizeQuad=16384, depthQuad=64
Scalar time: 0.1534 sec
RVV time : 0.0247 sec
Speedup : 6.20x
Test sizeQuad=16384, depthQuad=64: PASSED
sizeQuad=262144, depthQuad=16
Scalar time: 0.6115 sec
RVV time : 0.1029 sec
Speedup : 5.95x
Test sizeQuad=262144, depthQuad=16: PASSED
sizeQuad=1, depthQuad=1024
Scalar time: 0.0002 sec
RVV time : 0.0008 sec
Speedup : 0.22x
Test sizeQuad=1, depthQuad=1024: PASSED
sizeQuad=4194304, depthQuad=2
Scalar time: 1.2222 sec
RVV time : 0.2074 sec
Speedup : 5.89x
Test sizeQuad=4194304, depthQuad=2: PASSED
sizeQuad=32768, depthQuad=128
Scalar time: 0.6101 sec
RVV time : 0.1047 sec
Speedup : 5.83x
Test sizeQuad=32768, depthQuad=128: PASSED
sizeQuad=524288, depthQuad=32
Scalar time: 2.4432 sec
RVV time : 0.4132 sec
Speedup : 5.91x
Test sizeQuad=524288, depthQuad=32: PASSED
sizeQuad=0, depthQuad=1024
Scalar time: 0.0000 sec
RVV time : 0.0001 sec
Speedup : 0.74x
Test sizeQuad=0, depthQuad=1024: PASSED
sizeQuad=1048576, depthQuad=0
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.00x
Test sizeQuad=1048576, depthQuad=0: PASSED
sizeQuad=65536, depthQuad=256
Scalar time: 2.4446 sec
RVV time : 0.4175 sec
Speedup : 5.85x
Test sizeQuad=65536, depthQuad=256: PASSED
sizeQuad=16777216, depthQuad=4
Scalar time: 9.7753 sec
RVV time : 1.6606 sec
Speedup : 5.89x
Test sizeQuad=16777216, depthQuad=4: PASSED
All tests PASSED
[root@EulixOS ~]# ./test_softmax
size=1
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.09x
Test size=1: PASSED
size=3
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 1.00x
Test size=3: PASSED
size=4
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 0.50x
Test size=4: PASSED
size=100
Scalar time: 0.0000 sec
RVV time : 0.0000 sec
Speedup : 5.15x
Test size=100: PASSED
size=1024
Scalar time: 0.0002 sec
RVV time : 0.0000 sec
Speedup : 6.32x
Test size=1024: PASSED
size=65536
Scalar time: 0.0110 sec
RVV time : 0.0015 sec
Speedup : 7.10x
Test size=65536: PASSED
size=1000000
Scalar time: 0.1635 sec
RVV time : 0.0233 sec
Speedup : 7.01x
Test size=1000000: PASSED
size=10000000
Scalar time: 1.6323 sec
RVV time : 0.2320 sec
Speedup : 7.04x
Test size=10000000: PASSED
All tests PASSED
</details>