MNN
MNN copied to clipboard
opt(RVV): Optimize transpose functions with intrinsics
Summary
Optimize MNNTranspose16Bit and MNNTranspose32Bit using RVV intrinsics.
Environment
- Platform: Banana PI BPI-F3
- OS: EulixOS 3.0
Benchmark
Click to expand full test logs
[root@EulixOS ~]# ./test_transpose_16bit
w=4, h=4, strideSrc=4, strideDst=4
Scalar time: 0.000001 sec
RVV time : 0.000018 sec
Speedup : 0.05x
Test w=4, h=4: PASSED
w=7, h=7, strideSrc=7, strideDst=7
Scalar time: 0.000001 sec
RVV time : 0.000002 sec
Speedup : 0.62x
Test w=7, h=7: PASSED
w=16, h=16, strideSrc=16, strideDst=16
Scalar time: 0.000007 sec
RVV time : 0.000003 sec
Speedup : 2.23x
Test w=16, h=16: PASSED
w=32, h=8, strideSrc=8, strideDst=32
Scalar time: 0.000007 sec
RVV time : 0.000002 sec
Speedup : 3.75x
Test w=32, h=8: PASSED
w=8, h=32, strideSrc=32, strideDst=8
Scalar time: 0.000007 sec
RVV time : 0.000006 sec
Speedup : 1.20x
Test w=8, h=32: PASSED
w=63, h=65, strideSrc=65, strideDst=63
Scalar time: 0.000098 sec
RVV time : 0.000013 sec
Speedup : 7.47x
Test w=63, h=65: PASSED
w=128, h=128, strideSrc=128, strideDst=128
Scalar time: 0.000416 sec
RVV time : 0.000053 sec
Speedup : 7.82x
Test w=128, h=128: PASSED
w=256, h=256, strideSrc=256, strideDst=256
Scalar time: 0.002206 sec
RVV time : 0.001256 sec
Speedup : 1.76x
Test w=256, h=256: PASSED
w=512, h=512, strideSrc=512, strideDst=512
Scalar time: 0.018305 sec
RVV time : 0.006567 sec
Speedup : 2.79x
Test w=512, h=512: PASSED
w=1024, h=1024, strideSrc=1024, strideDst=1024
Scalar time: 0.048515 sec
RVV time : 0.036964 sec
Speedup : 1.31x
Test w=1024, h=1024: PASSED
w=1920, h=1080, strideSrc=1080, strideDst=1920
Scalar time: 0.123896 sec
RVV time : 0.070017 sec
Speedup : 1.77x
Test w=1920, h=1080: PASSED
All tests PASSED
[root@EulixOS ~]# ./test_transpose_32bit
w=4, h=4, strideSrc=4, strideDst=4
Scalar time: 0.000002 sec
RVV time : 0.000018 sec
Speedup : 0.12x
Test w=4, h=4: PASSED
w=7, h=7, strideSrc=7, strideDst=7
Scalar time: 0.000001 sec
RVV time : 0.000002 sec
Speedup : 0.44x
Test w=7, h=7: PASSED
w=16, h=16, strideSrc=16, strideDst=16
Scalar time: 0.000006 sec
RVV time : 0.000003 sec
Speedup : 1.92x
Test w=16, h=16: PASSED
w=32, h=8, strideSrc=8, strideDst=32
Scalar time: 0.000007 sec
RVV time : 0.000001 sec
Speedup : 7.25x
Test w=32, h=8: PASSED
w=8, h=32, strideSrc=32, strideDst=8
Scalar time: 0.000007 sec
RVV time : 0.000004 sec
Speedup : 1.71x
Test w=8, h=32: PASSED
w=63, h=65, strideSrc=65, strideDst=63
Scalar time: 0.000101 sec
RVV time : 0.000012 sec
Speedup : 8.46x
Test w=63, h=65: PASSED
w=128, h=128, strideSrc=128, strideDst=128
Scalar time: 0.000568 sec
RVV time : 0.000224 sec
Speedup : 2.54x
Test w=128, h=128: PASSED
w=256, h=256, strideSrc=256, strideDst=256
Scalar time: 0.002424 sec
RVV time : 0.001196 sec
Speedup : 2.03x
Test w=256, h=256: PASSED
w=512, h=512, strideSrc=512, strideDst=512
Scalar time: 0.012749 sec
RVV time : 0.009645 sec
Speedup : 1.32x
Test w=512, h=512: PASSED
w=1024, h=1024, strideSrc=1024, strideDst=1024
Scalar time: 0.257813 sec
RVV time : 0.222458 sec
Speedup : 1.16x
Test w=1024, h=1024: PASSED
w=1920, h=1080, strideSrc=1080, strideDst=1920
Scalar time: 0.133701 sec
RVV time : 0.084568 sec
Speedup : 1.58x
Test w=1920, h=1080: PASSED
All tests PASSED
</details>