aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuuta Liang <yuuta@yuuta.moe>2023-08-10 14:43:10 +0800
committerYuuta Liang <yuuta@yuuta.moe>2023-08-10 15:04:34 +0800
commitee24732cde941a9ff2f36bc19d13730e87430ed1 (patch)
treee31b3aed6f7d7b5e24bba8dcc811d6d954c27ccd
parentc2814bb8cc32362822b5a546c8de21b3a0e89032 (diff)
downloadlibvpx-riscv64_android_optimization.tar
libvpx-riscv64_android_optimization.tar.gz
libvpx-riscv64_android_optimization.tar.bz2
libvpx-riscv64_android_optimization.zip
RISC-V: optimize vp8_copy_mem with RVVriscv64_android_optimization
Test environment: 8c 1804Mhz i5-1140G7 RVV Impl: % CROSS=riscv64-unknown-linux-gnu- configure --target=riscv64-linux-gcc \ --enable-debug --enable-gprof && make -j % time qemu-riscv64 -cpu rv64,v=true,zba=true,vlen=128 -L /path/to/sysroot/ \ ./vpxenc --codec=vp8 -w 352 -h 288 -o akiyol.vpx ./akiyo_cif.yuv Pass 1/1 frame 300/300 314977B 8399b/f 251981b/s 92226 ms (3.25 fps) user 1m30.108s % gprof -abp ./vpxenc ./gmon.out | grep vp8_copy_mem 1.36 53.09 1.04 1025863 0.00 0.00 vp8_copy_mem16x16_rvv 0.72 59.01 0.55 1641368 0.00 0.00 vp8_copy_mem8x8_rvv 0.05 65.95 0.04 764377 0.00 0.00 vp8_copy_mem8x4_rvv C Impl: % CROSS=riscv64-unknown-linux-gnu- configure --target=generic-gnu --enable-debug \ --enable-gprof && make -j % time qemu-riscv64 -cpu rv64,v=true,zba=true,vlen=128 -L /path/to/sysroot/ \ ./vpxenc --codec=vp8 -w 352 -h 288 -o akiyol.vpx ./akiyo_cif.yuv Pass 1/1 frame 300/300 314977B 8399b/f 251981b/s 98417 ms (3.05 fps) user 1m36.146s % gprof -abp ./vpxenc ./gmon.out | grep vp8_copy_mem 0.38 63.96 0.31 vp8_copy_mem8x4_c 0.04 70.61 0.03 204336 0.00 0.00 vp8_copy_mem16x16_c Signed-off-by: Yuuta Liang <yuuta@yuuta.moe>
-rw-r--r--vp8/common/riscv/copymem_rvv.c54
-rw-r--r--vp8/common/rtcd_defs.pl6
-rw-r--r--vp8/vp8_common.mk1
3 files changed, 58 insertions, 3 deletions
diff --git a/vp8/common/riscv/copymem_rvv.c b/vp8/common/riscv/copymem_rvv.c
new file mode 100644
index 000000000..1999a4fad
--- /dev/null
+++ b/vp8/common/riscv/copymem_rvv.c
@@ -0,0 +1,54 @@
+#include <riscv_vector.h>
+#include "./vpx_config.h"
+#include "./vp8_rtcd.h"
+
+void vp8_copy_mem16x16_rvv(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride) {
+ vuint64m2_t v;
+ size_t vl;
+ size_t n = 16;
+ while (n) {
+ /* Assume e64 is supported. */
+ vl = __riscv_vsetvl_e64m2(n);
+
+ v = __riscv_vlse64_v_u64m2((uint64_t *) src, src_stride, vl);
+ __riscv_vsse64_v_u64m2((uint64_t *) dst, dst_stride, v, vl);
+
+ v = __riscv_vlse64_v_u64m2((uint64_t *) (src + 8), src_stride, vl);
+ __riscv_vsse64_v_u64m2((uint64_t *) (dst + 8), dst_stride, v, vl);
+
+ n -= vl;
+ src += src_stride * vl;
+ dst += dst_stride * vl;
+ }
+}
+
+void vp8_copy_mem8x8_rvv(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride) {
+ size_t n = 8;
+ size_t vl;
+ vuint64m2_t v;
+
+ while (n) {
+ /* Assume e64 is supported. */
+ vl = __riscv_vsetvl_e64m2(n);
+
+ v = __riscv_vlse64_v_u64m2((uint64_t *) src, src_stride, vl);
+ __riscv_vsse64_v_u64m2((uint64_t *) dst, dst_stride, v, vl);
+
+ n -= vl;
+ src += src_stride * vl;
+ dst += dst_stride * vl;
+ }
+}
+
+void vp8_copy_mem8x4_rvv(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride) {
+ size_t vl;
+ vuint64m2_t v;
+
+ /* VL must be 4 because VLEN must be >= 128. Assume e64 is supported. */
+ vl = __riscv_vsetvl_e64m2(4);
+ v = __riscv_vlse64_v_u64m2((uint64_t *) src, src_stride, vl);
+ __riscv_vsse64_v_u64m2((uint64_t *) dst, dst_stride, v, vl);
+}
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 3831a3c35..4fad0d7f6 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -114,13 +114,13 @@ specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi lsx/;
# RECON
#
add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi rvv/;
add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/;
+specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi rvv/;
add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride";
-specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/;
+specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi rvv/;
#
# Postproc
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 54bb5eadb..c24415052 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -148,5 +148,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c
# common (rvv intrinsics)
VP8_COMMON_SRCS-$(HAVE_RVV) += common/riscv/sixtap_predict_rvv.c
+VP8_COMMON_SRCS-$(HAVE_RVV) += common/riscv/copymem_rvv.c
$(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl))