From ee24732cde941a9ff2f36bc19d13730e87430ed1 Mon Sep 17 00:00:00 2001 From: Yuuta Liang Date: Thu, 10 Aug 2023 14:43:10 +0800 Subject: RISC-V: optimize vp8_copy_mem with RVV Test environment: 8c 1804Mhz i5-1140G7 RVV Impl: % CROSS=riscv64-unknown-linux-gnu- configure --target=riscv64-linux-gcc \ --enable-debug --enable-gprof && make -j % time qemu-riscv64 -cpu rv64,v=true,zba=true,vlen=128 -L /path/to/sysroot/ \ ./vpxenc --codec=vp8 -w 352 -h 288 -o akiyol.vpx ./akiyo_cif.yuv Pass 1/1 frame 300/300 314977B 8399b/f 251981b/s 92226 ms (3.25 fps) user 1m30.108s % gprof -abp ./vpxenc ./gmon.out | grep vp8_copy_mem 1.36 53.09 1.04 1025863 0.00 0.00 vp8_copy_mem16x16_rvv 0.72 59.01 0.55 1641368 0.00 0.00 vp8_copy_mem8x8_rvv 0.05 65.95 0.04 764377 0.00 0.00 vp8_copy_mem8x4_rvv C Impl: % CROSS=riscv64-unknown-linux-gnu- configure --target=generic-gnu --enable-debug \ --enable-gprof && make -j % time qemu-riscv64 -cpu rv64,v=true,zba=true,vlen=128 -L /path/to/sysroot/ \ ./vpxenc --codec=vp8 -w 352 -h 288 -o akiyol.vpx ./akiyo_cif.yuv Pass 1/1 frame 300/300 314977B 8399b/f 251981b/s 98417 ms (3.05 fps) user 1m36.146s % gprof -abp ./vpxenc ./gmon.out | grep vp8_copy_mem 0.38 63.96 0.31 vp8_copy_mem8x4_c 0.04 70.61 0.03 204336 0.00 0.00 vp8_copy_mem16x16_c Signed-off-by: Yuuta Liang --- vp8/common/riscv/copymem_rvv.c | 54 ++++++++++++++++++++++++++++++++++++++++++ vp8/common/rtcd_defs.pl | 6 ++--- vp8/vp8_common.mk | 1 + 3 files changed, 58 insertions(+), 3 deletions(-) create mode 100644 vp8/common/riscv/copymem_rvv.c diff --git a/vp8/common/riscv/copymem_rvv.c b/vp8/common/riscv/copymem_rvv.c new file mode 100644 index 000000000..1999a4fad --- /dev/null +++ b/vp8/common/riscv/copymem_rvv.c @@ -0,0 +1,54 @@ +#include +#include "./vpx_config.h" +#include "./vp8_rtcd.h" + +void vp8_copy_mem16x16_rvv(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + vuint64m2_t v; + size_t vl; + size_t n = 16; + while (n) { + /* Assume e64 is supported. */ + vl = __riscv_vsetvl_e64m2(n); + + v = __riscv_vlse64_v_u64m2((uint64_t *) src, src_stride, vl); + __riscv_vsse64_v_u64m2((uint64_t *) dst, dst_stride, v, vl); + + v = __riscv_vlse64_v_u64m2((uint64_t *) (src + 8), src_stride, vl); + __riscv_vsse64_v_u64m2((uint64_t *) (dst + 8), dst_stride, v, vl); + + n -= vl; + src += src_stride * vl; + dst += dst_stride * vl; + } +} + +void vp8_copy_mem8x8_rvv(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + size_t n = 8; + size_t vl; + vuint64m2_t v; + + while (n) { + /* Assume e64 is supported. */ + vl = __riscv_vsetvl_e64m2(n); + + v = __riscv_vlse64_v_u64m2((uint64_t *) src, src_stride, vl); + __riscv_vsse64_v_u64m2((uint64_t *) dst, dst_stride, v, vl); + + n -= vl; + src += src_stride * vl; + dst += dst_stride * vl; + } +} + +void vp8_copy_mem8x4_rvv(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + size_t vl; + vuint64m2_t v; + + /* VL must be 4 because VLEN must be >= 128. Assume e64 is supported. */ + vl = __riscv_vsetvl_e64m2(4); + v = __riscv_vlse64_v_u64m2((uint64_t *) src, src_stride, vl); + __riscv_vsse64_v_u64m2((uint64_t *) dst, dst_stride, v, vl); +} diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 3831a3c35..4fad0d7f6 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -114,13 +114,13 @@ specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi lsx/; # RECON # add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride"; -specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/; +specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi rvv/; add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride"; -specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/; +specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi rvv/; add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride"; -specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/; +specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi rvv/; # # Postproc diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 54bb5eadb..c24415052 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -148,5 +148,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c # common (rvv intrinsics) VP8_COMMON_SRCS-$(HAVE_RVV) += common/riscv/sixtap_predict_rvv.c +VP8_COMMON_SRCS-$(HAVE_RVV) += common/riscv/copymem_rvv.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) -- cgit v1.2.3