diff options
author | Yuuta Liang <yuuta@yuuta.moe> | 2023-08-10 14:43:10 +0800 |
---|---|---|
committer | Yuuta Liang <yuuta@yuuta.moe> | 2023-08-10 15:04:34 +0800 |
commit | ee24732cde941a9ff2f36bc19d13730e87430ed1 (patch) | |
tree | e31b3aed6f7d7b5e24bba8dcc811d6d954c27ccd | |
parent | c2814bb8cc32362822b5a546c8de21b3a0e89032 (diff) | |
download | libvpx-riscv64_android_optimization.tar libvpx-riscv64_android_optimization.tar.gz libvpx-riscv64_android_optimization.tar.bz2 libvpx-riscv64_android_optimization.zip |
RISC-V: optimize vp8_copy_mem with RVVriscv64_android_optimization
Test environment: 8c 1804Mhz i5-1140G7
RVV Impl:
% CROSS=riscv64-unknown-linux-gnu- configure --target=riscv64-linux-gcc \
--enable-debug --enable-gprof && make -j
% time qemu-riscv64 -cpu rv64,v=true,zba=true,vlen=128 -L /path/to/sysroot/ \
./vpxenc --codec=vp8 -w 352 -h 288 -o akiyol.vpx ./akiyo_cif.yuv
Pass 1/1 frame 300/300 314977B 8399b/f 251981b/s 92226 ms (3.25 fps)
user 1m30.108s
% gprof -abp ./vpxenc ./gmon.out | grep vp8_copy_mem
1.36 53.09 1.04 1025863 0.00 0.00 vp8_copy_mem16x16_rvv
0.72 59.01 0.55 1641368 0.00 0.00 vp8_copy_mem8x8_rvv
0.05 65.95 0.04 764377 0.00 0.00 vp8_copy_mem8x4_rvv
C Impl:
% CROSS=riscv64-unknown-linux-gnu- configure --target=generic-gnu --enable-debug \
--enable-gprof && make -j
% time qemu-riscv64 -cpu rv64,v=true,zba=true,vlen=128 -L /path/to/sysroot/ \
./vpxenc --codec=vp8 -w 352 -h 288 -o akiyol.vpx ./akiyo_cif.yuv
Pass 1/1 frame 300/300 314977B 8399b/f 251981b/s 98417 ms (3.05 fps)
user 1m36.146s
% gprof -abp ./vpxenc ./gmon.out | grep vp8_copy_mem
0.38 63.96 0.31 vp8_copy_mem8x4_c
0.04 70.61 0.03 204336 0.00 0.00 vp8_copy_mem16x16_c
Signed-off-by: Yuuta Liang <yuuta@yuuta.moe>
-rw-r--r-- | vp8/common/riscv/copymem_rvv.c | 54 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.pl | 6 | ||||
-rw-r--r-- | vp8/vp8_common.mk | 1 |
3 files changed, 58 insertions, 3 deletions
diff --git a/vp8/common/riscv/copymem_rvv.c b/vp8/common/riscv/copymem_rvv.c new file mode 100644 index 000000000..1999a4fad --- /dev/null +++ b/vp8/common/riscv/copymem_rvv.c @@ -0,0 +1,54 @@ +#include <riscv_vector.h> +#include "./vpx_config.h" +#include "./vp8_rtcd.h" + +void vp8_copy_mem16x16_rvv(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + vuint64m2_t v; + size_t vl; + size_t n = 16; + while (n) { + /* Assume e64 is supported. */ + vl = __riscv_vsetvl_e64m2(n); + + v = __riscv_vlse64_v_u64m2((uint64_t *) src, src_stride, vl); + __riscv_vsse64_v_u64m2((uint64_t *) dst, dst_stride, v, vl); + + v = __riscv_vlse64_v_u64m2((uint64_t *) (src + 8), src_stride, vl); + __riscv_vsse64_v_u64m2((uint64_t *) (dst + 8), dst_stride, v, vl); + + n -= vl; + src += src_stride * vl; + dst += dst_stride * vl; + } +} + +void vp8_copy_mem8x8_rvv(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + size_t n = 8; + size_t vl; + vuint64m2_t v; + + while (n) { + /* Assume e64 is supported. */ + vl = __riscv_vsetvl_e64m2(n); + + v = __riscv_vlse64_v_u64m2((uint64_t *) src, src_stride, vl); + __riscv_vsse64_v_u64m2((uint64_t *) dst, dst_stride, v, vl); + + n -= vl; + src += src_stride * vl; + dst += dst_stride * vl; + } +} + +void vp8_copy_mem8x4_rvv(unsigned char *src, int src_stride, + unsigned char *dst, int dst_stride) { + size_t vl; + vuint64m2_t v; + + /* VL must be 4 because VLEN must be >= 128. Assume e64 is supported. */ + vl = __riscv_vsetvl_e64m2(4); + v = __riscv_vlse64_v_u64m2((uint64_t *) src, src_stride, vl); + __riscv_vsse64_v_u64m2((uint64_t *) dst, dst_stride, v, vl); +} diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 3831a3c35..4fad0d7f6 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -114,13 +114,13 @@ specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi lsx/; # RECON # add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride"; -specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi/; +specialize qw/vp8_copy_mem16x16 sse2 neon dspr2 msa mmi rvv/; add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride"; -specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi/; +specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa mmi rvv/; add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride"; -specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi/; +specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa mmi rvv/; # # Postproc diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 54bb5eadb..c24415052 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -148,5 +148,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sixtappredict_neon.c # common (rvv intrinsics) VP8_COMMON_SRCS-$(HAVE_RVV) += common/riscv/sixtap_predict_rvv.c +VP8_COMMON_SRCS-$(HAVE_RVV) += common/riscv/copymem_rvv.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) |