summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
authorjackychen <jackychen@google.com>2015-12-03 15:21:36 -0800
committerjackychen <jackychen@google.com>2015-12-08 17:23:36 -0800
commit303f144eefc739c44a3e18a24f8cf61d2fe103a7 (patch)
treefc85c17678fb48882e37b8d04b54941e263ef126 /vp9
parent16a4fab9e2c6d80c08183bf0cc27a3e0b9bd2346 (diff)
downloadlibvpx-303f144eefc739c44a3e18a24f8cf61d2fe103a7.tar
libvpx-303f144eefc739c44a3e18a24f8cf61d2fe103a7.tar.gz
libvpx-303f144eefc739c44a3e18a24f8cf61d2fe103a7.tar.bz2
libvpx-303f144eefc739c44a3e18a24f8cf61d2fe103a7.zip
Add vp9_avg_4x4_neon and the unit test.
Change-Id: I3ef9a9648841374ed3cc865a02053c14ad821a20
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_rtcd_defs.pl2
-rw-r--r--vp9/encoder/arm/neon/vp9_avg_neon.c12
2 files changed, 13 insertions, 1 deletions
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 8fe6503aa..d166bbf38 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -198,7 +198,7 @@ add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_avg_8x8 sse2 neon msa/;
add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
-specialize qw/vp9_avg_4x4 sse2 msa/;
+specialize qw/vp9_avg_4x4 sse2 neon msa/;
add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
specialize qw/vp9_minmax_8x8 sse2/;
diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c
index 5996bd426..78467cebd 100644
--- a/vp9/encoder/arm/neon/vp9_avg_neon.c
+++ b/vp9/encoder/arm/neon/vp9_avg_neon.c
@@ -24,6 +24,18 @@ static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) {
return vget_lane_u32(c, 0);
}
+unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) {
+ uint16x8_t v_sum;
+ uint32x2_t v_s0 = vdup_n_u32(0);
+ uint32x2_t v_s1 = vdup_n_u32(0);
+ v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0);
+ v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1);
+ v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0);
+ v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1);
+ v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1));
+ return (horizontal_add_u16x8(v_sum) + 8) >> 4;
+}
+
unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) {
uint8x8_t v_s0 = vld1_u8(s);
const uint8x8_t v_s1 = vld1_u8(s + p);