2 files changed, 130 insertions, 3 deletions
diff --git a/vp8/common/arm/neon/bilinearpredict_neon.c b/vp8/common/arm/neon/bilinearpredict_neon.c
index 7c1c63001..af566c2c4 100644
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -9,12 +9,141 @@
  */
 
 #include <arm_neon.h>
+#include <string.h>
+#include "./vpx_config.h"
 
 static const uint8_t bifilter4_coeff[8][2] = { { 128, 0 }, { 112, 16 },
                                                { 96, 32 }, { 80, 48 },
                                                { 64, 64 }, { 48, 80 },
                                                { 32, 96 }, { 16, 112 } };
 
+static INLINE uint8x8_t load_and_shift(const unsigned char *a) {
+  return vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(vld1_u8(a)), 32));
+}
+
+static INLINE void store4x4(unsigned char *dst, int dst_stride,
+                            const uint8x8_t a0, const uint8x8_t a1) {
+  if (!((uintptr_t)dst & 0x3) && !(dst_stride & 0x3)) {
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 0);
+    dst += dst_stride;
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a0), 1);
+    dst += dst_stride;
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 0);
+    dst += dst_stride;
+    vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(a1), 1);
+  } else {
+    // Store to the aligned local buffer and memcpy instead of vget_lane_u8
+    // which is really really slow.
+    uint32_t output_buffer[4];
+    vst1_lane_u32(output_buffer, vreinterpret_u32_u8(a0), 0);
+    vst1_lane_u32(output_buffer + 1, vreinterpret_u32_u8(a0), 1);
+    vst1_lane_u32(output_buffer + 2, vreinterpret_u32_u8(a1), 0);
+    vst1_lane_u32(output_buffer + 3, vreinterpret_u32_u8(a1), 1);
+
+    memcpy(dst, output_buffer, 4);
+    dst += dst_stride;
+    memcpy(dst, output_buffer + 1, 4);
+    dst += dst_stride;
+    memcpy(dst, output_buffer + 2, 4);
+    dst += dst_stride;
+    memcpy(dst, output_buffer + 3, 4);
+  }
+}
+
+void vp8_bilinear_predict4x4_neon(unsigned char *src_ptr,
+                                  int src_pixels_per_line, int xoffset,
+                                  int yoffset, unsigned char *dst_ptr,
+                                  int dst_pitch) {
+  uint8x8_t e0, e1, e2;
+
+  if (xoffset == 0) {  // skip_1stpass_filter
+    uint8x8_t a0, a1, a2, a3, a4;
+
+    a0 = load_and_shift(src_ptr);
+    src_ptr += src_pixels_per_line;
+    a1 = vld1_u8(src_ptr);
+    src_ptr += src_pixels_per_line;
+    a2 = load_and_shift(src_ptr);
+    src_ptr += src_pixels_per_line;
+    a3 = vld1_u8(src_ptr);
+    src_ptr += src_pixels_per_line;
+    a4 = vld1_u8(src_ptr);
+
+    e0 = vext_u8(a0, a1, 4);
+    e1 = vext_u8(a2, a3, 4);
+    e2 = a4;
+  } else {
+    uint8x8_t a0, a1, a2, a3, a4, b4;
+    uint8x16_t a01, a23;
+    uint8x16_t b01, b23;
+    uint32x2x2_t c0, c1, c2, c3;
+    uint16x8_t d0, d1, d2;
+    const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
+    const uint8x8_t filter1 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
+
+    a0 = vld1_u8(src_ptr);
+    src_ptr += src_pixels_per_line;
+    a1 = vld1_u8(src_ptr);
+    src_ptr += src_pixels_per_line;
+    a2 = vld1_u8(src_ptr);
+    src_ptr += src_pixels_per_line;
+    a3 = vld1_u8(src_ptr);
+    src_ptr += src_pixels_per_line;
+    a4 = vld1_u8(src_ptr);
+
+    a01 = vcombine_u8(a0, a1);
+    a23 = vcombine_u8(a2, a3);
+
+    b01 = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(a01), 8));
+    b23 = vreinterpretq_u8_u64(vshrq_n_u64(vreinterpretq_u64_u8(a23), 8));
+    b4 = vreinterpret_u8_u64(vshr_n_u64(vreinterpret_u64_u8(a4), 8));
+
+    c0 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a01)),
+                  vreinterpret_u32_u8(vget_high_u8(a01)));
+    c1 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(a23)),
+                  vreinterpret_u32_u8(vget_high_u8(a23)));
+    c2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b01)),
+                  vreinterpret_u32_u8(vget_high_u8(b01)));
+    c3 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(b23)),
+                  vreinterpret_u32_u8(vget_high_u8(b23)));
+
+    d0 = vmull_u8(vreinterpret_u8_u32(c0.val[0]), filter0);
+    d1 = vmull_u8(vreinterpret_u8_u32(c1.val[0]), filter0);
+    d2 = vmull_u8(a4, filter0);
+
+    d0 = vmlal_u8(d0, vreinterpret_u8_u32(c2.val[0]), filter1);
+    d1 = vmlal_u8(d1, vreinterpret_u8_u32(c3.val[0]), filter1);
+    d2 = vmlal_u8(d2, b4, filter1);
+
+    e0 = vqrshrn_n_u16(d0, 7);
+    e1 = vqrshrn_n_u16(d1, 7);
+    e2 = vqrshrn_n_u16(d2, 7);
+  }
+
+  // secondpass_filter
+  if (yoffset == 0) {  // skip_2ndpass_filter
+    store4x4(dst_ptr, dst_pitch, e0, e1);
+  } else {
+    uint8x8_t f0, f1;
+    const uint8x8_t filter0 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
+    const uint8x8_t filter1 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
+
+    uint16x8_t b0 = vmull_u8(e0, filter0);
+    uint16x8_t b1 = vmull_u8(e1, filter0);
+
+    const uint8x8_t a0 = vext_u8(e0, e1, 4);
+    const uint8x8_t a1 = vext_u8(e1, e2, 4);
+
+    b0 = vmlal_u8(b0, a0, filter1);
+    b1 = vmlal_u8(b1, a1, filter1);
+
+    f0 = vqrshrn_n_u16(b0, 7);
+    f1 = vqrshrn_n_u16(b1, 7);
+
+    store4x4(dst_ptr, dst_pitch, f0, f1);
+  }
+}
+
 void vp8_bilinear_predict8x4_neon(unsigned char *src_ptr,
                                   int src_pixels_per_line, int xoffset,
                                   int yoffset, unsigned char *dst_ptr,
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 063b9d5f5..6ce0036bf 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -182,10 +182,8 @@ specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 neon msa/;
 add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
 specialize qw/vp8_bilinear_predict8x4 mmx neon msa/;
 
-# TODO(johannkoenig): Add neon implementation
-# https://bugs.chromium.org/p/webm/issues/detail?id=1273
 add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch";
-specialize qw/vp8_bilinear_predict4x4 mmx msa/;
+specialize qw/vp8_bilinear_predict4x4 mmx neon msa/;
 
 #
 # Encoder functions below this point.