summaryrefslogtreecommitdiff
path: root/vp8/common/arm/neon/bilinearpredict_neon.c
diff options
context:
space:
mode:
authorJohann <johannkoenig@google.com>2016-05-06 17:20:15 -0700
committerJohann <johannkoenig@google.com>2016-05-06 17:20:15 -0700
commitce11055d57687930c5ee5f6796d2d76787de5f5c (patch)
tree304b005882e754d79b1eb9b0db32ff991e7ed51b /vp8/common/arm/neon/bilinearpredict_neon.c
parentd3a62ac85007329dc2c9fe0918962a4e13716700 (diff)
downloadlibvpx-ce11055d57687930c5ee5f6796d2d76787de5f5c.tar
libvpx-ce11055d57687930c5ee5f6796d2d76787de5f5c.tar.gz
libvpx-ce11055d57687930c5ee5f6796d2d76787de5f5c.tar.bz2
libvpx-ce11055d57687930c5ee5f6796d2d76787de5f5c.zip
Remove sixtap/bilinear 4x4 neon implementations
These implementations rely on casting the pointers to load the data. Clang implemented optimizations which automatically add alignment hints to such loads. The 4x4 filters do not guarantee the necessary alignment so the resulting assembly is broken. https://llvm.org/bugs/show_bug.cgi?id=24421 BUG=webm:817 BUG=webm:892 Change-Id: I608885299f1f86ff83653b65e0e40d0ae87fb3fe
Diffstat (limited to 'vp8/common/arm/neon/bilinearpredict_neon.c')
-rw-r--r--vp8/common/arm/neon/bilinearpredict_neon.c108
1 files changed, 0 insertions, 108 deletions
diff --git a/vp8/common/arm/neon/bilinearpredict_neon.c b/vp8/common/arm/neon/bilinearpredict_neon.c
index 9824a3193..bb6ea76ba 100644
--- a/vp8/common/arm/neon/bilinearpredict_neon.c
+++ b/vp8/common/arm/neon/bilinearpredict_neon.c
@@ -21,114 +21,6 @@ static const uint8_t bifilter4_coeff[8][2] = {
{ 16, 112}
};
-void vp8_bilinear_predict4x4_neon(
- unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8;
- uint8x8_t d26u8, d27u8, d28u8, d29u8, d30u8;
- uint8x16_t q1u8, q2u8;
- uint16x8_t q1u16, q2u16;
- uint16x8_t q7u16, q8u16, q9u16;
- uint64x2_t q4u64, q5u64;
- uint64x1_t d12u64;
- uint32x2x2_t d0u32x2, d1u32x2, d2u32x2, d3u32x2;
-
- if (xoffset == 0) { // skip_1stpass_filter
- uint32x2_t d28u32 = vdup_n_u32(0);
- uint32x2_t d29u32 = vdup_n_u32(0);
- uint32x2_t d30u32 = vdup_n_u32(0);
-
- d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 0);
- src_ptr += src_pixels_per_line;
- d28u32 = vld1_lane_u32((const uint32_t *)src_ptr, d28u32, 1);
- src_ptr += src_pixels_per_line;
- d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 0);
- src_ptr += src_pixels_per_line;
- d29u32 = vld1_lane_u32((const uint32_t *)src_ptr, d29u32, 1);
- src_ptr += src_pixels_per_line;
- d30u32 = vld1_lane_u32((const uint32_t *)src_ptr, d30u32, 0);
- d28u8 = vreinterpret_u8_u32(d28u32);
- d29u8 = vreinterpret_u8_u32(d29u32);
- d30u8 = vreinterpret_u8_u32(d30u32);
- } else {
- d2u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
- d3u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
- d4u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
- d5u8 = vld1_u8(src_ptr); src_ptr += src_pixels_per_line;
- d6u8 = vld1_u8(src_ptr);
-
- q1u8 = vcombine_u8(d2u8, d3u8);
- q2u8 = vcombine_u8(d4u8, d5u8);
-
- d0u8 = vdup_n_u8(bifilter4_coeff[xoffset][0]);
- d1u8 = vdup_n_u8(bifilter4_coeff[xoffset][1]);
-
- q4u64 = vshrq_n_u64(vreinterpretq_u64_u8(q1u8), 8);
- q5u64 = vshrq_n_u64(vreinterpretq_u64_u8(q2u8), 8);
- d12u64 = vshr_n_u64(vreinterpret_u64_u8(d6u8), 8);
-
- d0u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q1u8)),
- vreinterpret_u32_u8(vget_high_u8(q1u8)));
- d1u32x2 = vzip_u32(vreinterpret_u32_u8(vget_low_u8(q2u8)),
- vreinterpret_u32_u8(vget_high_u8(q2u8)));
- d2u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q4u64)),
- vreinterpret_u32_u64(vget_high_u64(q4u64)));
- d3u32x2 = vzip_u32(vreinterpret_u32_u64(vget_low_u64(q5u64)),
- vreinterpret_u32_u64(vget_high_u64(q5u64)));
-
- q7u16 = vmull_u8(vreinterpret_u8_u32(d0u32x2.val[0]), d0u8);
- q8u16 = vmull_u8(vreinterpret_u8_u32(d1u32x2.val[0]), d0u8);
- q9u16 = vmull_u8(d6u8, d0u8);
-
- q7u16 = vmlal_u8(q7u16, vreinterpret_u8_u32(d2u32x2.val[0]), d1u8);
- q8u16 = vmlal_u8(q8u16, vreinterpret_u8_u32(d3u32x2.val[0]), d1u8);
- q9u16 = vmlal_u8(q9u16, vreinterpret_u8_u64(d12u64), d1u8);
-
- d28u8 = vqrshrn_n_u16(q7u16, 7);
- d29u8 = vqrshrn_n_u16(q8u16, 7);
- d30u8 = vqrshrn_n_u16(q9u16, 7);
- }
-
- // secondpass_filter
- if (yoffset == 0) { // skip_2ndpass_filter
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 0);
- dst_ptr += dst_pitch;
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d28u8), 1);
- dst_ptr += dst_pitch;
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 0);
- dst_ptr += dst_pitch;
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d29u8), 1);
- } else {
- d0u8 = vdup_n_u8(bifilter4_coeff[yoffset][0]);
- d1u8 = vdup_n_u8(bifilter4_coeff[yoffset][1]);
-
- q1u16 = vmull_u8(d28u8, d0u8);
- q2u16 = vmull_u8(d29u8, d0u8);
-
- d26u8 = vext_u8(d28u8, d29u8, 4);
- d27u8 = vext_u8(d29u8, d30u8, 4);
-
- q1u16 = vmlal_u8(q1u16, d26u8, d1u8);
- q2u16 = vmlal_u8(q2u16, d27u8, d1u8);
-
- d2u8 = vqrshrn_n_u16(q1u16, 7);
- d3u8 = vqrshrn_n_u16(q2u16, 7);
-
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 0);
- dst_ptr += dst_pitch;
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d2u8), 1);
- dst_ptr += dst_pitch;
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 0);
- dst_ptr += dst_pitch;
- vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d3u8), 1);
- }
- return;
-}
-
void vp8_bilinear_predict8x4_neon(
unsigned char *src_ptr,
int src_pixels_per_line,