Merge "Clean vpx_idct32x32_1024_add_neon()"

author: James Zern <jzern@google.com> 2017-03-17 05:24:57 +0000
committer: Gerrit Code Review <noreply-gerritcodereview@google.com> 2017-03-17 05:24:58 +0000
commit: 68efc64b72f892b7bedd5329706dfc2cc21c56bc (patch)
tree: 167332d4959b0ad61d95acde2fd6f0cd425a85fd
parent: 02975a604c679b57ad57e4785abf6f7f0ebd6052 (diff)
parent: e54231d613c70c38f857a968fef9d91a2e4bb10a (diff)
download: libvpx-68efc64b72f892b7bedd5329706dfc2cc21c56bc.tar
libvpx-68efc64b72f892b7bedd5329706dfc2cc21c56bc.tar.gz
libvpx-68efc64b72f892b7bedd5329706dfc2cc21c56bc.tar.bz2
libvpx-68efc64b72f892b7bedd5329706dfc2cc21c56bc.zip
1 files changed, 29 insertions, 73 deletions
diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c
index aa6f24205..97b91aa9a 100644
--- a/vpx_dsp/arm/idct32x32_add_neon.c
+++ b/vpx_dsp/arm/idct32x32_add_neon.c
@@ -38,84 +38,40 @@ static INLINE void store_in_output(int16_t *const out, const int first,
   vst1q_s16(out + second * 32, q1);
 }
 
-static INLINE void store_combine_center_results(uint8_t *p1, uint8_t *p2,
-                                                const int stride, int16x8_t q0,
-                                                int16x8_t q1, int16x8_t q2,
-                                                int16x8_t q3) {
-  int16x4_t d[4];
+static INLINE void store_combine_results(uint8_t *p1, uint8_t *p2,
+                                         const int stride, int16x8_t q0,
+                                         int16x8_t q1, int16x8_t q2,
+                                         int16x8_t q3) {
+  uint8x8_t d[4];
 
-  d[0] = vld1_s16((int16_t *)p1);
+  d[0] = vld1_u8(p1);
   p1 += stride;
-  d[1] = vld1_s16((int16_t *)p1);
-  d[3] = vld1_s16((int16_t *)p2);
+  d[1] = vld1_u8(p1);
+  d[3] = vld1_u8(p2);
   p2 -= stride;
-  d[2] = vld1_s16((int16_t *)p2);
+  d[2] = vld1_u8(p2);
 
   q0 = vrshrq_n_s16(q0, 6);
   q1 = vrshrq_n_s16(q1, 6);
   q2 = vrshrq_n_s16(q2, 6);
   q3 = vrshrq_n_s16(q3, 6);
 
-  q0 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q0), vreinterpret_u8_s16(d[0])));
-  q1 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q1), vreinterpret_u8_s16(d[1])));
-  q2 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2), vreinterpret_u8_s16(d[2])));
-  q3 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q3), vreinterpret_u8_s16(d[3])));
-
-  d[0] = vreinterpret_s16_u8(vqmovun_s16(q0));
-  d[1] = vreinterpret_s16_u8(vqmovun_s16(q1));
-  d[2] = vreinterpret_s16_u8(vqmovun_s16(q2));
-  d[3] = vreinterpret_s16_u8(vqmovun_s16(q3));
-
-  vst1_s16((int16_t *)p1, d[1]);
-  p1 -= stride;
-  vst1_s16((int16_t *)p1, d[0]);
-  vst1_s16((int16_t *)p2, d[2]);
-  p2 += stride;
-  vst1_s16((int16_t *)p2, d[3]);
-}
+  q0 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q0), d[0]));
+  q1 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q1), d[1]));
+  q2 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2), d[2]));
+  q3 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q3), d[3]));
 
-static INLINE void store_combine_extreme_results(uint8_t *p1, uint8_t *p2,
-                                                 const int stride, int16x8_t q0,
-                                                 int16x8_t q1, int16x8_t q2,
-                                                 int16x8_t q3) {
-  int16x4_t d[4];
-
-  d[0] = vld1_s16((int16_t *)p1);
-  p1 += stride;
-  d[1] = vld1_s16((int16_t *)p1);
-  d[3] = vld1_s16((int16_t *)p2);
-  p2 -= stride;
-  d[2] = vld1_s16((int16_t *)p2);
-
-  q0 = vrshrq_n_s16(q0, 6);
-  q1 = vrshrq_n_s16(q1, 6);
-  q2 = vrshrq_n_s16(q2, 6);
-  q3 = vrshrq_n_s16(q3, 6);
+  d[0] = vqmovun_s16(q0);
+  d[1] = vqmovun_s16(q1);
+  d[2] = vqmovun_s16(q2);
+  d[3] = vqmovun_s16(q3);
 
-  q0 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q0), vreinterpret_u8_s16(d[0])));
-  q1 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q1), vreinterpret_u8_s16(d[1])));
-  q2 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q2), vreinterpret_u8_s16(d[2])));
-  q3 = vreinterpretq_s16_u16(
-      vaddw_u8(vreinterpretq_u16_s16(q3), vreinterpret_u8_s16(d[3])));
-
-  d[0] = vreinterpret_s16_u8(vqmovun_s16(q0));
-  d[1] = vreinterpret_s16_u8(vqmovun_s16(q1));
-  d[2] = vreinterpret_s16_u8(vqmovun_s16(q2));
-  d[3] = vreinterpret_s16_u8(vqmovun_s16(q3));
-
-  vst1_s16((int16_t *)p1, d[1]);
+  vst1_u8(p1, d[1]);
   p1 -= stride;
-  vst1_s16((int16_t *)p1, d[0]);
-  vst1_s16((int16_t *)p2, d[2]);
+  vst1_u8(p1, d[0]);
+  vst1_u8(p2, d[2]);
   p2 += stride;
-  vst1_s16((int16_t *)p2, d[3]);
+  vst1_u8(p2, d[3]);
 }
 
 static INLINE void do_butterfly(const int16x8_t qIn0, const int16x8_t qIn1,
@@ -334,7 +290,7 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
   uint8_t *dest3 = dest + 15 * stride;
   const int str2 = stride << 1;
 
-  store_combine_center_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
   dest2 += str2;
   dest3 -= str2;
 
@@ -343,7 +299,7 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
   q[5] = final_add(q[3], q[0]);
   q[6] = final_sub(q[3], q[0]);
   q[7] = final_sub(q[2], q[1]);
-  store_combine_extreme_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
   dest0 += str2;
   dest1 -= str2;
 
@@ -358,7 +314,7 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
   q[9] = final_add(q[5], q[0]);
   q[6] = final_sub(q[5], q[0]);
   q[7] = final_sub(q[4], q[1]);
-  store_combine_center_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
   dest2 += str2;
   dest3 -= str2;
 
@@ -367,7 +323,7 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
   q[5] = final_add(q[3], q[0]);
   q[6] = final_sub(q[3], q[0]);
   q[7] = final_sub(q[2], q[1]);
-  store_combine_extreme_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
   dest0 += str2;
   dest1 -= str2;
 
@@ -382,7 +338,7 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
   q[9] = final_add(q[5], q[0]);
   q[6] = final_sub(q[5], q[0]);
   q[7] = final_sub(q[4], q[1]);
-  store_combine_center_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
   dest2 += str2;
   dest3 -= str2;
 
@@ -391,7 +347,7 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
   q[5] = final_add(q[3], q[0]);
   q[6] = final_sub(q[3], q[0]);
   q[7] = final_sub(q[2], q[1]);
-  store_combine_extreme_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
   dest0 += str2;
   dest1 -= str2;
 
@@ -406,14 +362,14 @@ static INLINE void idct32_bands_end_2nd_pass(const int16_t *const out,
   q[9] = final_add(q[5], q[0]);
   q[6] = final_sub(q[5], q[0]);
   q[7] = final_sub(q[4], q[1]);
-  store_combine_center_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
+  store_combine_results(dest2, dest3, stride, q[6], q[7], q[8], q[9]);
 
   load_from_output(out, 24, 25, &q[0], &q[1]);
   q[4] = final_add(q[2], q[1]);
   q[5] = final_add(q[3], q[0]);
   q[6] = final_sub(q[3], q[0]);
   q[7] = final_sub(q[2], q[1]);
-  store_combine_extreme_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
+  store_combine_results(dest0, dest1, stride, q[4], q[5], q[6], q[7]);
 }
 
 void vpx_idct32x32_1024_add_neon(const tran_low_t *input, uint8_t *dest,
author	James Zern <jzern@google.com>	2017-03-17 05:24:57 +0000
committer	Gerrit Code Review <noreply-gerritcodereview@google.com>	2017-03-17 05:24:58 +0000
commit	68efc64b72f892b7bedd5329706dfc2cc21c56bc (patch)
tree	167332d4959b0ad61d95acde2fd6f0cd425a85fd
parent	02975a604c679b57ad57e4785abf6f7f0ebd6052 (diff)
parent	e54231d613c70c38f857a968fef9d91a2e4bb10a (diff)
download	libvpx-68efc64b72f892b7bedd5329706dfc2cc21c56bc.tar libvpx-68efc64b72f892b7bedd5329706dfc2cc21c56bc.tar.gz libvpx-68efc64b72f892b7bedd5329706dfc2cc21c56bc.tar.bz2 libvpx-68efc64b72f892b7bedd5329706dfc2cc21c56bc.zip