Use shifted value for sinpi8sqrt2

The value 35468 changes sign when stored in int16_t: implicit conversion from 'int' to 'int16_t' (aka 'short') changes value from 35468 to -30068 This negation requires adding back the original value to compensate. Shifting the value keeps the value positive and saves a post-vqdmulh shift. This technique is used in webp and idct_dequant_full_2x_neon BUG=b/28027557 Change-Id: I0c5ce09bea170fe08061856c2af6f841a557e0c3
author: Johann <johannkoenig@google.com> 2016-08-02 15:59:35 -0700
committer: Johann <johannkoenig@google.com> 2016-09-23 17:04:18 -0700
commit: ab0e7a237a6e9796c15a8858caac04dea3593d62 (patch)
tree: 359cb7482dca32b87306efb67101c48c616dfcb8 /vp8/common/arm/neon
parent: ada850786c7e0a5d6024b7716db9896124fd483e (diff)
download: libvpx-ab0e7a237a6e9796c15a8858caac04dea3593d62.tar
libvpx-ab0e7a237a6e9796c15a8858caac04dea3593d62.tar.gz
libvpx-ab0e7a237a6e9796c15a8858caac04dea3593d62.tar.bz2
libvpx-ab0e7a237a6e9796c15a8858caac04dea3593d62.zip
2 files changed, 10 insertions, 10 deletions
diff --git a/vp8/common/arm/neon/dequant_idct_neon.c b/vp8/common/arm/neon/dequant_idct_neon.c
index ff5981eaa..753051c77 100644
--- a/vp8/common/arm/neon/dequant_idct_neon.c
+++ b/vp8/common/arm/neon/dequant_idct_neon.c
@@ -11,7 +11,11 @@
 #include <arm_neon.h>
 
 static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
 
 void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
                                int stride) {
@@ -60,10 +64,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
   q3 = vqdmulhq_n_s16(q2, sinpi8sqrt2);
   q4 = vqdmulhq_n_s16(q2, cospi8sqrt2minus1);
 
-  q3 = vshrq_n_s16(q3, 1);
   q4 = vshrq_n_s16(q4, 1);
 
-  q3 = vqaddq_s16(q3, q2);
   q4 = vqaddq_s16(q4, q2);
 
   d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
@@ -90,10 +92,8 @@ void vp8_dequant_idct_add_neon(int16_t *input, int16_t *dq, unsigned char *dst,
   d12 = vqadd_s16(d2tmp2.val[0], d2tmp3.val[0]);
   d13 = vqsub_s16(d2tmp2.val[0], d2tmp3.val[0]);
 
-  q3 = vshrq_n_s16(q3, 1);
   q4 = vshrq_n_s16(q4, 1);
 
-  q3 = vqaddq_s16(q3, q2);
   q4 = vqaddq_s16(q4, q2);
 
   d10 = vqsub_s16(vget_low_s16(q3), vget_high_s16(q4));
diff --git a/vp8/common/arm/neon/shortidct4x4llm_neon.c b/vp8/common/arm/neon/shortidct4x4llm_neon.c
index a36c0c1ca..1adb1c317 100644
--- a/vp8/common/arm/neon/shortidct4x4llm_neon.c
+++ b/vp8/common/arm/neon/shortidct4x4llm_neon.c
@@ -11,7 +11,11 @@
 #include <arm_neon.h>
 
 static const int16_t cospi8sqrt2minus1 = 20091;
-static const int16_t sinpi8sqrt2 = 35468;
+// 35468 exceeds INT16_MAX and gets converted to a negative number. Because of
+// the way it is used in vqdmulh, where the result is doubled, it can be divided
+// by 2 beforehand. This saves compensating for the negative value as well as
+// shifting the result.
+static const int16_t sinpi8sqrt2 = 35468 >> 1;
 
 void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
                                int pred_stride, unsigned char *dst_ptr,
@@ -40,10 +44,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
   d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
   d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
 
-  q3s16 = vshrq_n_s16(q3s16, 1);
   q4s16 = vshrq_n_s16(q4s16, 1);
 
-  q3s16 = vqaddq_s16(q3s16, q2s16);
   q4s16 = vqaddq_s16(q4s16, q2s16);
 
   d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
@@ -71,10 +73,8 @@ void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr,
   d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // a1
   d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16));  // b1
 
-  q3s16 = vshrq_n_s16(q3s16, 1);
   q4s16 = vshrq_n_s16(q4s16, 1);
 
-  q3s16 = vqaddq_s16(q3s16, q2s16);
   q4s16 = vqaddq_s16(q4s16, q2s16);
 
   d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16));  // c1
author	Johann <johannkoenig@google.com>	2016-08-02 15:59:35 -0700
committer	Johann <johannkoenig@google.com>	2016-09-23 17:04:18 -0700
commit	ab0e7a237a6e9796c15a8858caac04dea3593d62 (patch)
tree	359cb7482dca32b87306efb67101c48c616dfcb8 /vp8/common/arm/neon
parent	ada850786c7e0a5d6024b7716db9896124fd483e (diff)
download	libvpx-ab0e7a237a6e9796c15a8858caac04dea3593d62.tar libvpx-ab0e7a237a6e9796c15a8858caac04dea3593d62.tar.gz libvpx-ab0e7a237a6e9796c15a8858caac04dea3593d62.tar.bz2 libvpx-ab0e7a237a6e9796c15a8858caac04dea3593d62.zip