summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJingning Han <jingning@google.com>2014-05-30 15:52:53 -0700
committerGerrit Code Review <gerrit@gerrit.golo.chromium.org>2014-05-30 15:52:53 -0700
commitba6bed372bf17af7df6c310860e9a2539ba2e939 (patch)
treee693bfff6236902e00501f3700acc83bd5c08543
parent6f8a77d5d51c5359b6b5606df8aa611e26e3dea4 (diff)
parent2c1cdf69b6ebece5052c7a1b20e017925836bd96 (diff)
downloadlibvpx-ba6bed372bf17af7df6c310860e9a2539ba2e939.tar
libvpx-ba6bed372bf17af7df6c310860e9a2539ba2e939.tar.gz
libvpx-ba6bed372bf17af7df6c310860e9a2539ba2e939.tar.bz2
libvpx-ba6bed372bf17af7df6c310860e9a2539ba2e939.zip
Merge "Fix a potential overflow issue in inverse 16x16 full 2D-DCT"
-rw-r--r--vp9/common/x86/vp9_idct_intrin_ssse3.c110
1 files changed, 82 insertions, 28 deletions
diff --git a/vp9/common/x86/vp9_idct_intrin_ssse3.c b/vp9/common/x86/vp9_idct_intrin_ssse3.c
index e5d3cb5f4..0930e7805 100644
--- a/vp9/common/x86/vp9_idct_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_idct_intrin_ssse3.c
@@ -16,7 +16,7 @@
#include <tmmintrin.h> // SSSE3
#include "vp9/common/x86/vp9_idct_intrin_sse2.h"
-static void idct16_8col(__m128i *in) {
+static void idct16_8col(__m128i *in, int round) {
const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
@@ -36,6 +36,8 @@ static void idct16_8col(__m128i *in) {
const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i k__cospi_p16_p16_x2 = pair_set_epi16(23170, 23170);
+ const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
+ const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
__m128i v[16], u[16], s[16], t[16];
@@ -266,28 +268,80 @@ static void idct16_8col(__m128i *in) {
t[15] = _mm_add_epi16(s[12], s[15]);
// stage 6
- s[0] = _mm_add_epi16(t[0], t[7]);
- s[1] = _mm_add_epi16(t[1], t[6]);
- s[2] = _mm_add_epi16(t[2], t[5]);
- s[3] = _mm_add_epi16(t[3], t[4]);
- s[4] = _mm_sub_epi16(t[3], t[4]);
- s[5] = _mm_sub_epi16(t[2], t[5]);
- s[6] = _mm_sub_epi16(t[1], t[6]);
- s[7] = _mm_sub_epi16(t[0], t[7]);
- s[8] = t[8];
- s[9] = t[9];
-
- u[0] = _mm_sub_epi16(t[13], t[10]);
- u[1] = _mm_add_epi16(t[13], t[10]);
- u[2] = _mm_sub_epi16(t[12], t[11]);
- u[3] = _mm_add_epi16(t[12], t[11]);
-
- s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
- s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
- s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
- s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
- s[14] = t[14];
- s[15] = t[15];
+ if (round == 1) {
+ s[0] = _mm_add_epi16(t[0], t[7]);
+ s[1] = _mm_add_epi16(t[1], t[6]);
+ s[2] = _mm_add_epi16(t[2], t[5]);
+ s[3] = _mm_add_epi16(t[3], t[4]);
+ s[4] = _mm_sub_epi16(t[3], t[4]);
+ s[5] = _mm_sub_epi16(t[2], t[5]);
+ s[6] = _mm_sub_epi16(t[1], t[6]);
+ s[7] = _mm_sub_epi16(t[0], t[7]);
+ s[8] = t[8];
+ s[9] = t[9];
+
+ u[0] = _mm_unpacklo_epi16(t[10], t[13]);
+ u[1] = _mm_unpackhi_epi16(t[10], t[13]);
+ u[2] = _mm_unpacklo_epi16(t[11], t[12]);
+ u[3] = _mm_unpackhi_epi16(t[11], t[12]);
+
+ v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
+ v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
+ v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
+ v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
+ v[4] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
+ v[5] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
+ v[6] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
+ v[7] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
+
+ u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
+ u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
+ u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
+ u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
+ u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
+ u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
+ u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
+ u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
+
+ u[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
+ u[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
+ u[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
+ u[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
+ u[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
+ u[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
+ u[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
+ u[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
+
+ s[10] = _mm_packs_epi32(u[0], u[1]);
+ s[13] = _mm_packs_epi32(u[2], u[3]);
+ s[11] = _mm_packs_epi32(u[4], u[5]);
+ s[12] = _mm_packs_epi32(u[6], u[7]);
+ s[14] = t[14];
+ s[15] = t[15];
+ } else {
+ s[0] = _mm_add_epi16(t[0], t[7]);
+ s[1] = _mm_add_epi16(t[1], t[6]);
+ s[2] = _mm_add_epi16(t[2], t[5]);
+ s[3] = _mm_add_epi16(t[3], t[4]);
+ s[4] = _mm_sub_epi16(t[3], t[4]);
+ s[5] = _mm_sub_epi16(t[2], t[5]);
+ s[6] = _mm_sub_epi16(t[1], t[6]);
+ s[7] = _mm_sub_epi16(t[0], t[7]);
+ s[8] = t[8];
+ s[9] = t[9];
+
+ u[0] = _mm_sub_epi16(t[13], t[10]);
+ u[1] = _mm_add_epi16(t[13], t[10]);
+ u[2] = _mm_sub_epi16(t[12], t[11]);
+ u[3] = _mm_add_epi16(t[12], t[11]);
+
+ s[10] = _mm_mulhrs_epi16(u[0], k__cospi_p16_p16_x2);
+ s[13] = _mm_mulhrs_epi16(u[1], k__cospi_p16_p16_x2);
+ s[11] = _mm_mulhrs_epi16(u[2], k__cospi_p16_p16_x2);
+ s[12] = _mm_mulhrs_epi16(u[3], k__cospi_p16_p16_x2);
+ s[14] = t[14];
+ s[15] = t[15];
+ }
// stage 7
in[0] = _mm_add_epi16(s[0], s[15]);
@@ -308,10 +362,10 @@ static void idct16_8col(__m128i *in) {
in[15] = _mm_sub_epi16(s[0], s[15]);
}
-static void idct16_sse2(__m128i *in0, __m128i *in1) {
+static void idct16_sse2(__m128i *in0, __m128i *in1, int round) {
array_transpose_16x16(in0, in1);
- idct16_8col(in0);
- idct16_8col(in1);
+ idct16_8col(in0, round);
+ idct16_8col(in1, round);
}
void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
@@ -322,8 +376,8 @@ void vp9_idct16x16_256_add_ssse3(const int16_t *input, uint8_t *dest,
input += 8;
load_buffer_8x16(input, in1);
- idct16_sse2(in0, in1);
- idct16_sse2(in0, in1);
+ idct16_sse2(in0, in1, 0);
+ idct16_sse2(in0, in1, 1);
write_buffer_8x16(dest, in0, stride);
dest += 8;