summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJingning Han <jingning@google.com>2013-11-13 14:48:38 -0800
committerJingning Han <jingning@google.com>2013-11-13 15:15:59 -0800
commitfabc7836956c8c48adf6c570957c7fddb7ec0253 (patch)
tree445f33c39b3c2b4cb1f207e2e066856490734de6
parentb3c75a2d6c04213e729c8e4cc45ec7f45aa1c38f (diff)
downloadlibvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.tar
libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.tar.gz
libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.tar.bz2
libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.zip
Fix an overflow issue in SSE2 forward ADST
The step that sums three input samples could potentially cause the intermediate result go beyond 16 bit limit, when operating as the second 1-D transform. This commit fixes the issue. Change-Id: Iaf512449ac2d25ddd8a806d760afab362c62a516
-rw-r--r--vp9/encoder/x86/vp9_dct_sse2.c5
1 files changed, 3 insertions, 2 deletions
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index dc115018e..fefca660d 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -206,12 +206,12 @@ void fadst4_1d_sse2(__m128i *in) {
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8];
__m128i in7 = _mm_add_epi16(in[0], in[1]);
- in7 = _mm_sub_epi16(in7, in[3]);
u[0] = _mm_unpacklo_epi16(in[0], in[1]);
u[1] = _mm_unpacklo_epi16(in[2], in[3]);
u[2] = _mm_unpacklo_epi16(in7, kZero);
u[3] = _mm_unpacklo_epi16(in[2], kZero);
+ u[4] = _mm_unpacklo_epi16(in[3], kZero);
v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
@@ -219,9 +219,10 @@ void fadst4_1d_sse2(__m128i *in) {
v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = v[2];
+ u[1] = _mm_sub_epi32(v[2], v[6]);
u[2] = _mm_add_epi32(v[3], v[4]);
u[3] = _mm_sub_epi32(u[2], u[0]);
u[4] = _mm_slli_epi32(v[5], 2);