diff options
author | Jingning Han <jingning@google.com> | 2013-11-13 14:48:38 -0800 |
---|---|---|
committer | Jingning Han <jingning@google.com> | 2013-11-13 15:15:59 -0800 |
commit | fabc7836956c8c48adf6c570957c7fddb7ec0253 (patch) | |
tree | 445f33c39b3c2b4cb1f207e2e066856490734de6 | |
parent | b3c75a2d6c04213e729c8e4cc45ec7f45aa1c38f (diff) | |
download | libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.tar libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.tar.gz libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.tar.bz2 libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.zip |
Fix an overflow issue in SSE2 forward ADST
The step that sums three input samples could potentially cause the
intermediate result go beyond 16 bit limit, when operating as the
second 1-D transform. This commit fixes the issue.
Change-Id: Iaf512449ac2d25ddd8a806d760afab362c62a516
-rw-r--r-- | vp9/encoder/x86/vp9_dct_sse2.c | 5 |
1 files changed, 3 insertions, 2 deletions
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c index dc115018e..fefca660d 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.c +++ b/vp9/encoder/x86/vp9_dct_sse2.c @@ -206,12 +206,12 @@ void fadst4_1d_sse2(__m128i *in) { const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); __m128i u[8], v[8]; __m128i in7 = _mm_add_epi16(in[0], in[1]); - in7 = _mm_sub_epi16(in7, in[3]); u[0] = _mm_unpacklo_epi16(in[0], in[1]); u[1] = _mm_unpacklo_epi16(in[2], in[3]); u[2] = _mm_unpacklo_epi16(in7, kZero); u[3] = _mm_unpacklo_epi16(in[2], kZero); + u[4] = _mm_unpacklo_epi16(in[3], kZero); v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 @@ -219,9 +219,10 @@ void fadst4_1d_sse2(__m128i *in) { v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 + v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = v[2]; + u[1] = _mm_sub_epi32(v[2], v[6]); u[2] = _mm_add_epi32(v[3], v[4]); u[3] = _mm_sub_epi32(u[2], u[0]); u[4] = _mm_slli_epi32(v[5], 2); |