summaryrefslogtreecommitdiff
path: root/vp9/encoder/x86
diff options
context:
space:
mode:
authorJingning Han <jingning@google.com>2013-11-13 14:48:38 -0800
committerJingning Han <jingning@google.com>2013-11-13 15:15:59 -0800
commitfabc7836956c8c48adf6c570957c7fddb7ec0253 (patch)
tree445f33c39b3c2b4cb1f207e2e066856490734de6 /vp9/encoder/x86
parentb3c75a2d6c04213e729c8e4cc45ec7f45aa1c38f (diff)
downloadlibvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.tar
libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.tar.gz
libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.tar.bz2
libvpx-fabc7836956c8c48adf6c570957c7fddb7ec0253.zip
Fix an overflow issue in SSE2 forward ADST
The step that sums three input samples could potentially cause the intermediate result go beyond 16 bit limit, when operating as the second 1-D transform. This commit fixes the issue. Change-Id: Iaf512449ac2d25ddd8a806d760afab362c62a516
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r--vp9/encoder/x86/vp9_dct_sse2.c5
1 files changed, 3 insertions, 2 deletions
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index dc115018e..fefca660d 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -206,12 +206,12 @@ void fadst4_1d_sse2(__m128i *in) {
const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
__m128i u[8], v[8];
__m128i in7 = _mm_add_epi16(in[0], in[1]);
- in7 = _mm_sub_epi16(in7, in[3]);
u[0] = _mm_unpacklo_epi16(in[0], in[1]);
u[1] = _mm_unpacklo_epi16(in[2], in[3]);
u[2] = _mm_unpacklo_epi16(in7, kZero);
u[3] = _mm_unpacklo_epi16(in[2], kZero);
+ u[4] = _mm_unpacklo_epi16(in[3], kZero);
v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
@@ -219,9 +219,10 @@ void fadst4_1d_sse2(__m128i *in) {
v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
+ v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = v[2];
+ u[1] = _mm_sub_epi32(v[2], v[6]);
u[2] = _mm_add_epi32(v[3], v[4]);
u[3] = _mm_sub_epi32(u[2], u[0]);
u[4] = _mm_slli_epi32(v[5], 2);