diff options
author | Yunqing Wang <yunqingwang@google.com> | 2014-05-27 10:52:16 -0700 |
---|---|---|
committer | Gerrit Code Review <gerrit@gerrit.golo.chromium.org> | 2014-05-27 10:52:16 -0700 |
commit | a591ac9e5a2daa93ad7ae6c93db5ffb34ea818ec (patch) | |
tree | acc2e674aaa53e0351498d8790a02734fd6f3666 /vp9/common | |
parent | bf503e52369d63cd7f6f4b73a8f7f76380c84e19 (diff) | |
parent | 773596050f59eb46bb08b2dab986023e03de63da (diff) | |
download | libvpx-a591ac9e5a2daa93ad7ae6c93db5ffb34ea818ec.tar libvpx-a591ac9e5a2daa93ad7ae6c93db5ffb34ea818ec.tar.gz libvpx-a591ac9e5a2daa93ad7ae6c93db5ffb34ea818ec.tar.bz2 libvpx-a591ac9e5a2daa93ad7ae6c93db5ffb34ea818ec.zip |
Merge "Fix decoder mismatch in sub-pixel AVX2 intrinsic filters"
Diffstat (limited to 'vp9/common')
-rw-r--r-- | vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c | 32 |
1 files changed, 16 insertions, 16 deletions
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c index b84db970e..d109e136a 100644 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c +++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c @@ -111,21 +111,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); - srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); + srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together @@ -146,21 +146,21 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); - srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt2Reg); + srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); - srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, secondFilters); + srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer - srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt4Reg); + srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result - srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, forthFilters); + srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together @@ -208,26 +208,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt2Reg)); + _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(secondFilters)); + _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, - _mm256_castsi256_si128(filt4Reg)); + _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, - _mm256_castsi256_si128(forthFilters)); + _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); @@ -247,26 +247,26 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr, srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt2Reg)); + _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, - _mm256_castsi256_si128(secondFilters)); + _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, - _mm256_castsi256_si128(filt4Reg)); + _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, - _mm256_castsi256_si128(forthFilters)); + _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); |