diff options
author | Johann <johannkoenig@google.com> | 2017-02-15 17:17:45 -0800 |
---|---|---|
committer | Johann Koenig <johannkoenig@google.com> | 2017-02-16 15:02:48 +0000 |
commit | 44600442dca48a5586105b01403ac03fad54d05b (patch) | |
tree | 839959ff4937f9527f1e641159890bf512669bdc | |
parent | 60a10116d13c4394bed943e0213631735d03bb74 (diff) | |
download | libvpx-44600442dca48a5586105b01403ac03fad54d05b.tar libvpx-44600442dca48a5586105b01403ac03fad54d05b.tar.gz libvpx-44600442dca48a5586105b01403ac03fad54d05b.tar.bz2 libvpx-44600442dca48a5586105b01403ac03fad54d05b.zip |
bitdepth conversion: really use num elements
The previous implementation confused bit/bytes/elements. It was using
'32' as the multiplier but that was mistakenly adopted because a 32x32
transform embedded the stride.
Change-Id: Ieeb867a332416b9a40580b5e7c9b20088e9e691a
-rw-r--r-- | vp9/encoder/x86/vp9_dct_sse2.asm | 2 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_error_sse2.asm | 4 | ||||
-rw-r--r-- | vpx_dsp/x86/avg_ssse3_x86_64.asm | 16 | ||||
-rw-r--r-- | vpx_dsp/x86/bitdepth_conversion_sse2.asm | 16 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm | 32 | ||||
-rw-r--r-- | vpx_dsp/x86/inv_wht_sse2.asm | 2 |
6 files changed, 36 insertions, 36 deletions
diff --git a/vp9/encoder/x86/vp9_dct_sse2.asm b/vp9/encoder/x86/vp9_dct_sse2.asm index e24cabba0..8152dce86 100644 --- a/vp9/encoder/x86/vp9_dct_sse2.asm +++ b/vp9/encoder/x86/vp9_dct_sse2.asm @@ -64,6 +64,6 @@ cglobal fwht4x4, 3, 4, 8, input, output, stride psllw m1, 2 STORE_TRAN_LOW 0, outputq, 0, 2, 3 - STORE_TRAN_LOW 1, outputq, 1, 2, 3 + STORE_TRAN_LOW 1, outputq, 8, 2, 3 RET diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm index dcedf913d..5186d3087 100644 --- a/vp9/encoder/x86/vp9_error_sse2.asm +++ b/vp9/encoder/x86/vp9_error_sse2.asm @@ -91,8 +91,8 @@ cglobal block_error_fp, 3, 3, 6, uqc, dqc, size .loop: LOAD_TRAN_LOW 2, uqcq, 0 LOAD_TRAN_LOW 0, dqcq, 0 - LOAD_TRAN_LOW 3, uqcq, 1 - LOAD_TRAN_LOW 1, dqcq, 1 + LOAD_TRAN_LOW 3, uqcq, 8 + LOAD_TRAN_LOW 1, dqcq, 8 INCREMENT_ELEMENTS_TRAN_LOW uqcq, 16 INCREMENT_ELEMENTS_TRAN_LOW dqcq, 16 sub sizeq, 16 diff --git a/vpx_dsp/x86/avg_ssse3_x86_64.asm b/vpx_dsp/x86/avg_ssse3_x86_64.asm index 4b486c15b..22e0a086c 100644 --- a/vpx_dsp/x86/avg_ssse3_x86_64.asm +++ b/vpx_dsp/x86/avg_ssse3_x86_64.asm @@ -117,14 +117,14 @@ cglobal hadamard_8x8, 3, 5, 11, input, stride, output TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 HMD8_1D - STORE_TRAN_LOW 0, outputq, 0, 8, 9 - STORE_TRAN_LOW 1, outputq, 1, 8, 9 - STORE_TRAN_LOW 2, outputq, 2, 8, 9 - STORE_TRAN_LOW 3, outputq, 3, 8, 9 - STORE_TRAN_LOW 4, outputq, 4, 8, 9 - STORE_TRAN_LOW 5, outputq, 5, 8, 9 - STORE_TRAN_LOW 6, outputq, 6, 8, 9 - STORE_TRAN_LOW 7, outputq, 7, 8, 9 + STORE_TRAN_LOW 0, outputq, 0, 8, 9 + STORE_TRAN_LOW 1, outputq, 8, 8, 9 + STORE_TRAN_LOW 2, outputq, 16, 8, 9 + STORE_TRAN_LOW 3, outputq, 24, 8, 9 + STORE_TRAN_LOW 4, outputq, 32, 8, 9 + STORE_TRAN_LOW 5, outputq, 40, 8, 9 + STORE_TRAN_LOW 6, outputq, 48, 8, 9 + STORE_TRAN_LOW 7, outputq, 56, 8, 9 RET %endif diff --git a/vpx_dsp/x86/bitdepth_conversion_sse2.asm b/vpx_dsp/x86/bitdepth_conversion_sse2.asm index b2df5207a..2bcbc0ac1 100644 --- a/vpx_dsp/x86/bitdepth_conversion_sse2.asm +++ b/vpx_dsp/x86/bitdepth_conversion_sse2.asm @@ -32,21 +32,21 @@ %endmacro ; Load %2 + %3 into m%1. -; %3 is the offset in elements, not bits. +; %3 is the offset in elements, not bytes. ; If tran_low_t is 16 bits (low bit depth configuration) then load the value ; directly. If tran_low_t is 32 bits (high bit depth configuration) then pack ; the values down to 16 bits. %macro LOAD_TRAN_LOW 3 %if CONFIG_VP9_HIGHBITDEPTH - mova m%1, [%2 + %3 * 32] - packssdw m%1, [%2 + %3 * 32 + 16] + mova m%1, [%2 + %3 * 4] + packssdw m%1, [%2 + %3 * 4 + 16] %else - mova m%1, [%2 + %3 * 16] + mova m%1, [%2 + %3 * 2] %endif %endmacro ; Store m%1 to %2 + %3. -; %3 is the offset in elements, not bits. +; %3 is the offset in elements, not bytes. ; If tran_low_t is 16 bits (low bit depth configuration) then store the value ; directly. If tran_low_t is 32 bits (high bit depth configuration) then sign ; extend the values first. @@ -58,9 +58,9 @@ pcmpgtw m%4, m%1 punpcklwd m%5, m%4 punpckhwd m%1, m%4 - mova [%2 + %3 * 32 + 0], m%5 - mova [%2 + %3 * 32 + 16], m%1 + mova [%2 + %3 * 4 + 0], m%5 + mova [%2 + %3 * 4 + 16], m%1 %else - mova [%2 + %3 * 16], m%1 + mova [%2 + %3 * 2], m%1 %endif %endmacro diff --git a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm index 77bfe86cf..a3e0b7fc5 100644 --- a/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm +++ b/vpx_dsp/x86/inv_txfm_ssse3_x86_64.asm @@ -984,14 +984,14 @@ idct32x32_135: mov r7, 2 idct32x32_135_transpose: - LOAD_TRAN_LOW 0, r3, 0 - LOAD_TRAN_LOW 1, r3, 4 - LOAD_TRAN_LOW 2, r3, 8 - LOAD_TRAN_LOW 3, r3, 12 - LOAD_TRAN_LOW 4, r3, 16 - LOAD_TRAN_LOW 5, r3, 20 - LOAD_TRAN_LOW 6, r3, 24 - LOAD_TRAN_LOW 7, r3, 28 + LOAD_TRAN_LOW 0, r3, 0 + LOAD_TRAN_LOW 1, r3, 32 + LOAD_TRAN_LOW 2, r3, 64 + LOAD_TRAN_LOW 3, r3, 96 + LOAD_TRAN_LOW 4, r3, 128 + LOAD_TRAN_LOW 5, r3, 160 + LOAD_TRAN_LOW 6, r3, 192 + LOAD_TRAN_LOW 7, r3, 224 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 @@ -1422,14 +1422,14 @@ idct32x32_1024: mov r7, 4 idct32x32_1024_transpose: - LOAD_TRAN_LOW 0, r3, 0 - LOAD_TRAN_LOW 1, r3, 4 - LOAD_TRAN_LOW 2, r3, 8 - LOAD_TRAN_LOW 3, r3, 12 - LOAD_TRAN_LOW 4, r3, 16 - LOAD_TRAN_LOW 5, r3, 20 - LOAD_TRAN_LOW 6, r3, 24 - LOAD_TRAN_LOW 7, r3, 28 + LOAD_TRAN_LOW 0, r3, 0 + LOAD_TRAN_LOW 1, r3, 32 + LOAD_TRAN_LOW 2, r3, 64 + LOAD_TRAN_LOW 3, r3, 96 + LOAD_TRAN_LOW 4, r3, 128 + LOAD_TRAN_LOW 5, r3, 160 + LOAD_TRAN_LOW 6, r3, 192 + LOAD_TRAN_LOW 7, r3, 224 TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9, 10 diff --git a/vpx_dsp/x86/inv_wht_sse2.asm b/vpx_dsp/x86/inv_wht_sse2.asm index a9c52dc3d..bcf1a6ef9 100644 --- a/vpx_dsp/x86/inv_wht_sse2.asm +++ b/vpx_dsp/x86/inv_wht_sse2.asm @@ -84,7 +84,7 @@ SECTION .text INIT_XMM sse2 cglobal iwht4x4_16_add, 3, 3, 7, input, output, stride LOAD_TRAN_LOW 0, inputq, 0 - LOAD_TRAN_LOW 1, inputq, 1 + LOAD_TRAN_LOW 1, inputq, 8 psraw m0, 2 psraw m1, 2 |