diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 19 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_highbd_error_avx.asm | 261 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_highbd_error_sse2.asm | 98 | ||||
-rw-r--r-- | vp9/vp9cx.mk | 4 |
5 files changed, 1 insertions, 384 deletions
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 57af79d5b..77bebc7b9 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -130,9 +130,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd"; specialize qw/vp9_highbd_block_error sse2/; - add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz"; - specialize qw/vp9_highbd_block_error_8bit sse2 avx/; - add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size"; specialize qw/vp9_block_error_fp sse2/; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 0500e6025..1b82b29d4 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -284,22 +284,12 @@ int64_t vp9_highbd_block_error_c(const tran_low_t *coeff, return error; } -int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff, - const tran_low_t *dqcoeff, - intptr_t block_size, int64_t *ssz) { - // Note that the C versions of these 2 functions (vp9_block_error and - // vp9_highbd_block_error_8bit are the same, but the optimized assembly - // routines are not compatible in the non high bitdepth configuration, so - // they still cannot share the same name. - return vp9_block_error_c(coeff, dqcoeff, block_size, ssz); -} - static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd) { if (bd == 8) { - return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz); + return vp9_block_error(coeff, dqcoeff, block_size, ssz); } else { return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd); } @@ -1130,16 +1120,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row, ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan, so->neighbors, cpi->sf.use_fast_coef_costing); tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0; -#if CONFIG_VP9_HIGHBITDEPTH - distortion += - vp9_highbd_block_error_8bit( - coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> - 2; -#else distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >> 2; -#endif if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd) goto next; vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst, diff --git a/vp9/encoder/x86/vp9_highbd_error_avx.asm b/vp9/encoder/x86/vp9_highbd_error_avx.asm deleted file mode 100644 index e476323e1..000000000 --- a/vp9/encoder/x86/vp9_highbd_error_avx.asm +++ /dev/null @@ -1,261 +0,0 @@ -; -; Copyright (c) 2015 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text -ALIGN 16 - -; -; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, -; intptr_t block_size, int64_t *ssz) -; - -INIT_XMM avx -cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz - vzeroupper - - ; If only one iteration is required, then handle this as a special case. - ; It is the most frequent case, so we can have a significant gain here - ; by not setting up a loop and accumulators. - cmp sizeq, 16 - jne .generic - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Common case of size == 16 - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - - ; Load input vectors - mova xm0, [dqcq] - packssdw xm0, [dqcq+16] - mova xm2, [uqcq] - packssdw xm2, [uqcq+16] - - mova xm1, [dqcq+32] - packssdw xm1, [dqcq+48] - mova xm3, [uqcq+32] - packssdw xm3, [uqcq+48] - - ; Compute the errors. - psubw xm0, xm2 - psubw xm1, xm3 - - ; Individual errors are max 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). - pmaddwd xm2, xm2 - pmaddwd xm3, xm3 - - pmaddwd xm0, xm0 - pmaddwd xm1, xm1 - - ; Squares are always positive, so we can use unsigned arithmetic after - ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will - ; fit in 32bits - paddd xm2, xm3 - paddd xm0, xm1 - - ; Accumulate horizontally in 64 bits, there is no chance of overflow here - pxor xm5, xm5 - - pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits - psrlq xm2, 32 ; Zero extended high of a pair of 32 bits - - pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits - psrlq xm0, 32 ; Zero extended high of a pair of 32 bits - - paddq xm2, xm3 - paddq xm0, xm1 - - psrldq xm3, xm2, 8 - psrldq xm1, xm0, 8 - - paddq xm2, xm3 - paddq xm0, xm1 - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm0 - movq [sszq], xm2 -%else - movd eax, xm0 - pextrd edx, xm0, 1 - movq [sszd], xm2 -%endif - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of size != 16, speculative low precision - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ALIGN 16 -.generic: - pxor xm4, xm4 ; sse accumulator - pxor xm5, xm5 ; overflow detection register for xm4 - pxor xm6, xm6 ; ssz accumulator - pxor xm7, xm7 ; overflow detection register for xm6 - lea uqcq, [uqcq+sizeq*4] - lea dqcq, [dqcq+sizeq*4] - neg sizeq - - ; Push the negative size as the high precision code might need it - push sizeq - -.loop: - ; Load input vectors - mova xm0, [dqcq+sizeq*4] - packssdw xm0, [dqcq+sizeq*4+16] - mova xm2, [uqcq+sizeq*4] - packssdw xm2, [uqcq+sizeq*4+16] - - mova xm1, [dqcq+sizeq*4+32] - packssdw xm1, [dqcq+sizeq*4+48] - mova xm3, [uqcq+sizeq*4+32] - packssdw xm3, [uqcq+sizeq*4+48] - - add sizeq, 16 - - ; Compute the squared errors. - ; Individual errors are max 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit). - psubw xm0, xm2 - pmaddwd xm2, xm2 - pmaddwd xm0, xm0 - - psubw xm1, xm3 - pmaddwd xm3, xm3 - pmaddwd xm1, xm1 - - ; Squares are always positive, so we can use unsigned arithmetic after - ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will - ; fit in 32bits - paddd xm2, xm3 - paddd xm0, xm1 - - ; We accumulate using 32 bit arithmetic, but detect potential overflow - ; by checking if the MSB of the accumulators have ever been a set bit. - ; If yes, we redo the whole compute at the end on higher precision, but - ; this happens extremely rarely, so we still achieve a net gain. - paddd xm4, xm0 - paddd xm6, xm2 - por xm5, xm4 ; OR in the accumulator for overflow detection - por xm7, xm6 ; OR in the accumulator for overflow detection - - jnz .loop - - ; Add pairs horizontally (still only on 32 bits) - phaddd xm4, xm4 - por xm5, xm4 ; OR in the accumulator for overflow detection - phaddd xm6, xm6 - por xm7, xm6 ; OR in the accumulator for overflow detection - - ; Check for possibility of overflow by testing if bit 32 of each dword lane - ; have ever been set. If they were not, then there was no overflow and the - ; final sum will fit in 32 bits. If overflow happened, then - ; we redo the whole computation on higher precision. - por xm7, xm5 - pmovmskb r4, xm7 - test r4, 0x8888 - jnz .highprec - - phaddd xm4, xm4 - phaddd xm6, xm6 - pmovzxdq xm4, xm4 - pmovzxdq xm6, xm6 - - ; Restore stack - pop sizeq - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm4 - movq [sszq], xm6 -%else - movd eax, xm4 - pextrd edx, xm4, 1 - movq [sszd], xm6 -%endif - RET - - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - ;; Generic case of size != 16, high precision case - ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -.highprec: - pxor xm4, xm4 ; sse accumulator - pxor xm5, xm5 ; dedicated zero register - pxor xm6, xm6 ; ssz accumulator - pop sizeq - -.loophp: - mova xm0, [dqcq+sizeq*4] - packssdw xm0, [dqcq+sizeq*4+16] - mova xm2, [uqcq+sizeq*4] - packssdw xm2, [uqcq+sizeq*4+16] - - mova xm1, [dqcq+sizeq*4+32] - packssdw xm1, [dqcq+sizeq*4+48] - mova xm3, [uqcq+sizeq*4+32] - packssdw xm3, [uqcq+sizeq*4+48] - - add sizeq, 16 - - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - - psubw xm0, xm2 - pmaddwd xm2, xm2 - pmaddwd xm0, xm0 - - psubw xm1, xm3 - pmaddwd xm3, xm3 - pmaddwd xm1, xm1 - - ; accumulate in 64bit - punpckldq xm7, xm0, xm5 - punpckhdq xm0, xm5 - paddq xm4, xm7 - - punpckldq xm7, xm2, xm5 - punpckhdq xm2, xm5 - paddq xm6, xm7 - - punpckldq xm7, xm1, xm5 - punpckhdq xm1, xm5 - paddq xm4, xm7 - - punpckldq xm7, xm3, xm5 - punpckhdq xm3, xm5 - paddq xm6, xm7 - - paddq xm4, xm0 - paddq xm4, xm1 - paddq xm6, xm2 - paddq xm6, xm3 - - jnz .loophp - - ; Accumulate horizontally - movhlps xm5, xm4 - movhlps xm7, xm6 - paddq xm4, xm5 - paddq xm6, xm7 - - ; Store the return value -%if ARCH_X86_64 - movq rax, xm4 - movq [sszq], xm6 -%else - movd eax, xm4 - pextrd edx, xm4, 1 - movq [sszd], xm6 -%endif - RET - -END diff --git a/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/vp9/encoder/x86/vp9_highbd_error_sse2.asm deleted file mode 100644 index f3b8f0194..000000000 --- a/vp9/encoder/x86/vp9_highbd_error_sse2.asm +++ /dev/null @@ -1,98 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text -ALIGN 16 - -; -; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff, -; intptr_t block_size, int64_t *ssz) -; - -INIT_XMM sse2 -cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz - pxor m4, m4 ; sse accumulator - pxor m6, m6 ; ssz accumulator - pxor m5, m5 ; dedicated zero register - lea uqcq, [uqcq+sizeq*4] - lea dqcq, [dqcq+sizeq*4] - neg sizeq - - ALIGN 16 - -.loop: - mova m0, [dqcq+sizeq*4] - packssdw m0, [dqcq+sizeq*4+mmsize] - mova m2, [uqcq+sizeq*4] - packssdw m2, [uqcq+sizeq*4+mmsize] - - mova m1, [dqcq+sizeq*4+mmsize*2] - packssdw m1, [dqcq+sizeq*4+mmsize*3] - mova m3, [uqcq+sizeq*4+mmsize*2] - packssdw m3, [uqcq+sizeq*4+mmsize*3] - - add sizeq, mmsize - - ; individual errors are max. 15bit+sign, so squares are 30bit, and - ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) - - psubw m0, m2 - pmaddwd m2, m2 - pmaddwd m0, m0 - - psubw m1, m3 - pmaddwd m3, m3 - pmaddwd m1, m1 - - ; accumulate in 64bit - punpckldq m7, m0, m5 - punpckhdq m0, m5 - paddq m4, m7 - - punpckldq m7, m2, m5 - punpckhdq m2, m5 - paddq m6, m7 - - punpckldq m7, m1, m5 - punpckhdq m1, m5 - paddq m4, m7 - - punpckldq m7, m3, m5 - punpckhdq m3, m5 - paddq m6, m7 - - paddq m4, m0 - paddq m4, m1 - paddq m6, m2 - paddq m6, m3 - - jnz .loop - - ; accumulate horizontally and store in return value - movhlps m5, m4 - movhlps m7, m6 - paddq m4, m5 - paddq m6, m7 - -%if ARCH_X86_64 - movq rax, m4 - movq [sszq], m6 -%else - mov eax, sszm - pshufd m5, m4, 0x1 - movq [eax], m6 - movd eax, m4 - movd edx, m5 -%endif - RET diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index a54e99e2c..e73535543 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -108,10 +108,6 @@ endif VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm -ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) -VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm -VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm -endif ifeq ($(ARCH_X86_64),yes) VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm |