summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_rtcd_defs.pl3
-rw-r--r--vp9/encoder/vp9_rdopt.c19
-rw-r--r--vp9/encoder/x86/vp9_highbd_error_avx.asm261
-rw-r--r--vp9/encoder/x86/vp9_highbd_error_sse2.asm98
-rw-r--r--vp9/vp9cx.mk4
5 files changed, 1 insertions, 384 deletions
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 57af79d5b..77bebc7b9 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -130,9 +130,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
specialize qw/vp9_highbd_block_error sse2/;
- add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
- specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
-
add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
specialize qw/vp9_block_error_fp sse2/;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0500e6025..1b82b29d4 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -284,22 +284,12 @@ int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
return error;
}
-int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
- const tran_low_t *dqcoeff,
- intptr_t block_size, int64_t *ssz) {
- // Note that the C versions of these 2 functions (vp9_block_error and
- // vp9_highbd_block_error_8bit are the same, but the optimized assembly
- // routines are not compatible in the non high bitdepth configuration, so
- // they still cannot share the same name.
- return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
-}
-
static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
const tran_low_t *dqcoeff,
intptr_t block_size,
int64_t *ssz, int bd) {
if (bd == 8) {
- return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+ return vp9_block_error(coeff, dqcoeff, block_size, ssz);
} else {
return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
}
@@ -1130,16 +1120,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
so->neighbors, cpi->sf.use_fast_coef_costing);
tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
-#if CONFIG_VP9_HIGHBITDEPTH
- distortion +=
- vp9_highbd_block_error_8bit(
- coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >>
- 2;
-#else
distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
16, &unused) >>
2;
-#endif
if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
goto next;
vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst,
diff --git a/vp9/encoder/x86/vp9_highbd_error_avx.asm b/vp9/encoder/x86/vp9_highbd_error_avx.asm
deleted file mode 100644
index e476323e1..000000000
--- a/vp9/encoder/x86/vp9_highbd_error_avx.asm
+++ /dev/null
@@ -1,261 +0,0 @@
-;
-; Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-; intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM avx
-cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
- vzeroupper
-
- ; If only one iteration is required, then handle this as a special case.
- ; It is the most frequent case, so we can have a significant gain here
- ; by not setting up a loop and accumulators.
- cmp sizeq, 16
- jne .generic
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Common case of size == 16
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
- ; Load input vectors
- mova xm0, [dqcq]
- packssdw xm0, [dqcq+16]
- mova xm2, [uqcq]
- packssdw xm2, [uqcq+16]
-
- mova xm1, [dqcq+32]
- packssdw xm1, [dqcq+48]
- mova xm3, [uqcq+32]
- packssdw xm3, [uqcq+48]
-
- ; Compute the errors.
- psubw xm0, xm2
- psubw xm1, xm3
-
- ; Individual errors are max 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
- pmaddwd xm2, xm2
- pmaddwd xm3, xm3
-
- pmaddwd xm0, xm0
- pmaddwd xm1, xm1
-
- ; Squares are always positive, so we can use unsigned arithmetic after
- ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
- ; fit in 32bits
- paddd xm2, xm3
- paddd xm0, xm1
-
- ; Accumulate horizontally in 64 bits, there is no chance of overflow here
- pxor xm5, xm5
-
- pblendw xm3, xm5, xm2, 0x33 ; Zero extended low of a pair of 32 bits
- psrlq xm2, 32 ; Zero extended high of a pair of 32 bits
-
- pblendw xm1, xm5, xm0, 0x33 ; Zero extended low of a pair of 32 bits
- psrlq xm0, 32 ; Zero extended high of a pair of 32 bits
-
- paddq xm2, xm3
- paddq xm0, xm1
-
- psrldq xm3, xm2, 8
- psrldq xm1, xm0, 8
-
- paddq xm2, xm3
- paddq xm0, xm1
-
- ; Store the return value
-%if ARCH_X86_64
- movq rax, xm0
- movq [sszq], xm2
-%else
- movd eax, xm0
- pextrd edx, xm0, 1
- movq [sszd], xm2
-%endif
- RET
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Generic case of size != 16, speculative low precision
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ALIGN 16
-.generic:
- pxor xm4, xm4 ; sse accumulator
- pxor xm5, xm5 ; overflow detection register for xm4
- pxor xm6, xm6 ; ssz accumulator
- pxor xm7, xm7 ; overflow detection register for xm6
- lea uqcq, [uqcq+sizeq*4]
- lea dqcq, [dqcq+sizeq*4]
- neg sizeq
-
- ; Push the negative size as the high precision code might need it
- push sizeq
-
-.loop:
- ; Load input vectors
- mova xm0, [dqcq+sizeq*4]
- packssdw xm0, [dqcq+sizeq*4+16]
- mova xm2, [uqcq+sizeq*4]
- packssdw xm2, [uqcq+sizeq*4+16]
-
- mova xm1, [dqcq+sizeq*4+32]
- packssdw xm1, [dqcq+sizeq*4+48]
- mova xm3, [uqcq+sizeq*4+32]
- packssdw xm3, [uqcq+sizeq*4+48]
-
- add sizeq, 16
-
- ; Compute the squared errors.
- ; Individual errors are max 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
- psubw xm0, xm2
- pmaddwd xm2, xm2
- pmaddwd xm0, xm0
-
- psubw xm1, xm3
- pmaddwd xm3, xm3
- pmaddwd xm1, xm1
-
- ; Squares are always positive, so we can use unsigned arithmetic after
- ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
- ; fit in 32bits
- paddd xm2, xm3
- paddd xm0, xm1
-
- ; We accumulate using 32 bit arithmetic, but detect potential overflow
- ; by checking if the MSB of the accumulators have ever been a set bit.
- ; If yes, we redo the whole compute at the end on higher precision, but
- ; this happens extremely rarely, so we still achieve a net gain.
- paddd xm4, xm0
- paddd xm6, xm2
- por xm5, xm4 ; OR in the accumulator for overflow detection
- por xm7, xm6 ; OR in the accumulator for overflow detection
-
- jnz .loop
-
- ; Add pairs horizontally (still only on 32 bits)
- phaddd xm4, xm4
- por xm5, xm4 ; OR in the accumulator for overflow detection
- phaddd xm6, xm6
- por xm7, xm6 ; OR in the accumulator for overflow detection
-
- ; Check for possibility of overflow by testing if bit 32 of each dword lane
- ; have ever been set. If they were not, then there was no overflow and the
- ; final sum will fit in 32 bits. If overflow happened, then
- ; we redo the whole computation on higher precision.
- por xm7, xm5
- pmovmskb r4, xm7
- test r4, 0x8888
- jnz .highprec
-
- phaddd xm4, xm4
- phaddd xm6, xm6
- pmovzxdq xm4, xm4
- pmovzxdq xm6, xm6
-
- ; Restore stack
- pop sizeq
-
- ; Store the return value
-%if ARCH_X86_64
- movq rax, xm4
- movq [sszq], xm6
-%else
- movd eax, xm4
- pextrd edx, xm4, 1
- movq [sszd], xm6
-%endif
- RET
-
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
- ;; Generic case of size != 16, high precision case
- ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-.highprec:
- pxor xm4, xm4 ; sse accumulator
- pxor xm5, xm5 ; dedicated zero register
- pxor xm6, xm6 ; ssz accumulator
- pop sizeq
-
-.loophp:
- mova xm0, [dqcq+sizeq*4]
- packssdw xm0, [dqcq+sizeq*4+16]
- mova xm2, [uqcq+sizeq*4]
- packssdw xm2, [uqcq+sizeq*4+16]
-
- mova xm1, [dqcq+sizeq*4+32]
- packssdw xm1, [dqcq+sizeq*4+48]
- mova xm3, [uqcq+sizeq*4+32]
- packssdw xm3, [uqcq+sizeq*4+48]
-
- add sizeq, 16
-
- ; individual errors are max. 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
- psubw xm0, xm2
- pmaddwd xm2, xm2
- pmaddwd xm0, xm0
-
- psubw xm1, xm3
- pmaddwd xm3, xm3
- pmaddwd xm1, xm1
-
- ; accumulate in 64bit
- punpckldq xm7, xm0, xm5
- punpckhdq xm0, xm5
- paddq xm4, xm7
-
- punpckldq xm7, xm2, xm5
- punpckhdq xm2, xm5
- paddq xm6, xm7
-
- punpckldq xm7, xm1, xm5
- punpckhdq xm1, xm5
- paddq xm4, xm7
-
- punpckldq xm7, xm3, xm5
- punpckhdq xm3, xm5
- paddq xm6, xm7
-
- paddq xm4, xm0
- paddq xm4, xm1
- paddq xm6, xm2
- paddq xm6, xm3
-
- jnz .loophp
-
- ; Accumulate horizontally
- movhlps xm5, xm4
- movhlps xm7, xm6
- paddq xm4, xm5
- paddq xm6, xm7
-
- ; Store the return value
-%if ARCH_X86_64
- movq rax, xm4
- movq [sszq], xm6
-%else
- movd eax, xm4
- pextrd edx, xm4, 1
- movq [sszd], xm6
-%endif
- RET
-
-END
diff --git a/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/vp9/encoder/x86/vp9_highbd_error_sse2.asm
deleted file mode 100644
index f3b8f0194..000000000
--- a/vp9/encoder/x86/vp9_highbd_error_sse2.asm
+++ /dev/null
@@ -1,98 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-; intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM sse2
-cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
- pxor m4, m4 ; sse accumulator
- pxor m6, m6 ; ssz accumulator
- pxor m5, m5 ; dedicated zero register
- lea uqcq, [uqcq+sizeq*4]
- lea dqcq, [dqcq+sizeq*4]
- neg sizeq
-
- ALIGN 16
-
-.loop:
- mova m0, [dqcq+sizeq*4]
- packssdw m0, [dqcq+sizeq*4+mmsize]
- mova m2, [uqcq+sizeq*4]
- packssdw m2, [uqcq+sizeq*4+mmsize]
-
- mova m1, [dqcq+sizeq*4+mmsize*2]
- packssdw m1, [dqcq+sizeq*4+mmsize*3]
- mova m3, [uqcq+sizeq*4+mmsize*2]
- packssdw m3, [uqcq+sizeq*4+mmsize*3]
-
- add sizeq, mmsize
-
- ; individual errors are max. 15bit+sign, so squares are 30bit, and
- ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
- psubw m0, m2
- pmaddwd m2, m2
- pmaddwd m0, m0
-
- psubw m1, m3
- pmaddwd m3, m3
- pmaddwd m1, m1
-
- ; accumulate in 64bit
- punpckldq m7, m0, m5
- punpckhdq m0, m5
- paddq m4, m7
-
- punpckldq m7, m2, m5
- punpckhdq m2, m5
- paddq m6, m7
-
- punpckldq m7, m1, m5
- punpckhdq m1, m5
- paddq m4, m7
-
- punpckldq m7, m3, m5
- punpckhdq m3, m5
- paddq m6, m7
-
- paddq m4, m0
- paddq m4, m1
- paddq m6, m2
- paddq m6, m3
-
- jnz .loop
-
- ; accumulate horizontally and store in return value
- movhlps m5, m4
- movhlps m7, m6
- paddq m4, m5
- paddq m6, m7
-
-%if ARCH_X86_64
- movq rax, m4
- movq [sszq], m6
-%else
- mov eax, sszm
- pshufd m5, m4, 0x1
- movq [eax], m6
- movd eax, m4
- movd edx, m5
-%endif
- RET
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index a54e99e2c..e73535543 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -108,10 +108,6 @@ endif
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
-endif
ifeq ($(ARCH_X86_64),yes)
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm