5 files changed, 1 insertions, 384 deletions
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 57af79d5b..77bebc7b9 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -130,9 +130,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/int64_t vp9_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp9_highbd_block_error sse2/;
 
-  add_proto qw/int64_t vp9_highbd_block_error_8bit/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
-  specialize qw/vp9_highbd_block_error_8bit sse2 avx/;
-
   add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
   specialize qw/vp9_block_error_fp sse2/;
 
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 0500e6025..1b82b29d4 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -284,22 +284,12 @@ int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
   return error;
 }
 
-int64_t vp9_highbd_block_error_8bit_c(const tran_low_t *coeff,
-                                      const tran_low_t *dqcoeff,
-                                      intptr_t block_size, int64_t *ssz) {
-  // Note that the C versions of these 2 functions (vp9_block_error and
-  // vp9_highbd_block_error_8bit are the same, but the optimized assembly
-  // routines are not compatible in the non high bitdepth configuration, so
-  // they still cannot share the same name.
-  return vp9_block_error_c(coeff, dqcoeff, block_size, ssz);
-}
-
 static int64_t vp9_highbd_block_error_dispatch(const tran_low_t *coeff,
                                                const tran_low_t *dqcoeff,
                                                intptr_t block_size,
                                                int64_t *ssz, int bd) {
   if (bd == 8) {
-    return vp9_highbd_block_error_8bit(coeff, dqcoeff, block_size, ssz);
+    return vp9_block_error(coeff, dqcoeff, block_size, ssz);
   } else {
     return vp9_highbd_block_error(coeff, dqcoeff, block_size, ssz, bd);
   }
@@ -1130,16 +1120,9 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int row,
           ratey += cost_coeffs(x, 0, block, TX_4X4, coeff_ctx, so->scan,
                                so->neighbors, cpi->sf.use_fast_coef_costing);
           tempa[idx] = templ[idy] = (x->plane[0].eobs[block] > 0) ? 1 : 0;
-#if CONFIG_VP9_HIGHBITDEPTH
-          distortion +=
-              vp9_highbd_block_error_8bit(
-                  coeff, BLOCK_OFFSET(pd->dqcoeff, block), 16, &unused) >>
-              2;
-#else
           distortion += vp9_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
                                         16, &unused) >>
                         2;
-#endif
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
           vp9_iht4x4_add(tx_type, BLOCK_OFFSET(pd->dqcoeff, block), dst,
diff --git a/vp9/encoder/x86/vp9_highbd_error_avx.asm b/vp9/encoder/x86/vp9_highbd_error_avx.asm
deleted file mode 100644
index e476323e1..000000000
--- a/vp9/encoder/x86/vp9_highbd_error_avx.asm
+++ /dev/null
@@ -1,261 +0,0 @@
-;
-;  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-;                                     intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM avx
-cglobal highbd_block_error_8bit, 4, 5, 8, uqc, dqc, size, ssz
-  vzeroupper
-
-  ; If only one iteration is required, then handle this as a special case.
-  ; It is the most frequent case, so we can have a significant gain here
-  ; by not setting up a loop and accumulators.
-  cmp    sizeq, 16
-  jne   .generic
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Common case of size == 16
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-
-  ; Load input vectors
-  mova      xm0, [dqcq]
-  packssdw  xm0, [dqcq+16]
-  mova      xm2, [uqcq]
-  packssdw  xm2, [uqcq+16]
-
-  mova      xm1, [dqcq+32]
-  packssdw  xm1, [dqcq+48]
-  mova      xm3, [uqcq+32]
-  packssdw  xm3, [uqcq+48]
-
-  ; Compute the errors.
-  psubw     xm0, xm2
-  psubw     xm1, xm3
-
-  ; Individual errors are max 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
-  pmaddwd   xm2, xm2
-  pmaddwd   xm3, xm3
-
-  pmaddwd   xm0, xm0
-  pmaddwd   xm1, xm1
-
-  ; Squares are always positive, so we can use unsigned arithmetic after
-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
-  ; fit in 32bits
-  paddd     xm2, xm3
-  paddd     xm0, xm1
-
-  ; Accumulate horizontally in 64 bits, there is no chance of overflow here
-  pxor      xm5, xm5
-
-  pblendw   xm3, xm5, xm2, 0x33 ; Zero extended  low of a pair of 32 bits
-  psrlq     xm2, 32             ; Zero extended high of a pair of 32 bits
-
-  pblendw   xm1, xm5, xm0, 0x33 ; Zero extended  low of a pair of 32 bits
-  psrlq     xm0, 32             ; Zero extended high of a pair of 32 bits
-
-  paddq     xm2, xm3
-  paddq     xm0, xm1
-
-  psrldq    xm3, xm2, 8
-  psrldq    xm1, xm0, 8
-
-  paddq     xm2, xm3
-  paddq     xm0, xm1
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm0
-  movq   [sszq], xm2
-%else
-  movd      eax, xm0
-  pextrd    edx, xm0, 1
-  movq   [sszd], xm2
-%endif
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of size != 16, speculative low precision
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ALIGN 16
-.generic:
-  pxor      xm4, xm4                ; sse accumulator
-  pxor      xm5, xm5                ; overflow detection register for xm4
-  pxor      xm6, xm6                ; ssz accumulator
-  pxor      xm7, xm7                ; overflow detection register for xm6
-  lea      uqcq, [uqcq+sizeq*4]
-  lea      dqcq, [dqcq+sizeq*4]
-  neg     sizeq
-
-  ; Push the negative size as the high precision code might need it
-  push    sizeq
-
-.loop:
-  ; Load input vectors
-  mova      xm0, [dqcq+sizeq*4]
-  packssdw  xm0, [dqcq+sizeq*4+16]
-  mova      xm2, [uqcq+sizeq*4]
-  packssdw  xm2, [uqcq+sizeq*4+16]
-
-  mova      xm1, [dqcq+sizeq*4+32]
-  packssdw  xm1, [dqcq+sizeq*4+48]
-  mova      xm3, [uqcq+sizeq*4+32]
-  packssdw  xm3, [uqcq+sizeq*4+48]
-
-  add     sizeq, 16
-
-  ; Compute the squared errors.
-  ; Individual errors are max 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit).
-  psubw     xm0, xm2
-  pmaddwd   xm2, xm2
-  pmaddwd   xm0, xm0
-
-  psubw     xm1, xm3
-  pmaddwd   xm3, xm3
-  pmaddwd   xm1, xm1
-
-  ; Squares are always positive, so we can use unsigned arithmetic after
-  ; squaring. As mentioned earlier 2 sums fit in 31 bits, so 4 sums will
-  ; fit in 32bits
-  paddd     xm2, xm3
-  paddd     xm0, xm1
-
-  ; We accumulate using 32 bit arithmetic, but detect potential overflow
-  ; by checking if the MSB of the accumulators have ever been a set bit.
-  ; If yes, we redo the whole compute at the end on higher precision, but
-  ; this happens extremely rarely, so we still achieve a net gain.
-  paddd     xm4, xm0
-  paddd     xm6, xm2
-  por       xm5, xm4  ; OR in the accumulator for overflow detection
-  por       xm7, xm6  ; OR in the accumulator for overflow detection
-
-  jnz .loop
-
-  ; Add pairs horizontally (still only on 32 bits)
-  phaddd    xm4, xm4
-  por       xm5, xm4  ; OR in the accumulator for overflow detection
-  phaddd    xm6, xm6
-  por       xm7, xm6  ; OR in the accumulator for overflow detection
-
-  ; Check for possibility of overflow by testing if bit 32 of each dword lane
-  ; have ever been set. If they were not, then there was no overflow and the
-  ; final sum will fit in 32 bits. If overflow happened, then
-  ; we redo the whole computation on higher precision.
-  por       xm7, xm5
-  pmovmskb   r4, xm7
-  test       r4, 0x8888
-  jnz .highprec
-
-  phaddd    xm4, xm4
-  phaddd    xm6, xm6
-  pmovzxdq  xm4, xm4
-  pmovzxdq  xm6, xm6
-
-  ; Restore stack
-  pop     sizeq
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm4
-  movq   [sszq], xm6
-%else
-  movd      eax, xm4
-  pextrd    edx, xm4, 1
-  movq   [sszd], xm6
-%endif
-  RET
-
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-  ;; Generic case of size != 16, high precision case
-  ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
-.highprec:
-  pxor      xm4, xm4                 ; sse accumulator
-  pxor      xm5, xm5                 ; dedicated zero register
-  pxor      xm6, xm6                 ; ssz accumulator
-  pop     sizeq
-
-.loophp:
-  mova      xm0, [dqcq+sizeq*4]
-  packssdw  xm0, [dqcq+sizeq*4+16]
-  mova      xm2, [uqcq+sizeq*4]
-  packssdw  xm2, [uqcq+sizeq*4+16]
-
-  mova      xm1, [dqcq+sizeq*4+32]
-  packssdw  xm1, [dqcq+sizeq*4+48]
-  mova      xm3, [uqcq+sizeq*4+32]
-  packssdw  xm3, [uqcq+sizeq*4+48]
-
-  add     sizeq, 16
-
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
-  psubw     xm0, xm2
-  pmaddwd   xm2, xm2
-  pmaddwd   xm0, xm0
-
-  psubw     xm1, xm3
-  pmaddwd   xm3, xm3
-  pmaddwd   xm1, xm1
-
-  ; accumulate in 64bit
-  punpckldq xm7, xm0, xm5
-  punpckhdq xm0, xm5
-  paddq     xm4, xm7
-
-  punpckldq xm7, xm2, xm5
-  punpckhdq xm2, xm5
-  paddq     xm6, xm7
-
-  punpckldq xm7, xm1, xm5
-  punpckhdq xm1, xm5
-  paddq     xm4, xm7
-
-  punpckldq xm7, xm3, xm5
-  punpckhdq xm3, xm5
-  paddq     xm6, xm7
-
-  paddq     xm4, xm0
-  paddq     xm4, xm1
-  paddq     xm6, xm2
-  paddq     xm6, xm3
-
-  jnz .loophp
-
-  ; Accumulate horizontally
-  movhlps   xm5, xm4
-  movhlps   xm7, xm6
-  paddq     xm4, xm5
-  paddq     xm6, xm7
-
-  ; Store the return value
-%if ARCH_X86_64
-  movq      rax, xm4
-  movq   [sszq], xm6
-%else
-  movd      eax, xm4
-  pextrd    edx, xm4, 1
-  movq   [sszd], xm6
-%endif
-  RET
-
-END
diff --git a/vp9/encoder/x86/vp9_highbd_error_sse2.asm b/vp9/encoder/x86/vp9_highbd_error_sse2.asm
deleted file mode 100644
index f3b8f0194..000000000
--- a/vp9/encoder/x86/vp9_highbd_error_sse2.asm
+++ /dev/null
@@ -1,98 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%define private_prefix vp9
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION .text
-ALIGN 16
-
-;
-; int64_t vp9_highbd_block_error_8bit(int32_t *coeff, int32_t *dqcoeff,
-;                                     intptr_t block_size, int64_t *ssz)
-;
-
-INIT_XMM sse2
-cglobal highbd_block_error_8bit, 3, 3, 8, uqc, dqc, size, ssz
-  pxor      m4, m4                 ; sse accumulator
-  pxor      m6, m6                 ; ssz accumulator
-  pxor      m5, m5                 ; dedicated zero register
-  lea     uqcq, [uqcq+sizeq*4]
-  lea     dqcq, [dqcq+sizeq*4]
-  neg    sizeq
-
-  ALIGN 16
-
-.loop:
-  mova      m0, [dqcq+sizeq*4]
-  packssdw  m0, [dqcq+sizeq*4+mmsize]
-  mova      m2, [uqcq+sizeq*4]
-  packssdw  m2, [uqcq+sizeq*4+mmsize]
-
-  mova      m1, [dqcq+sizeq*4+mmsize*2]
-  packssdw  m1, [dqcq+sizeq*4+mmsize*3]
-  mova      m3, [uqcq+sizeq*4+mmsize*2]
-  packssdw  m3, [uqcq+sizeq*4+mmsize*3]
-
-  add    sizeq, mmsize
-
-  ; individual errors are max. 15bit+sign, so squares are 30bit, and
-  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
-
-  psubw     m0, m2
-  pmaddwd   m2, m2
-  pmaddwd   m0, m0
-
-  psubw     m1, m3
-  pmaddwd   m3, m3
-  pmaddwd   m1, m1
-
-  ; accumulate in 64bit
-  punpckldq m7, m0, m5
-  punpckhdq m0, m5
-  paddq     m4, m7
-
-  punpckldq m7, m2, m5
-  punpckhdq m2, m5
-  paddq     m6, m7
-
-  punpckldq m7, m1, m5
-  punpckhdq m1, m5
-  paddq     m4, m7
-
-  punpckldq m7, m3, m5
-  punpckhdq m3, m5
-  paddq     m6, m7
-
-  paddq     m4, m0
-  paddq     m4, m1
-  paddq     m6, m2
-  paddq     m6, m3
-
-  jnz .loop
-
-  ; accumulate horizontally and store in return value
-  movhlps   m5, m4
-  movhlps   m7, m6
-  paddq     m4, m5
-  paddq     m6, m7
-
-%if ARCH_X86_64
-  movq    rax, m4
-  movq [sszq], m6
-%else
-  mov     eax, sszm
-  pshufd   m5, m4, 0x1
-  movq  [eax], m6
-  movd    eax, m4
-  movd    edx, m5
-%endif
-  RET
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index a54e99e2c..e73535543 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -108,10 +108,6 @@ endif
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.asm
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
-ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
-VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_error_sse2.asm
-VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_highbd_error_avx.asm
-endif
 
 ifeq ($(ARCH_X86_64),yes)
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_quantize_ssse3_x86_64.asm