summaryrefslogtreecommitdiff
path: root/vp8/encoder
diff options
context:
space:
mode:
authorJingning Han <jingning@google.com>2015-07-06 16:52:24 -0700
committerJingning Han <jingning@google.com>2015-07-07 09:57:44 -0700
commit0ede9f52b796b6d8e02046b24f68a3db8b9f5920 (patch)
treea5d4280fa9a825f47c64bd65b2cefa5823db7c1f /vp8/encoder
parent9cb3a13426d21a9bcfbd4d71e045fa0199ab1bbd (diff)
downloadlibvpx-0ede9f52b796b6d8e02046b24f68a3db8b9f5920.tar
libvpx-0ede9f52b796b6d8e02046b24f68a3db8b9f5920.tar.gz
libvpx-0ede9f52b796b6d8e02046b24f68a3db8b9f5920.tar.bz2
libvpx-0ede9f52b796b6d8e02046b24f68a3db8b9f5920.zip
Unify subtract function used in VP8/9
This commit replaces the vp8_ prefixed subtract function with the common vpx_subtract_block function. It removes redundant SIMD optimization codes and unit tests. Change-Id: I42e086c32c93c6125e452dcaa6ed04337fe028d9
Diffstat (limited to 'vp8/encoder')
-rw-r--r--vp8/encoder/arm/neon/subtract_neon.c154
-rw-r--r--vp8/encoder/encodemb.c84
-rw-r--r--vp8/encoder/encodemb.h7
-rw-r--r--vp8/encoder/x86/subtract_mmx.asm223
-rw-r--r--vp8/encoder/x86/subtract_sse2.asm245
-rw-r--r--vp8/encoder/x86/vp8_enc_stubs_mmx.c11
-rw-r--r--vp8/encoder/x86/vp8_enc_stubs_sse2.c11
7 files changed, 24 insertions, 711 deletions
diff --git a/vp8/encoder/arm/neon/subtract_neon.c b/vp8/encoder/arm/neon/subtract_neon.c
deleted file mode 100644
index d3ab7b165..000000000
--- a/vp8/encoder/arm/neon/subtract_neon.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <arm_neon.h>
-#include "vp8/encoder/block.h"
-
-void vp8_subtract_b_neon(
- BLOCK *be,
- BLOCKD *bd,
- int pitch) {
- unsigned char *src_ptr, *predictor;
- int src_stride;
- int16_t *src_diff;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint16x8_t q10u16, q11u16, q12u16, q13u16;
-
- src_ptr = *be->base_src + be->src;
- src_stride = be->src_stride;
- predictor = bd->predictor;
-
- d0u8 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d4u8 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d6u8 = vld1_u8(src_ptr);
-
- d1u8 = vld1_u8(predictor);
- predictor += pitch;
- d3u8 = vld1_u8(predictor);
- predictor += pitch;
- d5u8 = vld1_u8(predictor);
- predictor += pitch;
- d7u8 = vld1_u8(predictor);
-
- q10u16 = vsubl_u8(d0u8, d1u8);
- q11u16 = vsubl_u8(d2u8, d3u8);
- q12u16 = vsubl_u8(d4u8, d5u8);
- q13u16 = vsubl_u8(d6u8, d7u8);
-
- src_diff = be->src_diff;
- vst1_u16((uint16_t *)src_diff, vget_low_u16(q10u16));
- src_diff += pitch;
- vst1_u16((uint16_t *)src_diff, vget_low_u16(q11u16));
- src_diff += pitch;
- vst1_u16((uint16_t *)src_diff, vget_low_u16(q12u16));
- src_diff += pitch;
- vst1_u16((uint16_t *)src_diff, vget_low_u16(q13u16));
- return;
-}
-
-void vp8_subtract_mby_neon(
- int16_t *diff,
- unsigned char *src,
- int src_stride,
- unsigned char *pred,
- int pred_stride) {
- int i;
- uint8x16_t q0u8, q1u8, q2u8, q3u8;
- uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
- for (i = 0; i < 8; i++) { // subtract_mby_loop
- q0u8 = vld1q_u8(src);
- src += src_stride;
- q2u8 = vld1q_u8(src);
- src += src_stride;
- q1u8 = vld1q_u8(pred);
- pred += pred_stride;
- q3u8 = vld1q_u8(pred);
- pred += pred_stride;
-
- q8u16 = vsubl_u8(vget_low_u8(q0u8), vget_low_u8(q1u8));
- q9u16 = vsubl_u8(vget_high_u8(q0u8), vget_high_u8(q1u8));
- q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q3u8));
- q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q3u8));
-
- vst1q_u16((uint16_t *)diff, q8u16);
- diff += 8;
- vst1q_u16((uint16_t *)diff, q9u16);
- diff += 8;
- vst1q_u16((uint16_t *)diff, q10u16);
- diff += 8;
- vst1q_u16((uint16_t *)diff, q11u16);
- diff += 8;
- }
- return;
-}
-
-void vp8_subtract_mbuv_neon(
- int16_t *diff,
- unsigned char *usrc,
- unsigned char *vsrc,
- int src_stride,
- unsigned char *upred,
- unsigned char *vpred,
- int pred_stride) {
- int i, j;
- unsigned char *src_ptr, *pred_ptr;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
- uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
- diff += 256;
- for (i = 0; i < 2; i++) {
- if (i == 0) {
- src_ptr = usrc;
- pred_ptr = upred;
- } else if (i == 1) {
- src_ptr = vsrc;
- pred_ptr = vpred;
- }
-
- for (j = 0; j < 2; j++) {
- d0u8 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d1u8 = vld1_u8(pred_ptr);
- pred_ptr += pred_stride;
- d2u8 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d3u8 = vld1_u8(pred_ptr);
- pred_ptr += pred_stride;
- d4u8 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d5u8 = vld1_u8(pred_ptr);
- pred_ptr += pred_stride;
- d6u8 = vld1_u8(src_ptr);
- src_ptr += src_stride;
- d7u8 = vld1_u8(pred_ptr);
- pred_ptr += pred_stride;
-
- q8u16 = vsubl_u8(d0u8, d1u8);
- q9u16 = vsubl_u8(d2u8, d3u8);
- q10u16 = vsubl_u8(d4u8, d5u8);
- q11u16 = vsubl_u8(d6u8, d7u8);
-
- vst1q_u16((uint16_t *)diff, q8u16);
- diff += 8;
- vst1q_u16((uint16_t *)diff, q9u16);
- diff += 8;
- vst1q_u16((uint16_t *)diff, q10u16);
- diff += 8;
- vst1q_u16((uint16_t *)diff, q11u16);
- diff += 8;
- }
- }
- return;
-}
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 820b1376b..cf180c12d 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -8,6 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include "./vpx_dsp_rtcd.h"
#include "vpx_config.h"
#include "vp8_rtcd.h"
@@ -19,80 +20,29 @@
#include "vpx_mem/vpx_mem.h"
#include "rdopt.h"
-// TODO(jingning,johannkoenig): use vpx_subtract_block to replace
-// codec specified vp9_subtract_ functions.
-void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch)
-{
- unsigned char *src_ptr = (*(be->base_src) + be->src);
- short *diff_ptr = be->src_diff;
- unsigned char *pred_ptr = bd->predictor;
- int src_stride = be->src_stride;
-
- int r, c;
+void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch) {
+ unsigned char *src_ptr = (*(be->base_src) + be->src);
+ short *diff_ptr = be->src_diff;
+ unsigned char *pred_ptr = bd->predictor;
+ int src_stride = be->src_stride;
- for (r = 0; r < 4; r++)
- {
- for (c = 0; c < 4; c++)
- {
- diff_ptr[c] = src_ptr[c] - pred_ptr[c];
- }
-
- diff_ptr += pitch;
- pred_ptr += pitch;
- src_ptr += src_stride;
- }
+ vpx_subtract_block(4, 4, diff_ptr, pitch, src_ptr, src_stride,
+ pred_ptr, pitch);
}
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc,
+void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
int src_stride, unsigned char *upred,
- unsigned char *vpred, int pred_stride)
-{
- short *udiff = diff + 256;
- short *vdiff = diff + 320;
-
- int r, c;
+ unsigned char *vpred, int pred_stride) {
+ short *udiff = diff + 256;
+ short *vdiff = diff + 320;
- for (r = 0; r < 8; r++)
- {
- for (c = 0; c < 8; c++)
- {
- udiff[c] = usrc[c] - upred[c];
- }
-
- udiff += 8;
- upred += pred_stride;
- usrc += src_stride;
- }
-
- for (r = 0; r < 8; r++)
- {
- for (c = 0; c < 8; c++)
- {
- vdiff[c] = vsrc[c] - vpred[c];
- }
-
- vdiff += 8;
- vpred += pred_stride;
- vsrc += src_stride;
- }
+ vpx_subtract_block(8, 8, udiff, 8, usrc, src_stride, upred, pred_stride);
+ vpx_subtract_block(8, 8, vdiff, 8, vsrc, src_stride, vpred, pred_stride);
}
-void vp8_subtract_mby_c(short *diff, unsigned char *src, int src_stride,
- unsigned char *pred, int pred_stride)
-{
- int r, c;
-
- for (r = 0; r < 16; r++)
- {
- for (c = 0; c < 16; c++)
- {
- diff[c] = src[c] - pred[c];
- }
-
- diff += 16;
- pred += pred_stride;
- src += src_stride;
- }
+void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
+ unsigned char *pred, int pred_stride) {
+ vpx_subtract_block(16, 16, diff, 16, src, src_stride, pred, pred_stride);
}
static void vp8_subtract_mb(MACROBLOCK *x)
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 0b3ec875e..10b3d8651 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -19,6 +19,13 @@ extern "C" {
#endif
void vp8_encode_inter16x16(MACROBLOCK *x);
+void vp8_subtract_b(BLOCK *be, BLOCKD *bd, int pitch);
+void vp8_subtract_mbuv(short *diff, unsigned char *usrc, unsigned char *vsrc,
+ int src_stride, unsigned char *upred,
+ unsigned char *vpred, int pred_stride);
+void vp8_subtract_mby(short *diff, unsigned char *src, int src_stride,
+ unsigned char *pred, int pred_stride);
+
void vp8_build_dcblock(MACROBLOCK *b);
void vp8_transform_mb(MACROBLOCK *mb);
void vp8_transform_mbuv(MACROBLOCK *x);
diff --git a/vp8/encoder/x86/subtract_mmx.asm b/vp8/encoder/x86/subtract_mmx.asm
deleted file mode 100644
index 794dd22f7..000000000
--- a/vp8/encoder/x86/subtract_mmx.asm
+++ /dev/null
@@ -1,223 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
-; short *diff, unsigned char *Predictor,
-; int pitch);
-global sym(vp8_subtract_b_mmx_impl) PRIVATE
-sym(vp8_subtract_b_mmx_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
-
- mov rdi, arg(2) ;diff
- mov rax, arg(3) ;Predictor
- mov rsi, arg(0) ;z
- movsxd rdx, dword ptr arg(1);src_stride;
- movsxd rcx, dword ptr arg(4);pitch
- pxor mm7, mm7
-
- movd mm0, [rsi]
- movd mm1, [rax]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi], mm0
-
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*2],mm0
-
-
- movd mm0, [rsi+rdx*2]
- movd mm1, [rax+rcx*2]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*4], mm0
-
- lea rsi, [rsi+rdx*2]
- lea rcx, [rcx+rcx*2]
-
-
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*2], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
-;unsigned char *pred, int pred_stride)
-global sym(vp8_subtract_mby_mmx) PRIVATE
-sym(vp8_subtract_mby_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(0) ;diff
- mov rsi, arg(1) ;src
- movsxd rdx, dword ptr arg(2);src_stride
- mov rax, arg(3) ;pred
- push rbx
- movsxd rbx, dword ptr arg(4);pred_stride
-
- pxor mm0, mm0
- mov rcx, 16
-
-
-.submby_loop:
- movq mm1, [rsi]
- movq mm3, [rax]
-
- movq mm2, mm1
- movq mm4, mm3
-
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
-
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm3
- psubw mm2, mm4
-
- movq [rdi], mm1
- movq [rdi+8], mm2
-
- movq mm1, [rsi+8]
- movq mm3, [rax+8]
-
- movq mm2, mm1
- movq mm4, mm3
-
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
-
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm3
- psubw mm2, mm4
-
- movq [rdi+16], mm1
- movq [rdi+24], mm2
- add rdi, 32
- lea rax, [rax+rbx]
- lea rsi, [rsi+rdx]
- dec rcx
- jnz .submby_loop
-
- pop rbx
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
-; int src_stride, unsigned char *upred,
-; unsigned char *vpred, int pred_stride)
-
-global sym(vp8_subtract_mbuv_mmx) PRIVATE
-sym(vp8_subtract_mbuv_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(0) ;diff
- mov rsi, arg(1) ;usrc
- movsxd rdx, dword ptr arg(3);src_stride;
- mov rax, arg(4) ;upred
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- mov rcx, 8
- push rbx
- movsxd rbx, dword ptr arg(6);pred_stride
-
- pxor mm7, mm7
-
-.submbu_loop:
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
- add rdi, 16
- add rsi, rdx
- add rax, rbx
-
- dec rcx
- jnz .submbu_loop
-
- mov rsi, arg(2) ;vsrc
- mov rax, arg(5) ;vpred
- mov rcx, 8
-
-.submbv_loop:
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
- add rdi, 16
- add rsi, rdx
- add rax, rbx
-
- dec rcx
- jnz .submbv_loop
-
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm
deleted file mode 100644
index a5d17f5be..000000000
--- a/vp8/encoder/x86/subtract_sse2.asm
+++ /dev/null
@@ -1,245 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
-; short *diff, unsigned char *Predictor,
-; int pitch);
-global sym(vp8_subtract_b_sse2_impl) PRIVATE
-sym(vp8_subtract_b_sse2_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(2) ;diff
- mov rax, arg(3) ;Predictor
- mov rsi, arg(0) ;z
- movsxd rdx, dword ptr arg(1);src_stride;
- movsxd rcx, dword ptr arg(4);pitch
- pxor mm7, mm7
-
- movd mm0, [rsi]
- movd mm1, [rax]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi], mm0
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*2], mm0
-
- movd mm0, [rsi+rdx*2]
- movd mm1, [rax+rcx*2]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*4], mm0
-
- lea rsi, [rsi+rdx*2]
- lea rcx, [rcx+rcx*2]
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*2], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride,
-;unsigned char *pred, int pred_stride)
-global sym(vp8_subtract_mby_sse2) PRIVATE
-sym(vp8_subtract_mby_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(0) ;diff
- mov rsi, arg(1) ;src
- movsxd rdx, dword ptr arg(2);src_stride
- mov rax, arg(3) ;pred
- movdqa xmm4, [GLOBAL(t80)]
- push rbx
- mov rcx, 8 ; do two lines at one time
- movsxd rbx, dword ptr arg(4);pred_stride
-
-.submby_loop:
- movdqa xmm0, [rsi] ; src
- movdqa xmm1, [rax] ; pred
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1
-
- pxor xmm1, xmm4 ;convert to signed values
- pxor xmm2, xmm4
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm1 ; put sign back to subtraction
-
- movdqa xmm3, [rsi + rdx]
- movdqa xmm5, [rax + rbx]
-
- lea rsi, [rsi+rdx*2]
- lea rax, [rax+rbx*2]
-
- movdqa [rdi], xmm0
- movdqa [rdi +16], xmm2
-
- movdqa xmm1, xmm3
- psubb xmm3, xmm5
-
- pxor xmm5, xmm4 ;convert to signed values
- pxor xmm1, xmm4
- pcmpgtb xmm5, xmm1 ; obtain sign information
-
- movdqa xmm1, xmm3
- punpcklbw xmm3, xmm5 ; put sign back to subtraction
- punpckhbw xmm1, xmm5 ; put sign back to subtraction
-
- movdqa [rdi +32], xmm3
- movdqa [rdi +48], xmm1
-
- add rdi, 64
- dec rcx
- jnz .submby_loop
-
- pop rbx
- pop rdi
- pop rsi
- ; begin epilog
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc,
-; int src_stride, unsigned char *upred,
-; unsigned char *vpred, int pred_stride)
-global sym(vp8_subtract_mbuv_sse2) PRIVATE
-sym(vp8_subtract_mbuv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movdqa xmm4, [GLOBAL(t80)]
- mov rdi, arg(0) ;diff
- mov rsi, arg(1) ;usrc
- movsxd rdx, dword ptr arg(3);src_stride;
- mov rax, arg(4) ;upred
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- mov rcx, 4
- push rbx
- movsxd rbx, dword ptr arg(6);pred_stride
-
- ;u
-.submbu_loop:
- movq xmm0, [rsi] ; src
- movq xmm2, [rsi+rdx] ; src -- next line
- movq xmm1, [rax] ; pred
- movq xmm3, [rax+rbx] ; pred -- next line
- lea rsi, [rsi + rdx*2]
- lea rax, [rax + rbx*2]
-
- punpcklqdq xmm0, xmm2
- punpcklqdq xmm1, xmm3
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, xmm4 ;convert to signed values
- pxor xmm2, xmm4
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa [rdi], xmm0 ; store difference
- movdqa [rdi +16], xmm2 ; store difference
- add rdi, 32
- sub rcx, 1
- jnz .submbu_loop
-
- mov rsi, arg(2) ;vsrc
- mov rax, arg(5) ;vpred
- mov rcx, 4
-
- ;v
-.submbv_loop:
- movq xmm0, [rsi] ; src
- movq xmm2, [rsi+rdx] ; src -- next line
- movq xmm1, [rax] ; pred
- movq xmm3, [rax+rbx] ; pred -- next line
- lea rsi, [rsi + rdx*2]
- lea rax, [rax + rbx*2]
-
- punpcklqdq xmm0, xmm2
- punpcklqdq xmm1, xmm3
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, xmm4 ;convert to signed values
- pxor xmm2, xmm4
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa [rdi], xmm0 ; store difference
- movdqa [rdi +16], xmm2 ; store difference
- add rdi, 32
- sub rcx, 1
- jnz .submbv_loop
-
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-t80:
- times 16 db 0x80
diff --git a/vp8/encoder/x86/vp8_enc_stubs_mmx.c b/vp8/encoder/x86/vp8_enc_stubs_mmx.c
index cf3d8ca4a..7bf5155c9 100644
--- a/vp8/encoder/x86/vp8_enc_stubs_mmx.c
+++ b/vp8/encoder/x86/vp8_enc_stubs_mmx.c
@@ -65,14 +65,3 @@ int vp8_mbuverror_mmx(MACROBLOCK *mb)
return vp8_mbuverror_mmx_impl(s_ptr, d_ptr);
}
-void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride,
- short *diff, unsigned char *predictor,
- int pitch);
-void vp8_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch)
-{
- unsigned char *z = *(be->base_src) + be->src;
- unsigned int src_stride = be->src_stride;
- short *diff = &be->src_diff[0];
- unsigned char *predictor = &bd->predictor[0];
- vp8_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}
diff --git a/vp8/encoder/x86/vp8_enc_stubs_sse2.c b/vp8/encoder/x86/vp8_enc_stubs_sse2.c
index 3dfbee368..be9aaf3c9 100644
--- a/vp8/encoder/x86/vp8_enc_stubs_sse2.c
+++ b/vp8/encoder/x86/vp8_enc_stubs_sse2.c
@@ -30,14 +30,3 @@ int vp8_mbuverror_xmm(MACROBLOCK *mb)
return vp8_mbuverror_xmm_impl(s_ptr, d_ptr);
}
-void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride,
- short *diff, unsigned char *predictor,
- int pitch);
-void vp8_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch)
-{
- unsigned char *z = *(be->base_src) + be->src;
- unsigned int src_stride = be->src_stride;
- short *diff = &be->src_diff[0];
- unsigned char *predictor = &bd->predictor[0];
- vp8_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}