From 25c588b1e49deb70a06549a8c843c9a3bc19ea1a Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Fri, 21 Jun 2013 09:35:37 -0700 Subject: Add subtract_block SSE2 version and unit test. 3% faster overall (3min35.0 to 3min28.5). Change-Id: I5ff8a5c2c91586b6632ca5009ad1ea51ce94af5e --- test/test.mk | 1 + test/vp9_subtract_test.cc | 97 ++++++ vp9/common/vp9_rtcd_defs.sh | 3 + vp9/encoder/vp9_encodemb.c | 8 +- vp9/encoder/vp9_encodemb.h | 4 - vp9/encoder/x86/vp9_subtract_mmx.asm | 432 --------------------------- vp9/encoder/x86/vp9_subtract_sse2.asm | 464 ++++++++--------------------- vp9/encoder/x86/vp9_x86_csystemdependent.c | 55 ---- vp9/vp9cx.mk | 2 - 9 files changed, 223 insertions(+), 843 deletions(-) create mode 100644 test/vp9_subtract_test.cc delete mode 100644 vp9/encoder/x86/vp9_subtract_mmx.asm delete mode 100644 vp9/encoder/x86/vp9_x86_csystemdependent.c diff --git a/test/test.mk b/test/test.mk index 806901d27..ea6dbd865 100644 --- a/test/test.mk +++ b/test/test.mk @@ -66,6 +66,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += set_roi.cc LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += subtract_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_DECODER) += vp8_decrypt_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc new file mode 100644 index 000000000..1a62b5731 --- /dev/null +++ b/test/vp9_subtract_test.cc @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "third_party/googletest/src/include/gtest/gtest.h" +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +extern "C" { +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_blockd.h" +} + +typedef void (*subtract_fn_t)(int rows, int cols, + int16_t *diff_ptr, ptrdiff_t diff_stride, + const uint8_t *src_ptr, ptrdiff_t src_stride, + const uint8_t *pred_ptr, ptrdiff_t pred_stride); + +namespace vp9 { + +class VP9SubtractBlockTest : public ::testing::TestWithParam { + public: + virtual void TearDown() { + libvpx_test::ClearSystemState(); + } +}; + +using libvpx_test::ACMRandom; + +TEST_P(VP9SubtractBlockTest, SimpleSubtract) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + + // FIXME(rbultje) split in its own file + for (BLOCK_SIZE_TYPE bsize = BLOCK_SIZE_AB4X4; bsize < BLOCK_SIZE_TYPES; + bsize = static_cast(static_cast(bsize) + 1)) { + const int block_width = 4 << b_width_log2(bsize); + const int block_height = 4 << b_height_log2(bsize); + int16_t *diff = new int16_t[block_width * block_height * 2]; + uint8_t *pred = new uint8_t[block_width * block_height * 2]; + uint8_t *src = new uint8_t[block_width * block_height * 2]; + + for (int n = 0; n < 100; n++) { + for (int r = 0; r < block_height; ++r) { + for (int c = 0; c < block_width * 2; ++c) { + src[r * block_width * 2 + c] = rnd.Rand8(); + pred[r * block_width * 2 + c] = rnd.Rand8(); + } + } + + GetParam()(block_height, block_width, diff, block_width, + src, block_width, pred, block_width); + + for (int r = 0; r < block_height; ++r) { + for (int c = 0; c < block_width; ++c) { + EXPECT_EQ(diff[r * block_width + c], + (src[r * block_width + c] - + pred[r * block_width + c])) << "r = " << r + << ", c = " << c + << ", bs = " << bsize; + } + } + + GetParam()(block_height, block_width, diff, block_width * 2, + src, block_width * 2, pred, block_width * 2); + + for (int r = 0; r < block_height; ++r) { + for (int c = 0; c < block_width; ++c) { + EXPECT_EQ(diff[r * block_width * 2 + c], + (src[r * block_width * 2 + c] - + pred[r * block_width * 2 + c])) << "r = " << r + << ", c = " << c + << ", bs = " << bsize; + } + } + } + delete[] diff; + delete[] pred; + delete[] src; + } +} + +INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest, + ::testing::Values(vp9_subtract_block_c)); + +#if HAVE_SSE2 +INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest, + ::testing::Values(vp9_subtract_block_sse2)); +#endif + +} // namespace vp9 diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 8e24eed0f..8e568f83c 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -533,6 +533,9 @@ prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size" specialize vp9_block_error mmx sse2 vp9_block_error_sse2=vp9_block_error_xmm +prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride" +specialize vp9_subtract_block sse2 + # # Structured Similarity (SSIM) # diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 4f45496df..2f133ccbc 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -22,10 +22,10 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); -void vp9_subtract_block(int rows, int cols, - int16_t *diff_ptr, int diff_stride, - const uint8_t *src_ptr, int src_stride, - const uint8_t *pred_ptr, int pred_stride) { +void vp9_subtract_block_c(int rows, int cols, + int16_t *diff_ptr, ptrdiff_t diff_stride, + const uint8_t *src_ptr, ptrdiff_t src_stride, + const uint8_t *pred_ptr, ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 579690346..3042c9f7f 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -42,10 +42,6 @@ void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); -void vp9_subtract_block(int rows, int cols, - int16_t *diff_ptr, int diff_stride, - const uint8_t *src_ptr, int src_stride, - const uint8_t *pred_ptr, int pred_stride); void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize); diff --git a/vp9/encoder/x86/vp9_subtract_mmx.asm b/vp9/encoder/x86/vp9_subtract_mmx.asm deleted file mode 100644 index e9eda4fed..000000000 --- a/vp9/encoder/x86/vp9_subtract_mmx.asm +++ /dev/null @@ -1,432 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride, -; short *diff, unsigned char *Predictor, -; int pitch); -global sym(vp9_subtract_b_mmx_impl) PRIVATE -sym(vp9_subtract_b_mmx_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - - mov rdi, arg(2) ;diff - mov rax, arg(3) ;Predictor - mov rsi, arg(0) ;z - movsxd rdx, dword ptr arg(1);src_stride; - movsxd rcx, dword ptr arg(4);pitch - pxor mm7, mm7 - - movd mm0, [rsi] - movd mm1, [rax] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi], mm0 - - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*2],mm0 - - - movd mm0, [rsi+rdx*2] - movd mm1, [rax+rcx*2] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*4], mm0 - - lea rsi, [rsi+rdx*2] - lea rcx, [rcx+rcx*2] - - - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*2], mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride) -global sym(vp9_subtract_mby_mmx) PRIVATE -sym(vp9_subtract_mby_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride - - mov rcx, 16 - pxor mm0, mm0 - -.submby_loop: - - movq mm1, [rsi] - movq mm3, [rax] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 - - movq [rdi], mm1 - movq [rdi+8], mm2 - - - movq mm1, [rsi+8] - movq mm3, [rax+8] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 - - movq [rdi+16], mm1 - movq [rdi+24], mm2 - - - add rdi, 32 - add rax, 16 - - lea rsi, [rsi+rdx] - - sub rcx, 1 - jnz .submby_loop - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) -global sym(vp9_subtract_mbuv_mmx) PRIVATE -sym(vp9_subtract_mbuv_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - ;short *udiff = diff + 256; - ;short *vdiff = diff + 320; - ;unsigned char *upred = pred + 256; - ;unsigned char *vpred = pred + 320; - - ;unsigned char *z = usrc; - ;unsigned short *diff = udiff; - ;unsigned char *Predictor= upred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - ;unsigned char *z = vsrc; - ;unsigned short *diff = vdiff; - ;unsigned char *Predictor= vpred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(2) ;z = usrc - add rdi, 320*2 ;diff = diff + 320 (shorts) - add rax, 320 ;Predictor = pred + 320 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vp9/encoder/x86/vp9_subtract_sse2.asm index 739d9487e..e428a1397 100644 --- a/vp9/encoder/x86/vp9_subtract_sse2.asm +++ b/vp9/encoder/x86/vp9_subtract_sse2.asm @@ -8,349 +8,121 @@ ; be found in the AUTHORS file in the root of the source tree. ; - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride, -; short *diff, unsigned char *Predictor, -; int pitch); -global sym(vp9_subtract_b_sse2_impl) PRIVATE -sym(vp9_subtract_b_sse2_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(2) ;diff - mov rax, arg(3) ;Predictor - mov rsi, arg(0) ;z - movsxd rdx, dword ptr arg(1);src_stride; - movsxd rcx, dword ptr arg(4);pitch - pxor mm7, mm7 - - movd mm0, [rsi] - movd mm1, [rax] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi], mm0 - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - movd mm0, [rsi+rdx*2] - movd mm1, [rax+rcx*2] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*4], mm0 - - lea rsi, [rsi+rdx*2] - lea rcx, [rcx+rcx*2] - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride) -global sym(vp9_subtract_mby_sse2) PRIVATE -sym(vp9_subtract_mby_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride - - mov rcx, 8 ; do two lines at one time - -.submby_loop: - movdqa xmm0, XMMWORD PTR [rsi] ; src - movdqa xmm1, XMMWORD PTR [rax] ; pred - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - movdqa xmm4, XMMWORD PTR [rsi + rdx] - movdqa xmm5, XMMWORD PTR [rax + 16] - - movdqa xmm6, xmm4 - psubb xmm4, xmm5 - - pxor xmm5, [GLOBAL(t80)] ;convert to signed values - pxor xmm6, [GLOBAL(t80)] - pcmpgtb xmm5, xmm6 ; obtain sign information - - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - punpcklbw xmm4, xmm5 ; put sign back to subtraction - punpckhbw xmm6, xmm7 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi +32], xmm4 - movdqa XMMWORD PTR [rdi +48], xmm6 - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - sub rcx, 1 - jnz .submby_loop - - pop rdi - pop rsi - ; begin epilog - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) -global sym(vp9_subtract_mbuv_sse2) PRIVATE -sym(vp9_subtract_mbuv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - lea rcx, [rdx + rdx*2] - - ;u - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ;v - mov rsi, arg(2) ;z = vsrc - add rdi, 64*2 ;diff = diff + 320 (shorts) - add rax, 64 ;Predictor = pred + 320 - - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -t80: - times 16 db 0x80 +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void vp9_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + mova m1, [predq+%3] + mova m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + emms + RET diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c deleted file mode 100644 index 6016e14eb..000000000 --- a/vp9/encoder/x86/vp9_x86_csystemdependent.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "./vpx_config.h" -#include "vpx_ports/x86.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/x86/vp9_dct_mmx.h" - -// TODO(jimbankoski) Consider rewriting the c to take the same values rather -// than going through these pointer conversions -#if 0 && HAVE_MMX -void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) { - vp9_short_fdct4x4_mmx(input, output, pitch); - vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch); -} - -void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride, - short *diff, unsigned char *predictor, - int pitch); -void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) { - unsigned char *z = *(be->base_src) + be->src; - unsigned int src_stride = be->src_stride; - short *diff = &be->src_diff[0]; - unsigned char *predictor = *(bd->base_dst) + bd->dst; - // TODO(jingning): The prototype function in c has been changed. Need to - // modify the mmx and sse versions. - vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); -} - -#endif - -#if 0 && HAVE_SSE2 -void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride, - short *diff, unsigned char *predictor, - int pitch); -void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) { - unsigned char *z = *(be->base_src) + be->src; - unsigned int src_stride = be->src_stride; - short *diff = &be->src_diff[0]; - unsigned char *predictor = *(bd->base_dst) + bd->dst; - // TODO(jingning): The prototype function in c has been changed. Need to - // modify the mmx and sse versions. - vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); -} - -#endif diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 9ce91544f..b9afe96b2 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -73,13 +73,11 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h -VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm -- cgit v1.2.3