diff options
author | Johann <johannkoenig@google.com> | 2015-04-17 16:11:38 -0400 |
---|---|---|
committer | Johann <johannkoenig@google.com> | 2015-05-06 16:58:20 -0700 |
commit | d5d92898001064c74548a7fa04b0f624de4afb40 (patch) | |
tree | f10d02eae337f0cb5561404ac9109203776ecc32 /vp9/encoder | |
parent | c77b1f5acd09852aff1ba09d7f371728a60634d7 (diff) | |
download | libvpx-d5d92898001064c74548a7fa04b0f624de4afb40.tar libvpx-d5d92898001064c74548a7fa04b0f624de4afb40.tar.gz libvpx-d5d92898001064c74548a7fa04b0f624de4afb40.tar.bz2 libvpx-d5d92898001064c74548a7fa04b0f624de4afb40.zip |
Move shared SAD code to vpx_dsp
Create a new component, vpx_dsp, for code that can be shared
between codecs. Move the SAD code into the component.
This reduces the size of vpxenc/dec by 36k on x86_64 builds.
Change-Id: I73f837ddaecac6b350bf757af0cfe19c4ab9327a
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/arm/neon/vp9_sad4d_neon.c | 226 | ||||
-rw-r--r-- | vp9/encoder/arm/neon/vp9_sad_neon.c | 130 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 500 | ||||
-rw-r--r-- | vp9/encoder/vp9_mbgraph.c | 13 | ||||
-rw-r--r-- | vp9/encoder/vp9_sad.c | 276 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm | 287 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_highbd_sad_sse2.asm | 363 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad4d_intrin_avx2.c | 167 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad4d_sse2.asm | 231 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad_intrin_avx2.c | 180 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad_sse2.asm | 267 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad_sse3.asm | 378 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad_sse4.asm | 359 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_sad_ssse3.asm | 370 |
14 files changed, 259 insertions, 3488 deletions
diff --git a/vp9/encoder/arm/neon/vp9_sad4d_neon.c b/vp9/encoder/arm/neon/vp9_sad4d_neon.c deleted file mode 100644 index cec1689f1..000000000 --- a/vp9/encoder/arm/neon/vp9_sad4d_neon.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> -#include "./vp9_rtcd.h" -#include "./vpx_config.h" - -#include "vpx/vpx_integer.h" - -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), - vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), - vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -// Calculate the absolute difference of 64 bytes from vec_src_00, vec_src_16, -// vec_src_32, vec_src_48 and ref. Accumulate partial sums in vec_sum_ref_lo -// and vec_sum_ref_hi. -static void sad_neon_64(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, - const uint8x16_t vec_src_32, - const uint8x16_t vec_src_48, - const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); -} - -// Calculate the absolute difference of 32 bytes from vec_src_00, vec_src_16, -// and ref. Accumulate partial sums in vec_sum_ref_lo and vec_sum_ref_hi. -static void sad_neon_32(const uint8x16_t vec_src_00, - const uint8x16_t vec_src_16, - const uint8_t *ref, - uint16x8_t *vec_sum_ref_lo, - uint16x8_t *vec_sum_ref_hi) { - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - *vec_sum_ref_lo = vabal_u8(*vec_sum_ref_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - *vec_sum_ref_hi = vabal_u8(*vec_sum_ref_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); -} - -void vp9_sad64x64x4d_neon(const uint8_t *src, int src_stride, - const uint8_t* const ref[4], int ref_stride, - unsigned int *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); - - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref0, - &vec_sum_ref0_lo, &vec_sum_ref0_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref1, - &vec_sum_ref1_lo, &vec_sum_ref1_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref2, - &vec_sum_ref2_lo, &vec_sum_ref2_hi); - sad_neon_64(vec_src_00, vec_src_16, vec_src_32, vec_src_48, ref3, - &vec_sum_ref3_lo, &vec_sum_ref3_hi); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} - -void vp9_sad32x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t* const ref[4], int ref_stride, - unsigned int *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - - sad_neon_32(vec_src_00, vec_src_16, ref0, - &vec_sum_ref0_lo, &vec_sum_ref0_hi); - sad_neon_32(vec_src_00, vec_src_16, ref1, - &vec_sum_ref1_lo, &vec_sum_ref1_hi); - sad_neon_32(vec_src_00, vec_src_16, ref2, - &vec_sum_ref2_lo, &vec_sum_ref2_hi); - sad_neon_32(vec_src_00, vec_src_16, ref3, - &vec_sum_ref3_lo, &vec_sum_ref3_hi); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} - -void vp9_sad16x16x4d_neon(const uint8_t *src, int src_stride, - const uint8_t* const ref[4], int ref_stride, - unsigned int *res) { - int i; - uint16x8_t vec_sum_ref0_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref0_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref1_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref2_hi = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_ref3_hi = vdupq_n_u16(0); - const uint8_t *ref0, *ref1, *ref2, *ref3; - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref0 = vld1q_u8(ref0); - const uint8x16_t vec_ref1 = vld1q_u8(ref1); - const uint8x16_t vec_ref2 = vld1q_u8(ref2); - const uint8x16_t vec_ref3 = vld1q_u8(ref3); - - vec_sum_ref0_lo = vabal_u8(vec_sum_ref0_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref0)); - vec_sum_ref0_hi = vabal_u8(vec_sum_ref0_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref0)); - vec_sum_ref1_lo = vabal_u8(vec_sum_ref1_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref1)); - vec_sum_ref1_hi = vabal_u8(vec_sum_ref1_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref1)); - vec_sum_ref2_lo = vabal_u8(vec_sum_ref2_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref2)); - vec_sum_ref2_hi = vabal_u8(vec_sum_ref2_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref2)); - vec_sum_ref3_lo = vabal_u8(vec_sum_ref3_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref3)); - vec_sum_ref3_hi = vabal_u8(vec_sum_ref3_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref3)); - - src += src_stride; - ref0 += ref_stride; - ref1 += ref_stride; - ref2 += ref_stride; - ref3 += ref_stride; - } - - res[0] = horizontal_long_add_16x8(vec_sum_ref0_lo, vec_sum_ref0_hi); - res[1] = horizontal_long_add_16x8(vec_sum_ref1_lo, vec_sum_ref1_hi); - res[2] = horizontal_long_add_16x8(vec_sum_ref2_lo, vec_sum_ref2_hi); - res[3] = horizontal_long_add_16x8(vec_sum_ref3_lo, vec_sum_ref3_hi); -} diff --git a/vp9/encoder/arm/neon/vp9_sad_neon.c b/vp9/encoder/arm/neon/vp9_sad_neon.c deleted file mode 100644 index c4cd85680..000000000 --- a/vp9/encoder/arm/neon/vp9_sad_neon.c +++ /dev/null @@ -1,130 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> -#include "./vp9_rtcd.h" -#include "./vpx_config.h" - -#include "vpx/vpx_integer.h" - -static INLINE unsigned int horizontal_long_add_16x8(const uint16x8_t vec_lo, - const uint16x8_t vec_hi) { - const uint32x4_t vec_l_lo = vaddl_u16(vget_low_u16(vec_lo), - vget_high_u16(vec_lo)); - const uint32x4_t vec_l_hi = vaddl_u16(vget_low_u16(vec_hi), - vget_high_u16(vec_hi)); - const uint32x4_t a = vaddq_u32(vec_l_lo, vec_l_hi); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} -static INLINE unsigned int horizontal_add_16x8(const uint16x8_t vec_16x8) { - const uint32x4_t a = vpaddlq_u16(vec_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -unsigned int vp9_sad64x64_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - for (i = 0; i < 64; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_src_32 = vld1q_u8(src + 32); - const uint8x16_t vec_src_48 = vld1q_u8(src + 48); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - const uint8x16_t vec_ref_32 = vld1q_u8(ref + 32); - const uint8x16_t vec_ref_48 = vld1q_u8(ref + 48); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_32), - vget_low_u8(vec_ref_32)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_32), - vget_high_u8(vec_ref_32)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_48), - vget_low_u8(vec_ref_48)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_48), - vget_high_u8(vec_ref_48)); - } - return horizontal_long_add_16x8(vec_accum_lo, vec_accum_hi); -} - -unsigned int vp9_sad32x32_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - - for (i = 0; i < 32; ++i) { - const uint8x16_t vec_src_00 = vld1q_u8(src); - const uint8x16_t vec_src_16 = vld1q_u8(src + 16); - const uint8x16_t vec_ref_00 = vld1q_u8(ref); - const uint8x16_t vec_ref_16 = vld1q_u8(ref + 16); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_00), - vget_low_u8(vec_ref_00)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_00), - vget_high_u8(vec_ref_00)); - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src_16), - vget_low_u8(vec_ref_16)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src_16), - vget_high_u8(vec_ref_16)); - } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); -} - -unsigned int vp9_sad16x16_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum_lo = vdupq_n_u16(0); - uint16x8_t vec_accum_hi = vdupq_n_u16(0); - - for (i = 0; i < 16; ++i) { - const uint8x16_t vec_src = vld1q_u8(src); - const uint8x16_t vec_ref = vld1q_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum_lo = vabal_u8(vec_accum_lo, vget_low_u8(vec_src), - vget_low_u8(vec_ref)); - vec_accum_hi = vabal_u8(vec_accum_hi, vget_high_u8(vec_src), - vget_high_u8(vec_ref)); - } - return horizontal_add_16x8(vaddq_u16(vec_accum_lo, vec_accum_hi)); -} - -unsigned int vp9_sad8x8_neon(const uint8_t *src, int src_stride, - const uint8_t *ref, int ref_stride) { - int i; - uint16x8_t vec_accum = vdupq_n_u16(0); - - for (i = 0; i < 8; ++i) { - const uint8x8_t vec_src = vld1_u8(src); - const uint8x8_t vec_ref = vld1_u8(ref); - src += src_stride; - ref += ref_stride; - vec_accum = vabal_u8(vec_accum, vec_src, vec_ref); - } - return horizontal_add_16x8(vec_accum); -} diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index a6e4c9c27..a1018adb8 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -14,6 +14,7 @@ #include "./vpx_config.h" #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "./vpx_scale_rtcd.h" #include "vpx/internal/vpx_psnr.h" #include "vpx_ports/vpx_timer.h" @@ -318,6 +319,7 @@ void vp9_initialize_enc(void) { if (!init_done) { vp9_rtcd(); + vpx_dsp_rtcd(); vpx_scale_rtcd(); vp9_init_intra_predictors(); vp9_init_me_luts(); @@ -929,61 +931,61 @@ static void fnname##_bits12(const uint8_t *src_ptr, \ sad_array[i] >>= 4; \ } -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x16) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x16_avg) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x16x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x32) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x32_avg) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x32x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x32) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x32_avg) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x32x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x64) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x64_avg) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x64x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad32x32) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad32x32_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad32x32x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad32x32x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad32x32x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad64x64) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad64x64_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad64x64x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad64x64x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad64x64x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x16) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x16_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x16x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x16x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x16x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad16x8) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad16x8_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad16x8x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad16x8x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad16x8x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x16) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x16_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x16x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x16x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x16x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x8) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x8_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad8x8x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x8x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x8x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad8x4) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad8x4_avg) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad8x4x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad8x4x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x8) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x8_avg) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x8x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x8x4d) -MAKE_BFP_SAD_WRAPPER(vp9_highbd_sad4x4) -MAKE_BFP_SADAVG_WRAPPER(vp9_highbd_sad4x4_avg) -MAKE_BFP_SAD3_WRAPPER(vp9_highbd_sad4x4x3) -MAKE_BFP_SAD8_WRAPPER(vp9_highbd_sad4x4x8) -MAKE_BFP_SAD4D_WRAPPER(vp9_highbd_sad4x4x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x32) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x32_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x32x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x32) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x32_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x32x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x64) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x64_avg) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x64x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x32) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x32_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad32x32x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad32x32x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x32x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x64) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x64_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad64x64x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad64x64x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x64x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x16) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x16_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x16x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x16x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x16x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad16x8) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad16x8_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad16x8x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad16x8x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad16x8x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x16) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x16_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x16x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x16x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x16x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x8) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x8_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad8x8x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x8x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x8x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad8x4) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad8x4_avg) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad8x4x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad8x4x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x8) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x8_avg) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x8x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x8x4d) +MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad4x4) +MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad4x4_avg) +MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad4x4x3) +MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8) +MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d) static void highbd_set_var_fns(VP9_COMP *const cpi) { VP9_COMMON *const cm = &cpi->common; @@ -991,398 +993,398 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { switch (cm->bit_depth) { case VPX_BITS_8: HIGHBD_BFP(BLOCK_32X16, - vp9_highbd_sad32x16_bits8, - vp9_highbd_sad32x16_avg_bits8, + vpx_highbd_sad32x16_bits8, + vpx_highbd_sad32x16_avg_bits8, vp9_highbd_variance32x16, vp9_highbd_sub_pixel_variance32x16, vp9_highbd_sub_pixel_avg_variance32x16, NULL, NULL, - vp9_highbd_sad32x16x4d_bits8) + vpx_highbd_sad32x16x4d_bits8) HIGHBD_BFP(BLOCK_16X32, - vp9_highbd_sad16x32_bits8, - vp9_highbd_sad16x32_avg_bits8, + vpx_highbd_sad16x32_bits8, + vpx_highbd_sad16x32_avg_bits8, vp9_highbd_variance16x32, vp9_highbd_sub_pixel_variance16x32, vp9_highbd_sub_pixel_avg_variance16x32, NULL, NULL, - vp9_highbd_sad16x32x4d_bits8) + vpx_highbd_sad16x32x4d_bits8) HIGHBD_BFP(BLOCK_64X32, - vp9_highbd_sad64x32_bits8, - vp9_highbd_sad64x32_avg_bits8, + vpx_highbd_sad64x32_bits8, + vpx_highbd_sad64x32_avg_bits8, vp9_highbd_variance64x32, vp9_highbd_sub_pixel_variance64x32, vp9_highbd_sub_pixel_avg_variance64x32, NULL, NULL, - vp9_highbd_sad64x32x4d_bits8) + vpx_highbd_sad64x32x4d_bits8) HIGHBD_BFP(BLOCK_32X64, - vp9_highbd_sad32x64_bits8, - vp9_highbd_sad32x64_avg_bits8, + vpx_highbd_sad32x64_bits8, + vpx_highbd_sad32x64_avg_bits8, vp9_highbd_variance32x64, vp9_highbd_sub_pixel_variance32x64, vp9_highbd_sub_pixel_avg_variance32x64, NULL, NULL, - vp9_highbd_sad32x64x4d_bits8) + vpx_highbd_sad32x64x4d_bits8) HIGHBD_BFP(BLOCK_32X32, - vp9_highbd_sad32x32_bits8, - vp9_highbd_sad32x32_avg_bits8, + vpx_highbd_sad32x32_bits8, + vpx_highbd_sad32x32_avg_bits8, vp9_highbd_variance32x32, vp9_highbd_sub_pixel_variance32x32, vp9_highbd_sub_pixel_avg_variance32x32, - vp9_highbd_sad32x32x3_bits8, - vp9_highbd_sad32x32x8_bits8, - vp9_highbd_sad32x32x4d_bits8) + vpx_highbd_sad32x32x3_bits8, + vpx_highbd_sad32x32x8_bits8, + vpx_highbd_sad32x32x4d_bits8) HIGHBD_BFP(BLOCK_64X64, - vp9_highbd_sad64x64_bits8, - vp9_highbd_sad64x64_avg_bits8, + vpx_highbd_sad64x64_bits8, + vpx_highbd_sad64x64_avg_bits8, vp9_highbd_variance64x64, vp9_highbd_sub_pixel_variance64x64, vp9_highbd_sub_pixel_avg_variance64x64, - vp9_highbd_sad64x64x3_bits8, - vp9_highbd_sad64x64x8_bits8, - vp9_highbd_sad64x64x4d_bits8) + vpx_highbd_sad64x64x3_bits8, + vpx_highbd_sad64x64x8_bits8, + vpx_highbd_sad64x64x4d_bits8) HIGHBD_BFP(BLOCK_16X16, - vp9_highbd_sad16x16_bits8, - vp9_highbd_sad16x16_avg_bits8, + vpx_highbd_sad16x16_bits8, + vpx_highbd_sad16x16_avg_bits8, vp9_highbd_variance16x16, vp9_highbd_sub_pixel_variance16x16, vp9_highbd_sub_pixel_avg_variance16x16, - vp9_highbd_sad16x16x3_bits8, - vp9_highbd_sad16x16x8_bits8, - vp9_highbd_sad16x16x4d_bits8) + vpx_highbd_sad16x16x3_bits8, + vpx_highbd_sad16x16x8_bits8, + vpx_highbd_sad16x16x4d_bits8) HIGHBD_BFP(BLOCK_16X8, - vp9_highbd_sad16x8_bits8, - vp9_highbd_sad16x8_avg_bits8, + vpx_highbd_sad16x8_bits8, + vpx_highbd_sad16x8_avg_bits8, vp9_highbd_variance16x8, vp9_highbd_sub_pixel_variance16x8, vp9_highbd_sub_pixel_avg_variance16x8, - vp9_highbd_sad16x8x3_bits8, - vp9_highbd_sad16x8x8_bits8, - vp9_highbd_sad16x8x4d_bits8) + vpx_highbd_sad16x8x3_bits8, + vpx_highbd_sad16x8x8_bits8, + vpx_highbd_sad16x8x4d_bits8) HIGHBD_BFP(BLOCK_8X16, - vp9_highbd_sad8x16_bits8, - vp9_highbd_sad8x16_avg_bits8, + vpx_highbd_sad8x16_bits8, + vpx_highbd_sad8x16_avg_bits8, vp9_highbd_variance8x16, vp9_highbd_sub_pixel_variance8x16, vp9_highbd_sub_pixel_avg_variance8x16, - vp9_highbd_sad8x16x3_bits8, - vp9_highbd_sad8x16x8_bits8, - vp9_highbd_sad8x16x4d_bits8) + vpx_highbd_sad8x16x3_bits8, + vpx_highbd_sad8x16x8_bits8, + vpx_highbd_sad8x16x4d_bits8) HIGHBD_BFP(BLOCK_8X8, - vp9_highbd_sad8x8_bits8, - vp9_highbd_sad8x8_avg_bits8, + vpx_highbd_sad8x8_bits8, + vpx_highbd_sad8x8_avg_bits8, vp9_highbd_variance8x8, vp9_highbd_sub_pixel_variance8x8, vp9_highbd_sub_pixel_avg_variance8x8, - vp9_highbd_sad8x8x3_bits8, - vp9_highbd_sad8x8x8_bits8, - vp9_highbd_sad8x8x4d_bits8) + vpx_highbd_sad8x8x3_bits8, + vpx_highbd_sad8x8x8_bits8, + vpx_highbd_sad8x8x4d_bits8) HIGHBD_BFP(BLOCK_8X4, - vp9_highbd_sad8x4_bits8, - vp9_highbd_sad8x4_avg_bits8, + vpx_highbd_sad8x4_bits8, + vpx_highbd_sad8x4_avg_bits8, vp9_highbd_variance8x4, vp9_highbd_sub_pixel_variance8x4, vp9_highbd_sub_pixel_avg_variance8x4, NULL, - vp9_highbd_sad8x4x8_bits8, - vp9_highbd_sad8x4x4d_bits8) + vpx_highbd_sad8x4x8_bits8, + vpx_highbd_sad8x4x4d_bits8) HIGHBD_BFP(BLOCK_4X8, - vp9_highbd_sad4x8_bits8, - vp9_highbd_sad4x8_avg_bits8, + vpx_highbd_sad4x8_bits8, + vpx_highbd_sad4x8_avg_bits8, vp9_highbd_variance4x8, vp9_highbd_sub_pixel_variance4x8, vp9_highbd_sub_pixel_avg_variance4x8, NULL, - vp9_highbd_sad4x8x8_bits8, - vp9_highbd_sad4x8x4d_bits8) + vpx_highbd_sad4x8x8_bits8, + vpx_highbd_sad4x8x4d_bits8) HIGHBD_BFP(BLOCK_4X4, - vp9_highbd_sad4x4_bits8, - vp9_highbd_sad4x4_avg_bits8, + vpx_highbd_sad4x4_bits8, + vpx_highbd_sad4x4_avg_bits8, vp9_highbd_variance4x4, vp9_highbd_sub_pixel_variance4x4, vp9_highbd_sub_pixel_avg_variance4x4, - vp9_highbd_sad4x4x3_bits8, - vp9_highbd_sad4x4x8_bits8, - vp9_highbd_sad4x4x4d_bits8) + vpx_highbd_sad4x4x3_bits8, + vpx_highbd_sad4x4x8_bits8, + vpx_highbd_sad4x4x4d_bits8) break; case VPX_BITS_10: HIGHBD_BFP(BLOCK_32X16, - vp9_highbd_sad32x16_bits10, - vp9_highbd_sad32x16_avg_bits10, + vpx_highbd_sad32x16_bits10, + vpx_highbd_sad32x16_avg_bits10, vp9_highbd_10_variance32x16, vp9_highbd_10_sub_pixel_variance32x16, vp9_highbd_10_sub_pixel_avg_variance32x16, NULL, NULL, - vp9_highbd_sad32x16x4d_bits10) + vpx_highbd_sad32x16x4d_bits10) HIGHBD_BFP(BLOCK_16X32, - vp9_highbd_sad16x32_bits10, - vp9_highbd_sad16x32_avg_bits10, + vpx_highbd_sad16x32_bits10, + vpx_highbd_sad16x32_avg_bits10, vp9_highbd_10_variance16x32, vp9_highbd_10_sub_pixel_variance16x32, vp9_highbd_10_sub_pixel_avg_variance16x32, NULL, NULL, - vp9_highbd_sad16x32x4d_bits10) + vpx_highbd_sad16x32x4d_bits10) HIGHBD_BFP(BLOCK_64X32, - vp9_highbd_sad64x32_bits10, - vp9_highbd_sad64x32_avg_bits10, + vpx_highbd_sad64x32_bits10, + vpx_highbd_sad64x32_avg_bits10, vp9_highbd_10_variance64x32, vp9_highbd_10_sub_pixel_variance64x32, vp9_highbd_10_sub_pixel_avg_variance64x32, NULL, NULL, - vp9_highbd_sad64x32x4d_bits10) + vpx_highbd_sad64x32x4d_bits10) HIGHBD_BFP(BLOCK_32X64, - vp9_highbd_sad32x64_bits10, - vp9_highbd_sad32x64_avg_bits10, + vpx_highbd_sad32x64_bits10, + vpx_highbd_sad32x64_avg_bits10, vp9_highbd_10_variance32x64, vp9_highbd_10_sub_pixel_variance32x64, vp9_highbd_10_sub_pixel_avg_variance32x64, NULL, NULL, - vp9_highbd_sad32x64x4d_bits10) + vpx_highbd_sad32x64x4d_bits10) HIGHBD_BFP(BLOCK_32X32, - vp9_highbd_sad32x32_bits10, - vp9_highbd_sad32x32_avg_bits10, + vpx_highbd_sad32x32_bits10, + vpx_highbd_sad32x32_avg_bits10, vp9_highbd_10_variance32x32, vp9_highbd_10_sub_pixel_variance32x32, vp9_highbd_10_sub_pixel_avg_variance32x32, - vp9_highbd_sad32x32x3_bits10, - vp9_highbd_sad32x32x8_bits10, - vp9_highbd_sad32x32x4d_bits10) + vpx_highbd_sad32x32x3_bits10, + vpx_highbd_sad32x32x8_bits10, + vpx_highbd_sad32x32x4d_bits10) HIGHBD_BFP(BLOCK_64X64, - vp9_highbd_sad64x64_bits10, - vp9_highbd_sad64x64_avg_bits10, + vpx_highbd_sad64x64_bits10, + vpx_highbd_sad64x64_avg_bits10, vp9_highbd_10_variance64x64, vp9_highbd_10_sub_pixel_variance64x64, vp9_highbd_10_sub_pixel_avg_variance64x64, - vp9_highbd_sad64x64x3_bits10, - vp9_highbd_sad64x64x8_bits10, - vp9_highbd_sad64x64x4d_bits10) + vpx_highbd_sad64x64x3_bits10, + vpx_highbd_sad64x64x8_bits10, + vpx_highbd_sad64x64x4d_bits10) HIGHBD_BFP(BLOCK_16X16, - vp9_highbd_sad16x16_bits10, - vp9_highbd_sad16x16_avg_bits10, + vpx_highbd_sad16x16_bits10, + vpx_highbd_sad16x16_avg_bits10, vp9_highbd_10_variance16x16, vp9_highbd_10_sub_pixel_variance16x16, vp9_highbd_10_sub_pixel_avg_variance16x16, - vp9_highbd_sad16x16x3_bits10, - vp9_highbd_sad16x16x8_bits10, - vp9_highbd_sad16x16x4d_bits10) + vpx_highbd_sad16x16x3_bits10, + vpx_highbd_sad16x16x8_bits10, + vpx_highbd_sad16x16x4d_bits10) HIGHBD_BFP(BLOCK_16X8, - vp9_highbd_sad16x8_bits10, - vp9_highbd_sad16x8_avg_bits10, + vpx_highbd_sad16x8_bits10, + vpx_highbd_sad16x8_avg_bits10, vp9_highbd_10_variance16x8, vp9_highbd_10_sub_pixel_variance16x8, vp9_highbd_10_sub_pixel_avg_variance16x8, - vp9_highbd_sad16x8x3_bits10, - vp9_highbd_sad16x8x8_bits10, - vp9_highbd_sad16x8x4d_bits10) + vpx_highbd_sad16x8x3_bits10, + vpx_highbd_sad16x8x8_bits10, + vpx_highbd_sad16x8x4d_bits10) HIGHBD_BFP(BLOCK_8X16, - vp9_highbd_sad8x16_bits10, - vp9_highbd_sad8x16_avg_bits10, + vpx_highbd_sad8x16_bits10, + vpx_highbd_sad8x16_avg_bits10, vp9_highbd_10_variance8x16, vp9_highbd_10_sub_pixel_variance8x16, vp9_highbd_10_sub_pixel_avg_variance8x16, - vp9_highbd_sad8x16x3_bits10, - vp9_highbd_sad8x16x8_bits10, - vp9_highbd_sad8x16x4d_bits10) + vpx_highbd_sad8x16x3_bits10, + vpx_highbd_sad8x16x8_bits10, + vpx_highbd_sad8x16x4d_bits10) HIGHBD_BFP(BLOCK_8X8, - vp9_highbd_sad8x8_bits10, - vp9_highbd_sad8x8_avg_bits10, + vpx_highbd_sad8x8_bits10, + vpx_highbd_sad8x8_avg_bits10, vp9_highbd_10_variance8x8, vp9_highbd_10_sub_pixel_variance8x8, vp9_highbd_10_sub_pixel_avg_variance8x8, - vp9_highbd_sad8x8x3_bits10, - vp9_highbd_sad8x8x8_bits10, - vp9_highbd_sad8x8x4d_bits10) + vpx_highbd_sad8x8x3_bits10, + vpx_highbd_sad8x8x8_bits10, + vpx_highbd_sad8x8x4d_bits10) HIGHBD_BFP(BLOCK_8X4, - vp9_highbd_sad8x4_bits10, - vp9_highbd_sad8x4_avg_bits10, + vpx_highbd_sad8x4_bits10, + vpx_highbd_sad8x4_avg_bits10, vp9_highbd_10_variance8x4, vp9_highbd_10_sub_pixel_variance8x4, vp9_highbd_10_sub_pixel_avg_variance8x4, NULL, - vp9_highbd_sad8x4x8_bits10, - vp9_highbd_sad8x4x4d_bits10) + vpx_highbd_sad8x4x8_bits10, + vpx_highbd_sad8x4x4d_bits10) HIGHBD_BFP(BLOCK_4X8, - vp9_highbd_sad4x8_bits10, - vp9_highbd_sad4x8_avg_bits10, + vpx_highbd_sad4x8_bits10, + vpx_highbd_sad4x8_avg_bits10, vp9_highbd_10_variance4x8, vp9_highbd_10_sub_pixel_variance4x8, vp9_highbd_10_sub_pixel_avg_variance4x8, NULL, - vp9_highbd_sad4x8x8_bits10, - vp9_highbd_sad4x8x4d_bits10) + vpx_highbd_sad4x8x8_bits10, + vpx_highbd_sad4x8x4d_bits10) HIGHBD_BFP(BLOCK_4X4, - vp9_highbd_sad4x4_bits10, - vp9_highbd_sad4x4_avg_bits10, + vpx_highbd_sad4x4_bits10, + vpx_highbd_sad4x4_avg_bits10, vp9_highbd_10_variance4x4, vp9_highbd_10_sub_pixel_variance4x4, vp9_highbd_10_sub_pixel_avg_variance4x4, - vp9_highbd_sad4x4x3_bits10, - vp9_highbd_sad4x4x8_bits10, - vp9_highbd_sad4x4x4d_bits10) + vpx_highbd_sad4x4x3_bits10, + vpx_highbd_sad4x4x8_bits10, + vpx_highbd_sad4x4x4d_bits10) break; case VPX_BITS_12: HIGHBD_BFP(BLOCK_32X16, - vp9_highbd_sad32x16_bits12, - vp9_highbd_sad32x16_avg_bits12, + vpx_highbd_sad32x16_bits12, + vpx_highbd_sad32x16_avg_bits12, vp9_highbd_12_variance32x16, vp9_highbd_12_sub_pixel_variance32x16, vp9_highbd_12_sub_pixel_avg_variance32x16, NULL, NULL, - vp9_highbd_sad32x16x4d_bits12) + vpx_highbd_sad32x16x4d_bits12) HIGHBD_BFP(BLOCK_16X32, - vp9_highbd_sad16x32_bits12, - vp9_highbd_sad16x32_avg_bits12, + vpx_highbd_sad16x32_bits12, + vpx_highbd_sad16x32_avg_bits12, vp9_highbd_12_variance16x32, vp9_highbd_12_sub_pixel_variance16x32, vp9_highbd_12_sub_pixel_avg_variance16x32, NULL, NULL, - vp9_highbd_sad16x32x4d_bits12) + vpx_highbd_sad16x32x4d_bits12) HIGHBD_BFP(BLOCK_64X32, - vp9_highbd_sad64x32_bits12, - vp9_highbd_sad64x32_avg_bits12, + vpx_highbd_sad64x32_bits12, + vpx_highbd_sad64x32_avg_bits12, vp9_highbd_12_variance64x32, vp9_highbd_12_sub_pixel_variance64x32, vp9_highbd_12_sub_pixel_avg_variance64x32, NULL, NULL, - vp9_highbd_sad64x32x4d_bits12) + vpx_highbd_sad64x32x4d_bits12) HIGHBD_BFP(BLOCK_32X64, - vp9_highbd_sad32x64_bits12, - vp9_highbd_sad32x64_avg_bits12, + vpx_highbd_sad32x64_bits12, + vpx_highbd_sad32x64_avg_bits12, vp9_highbd_12_variance32x64, vp9_highbd_12_sub_pixel_variance32x64, vp9_highbd_12_sub_pixel_avg_variance32x64, NULL, NULL, - vp9_highbd_sad32x64x4d_bits12) + vpx_highbd_sad32x64x4d_bits12) HIGHBD_BFP(BLOCK_32X32, - vp9_highbd_sad32x32_bits12, - vp9_highbd_sad32x32_avg_bits12, + vpx_highbd_sad32x32_bits12, + vpx_highbd_sad32x32_avg_bits12, vp9_highbd_12_variance32x32, vp9_highbd_12_sub_pixel_variance32x32, vp9_highbd_12_sub_pixel_avg_variance32x32, - vp9_highbd_sad32x32x3_bits12, - vp9_highbd_sad32x32x8_bits12, - vp9_highbd_sad32x32x4d_bits12) + vpx_highbd_sad32x32x3_bits12, + vpx_highbd_sad32x32x8_bits12, + vpx_highbd_sad32x32x4d_bits12) HIGHBD_BFP(BLOCK_64X64, - vp9_highbd_sad64x64_bits12, - vp9_highbd_sad64x64_avg_bits12, + vpx_highbd_sad64x64_bits12, + vpx_highbd_sad64x64_avg_bits12, vp9_highbd_12_variance64x64, vp9_highbd_12_sub_pixel_variance64x64, vp9_highbd_12_sub_pixel_avg_variance64x64, - vp9_highbd_sad64x64x3_bits12, - vp9_highbd_sad64x64x8_bits12, - vp9_highbd_sad64x64x4d_bits12) + vpx_highbd_sad64x64x3_bits12, + vpx_highbd_sad64x64x8_bits12, + vpx_highbd_sad64x64x4d_bits12) HIGHBD_BFP(BLOCK_16X16, - vp9_highbd_sad16x16_bits12, - vp9_highbd_sad16x16_avg_bits12, + vpx_highbd_sad16x16_bits12, + vpx_highbd_sad16x16_avg_bits12, vp9_highbd_12_variance16x16, vp9_highbd_12_sub_pixel_variance16x16, vp9_highbd_12_sub_pixel_avg_variance16x16, - vp9_highbd_sad16x16x3_bits12, - vp9_highbd_sad16x16x8_bits12, - vp9_highbd_sad16x16x4d_bits12) + vpx_highbd_sad16x16x3_bits12, + vpx_highbd_sad16x16x8_bits12, + vpx_highbd_sad16x16x4d_bits12) HIGHBD_BFP(BLOCK_16X8, - vp9_highbd_sad16x8_bits12, - vp9_highbd_sad16x8_avg_bits12, + vpx_highbd_sad16x8_bits12, + vpx_highbd_sad16x8_avg_bits12, vp9_highbd_12_variance16x8, vp9_highbd_12_sub_pixel_variance16x8, vp9_highbd_12_sub_pixel_avg_variance16x8, - vp9_highbd_sad16x8x3_bits12, - vp9_highbd_sad16x8x8_bits12, - vp9_highbd_sad16x8x4d_bits12) + vpx_highbd_sad16x8x3_bits12, + vpx_highbd_sad16x8x8_bits12, + vpx_highbd_sad16x8x4d_bits12) HIGHBD_BFP(BLOCK_8X16, - vp9_highbd_sad8x16_bits12, - vp9_highbd_sad8x16_avg_bits12, + vpx_highbd_sad8x16_bits12, + vpx_highbd_sad8x16_avg_bits12, vp9_highbd_12_variance8x16, vp9_highbd_12_sub_pixel_variance8x16, vp9_highbd_12_sub_pixel_avg_variance8x16, - vp9_highbd_sad8x16x3_bits12, - vp9_highbd_sad8x16x8_bits12, - vp9_highbd_sad8x16x4d_bits12) + vpx_highbd_sad8x16x3_bits12, + vpx_highbd_sad8x16x8_bits12, + vpx_highbd_sad8x16x4d_bits12) HIGHBD_BFP(BLOCK_8X8, - vp9_highbd_sad8x8_bits12, - vp9_highbd_sad8x8_avg_bits12, + vpx_highbd_sad8x8_bits12, + vpx_highbd_sad8x8_avg_bits12, vp9_highbd_12_variance8x8, vp9_highbd_12_sub_pixel_variance8x8, vp9_highbd_12_sub_pixel_avg_variance8x8, - vp9_highbd_sad8x8x3_bits12, - vp9_highbd_sad8x8x8_bits12, - vp9_highbd_sad8x8x4d_bits12) + vpx_highbd_sad8x8x3_bits12, + vpx_highbd_sad8x8x8_bits12, + vpx_highbd_sad8x8x4d_bits12) HIGHBD_BFP(BLOCK_8X4, - vp9_highbd_sad8x4_bits12, - vp9_highbd_sad8x4_avg_bits12, + vpx_highbd_sad8x4_bits12, + vpx_highbd_sad8x4_avg_bits12, vp9_highbd_12_variance8x4, vp9_highbd_12_sub_pixel_variance8x4, vp9_highbd_12_sub_pixel_avg_variance8x4, NULL, - vp9_highbd_sad8x4x8_bits12, - vp9_highbd_sad8x4x4d_bits12) + vpx_highbd_sad8x4x8_bits12, + vpx_highbd_sad8x4x4d_bits12) HIGHBD_BFP(BLOCK_4X8, - vp9_highbd_sad4x8_bits12, - vp9_highbd_sad4x8_avg_bits12, + vpx_highbd_sad4x8_bits12, + vpx_highbd_sad4x8_avg_bits12, vp9_highbd_12_variance4x8, vp9_highbd_12_sub_pixel_variance4x8, vp9_highbd_12_sub_pixel_avg_variance4x8, NULL, - vp9_highbd_sad4x8x8_bits12, - vp9_highbd_sad4x8x4d_bits12) + vpx_highbd_sad4x8x8_bits12, + vpx_highbd_sad4x8x4d_bits12) HIGHBD_BFP(BLOCK_4X4, - vp9_highbd_sad4x4_bits12, - vp9_highbd_sad4x4_avg_bits12, + vpx_highbd_sad4x4_bits12, + vpx_highbd_sad4x4_avg_bits12, vp9_highbd_12_variance4x4, vp9_highbd_12_sub_pixel_variance4x4, vp9_highbd_12_sub_pixel_avg_variance4x4, - vp9_highbd_sad4x4x3_bits12, - vp9_highbd_sad4x4x8_bits12, - vp9_highbd_sad4x4x4d_bits12) + vpx_highbd_sad4x4x3_bits12, + vpx_highbd_sad4x4x8_bits12, + vpx_highbd_sad4x4x4d_bits12) break; default: @@ -1799,64 +1801,64 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf, cpi->fn_ptr[BT].sdx8f = SDX8F; \ cpi->fn_ptr[BT].sdx4df = SDX4DF; - BFP(BLOCK_32X16, vp9_sad32x16, vp9_sad32x16_avg, + BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vp9_variance32x16, vp9_sub_pixel_variance32x16, - vp9_sub_pixel_avg_variance32x16, NULL, NULL, vp9_sad32x16x4d) + vp9_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d) - BFP(BLOCK_16X32, vp9_sad16x32, vp9_sad16x32_avg, + BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vp9_variance16x32, vp9_sub_pixel_variance16x32, - vp9_sub_pixel_avg_variance16x32, NULL, NULL, vp9_sad16x32x4d) + vp9_sub_pixel_avg_variance16x32, NULL, NULL, vpx_sad16x32x4d) - BFP(BLOCK_64X32, vp9_sad64x32, vp9_sad64x32_avg, + BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vp9_variance64x32, vp9_sub_pixel_variance64x32, - vp9_sub_pixel_avg_variance64x32, NULL, NULL, vp9_sad64x32x4d) + vp9_sub_pixel_avg_variance64x32, NULL, NULL, vpx_sad64x32x4d) - BFP(BLOCK_32X64, vp9_sad32x64, vp9_sad32x64_avg, + BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vp9_variance32x64, vp9_sub_pixel_variance32x64, - vp9_sub_pixel_avg_variance32x64, NULL, NULL, vp9_sad32x64x4d) + vp9_sub_pixel_avg_variance32x64, NULL, NULL, vpx_sad32x64x4d) - BFP(BLOCK_32X32, vp9_sad32x32, vp9_sad32x32_avg, + BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vp9_variance32x32, vp9_sub_pixel_variance32x32, - vp9_sub_pixel_avg_variance32x32, vp9_sad32x32x3, vp9_sad32x32x8, - vp9_sad32x32x4d) + vp9_sub_pixel_avg_variance32x32, vpx_sad32x32x3, vpx_sad32x32x8, + vpx_sad32x32x4d) - BFP(BLOCK_64X64, vp9_sad64x64, vp9_sad64x64_avg, + BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vp9_variance64x64, vp9_sub_pixel_variance64x64, - vp9_sub_pixel_avg_variance64x64, vp9_sad64x64x3, vp9_sad64x64x8, - vp9_sad64x64x4d) + vp9_sub_pixel_avg_variance64x64, vpx_sad64x64x3, vpx_sad64x64x8, + vpx_sad64x64x4d) - BFP(BLOCK_16X16, vp9_sad16x16, vp9_sad16x16_avg, + BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vp9_variance16x16, vp9_sub_pixel_variance16x16, - vp9_sub_pixel_avg_variance16x16, vp9_sad16x16x3, vp9_sad16x16x8, - vp9_sad16x16x4d) + vp9_sub_pixel_avg_variance16x16, vpx_sad16x16x3, vpx_sad16x16x8, + vpx_sad16x16x4d) - BFP(BLOCK_16X8, vp9_sad16x8, vp9_sad16x8_avg, + BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vp9_variance16x8, vp9_sub_pixel_variance16x8, vp9_sub_pixel_avg_variance16x8, - vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d) + vpx_sad16x8x3, vpx_sad16x8x8, vpx_sad16x8x4d) - BFP(BLOCK_8X16, vp9_sad8x16, vp9_sad8x16_avg, + BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vp9_variance8x16, vp9_sub_pixel_variance8x16, vp9_sub_pixel_avg_variance8x16, - vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d) + vpx_sad8x16x3, vpx_sad8x16x8, vpx_sad8x16x4d) - BFP(BLOCK_8X8, vp9_sad8x8, vp9_sad8x8_avg, + BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vp9_variance8x8, vp9_sub_pixel_variance8x8, vp9_sub_pixel_avg_variance8x8, - vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) + vpx_sad8x8x3, vpx_sad8x8x8, vpx_sad8x8x4d) - BFP(BLOCK_8X4, vp9_sad8x4, vp9_sad8x4_avg, + BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vp9_variance8x4, vp9_sub_pixel_variance8x4, - vp9_sub_pixel_avg_variance8x4, NULL, vp9_sad8x4x8, vp9_sad8x4x4d) + vp9_sub_pixel_avg_variance8x4, NULL, vpx_sad8x4x8, vpx_sad8x4x4d) - BFP(BLOCK_4X8, vp9_sad4x8, vp9_sad4x8_avg, + BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vp9_variance4x8, vp9_sub_pixel_variance4x8, - vp9_sub_pixel_avg_variance4x8, NULL, vp9_sad4x8x8, vp9_sad4x8x4d) + vp9_sub_pixel_avg_variance4x8, NULL, vpx_sad4x8x8, vpx_sad4x8x4d) - BFP(BLOCK_4X4, vp9_sad4x4, vp9_sad4x4_avg, + BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vp9_variance4x4, vp9_sub_pixel_variance4x4, vp9_sub_pixel_avg_variance4x4, - vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) + vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d) #if CONFIG_VP9_HIGHBITDEPTH highbd_set_var_fns(cpi); diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 06c3885c1..d5eeb9cc5 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -10,6 +10,9 @@ #include <limits.h> +#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" + #include "vpx_mem/vpx_mem.h" #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_mcomp.h" @@ -74,8 +77,8 @@ static unsigned int do_16x16_motion_iteration(VP9_COMP *cpi, x->mv_row_min = tmp_row_min; x->mv_row_max = tmp_row_max; - return vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, - xd->plane[0].dst.buf, xd->plane[0].dst.stride); + return vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + xd->plane[0].dst.buf, xd->plane[0].dst.stride); } static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, @@ -87,7 +90,7 @@ static int do_16x16_motion_search(VP9_COMP *cpi, const MV *ref_mv, // Try zero MV first // FIXME should really use something like near/nearest MV and/or MV prediction - err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); dst_mv->as_int = 0; @@ -123,7 +126,7 @@ static int do_16x16_zerozero_search(VP9_COMP *cpi, int_mv *dst_mv) { // Try zero MV first // FIXME should really use something like near/nearest MV and/or MV prediction - err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); dst_mv->as_int = 0; @@ -146,7 +149,7 @@ static int find_best_16x16_intra(VP9_COMP *cpi, PREDICTION_MODE *pbest_mode) { x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride, 0, 0, 0); - err = vp9_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, + err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].dst.buf, xd->plane[0].dst.stride); // find best diff --git a/vp9/encoder/vp9_sad.c b/vp9/encoder/vp9_sad.c deleted file mode 100644 index 73134f2f2..000000000 --- a/vp9/encoder/vp9_sad.c +++ /dev/null @@ -1,276 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <stdlib.h> - -#include "./vp9_rtcd.h" -#include "./vpx_config.h" - -#include "vpx/vpx_integer.h" -#if CONFIG_VP9_HIGHBITDEPTH -#include "vp9/common/vp9_common.h" -#endif -#include "vp9/encoder/vp9_variance.h" - -static INLINE unsigned int sad(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - int width, int height) { - int y, x; - unsigned int sad = 0; - - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += abs(a[x] - b[x]); - - a += a_stride; - b += b_stride; - } - return sad; -} - -#define sadMxN(m, n) \ -unsigned int vp9_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - return sad(src, src_stride, ref, ref_stride, m, n); \ -} \ -unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - const uint8_t *second_pred) { \ - uint8_t comp_pred[m * n]; \ - vp9_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ - return sad(src, src_stride, comp_pred, m, m, n); \ -} - -#define sadMxNxK(m, n, k) \ -void vp9_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sads) { \ - int i; \ - for (i = 0; i < k; ++i) \ - sads[i] = vp9_sad##m##x##n##_c(src, src_stride, &ref[i], ref_stride); \ -} - -#define sadMxNx4D(m, n) \ -void vp9_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const refs[], int ref_stride, \ - unsigned int *sads) { \ - int i; \ - for (i = 0; i < 4; ++i) \ - sads[i] = vp9_sad##m##x##n##_c(src, src_stride, refs[i], ref_stride); \ -} - -// 64x64 -sadMxN(64, 64) -sadMxNxK(64, 64, 3) -sadMxNxK(64, 64, 8) -sadMxNx4D(64, 64) - -// 64x32 -sadMxN(64, 32) -sadMxNx4D(64, 32) - -// 32x64 -sadMxN(32, 64) -sadMxNx4D(32, 64) - -// 32x32 -sadMxN(32, 32) -sadMxNxK(32, 32, 3) -sadMxNxK(32, 32, 8) -sadMxNx4D(32, 32) - -// 32x16 -sadMxN(32, 16) -sadMxNx4D(32, 16) - -// 16x32 -sadMxN(16, 32) -sadMxNx4D(16, 32) - -// 16x16 -sadMxN(16, 16) -sadMxNxK(16, 16, 3) -sadMxNxK(16, 16, 8) -sadMxNx4D(16, 16) - -// 16x8 -sadMxN(16, 8) -sadMxNxK(16, 8, 3) -sadMxNxK(16, 8, 8) -sadMxNx4D(16, 8) - -// 8x16 -sadMxN(8, 16) -sadMxNxK(8, 16, 3) -sadMxNxK(8, 16, 8) -sadMxNx4D(8, 16) - -// 8x8 -sadMxN(8, 8) -sadMxNxK(8, 8, 3) -sadMxNxK(8, 8, 8) -sadMxNx4D(8, 8) - -// 8x4 -sadMxN(8, 4) -sadMxNxK(8, 4, 8) -sadMxNx4D(8, 4) - -// 4x8 -sadMxN(4, 8) -sadMxNxK(4, 8, 8) -sadMxNx4D(4, 8) - -// 4x4 -sadMxN(4, 4) -sadMxNxK(4, 4, 3) -sadMxNxK(4, 4, 8) -sadMxNx4D(4, 4) - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE unsigned int highbd_sad(const uint8_t *a8, int a_stride, - const uint8_t *b8, int b_stride, - int width, int height) { - int y, x; - unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - const uint16_t *b = CONVERT_TO_SHORTPTR(b8); - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += abs(a[x] - b[x]); - - a += a_stride; - b += b_stride; - } - return sad; -} - -static INLINE unsigned int highbd_sadb(const uint8_t *a8, int a_stride, - const uint16_t *b, int b_stride, - int width, int height) { - int y, x; - unsigned int sad = 0; - const uint16_t *a = CONVERT_TO_SHORTPTR(a8); - for (y = 0; y < height; y++) { - for (x = 0; x < width; x++) - sad += abs(a[x] - b[x]); - - a += a_stride; - b += b_stride; - } - return sad; -} - -#define highbd_sadMxN(m, n) \ -unsigned int vp9_highbd_sad##m##x##n##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride) { \ - return highbd_sad(src, src_stride, ref, ref_stride, m, n); \ -} \ -unsigned int vp9_highbd_sad##m##x##n##_avg_c(const uint8_t *src, \ - int src_stride, \ - const uint8_t *ref, \ - int ref_stride, \ - const uint8_t *second_pred) { \ - uint16_t comp_pred[m * n]; \ - vp9_highbd_comp_avg_pred(comp_pred, second_pred, m, n, ref, ref_stride); \ - return highbd_sadb(src, src_stride, comp_pred, m, m, n); \ -} - -#define highbd_sadMxNxK(m, n, k) \ -void vp9_highbd_sad##m##x##n##x##k##_c(const uint8_t *src, int src_stride, \ - const uint8_t *ref, int ref_stride, \ - unsigned int *sads) { \ - int i; \ - for (i = 0; i < k; ++i) { \ - sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, &ref[i], \ - ref_stride); \ - } \ -} - -#define highbd_sadMxNx4D(m, n) \ -void vp9_highbd_sad##m##x##n##x4d_c(const uint8_t *src, int src_stride, \ - const uint8_t *const refs[], \ - int ref_stride, unsigned int *sads) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sads[i] = vp9_highbd_sad##m##x##n##_c(src, src_stride, refs[i], \ - ref_stride); \ - } \ -} - -// 64x64 -highbd_sadMxN(64, 64) -highbd_sadMxNxK(64, 64, 3) -highbd_sadMxNxK(64, 64, 8) -highbd_sadMxNx4D(64, 64) - -// 64x32 -highbd_sadMxN(64, 32) -highbd_sadMxNx4D(64, 32) - -// 32x64 -highbd_sadMxN(32, 64) -highbd_sadMxNx4D(32, 64) - -// 32x32 -highbd_sadMxN(32, 32) -highbd_sadMxNxK(32, 32, 3) -highbd_sadMxNxK(32, 32, 8) -highbd_sadMxNx4D(32, 32) - -// 32x16 -highbd_sadMxN(32, 16) -highbd_sadMxNx4D(32, 16) - -// 16x32 -highbd_sadMxN(16, 32) -highbd_sadMxNx4D(16, 32) - -// 16x16 -highbd_sadMxN(16, 16) -highbd_sadMxNxK(16, 16, 3) -highbd_sadMxNxK(16, 16, 8) -highbd_sadMxNx4D(16, 16) - -// 16x8 -highbd_sadMxN(16, 8) -highbd_sadMxNxK(16, 8, 3) -highbd_sadMxNxK(16, 8, 8) -highbd_sadMxNx4D(16, 8) - -// 8x16 -highbd_sadMxN(8, 16) -highbd_sadMxNxK(8, 16, 3) -highbd_sadMxNxK(8, 16, 8) -highbd_sadMxNx4D(8, 16) - -// 8x8 -highbd_sadMxN(8, 8) -highbd_sadMxNxK(8, 8, 3) -highbd_sadMxNxK(8, 8, 8) -highbd_sadMxNx4D(8, 8) - -// 8x4 -highbd_sadMxN(8, 4) -highbd_sadMxNxK(8, 4, 8) -highbd_sadMxNx4D(8, 4) - -// 4x8 -highbd_sadMxN(4, 8) -highbd_sadMxNxK(4, 8, 8) -highbd_sadMxNx4D(4, 8) - -// 4x4 -highbd_sadMxN(4, 4) -highbd_sadMxNxK(4, 4, 3) -highbd_sadMxNxK(4, 4, 8) -highbd_sadMxNx4D(4, 4) - -#endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm b/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm deleted file mode 100644 index f79a59f02..000000000 --- a/vp9/encoder/x86/vp9_highbd_sad4d_sse2.asm +++ /dev/null @@ -1,287 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -; HIGH_PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_4x2x4 5-6 0 - movh m0, [srcq +%2*2] -%if %1 == 1 - movu m4, [ref1q+%3*2] - movu m5, [ref2q+%3*2] - movu m6, [ref3q+%3*2] - movu m7, [ref4q+%3*2] - movhps m0, [srcq +%4*2] - movhps m4, [ref1q+%5*2] - movhps m5, [ref2q+%5*2] - movhps m6, [ref3q+%5*2] - movhps m7, [ref4q+%5*2] - mova m3, m0 - mova m2, m0 - psubusw m3, m4 - psubusw m2, m5 - psubusw m4, m0 - psubusw m5, m0 - por m4, m3 - por m5, m2 - pmaddwd m4, m1 - pmaddwd m5, m1 - mova m3, m0 - mova m2, m0 - psubusw m3, m6 - psubusw m2, m7 - psubusw m6, m0 - psubusw m7, m0 - por m6, m3 - por m7, m2 - pmaddwd m6, m1 - pmaddwd m7, m1 -%else - movu m2, [ref1q+%3*2] - movhps m0, [srcq +%4*2] - movhps m2, [ref1q+%5*2] - mova m3, m0 - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m4, m2 - - movu m2, [ref2q+%3*2] - mova m3, m0 - movhps m2, [ref2q+%5*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m5, m2 - - movu m2, [ref3q+%3*2] - mova m3, m0 - movhps m2, [ref3q+%5*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m6, m2 - - movu m2, [ref4q+%3*2] - mova m3, m0 - movhps m2, [ref4q+%5*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m7, m2 -%endif -%if %6 == 1 - lea srcq, [srcq +src_strideq*4] - lea ref1q, [ref1q+ref_strideq*4] - lea ref2q, [ref2q+ref_strideq*4] - lea ref3q, [ref3q+ref_strideq*4] - lea ref4q, [ref4q+ref_strideq*4] -%endif -%endmacro - -; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_8x2x4 5-6 0 - ; 1st 8 px - mova m0, [srcq +%2*2] -%if %1 == 1 - movu m4, [ref1q+%3*2] - movu m5, [ref2q+%3*2] - movu m6, [ref3q+%3*2] - movu m7, [ref4q+%3*2] - mova m3, m0 - mova m2, m0 - psubusw m3, m4 - psubusw m2, m5 - psubusw m4, m0 - psubusw m5, m0 - por m4, m3 - por m5, m2 - pmaddwd m4, m1 - pmaddwd m5, m1 - mova m3, m0 - mova m2, m0 - psubusw m3, m6 - psubusw m2, m7 - psubusw m6, m0 - psubusw m7, m0 - por m6, m3 - por m7, m2 - pmaddwd m6, m1 - pmaddwd m7, m1 -%else - mova m3, m0 - movu m2, [ref1q+%3*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m4, m2 - movu m2, [ref2q+%3*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m5, m2 - movu m2, [ref3q+%3*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m6, m2 - movu m2, [ref4q+%3*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - pmaddwd m2, m1 - paddd m7, m2 -%endif - - ; 2nd 8 px - mova m0, [srcq +(%4)*2] - mova m3, m0 - movu m2, [ref1q+(%5)*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m4, m2 - movu m2, [ref2q+(%5)*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m5, m2 - movu m2, [ref3q+(%5)*2] - psubusw m3, m2 - psubusw m2, m0 - por m2, m3 - mova m3, m0 - pmaddwd m2, m1 - paddd m6, m2 - movu m2, [ref4q+(%5)*2] - psubusw m3, m2 - psubusw m2, m0 -%if %6 == 1 - lea srcq, [srcq +src_strideq*4] - lea ref1q, [ref1q+ref_strideq*4] - lea ref2q, [ref2q+ref_strideq*4] - lea ref3q, [ref3q+ref_strideq*4] - lea ref4q, [ref4q+ref_strideq*4] -%endif - por m2, m3 - pmaddwd m2, m1 - paddd m7, m2 -%endmacro - -; HIGH_PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_16x2x4 5-6 0 - HIGH_PROCESS_8x2x4 %1, %2, %3, (%2 + 8), (%3 + 8) - HIGH_PROCESS_8x2x4 0, %4, %5, (%4 + 8), (%5 + 8), %6 -%endmacro - -; HIGH_PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_32x2x4 5-6 0 - HIGH_PROCESS_16x2x4 %1, %2, %3, (%2 + 16), (%3 + 16) - HIGH_PROCESS_16x2x4 0, %4, %5, (%4 + 16), (%5 + 16), %6 -%endmacro - -; HIGH_PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro HIGH_PROCESS_64x2x4 5-6 0 - HIGH_PROCESS_32x2x4 %1, %2, %3, (%2 + 32), (%3 + 32) - HIGH_PROCESS_32x2x4 0, %4, %5, (%4 + 32), (%5 + 32), %6 -%endmacro - -; void vp9_highbd_sadNxNx4d_sse2(uint8_t *src, int src_stride, -; uint8_t *ref[4], int ref_stride, -; unsigned int res[4]); -; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 -%macro HIGH_SADNXN4D 2 -%if UNIX64 -cglobal highbd_sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ - res, ref2, ref3, ref4 -%else -cglobal highbd_sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ - ref2, ref3, ref4 -%endif - -; set m1 - push srcq - mov srcd, 0x00010001 - movd m1, srcd - pshufd m1, m1, 0x0 - pop srcq - - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - mov ref2q, [ref1q+gprsize*1] - mov ref3q, [ref1q+gprsize*2] - mov ref4q, [ref1q+gprsize*3] - mov ref1q, [ref1q+gprsize*0] - -; convert byte pointers to short pointers - shl srcq, 1 - shl ref2q, 1 - shl ref3q, 1 - shl ref4q, 1 - shl ref1q, 1 - - HIGH_PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 - HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 -%endrep - HIGH_PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 - ; N.B. HIGH_PROCESS outputs dwords (32 bits) - ; so in high bit depth even the smallest width (4) needs 128bits i.e. XMM - movhlps m0, m4 - movhlps m1, m5 - movhlps m2, m6 - movhlps m3, m7 - paddd m4, m0 - paddd m5, m1 - paddd m6, m2 - paddd m7, m3 - punpckldq m4, m5 - punpckldq m6, m7 - movhlps m0, m4 - movhlps m1, m6 - paddd m4, m0 - paddd m6, m1 - punpcklqdq m4, m6 - movifnidn r4, r4mp - movu [r4], m4 - RET -%endmacro - - -INIT_XMM sse2 -HIGH_SADNXN4D 64, 64 -HIGH_SADNXN4D 64, 32 -HIGH_SADNXN4D 32, 64 -HIGH_SADNXN4D 32, 32 -HIGH_SADNXN4D 32, 16 -HIGH_SADNXN4D 16, 32 -HIGH_SADNXN4D 16, 16 -HIGH_SADNXN4D 16, 8 -HIGH_SADNXN4D 8, 16 -HIGH_SADNXN4D 8, 8 -HIGH_SADNXN4D 8, 4 -HIGH_SADNXN4D 4, 8 -HIGH_SADNXN4D 4, 4 diff --git a/vp9/encoder/x86/vp9_highbd_sad_sse2.asm b/vp9/encoder/x86/vp9_highbd_sad_sse2.asm deleted file mode 100644 index c895ac0ee..000000000 --- a/vp9/encoder/x86/vp9_highbd_sad_sse2.asm +++ /dev/null @@ -1,363 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro HIGH_SAD_FN 4 -%if %4 == 0 -%if %3 == 5 -cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, n_rows -%else ; %3 == 7 -cglobal highbd_sad%1x%2, 4, %3, 7, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows -%endif ; %3 == 5/7 -%else ; avg -%if %3 == 5 -cglobal highbd_sad%1x%2_avg, 5, 1 + %3, 7, src, src_stride, ref, ref_stride, \ - second_pred, n_rows -%else ; %3 == 7 -cglobal highbd_sad%1x%2_avg, 5, ARCH_X86_64 + %3, 7, src, src_stride, \ - ref, ref_stride, \ - second_pred, \ - src_stride3, ref_stride3 -%if ARCH_X86_64 -%define n_rowsd r7d -%else ; x86-32 -%define n_rowsd dword r0m -%endif ; x86-32/64 -%endif ; %3 == 5/7 -%endif ; avg/sad - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided -%if %3 == 7 - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] -%endif ; %3 == 7 -; convert src, ref & second_pred to short ptrs (from byte ptrs) - shl srcq, 1 - shl refq, 1 -%if %4 == 1 - shl second_predq, 1 -%endif -%endmacro - -; unsigned int vp9_highbd_sad64x{16,32,64}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro HIGH_SAD64XN 1-2 0 - HIGH_SAD_FN 64, %1, 5, %2 - mov n_rowsd, %1 - pxor m0, m0 - pxor m6, m6 - -.loop: - ; first half of each row - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+32] - movu m4, [refq+48] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+mmsize*1] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq] - psubusw m5, m1 - psubusw m1, [srcq] - por m1, m5 - mova m5, [srcq+16] - psubusw m5, m2 - psubusw m2, [srcq+16] - por m2, m5 - mova m5, [srcq+32] - psubusw m5, m3 - psubusw m3, [srcq+32] - por m3, m5 - mova m5, [srcq+48] - psubusw m5, m4 - psubusw m4, [srcq+48] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - paddd m0, m1 - paddd m0, m3 - ; second half of each row - movu m1, [refq+64] - movu m2, [refq+80] - movu m3, [refq+96] - movu m4, [refq+112] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+mmsize*1] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq+64] - psubusw m5, m1 - psubusw m1, [srcq+64] - por m1, m5 - mova m5, [srcq+80] - psubusw m5, m2 - psubusw m2, [srcq+80] - por m2, m5 - mova m5, [srcq+96] - psubusw m5, m3 - psubusw m3, [srcq+96] - por m3, m5 - mova m5, [srcq+112] - psubusw m5, m4 - psubusw m4, [srcq+112] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - lea refq, [refq+ref_strideq*2] - paddd m0, m1 - lea srcq, [srcq+src_strideq*2] - paddd m0, m3 - - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - punpckldq m0, m6 - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -HIGH_SAD64XN 64 ; highbd_sad64x64_sse2 -HIGH_SAD64XN 32 ; highbd_sad64x32_sse2 -HIGH_SAD64XN 64, 1 ; highbd_sad64x64_avg_sse2 -HIGH_SAD64XN 32, 1 ; highbd_sad64x32_avg_sse2 - - -; unsigned int vp9_highbd_sad32x{16,32,64}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro HIGH_SAD32XN 1-2 0 - HIGH_SAD_FN 32, %1, 5, %2 - mov n_rowsd, %1 - pxor m0, m0 - pxor m6, m6 - -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+32] - movu m4, [refq+48] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+mmsize*1] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq] - psubusw m5, m1 - psubusw m1, [srcq] - por m1, m5 - mova m5, [srcq+16] - psubusw m5, m2 - psubusw m2, [srcq+16] - por m2, m5 - mova m5, [srcq+32] - psubusw m5, m3 - psubusw m3, [srcq+32] - por m3, m5 - mova m5, [srcq+48] - psubusw m5, m4 - psubusw m4, [srcq+48] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - lea refq, [refq+ref_strideq*2] - paddd m0, m1 - lea srcq, [srcq+src_strideq*2] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - punpckldq m0, m6 - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -HIGH_SAD32XN 64 ; highbd_sad32x64_sse2 -HIGH_SAD32XN 32 ; highbd_sad32x32_sse2 -HIGH_SAD32XN 16 ; highbd_sad32x16_sse2 -HIGH_SAD32XN 64, 1 ; highbd_sad32x64_avg_sse2 -HIGH_SAD32XN 32, 1 ; highbd_sad32x32_avg_sse2 -HIGH_SAD32XN 16, 1 ; highbd_sad32x16_avg_sse2 - -; unsigned int vp9_highbd_sad16x{8,16,32}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro HIGH_SAD16XN 1-2 0 - HIGH_SAD_FN 16, %1, 5, %2 - mov n_rowsd, %1/2 - pxor m0, m0 - pxor m6, m6 - -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+ref_strideq*2] - movu m4, [refq+ref_strideq*2+16] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+16] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*2+16] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq] - psubusw m5, m1 - psubusw m1, [srcq] - por m1, m5 - mova m5, [srcq+16] - psubusw m5, m2 - psubusw m2, [srcq+16] - por m2, m5 - mova m5, [srcq+src_strideq*2] - psubusw m5, m3 - psubusw m3, [srcq+src_strideq*2] - por m3, m5 - mova m5, [srcq+src_strideq*2+16] - psubusw m5, m4 - psubusw m4, [srcq+src_strideq*2+16] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - punpckldq m0, m6 - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -HIGH_SAD16XN 32 ; highbd_sad16x32_sse2 -HIGH_SAD16XN 16 ; highbd_sad16x16_sse2 -HIGH_SAD16XN 8 ; highbd_sad16x8_sse2 -HIGH_SAD16XN 32, 1 ; highbd_sad16x32_avg_sse2 -HIGH_SAD16XN 16, 1 ; highbd_sad16x16_avg_sse2 -HIGH_SAD16XN 8, 1 ; highbd_sad16x8_avg_sse2 - - -; unsigned int vp9_highbd_sad8x{4,8,16}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro HIGH_SAD8XN 1-2 0 - HIGH_SAD_FN 8, %1, 7, %2 - mov n_rowsd, %1/4 - pxor m0, m0 - pxor m6, m6 - -.loop: - movu m1, [refq] - movu m2, [refq+ref_strideq*2] - movu m3, [refq+ref_strideq*4] - movu m4, [refq+ref_stride3q*2] -%if %2 == 1 - pavgw m1, [second_predq+mmsize*0] - pavgw m2, [second_predq+mmsize*1] - pavgw m3, [second_predq+mmsize*2] - pavgw m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - mova m5, [srcq] - psubusw m5, m1 - psubusw m1, [srcq] - por m1, m5 - mova m5, [srcq+src_strideq*2] - psubusw m5, m2 - psubusw m2, [srcq+src_strideq*2] - por m2, m5 - mova m5, [srcq+src_strideq*4] - psubusw m5, m3 - psubusw m3, [srcq+src_strideq*4] - por m3, m5 - mova m5, [srcq+src_stride3q*2] - psubusw m5, m4 - psubusw m4, [srcq+src_stride3q*2] - por m4, m5 - paddw m1, m2 - paddw m3, m4 - movhlps m2, m1 - movhlps m4, m3 - paddw m1, m2 - paddw m3, m4 - punpcklwd m1, m6 - punpcklwd m3, m6 - lea refq, [refq+ref_strideq*8] - paddd m0, m1 - lea srcq, [srcq+src_strideq*8] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - punpckldq m0, m6 - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -HIGH_SAD8XN 16 ; highbd_sad8x16_sse2 -HIGH_SAD8XN 8 ; highbd_sad8x8_sse2 -HIGH_SAD8XN 4 ; highbd_sad8x4_sse2 -HIGH_SAD8XN 16, 1 ; highbd_sad8x16_avg_sse2 -HIGH_SAD8XN 8, 1 ; highbd_sad8x8_avg_sse2 -HIGH_SAD8XN 4, 1 ; highbd_sad8x4_avg_sse2 diff --git a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c b/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c deleted file mode 100644 index 1feed6256..000000000 --- a/vp9/encoder/x86/vp9_sad4d_intrin_avx2.c +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include <immintrin.h> // AVX2 -#include "vpx/vpx_integer.h" - -void vp9_sad32x32x4d_avx2(uint8_t *src, - int src_stride, - uint8_t *ref[4], - int ref_stride, - unsigned int res[4]) { - __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; - __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; - int i; - uint8_t *ref0, *ref1, *ref2, *ref3; - - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 32 ; i++) { - // load src and all refs - src_reg = _mm256_loadu_si256((__m256i *)(src)); - ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); - ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); - ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); - ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - - src+= src_stride; - ref0+= ref_stride; - ref1+= ref_stride; - ref2+= ref_stride; - ref3+= ref_stride; - } - { - __m128i sum; - // in sum_ref-i the result is saved in the first 4 bytes - // the other 4 bytes are zeroed. - // sum_ref1 and sum_ref3 are shifted left by 4 bytes - sum_ref1 = _mm256_slli_si256(sum_ref1, 4); - sum_ref3 = _mm256_slli_si256(sum_ref3, 4); - - // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 - sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); - sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); - - // merge every 64 bit from each sum_ref-i - sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); - sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); - - // add the low 64 bit to the high 64 bit - sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); - - // add the low 128 bit to the high 128 bit - sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), - _mm256_extractf128_si256(sum_mlow, 1)); - - _mm_storeu_si128((__m128i *)(res), sum); - } -} - -void vp9_sad64x64x4d_avx2(uint8_t *src, - int src_stride, - uint8_t *ref[4], - int ref_stride, - unsigned int res[4]) { - __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg; - __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg; - __m256i ref3_reg, ref3next_reg; - __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3; - __m256i sum_mlow, sum_mhigh; - int i; - uint8_t *ref0, *ref1, *ref2, *ref3; - - ref0 = ref[0]; - ref1 = ref[1]; - ref2 = ref[2]; - ref3 = ref[3]; - sum_ref0 = _mm256_set1_epi16(0); - sum_ref1 = _mm256_set1_epi16(0); - sum_ref2 = _mm256_set1_epi16(0); - sum_ref3 = _mm256_set1_epi16(0); - for (i = 0; i < 64 ; i++) { - // load 64 bytes from src and all refs - src_reg = _mm256_loadu_si256((__m256i *)(src)); - srcnext_reg = _mm256_loadu_si256((__m256i *)(src + 32)); - ref0_reg = _mm256_loadu_si256((__m256i *) (ref0)); - ref0next_reg = _mm256_loadu_si256((__m256i *) (ref0 + 32)); - ref1_reg = _mm256_loadu_si256((__m256i *) (ref1)); - ref1next_reg = _mm256_loadu_si256((__m256i *) (ref1 + 32)); - ref2_reg = _mm256_loadu_si256((__m256i *) (ref2)); - ref2next_reg = _mm256_loadu_si256((__m256i *) (ref2 + 32)); - ref3_reg = _mm256_loadu_si256((__m256i *) (ref3)); - ref3next_reg = _mm256_loadu_si256((__m256i *) (ref3 + 32)); - // sum of the absolute differences between every ref-i to src - ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg); - ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg); - ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg); - ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg); - ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg); - ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg); - ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg); - ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg); - - // sum every ref-i - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg); - sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg); - sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg); - sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg); - sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg); - src+= src_stride; - ref0+= ref_stride; - ref1+= ref_stride; - ref2+= ref_stride; - ref3+= ref_stride; - } - { - __m128i sum; - - // in sum_ref-i the result is saved in the first 4 bytes - // the other 4 bytes are zeroed. - // sum_ref1 and sum_ref3 are shifted left by 4 bytes - sum_ref1 = _mm256_slli_si256(sum_ref1, 4); - sum_ref3 = _mm256_slli_si256(sum_ref3, 4); - - // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3 - sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1); - sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3); - - // merge every 64 bit from each sum_ref-i - sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2); - sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2); - - // add the low 64 bit to the high 64 bit - sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh); - - // add the low 128 bit to the high 128 bit - sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow), - _mm256_extractf128_si256(sum_mlow, 1)); - - _mm_storeu_si128((__m128i *)(res), sum); - } -} diff --git a/vp9/encoder/x86/vp9_sad4d_sse2.asm b/vp9/encoder/x86/vp9_sad4d_sse2.asm deleted file mode 100644 index b4936281f..000000000 --- a/vp9/encoder/x86/vp9_sad4d_sse2.asm +++ /dev/null @@ -1,231 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -; PROCESS_4x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_4x2x4 5-6 0 - movd m0, [srcq +%2] -%if %1 == 1 - movd m6, [ref1q+%3] - movd m4, [ref2q+%3] - movd m7, [ref3q+%3] - movd m5, [ref4q+%3] - punpckldq m0, [srcq +%4] - punpckldq m6, [ref1q+%5] - punpckldq m4, [ref2q+%5] - punpckldq m7, [ref3q+%5] - punpckldq m5, [ref4q+%5] - psadbw m6, m0 - psadbw m4, m0 - psadbw m7, m0 - psadbw m5, m0 - punpckldq m6, m4 - punpckldq m7, m5 -%else - movd m1, [ref1q+%3] - movd m2, [ref2q+%3] - movd m3, [ref3q+%3] - movd m4, [ref4q+%3] - punpckldq m0, [srcq +%4] - punpckldq m1, [ref1q+%5] - punpckldq m2, [ref2q+%5] - punpckldq m3, [ref3q+%5] - punpckldq m4, [ref4q+%5] - psadbw m1, m0 - psadbw m2, m0 - psadbw m3, m0 - psadbw m4, m0 - punpckldq m1, m2 - punpckldq m3, m4 - paddd m6, m1 - paddd m7, m3 -%endif -%if %6 == 1 - lea srcq, [srcq +src_strideq*2] - lea ref1q, [ref1q+ref_strideq*2] - lea ref2q, [ref2q+ref_strideq*2] - lea ref3q, [ref3q+ref_strideq*2] - lea ref4q, [ref4q+ref_strideq*2] -%endif -%endmacro - -; PROCESS_8x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_8x2x4 5-6 0 - movh m0, [srcq +%2] -%if %1 == 1 - movh m4, [ref1q+%3] - movh m5, [ref2q+%3] - movh m6, [ref3q+%3] - movh m7, [ref4q+%3] - movhps m0, [srcq +%4] - movhps m4, [ref1q+%5] - movhps m5, [ref2q+%5] - movhps m6, [ref3q+%5] - movhps m7, [ref4q+%5] - psadbw m4, m0 - psadbw m5, m0 - psadbw m6, m0 - psadbw m7, m0 -%else - movh m1, [ref1q+%3] - movh m2, [ref2q+%3] - movh m3, [ref3q+%3] - movhps m0, [srcq +%4] - movhps m1, [ref1q+%5] - movhps m2, [ref2q+%5] - movhps m3, [ref3q+%5] - psadbw m1, m0 - psadbw m2, m0 - psadbw m3, m0 - paddd m4, m1 - movh m1, [ref4q+%3] - movhps m1, [ref4q+%5] - paddd m5, m2 - paddd m6, m3 - psadbw m1, m0 - paddd m7, m1 -%endif -%if %6 == 1 - lea srcq, [srcq +src_strideq*2] - lea ref1q, [ref1q+ref_strideq*2] - lea ref2q, [ref2q+ref_strideq*2] - lea ref3q, [ref3q+ref_strideq*2] - lea ref4q, [ref4q+ref_strideq*2] -%endif -%endmacro - -; PROCESS_16x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_16x2x4 5-6 0 - ; 1st 16 px - mova m0, [srcq +%2] -%if %1 == 1 - movu m4, [ref1q+%3] - movu m5, [ref2q+%3] - movu m6, [ref3q+%3] - movu m7, [ref4q+%3] - psadbw m4, m0 - psadbw m5, m0 - psadbw m6, m0 - psadbw m7, m0 -%else - movu m1, [ref1q+%3] - movu m2, [ref2q+%3] - movu m3, [ref3q+%3] - psadbw m1, m0 - psadbw m2, m0 - psadbw m3, m0 - paddd m4, m1 - movu m1, [ref4q+%3] - paddd m5, m2 - paddd m6, m3 - psadbw m1, m0 - paddd m7, m1 -%endif - - ; 2nd 16 px - mova m0, [srcq +%4] - movu m1, [ref1q+%5] - movu m2, [ref2q+%5] - movu m3, [ref3q+%5] - psadbw m1, m0 - psadbw m2, m0 - psadbw m3, m0 - paddd m4, m1 - movu m1, [ref4q+%5] - paddd m5, m2 - paddd m6, m3 -%if %6 == 1 - lea srcq, [srcq +src_strideq*2] - lea ref1q, [ref1q+ref_strideq*2] - lea ref2q, [ref2q+ref_strideq*2] - lea ref3q, [ref3q+ref_strideq*2] - lea ref4q, [ref4q+ref_strideq*2] -%endif - psadbw m1, m0 - paddd m7, m1 -%endmacro - -; PROCESS_32x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_32x2x4 5-6 0 - PROCESS_16x2x4 %1, %2, %3, %2 + 16, %3 + 16 - PROCESS_16x2x4 0, %4, %5, %4 + 16, %5 + 16, %6 -%endmacro - -; PROCESS_64x2x4 first, off_{first,second}_{src,ref}, advance_at_end -%macro PROCESS_64x2x4 5-6 0 - PROCESS_32x2x4 %1, %2, %3, %2 + 32, %3 + 32 - PROCESS_32x2x4 0, %4, %5, %4 + 32, %5 + 32, %6 -%endmacro - -; void vp9_sadNxNx4d_sse2(uint8_t *src, int src_stride, -; uint8_t *ref[4], int ref_stride, -; unsigned int res[4]); -; where NxN = 64x64, 32x32, 16x16, 16x8, 8x16 or 8x8 -%macro SADNXN4D 2 -%if UNIX64 -cglobal sad%1x%2x4d, 5, 8, 8, src, src_stride, ref1, ref_stride, \ - res, ref2, ref3, ref4 -%else -cglobal sad%1x%2x4d, 4, 7, 8, src, src_stride, ref1, ref_stride, \ - ref2, ref3, ref4 -%endif - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided - mov ref2q, [ref1q+gprsize*1] - mov ref3q, [ref1q+gprsize*2] - mov ref4q, [ref1q+gprsize*3] - mov ref1q, [ref1q+gprsize*0] - - PROCESS_%1x2x4 1, 0, 0, src_strideq, ref_strideq, 1 -%rep (%2-4)/2 - PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 1 -%endrep - PROCESS_%1x2x4 0, 0, 0, src_strideq, ref_strideq, 0 - -%if mmsize == 16 - pslldq m5, 4 - pslldq m7, 4 - por m4, m5 - por m6, m7 - mova m5, m4 - mova m7, m6 - punpcklqdq m4, m6 - punpckhqdq m5, m7 - movifnidn r4, r4mp - paddd m4, m5 - movu [r4], m4 - RET -%else - movifnidn r4, r4mp - movq [r4+0], m6 - movq [r4+8], m7 - RET -%endif -%endmacro - -INIT_XMM sse2 -SADNXN4D 64, 64 -SADNXN4D 64, 32 -SADNXN4D 32, 64 -SADNXN4D 32, 32 -SADNXN4D 32, 16 -SADNXN4D 16, 32 -SADNXN4D 16, 16 -SADNXN4D 16, 8 -SADNXN4D 8, 16 -SADNXN4D 8, 8 -SADNXN4D 8, 4 - -INIT_MMX sse -SADNXN4D 4, 8 -SADNXN4D 4, 4 diff --git a/vp9/encoder/x86/vp9_sad_intrin_avx2.c b/vp9/encoder/x86/vp9_sad_intrin_avx2.c deleted file mode 100644 index 113193070..000000000 --- a/vp9/encoder/x86/vp9_sad_intrin_avx2.c +++ /dev/null @@ -1,180 +0,0 @@ -/* - * Copyright (c) 2012 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include <immintrin.h> -#include "vpx_ports/mem.h" - -#define FSAD64_H(h) \ -unsigned int vp9_sad64x##h##_avx2(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0 ; i < h ; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - sad1_reg = _mm256_sad_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr+= ref_stride; \ - src_ptr+= src_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ -} - -#define FSAD32_H(h) \ -unsigned int vp9_sad32x##h##_avx2(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0 ; i < max ; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - sad1_reg = _mm256_sad_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr+= ref2_stride; \ - src_ptr+= src2_stride; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ -} - -#define FSAD64 \ -FSAD64_H(64); \ -FSAD64_H(32); - -#define FSAD32 \ -FSAD32_H(64); \ -FSAD32_H(32); \ -FSAD32_H(16); - -FSAD64; -FSAD32; - -#undef FSAD64 -#undef FSAD32 -#undef FSAD64_H -#undef FSAD32_H - -#define FSADAVG64_H(h) \ -unsigned int vp9_sad64x##h##_avg_avx2(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride, \ - const uint8_t *second_pred) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - for (i = 0 ; i < h ; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32)); \ - ref1_reg = _mm256_avg_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)second_pred)); \ - ref2_reg = _mm256_avg_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ - sad1_reg = _mm256_sad_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + 32))); \ - sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr+= ref_stride; \ - src_ptr+= src_stride; \ - second_pred+= 64; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ -} - -#define FSADAVG32_H(h) \ -unsigned int vp9_sad32x##h##_avg_avx2(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride, \ - const uint8_t *second_pred) { \ - int i, res; \ - __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg; \ - __m256i sum_sad = _mm256_setzero_si256(); \ - __m256i sum_sad_h; \ - __m128i sum_sad128; \ - int ref2_stride = ref_stride << 1; \ - int src2_stride = src_stride << 1; \ - int max = h >> 1; \ - for (i = 0 ; i < max ; i++) { \ - ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr); \ - ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride)); \ - ref1_reg = _mm256_avg_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)second_pred)); \ - ref2_reg = _mm256_avg_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(second_pred +32))); \ - sad1_reg = _mm256_sad_epu8(ref1_reg, \ - _mm256_loadu_si256((__m256i const *)src_ptr)); \ - sad2_reg = _mm256_sad_epu8(ref2_reg, \ - _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride))); \ - sum_sad = _mm256_add_epi32(sum_sad, \ - _mm256_add_epi32(sad1_reg, sad2_reg)); \ - ref_ptr+= ref2_stride; \ - src_ptr+= src2_stride; \ - second_pred+= 64; \ - } \ - sum_sad_h = _mm256_srli_si256(sum_sad, 8); \ - sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h); \ - sum_sad128 = _mm256_extracti128_si256(sum_sad, 1); \ - sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128); \ - res = _mm_cvtsi128_si32(sum_sad128); \ - return res; \ -} - -#define FSADAVG64 \ -FSADAVG64_H(64); \ -FSADAVG64_H(32); - -#define FSADAVG32 \ -FSADAVG32_H(64); \ -FSADAVG32_H(32); \ -FSADAVG32_H(16); - -FSADAVG64; -FSADAVG32; - -#undef FSADAVG64 -#undef FSADAVG32 -#undef FSADAVG64_H -#undef FSADAVG32_H diff --git a/vp9/encoder/x86/vp9_sad_sse2.asm b/vp9/encoder/x86/vp9_sad_sse2.asm deleted file mode 100644 index c4c5c54f0..000000000 --- a/vp9/encoder/x86/vp9_sad_sse2.asm +++ /dev/null @@ -1,267 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -%macro SAD_FN 4 -%if %4 == 0 -%if %3 == 5 -cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows -%else ; %3 == 7 -cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ - src_stride3, ref_stride3, n_rows -%endif ; %3 == 5/7 -%else ; avg -%if %3 == 5 -cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ - second_pred, n_rows -%else ; %3 == 7 -cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ - ref, ref_stride, \ - second_pred, \ - src_stride3, ref_stride3 -%if ARCH_X86_64 -%define n_rowsd r7d -%else ; x86-32 -%define n_rowsd dword r0m -%endif ; x86-32/64 -%endif ; %3 == 5/7 -%endif ; avg/sad - movsxdifnidn src_strideq, src_strided - movsxdifnidn ref_strideq, ref_strided -%if %3 == 7 - lea src_stride3q, [src_strideq*3] - lea ref_stride3q, [ref_strideq*3] -%endif ; %3 == 7 -%endmacro - -; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD64XN 1-2 0 - SAD_FN 64, %1, 5, %2 - mov n_rowsd, %1 - pxor m0, m0 -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+32] - movu m4, [refq+48] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m2, [second_predq+mmsize*1] - pavgb m3, [second_predq+mmsize*2] - pavgb m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - psadbw m1, [srcq] - psadbw m2, [srcq+16] - psadbw m3, [srcq+32] - psadbw m4, [srcq+48] - paddd m1, m2 - paddd m3, m4 - add refq, ref_strideq - paddd m0, m1 - add srcq, src_strideq - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD64XN 64 ; sad64x64_sse2 -SAD64XN 32 ; sad64x32_sse2 -SAD64XN 64, 1 ; sad64x64_avg_sse2 -SAD64XN 32, 1 ; sad64x32_avg_sse2 - -; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD32XN 1-2 0 - SAD_FN 32, %1, 5, %2 - mov n_rowsd, %1/2 - pxor m0, m0 -.loop: - movu m1, [refq] - movu m2, [refq+16] - movu m3, [refq+ref_strideq] - movu m4, [refq+ref_strideq+16] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m2, [second_predq+mmsize*1] - pavgb m3, [second_predq+mmsize*2] - pavgb m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - psadbw m1, [srcq] - psadbw m2, [srcq+16] - psadbw m3, [srcq+src_strideq] - psadbw m4, [srcq+src_strideq+16] - paddd m1, m2 - paddd m3, m4 - lea refq, [refq+ref_strideq*2] - paddd m0, m1 - lea srcq, [srcq+src_strideq*2] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD32XN 64 ; sad32x64_sse2 -SAD32XN 32 ; sad32x32_sse2 -SAD32XN 16 ; sad32x16_sse2 -SAD32XN 64, 1 ; sad32x64_avg_sse2 -SAD32XN 32, 1 ; sad32x32_avg_sse2 -SAD32XN 16, 1 ; sad32x16_avg_sse2 - -; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD16XN 1-2 0 - SAD_FN 16, %1, 7, %2 - mov n_rowsd, %1/4 - pxor m0, m0 - -.loop: - movu m1, [refq] - movu m2, [refq+ref_strideq] - movu m3, [refq+ref_strideq*2] - movu m4, [refq+ref_stride3q] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m2, [second_predq+mmsize*1] - pavgb m3, [second_predq+mmsize*2] - pavgb m4, [second_predq+mmsize*3] - lea second_predq, [second_predq+mmsize*4] -%endif - psadbw m1, [srcq] - psadbw m2, [srcq+src_strideq] - psadbw m3, [srcq+src_strideq*2] - psadbw m4, [srcq+src_stride3q] - paddd m1, m2 - paddd m3, m4 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - paddd m0, m3 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD16XN 32 ; sad16x32_sse2 -SAD16XN 16 ; sad16x16_sse2 -SAD16XN 8 ; sad16x8_sse2 -SAD16XN 32, 1 ; sad16x32_avg_sse2 -SAD16XN 16, 1 ; sad16x16_avg_sse2 -SAD16XN 8, 1 ; sad16x8_avg_sse2 - -; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD8XN 1-2 0 - SAD_FN 8, %1, 7, %2 - mov n_rowsd, %1/4 - pxor m0, m0 - -.loop: - movh m1, [refq] - movhps m1, [refq+ref_strideq] - movh m2, [refq+ref_strideq*2] - movhps m2, [refq+ref_stride3q] -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m2, [second_predq+mmsize*1] - lea second_predq, [second_predq+mmsize*2] -%endif - movh m3, [srcq] - movhps m3, [srcq+src_strideq] - movh m4, [srcq+src_strideq*2] - movhps m4, [srcq+src_stride3q] - psadbw m1, m3 - psadbw m2, m4 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - paddd m0, m2 - dec n_rowsd - jg .loop - - movhlps m1, m0 - paddd m0, m1 - movd eax, m0 - RET -%endmacro - -INIT_XMM sse2 -SAD8XN 16 ; sad8x16_sse2 -SAD8XN 8 ; sad8x8_sse2 -SAD8XN 4 ; sad8x4_sse2 -SAD8XN 16, 1 ; sad8x16_avg_sse2 -SAD8XN 8, 1 ; sad8x8_avg_sse2 -SAD8XN 4, 1 ; sad8x4_avg_sse2 - -; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, -; uint8_t *ref, int ref_stride); -%macro SAD4XN 1-2 0 - SAD_FN 4, %1, 7, %2 - mov n_rowsd, %1/4 - pxor m0, m0 - -.loop: - movd m1, [refq] - movd m2, [refq+ref_strideq] - movd m3, [refq+ref_strideq*2] - movd m4, [refq+ref_stride3q] - punpckldq m1, m2 - punpckldq m3, m4 -%if %2 == 1 - pavgb m1, [second_predq+mmsize*0] - pavgb m3, [second_predq+mmsize*1] - lea second_predq, [second_predq+mmsize*2] -%endif - movd m2, [srcq] - movd m5, [srcq+src_strideq] - movd m4, [srcq+src_strideq*2] - movd m6, [srcq+src_stride3q] - punpckldq m2, m5 - punpckldq m4, m6 - psadbw m1, m2 - psadbw m3, m4 - lea refq, [refq+ref_strideq*4] - paddd m0, m1 - lea srcq, [srcq+src_strideq*4] - paddd m0, m3 - dec n_rowsd - jg .loop - - movd eax, m0 - RET -%endmacro - -INIT_MMX sse -SAD4XN 8 ; sad4x8_sse -SAD4XN 4 ; sad4x4_sse -SAD4XN 8, 1 ; sad4x8_avg_sse -SAD4XN 4, 1 ; sad4x4_avg_sse diff --git a/vp9/encoder/x86/vp9_sad_sse3.asm b/vp9/encoder/x86/vp9_sad_sse3.asm deleted file mode 100644 index 2b90a5d54..000000000 --- a/vp9/encoder/x86/vp9_sad_sse3.asm +++ /dev/null @@ -1,378 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "vpx_ports/x86_abi_support.asm" - -%macro STACK_FRAME_CREATE_X3 0 -%if ABI_IS_32BIT - %define src_ptr rsi - %define src_stride rax - %define ref_ptr rdi - %define ref_stride rdx - %define end_ptr rcx - %define ret_var rbx - %define result_ptr arg(4) - %define max_err arg(4) - %define height dword ptr arg(4) - push rbp - mov rbp, rsp - push rsi - push rdi - push rbx - - mov rsi, arg(0) ; src_ptr - mov rdi, arg(2) ; ref_ptr - - movsxd rax, dword ptr arg(1) ; src_stride - movsxd rdx, dword ptr arg(3) ; ref_stride -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 7, u - %define src_ptr rcx - %define src_stride rdx - %define ref_ptr r8 - %define ref_stride r9 - %define end_ptr r10 - %define ret_var r11 - %define result_ptr [rsp+xmm_stack_space+8+4*8] - %define max_err [rsp+xmm_stack_space+8+4*8] - %define height dword ptr [rsp+xmm_stack_space+8+4*8] - %else - %define src_ptr rdi - %define src_stride rsi - %define ref_ptr rdx - %define ref_stride rcx - %define end_ptr r9 - %define ret_var r10 - %define result_ptr r8 - %define max_err r8 - %define height r8 - %endif -%endif - -%endmacro - -%macro STACK_FRAME_DESTROY_X3 0 - %define src_ptr - %define src_stride - %define ref_ptr - %define ref_stride - %define end_ptr - %define ret_var - %define result_ptr - %define max_err - %define height - -%if ABI_IS_32BIT - pop rbx - pop rdi - pop rsi - pop rbp -%else - %if LIBVPX_YASM_WIN64 - RESTORE_XMM - %endif -%endif - ret -%endmacro - -%macro PROCESS_16X2X3 5 -%if %1==0 - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm5, XMMWORD PTR [%3] - lddqu xmm6, XMMWORD PTR [%3+1] - lddqu xmm7, XMMWORD PTR [%3+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [%2] - lddqu xmm1, XMMWORD PTR [%3] - lddqu xmm2, XMMWORD PTR [%3+1] - lddqu xmm3, XMMWORD PTR [%3+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [%2+%4] - lddqu xmm1, XMMWORD PTR [%3+%5] - lddqu xmm2, XMMWORD PTR [%3+%5+1] - lddqu xmm3, XMMWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_8X2X3 5 -%if %1==0 - movq mm0, QWORD PTR [%2] - movq mm5, QWORD PTR [%3] - movq mm6, QWORD PTR [%3+1] - movq mm7, QWORD PTR [%3+2] - - psadbw mm5, mm0 - psadbw mm6, mm0 - psadbw mm7, mm0 -%else - movq mm0, QWORD PTR [%2] - movq mm1, QWORD PTR [%3] - movq mm2, QWORD PTR [%3+1] - movq mm3, QWORD PTR [%3+2] - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endif - movq mm0, QWORD PTR [%2+%4] - movq mm1, QWORD PTR [%3+%5] - movq mm2, QWORD PTR [%3+%5+1] - movq mm3, QWORD PTR [%3+%5+2] - -%if %1==0 || %1==1 - lea %2, [%2+%4*2] - lea %3, [%3+%5*2] -%endif - - psadbw mm1, mm0 - psadbw mm2, mm0 - psadbw mm3, mm0 - - paddw mm5, mm1 - paddw mm6, mm2 - paddw mm7, mm3 -%endmacro - -;void int vp9_sad16x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad16x16x3_sse3) PRIVATE -sym(vp9_sad16x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vp9_sad16x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad16x8x3_sse3) PRIVATE -sym(vp9_sad16x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rcx], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rcx+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rcx+8], xmm0 - - STACK_FRAME_DESTROY_X3 - -;void int vp9_sad8x16x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad8x16x3_sse3) PRIVATE -sym(vp9_sad8x16x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vp9_sad8x8x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad8x8x3_sse3) PRIVATE -sym(vp9_sad8x8x3_sse3): - - STACK_FRAME_CREATE_X3 - - PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride - PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride - - mov rcx, result_ptr - - punpckldq mm5, mm6 - - movq [rcx], mm5 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 - -;void int vp9_sad4x4x3_sse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad4x4x3_sse3) PRIVATE -sym(vp9_sad4x4x3_sse3): - - STACK_FRAME_CREATE_X3 - - movd mm0, DWORD PTR [src_ptr] - movd mm1, DWORD PTR [ref_ptr] - - movd mm2, DWORD PTR [src_ptr+src_stride] - movd mm3, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm2 - punpcklbw mm1, mm3 - - movd mm4, DWORD PTR [ref_ptr+1] - movd mm5, DWORD PTR [ref_ptr+2] - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm3, DWORD PTR [ref_ptr+ref_stride+2] - - psadbw mm1, mm0 - - punpcklbw mm4, mm2 - punpcklbw mm5, mm3 - - psadbw mm4, mm0 - psadbw mm5, mm0 - - lea src_ptr, [src_ptr+src_stride*2] - lea ref_ptr, [ref_ptr+ref_stride*2] - - movd mm0, DWORD PTR [src_ptr] - movd mm2, DWORD PTR [ref_ptr] - - movd mm3, DWORD PTR [src_ptr+src_stride] - movd mm6, DWORD PTR [ref_ptr+ref_stride] - - punpcklbw mm0, mm3 - punpcklbw mm2, mm6 - - movd mm3, DWORD PTR [ref_ptr+1] - movd mm7, DWORD PTR [ref_ptr+2] - - psadbw mm2, mm0 - - paddw mm1, mm2 - - movd mm2, DWORD PTR [ref_ptr+ref_stride+1] - movd mm6, DWORD PTR [ref_ptr+ref_stride+2] - - punpcklbw mm3, mm2 - punpcklbw mm7, mm6 - - psadbw mm3, mm0 - psadbw mm7, mm0 - - paddw mm3, mm4 - paddw mm7, mm5 - - mov rcx, result_ptr - - punpckldq mm1, mm3 - - movq [rcx], mm1 - movd [rcx+8], mm7 - - STACK_FRAME_DESTROY_X3 diff --git a/vp9/encoder/x86/vp9_sad_sse4.asm b/vp9/encoder/x86/vp9_sad_sse4.asm deleted file mode 100644 index faf1768a9..000000000 --- a/vp9/encoder/x86/vp9_sad_sse4.asm +++ /dev/null @@ -1,359 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X8 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm1, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm1, xmm2 - paddw xmm1, xmm3 - paddw xmm1, xmm4 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - movq xmm2, MMWORD PTR [rdi+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endif - movdqa xmm0, XMMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - movq xmm2, MMWORD PTR [rdi+ rdx+16] - punpcklqdq xmm5, xmm3 - punpcklqdq xmm3, xmm2 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - - psrldq xmm0, 8 - movdqa xmm4, xmm3 - mpsadbw xmm3, xmm0, 0x0 - mpsadbw xmm4, xmm0, 0x5 - - paddw xmm5, xmm2 - paddw xmm5, xmm3 - paddw xmm5, xmm4 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_8X2X8 1 -%if %1 - movq xmm0, MMWORD PTR [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - movdqa xmm2, xmm1 - mpsadbw xmm1, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm1, xmm2 -%else - movq xmm0, MMWORD PTR [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endif - movq xmm0, MMWORD PTR [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - movdqa xmm2, xmm5 - mpsadbw xmm5, xmm0, 0x0 - mpsadbw xmm2, xmm0, 0x5 - paddw xmm5, xmm2 - - paddw xmm1, xmm5 -%endmacro - -%macro PROCESS_4X2X8 1 -%if %1 - movd xmm0, [rsi] - movq xmm1, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm1, xmm3 - - mpsadbw xmm1, xmm0, 0x0 -%else - movd xmm0, [rsi] - movq xmm5, MMWORD PTR [rdi] - movq xmm3, MMWORD PTR [rdi+8] - punpcklqdq xmm5, xmm3 - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endif - movd xmm0, [rsi + rax] - movq xmm5, MMWORD PTR [rdi+ rdx] - movq xmm3, MMWORD PTR [rdi+ rdx+8] - punpcklqdq xmm5, xmm3 - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - mpsadbw xmm5, xmm0, 0x0 - - paddw xmm1, xmm5 -%endmacro - -%macro WRITE_AS_INTS 0 - mov rdi, arg(4) ;Results - pxor xmm0, xmm0 - movdqa xmm2, xmm1 - punpcklwd xmm1, xmm0 - punpckhwd xmm2, xmm0 - - movdqa [rdi], xmm1 - movdqa [rdi + 16], xmm2 -%endmacro - -;void vp9_sad16x16x8_sse4( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array); -global sym(vp9_sad16x16x8_sse4) PRIVATE -sym(vp9_sad16x16x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_sad16x8x8_sse4( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vp9_sad16x8x8_sse4) PRIVATE -sym(vp9_sad16x8x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_16X2X8 1 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - PROCESS_16X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_sad8x8x8_sse4( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vp9_sad8x8x8_sse4) PRIVATE -sym(vp9_sad8x8x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_sad8x16x8_sse4( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vp9_sad8x16x8_sse4) PRIVATE -sym(vp9_sad8x16x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_8X2X8 1 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - PROCESS_8X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_sad4x4x8_c( -; const unsigned char *src_ptr, -; int src_stride, -; const unsigned char *ref_ptr, -; int ref_stride, -; unsigned short *sad_array -;); -global sym(vp9_sad4x4x8_sse4) PRIVATE -sym(vp9_sad4x4x8_sse4): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - PROCESS_4X2X8 1 - PROCESS_4X2X8 0 - - WRITE_AS_INTS - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - - - - diff --git a/vp9/encoder/x86/vp9_sad_ssse3.asm b/vp9/encoder/x86/vp9_sad_ssse3.asm deleted file mode 100644 index 0cb35424e..000000000 --- a/vp9/encoder/x86/vp9_sad_ssse3.asm +++ /dev/null @@ -1,370 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%macro PROCESS_16X2X3 1 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm5, XMMWORD PTR [rdi] - lddqu xmm6, XMMWORD PTR [rdi+1] - lddqu xmm7, XMMWORD PTR [rdi+2] - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - lddqu xmm1, XMMWORD PTR [rdi] - lddqu xmm2, XMMWORD PTR [rdi+1] - lddqu xmm3, XMMWORD PTR [rdi+2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - lddqu xmm1, XMMWORD PTR [rdi+rdx] - lddqu xmm2, XMMWORD PTR [rdi+rdx+1] - lddqu xmm3, XMMWORD PTR [rdi+rdx+2] - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X2X3_OFFSET 2 -%if %1 - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm7, XMMWORD PTR [rdi+16] - - movdqa xmm5, xmm7 - palignr xmm5, xmm4, %2 - - movdqa xmm6, xmm7 - palignr xmm6, xmm4, (%2+1) - - palignr xmm7, xmm4, (%2+2) - - psadbw xmm5, xmm0 - psadbw xmm6, xmm0 - psadbw xmm7, xmm0 -%else - movdqa xmm0, XMMWORD PTR [rsi] - movdqa xmm4, XMMWORD PTR [rdi] - movdqa xmm3, XMMWORD PTR [rdi+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endif - movdqa xmm0, XMMWORD PTR [rsi+rax] - movdqa xmm4, XMMWORD PTR [rdi+rdx] - movdqa xmm3, XMMWORD PTR [rdi+rdx+16] - - movdqa xmm1, xmm3 - palignr xmm1, xmm4, %2 - - movdqa xmm2, xmm3 - palignr xmm2, xmm4, (%2+1) - - palignr xmm3, xmm4, (%2+2) - - lea rsi, [rsi+rax*2] - lea rdi, [rdi+rdx*2] - - psadbw xmm1, xmm0 - psadbw xmm2, xmm0 - psadbw xmm3, xmm0 - - paddw xmm5, xmm1 - paddw xmm6, xmm2 - paddw xmm7, xmm3 -%endmacro - -%macro PROCESS_16X16X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -%macro PROCESS_16X8X3_OFFSET 2 -%2_aligned_by_%1: - - sub rdi, %1 - - PROCESS_16X2X3_OFFSET 1, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - PROCESS_16X2X3_OFFSET 0, %1 - - jmp %2_store_off - -%endmacro - -;void int vp9_sad16x16x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad16x16x3_ssse3) PRIVATE -sym(vp9_sad16x16x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vp9_sad16x16x3_ssse3_skiptable -.vp9_sad16x16x3_ssse3_jumptable: - dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump - dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump -.vp9_sad16x16x3_ssse3_skiptable: - - call .vp9_sad16x16x3_ssse3_do_jump -.vp9_sad16x16x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3 - PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3 - -.vp9_sad16x16x3_ssse3_aligned_by_15: - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vp9_sad16x16x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void int vp9_sad16x8x3_ssse3( -; unsigned char *src_ptr, -; int src_stride, -; unsigned char *ref_ptr, -; int ref_stride, -; int *results) -global sym(vp9_sad16x8x3_ssse3) PRIVATE -sym(vp9_sad16x8x3_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - push rsi - push rdi - push rcx - ; end prolog - - mov rsi, arg(0) ;src_ptr - mov rdi, arg(2) ;ref_ptr - - mov rdx, 0xf - and rdx, rdi - - jmp .vp9_sad16x8x3_ssse3_skiptable -.vp9_sad16x8x3_ssse3_jumptable: - dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump - dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump -.vp9_sad16x8x3_ssse3_skiptable: - - call .vp9_sad16x8x3_ssse3_do_jump -.vp9_sad16x8x3_ssse3_do_jump: - pop rcx ; get the address of do_jump - mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump - add rax, rcx ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable - - movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable - add rcx, rax - - movsxd rax, dword ptr arg(1) ;src_stride - movsxd rdx, dword ptr arg(3) ;ref_stride - - jmp rcx - - PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3 - PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3 - -.vp9_sad16x8x3_ssse3_aligned_by_15: - - PROCESS_16X2X3 1 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - PROCESS_16X2X3 0 - -.vp9_sad16x8x3_ssse3_store_off: - mov rdi, arg(4) ;Results - - movq xmm0, xmm5 - psrldq xmm5, 8 - - paddw xmm0, xmm5 - movd [rdi], xmm0 -;- - movq xmm0, xmm6 - psrldq xmm6, 8 - - paddw xmm0, xmm6 - movd [rdi+4], xmm0 -;- - movq xmm0, xmm7 - psrldq xmm7, 8 - - paddw xmm0, xmm7 - movd [rdi+8], xmm0 - - ; begin epilog - pop rcx - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret |