From d36659cec7fab96cedc67db4d511ed7135637d0e Mon Sep 17 00:00:00 2001 From: James Zern Date: Fri, 11 Dec 2015 12:40:53 -0800 Subject: move vp9_avg to vpx_dsp Change-Id: I7bc991abea383db1f86c1bb0f2e849837b54d90f --- vp9/encoder/arm/neon/vp9_avg_neon.c | 199 --------------- vp9/encoder/mips/msa/vp9_avg_msa.c | 56 ---- vp9/encoder/vp9_avg.c | 230 ----------------- vp9/encoder/vp9_encodeframe.c | 30 +-- vp9/encoder/vp9_mcomp.c | 18 +- vp9/encoder/vp9_pickmode.c | 6 +- vp9/encoder/x86/vp9_avg_intrin_sse2.c | 423 ------------------------------- vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm | 121 --------- 8 files changed, 27 insertions(+), 1056 deletions(-) delete mode 100644 vp9/encoder/arm/neon/vp9_avg_neon.c delete mode 100644 vp9/encoder/mips/msa/vp9_avg_msa.c delete mode 100644 vp9/encoder/vp9_avg.c delete mode 100644 vp9/encoder/x86/vp9_avg_intrin_sse2.c delete mode 100644 vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm (limited to 'vp9/encoder') diff --git a/vp9/encoder/arm/neon/vp9_avg_neon.c b/vp9/encoder/arm/neon/vp9_avg_neon.c deleted file mode 100644 index 78467cebd..000000000 --- a/vp9/encoder/arm/neon/vp9_avg_neon.c +++ /dev/null @@ -1,199 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include -#include - -#include "./vp9_rtcd.h" -#include "./vpx_config.h" - -#include "vpx/vpx_integer.h" - -static INLINE unsigned int horizontal_add_u16x8(const uint16x8_t v_16x8) { - const uint32x4_t a = vpaddlq_u16(v_16x8); - const uint64x2_t b = vpaddlq_u32(a); - const uint32x2_t c = vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), - vreinterpret_u32_u64(vget_high_u64(b))); - return vget_lane_u32(c, 0); -} - -unsigned int vp9_avg_4x4_neon(const uint8_t *s, int p) { - uint16x8_t v_sum; - uint32x2_t v_s0 = vdup_n_u32(0); - uint32x2_t v_s1 = vdup_n_u32(0); - v_s0 = vld1_lane_u32((const uint32_t *)s, v_s0, 0); - v_s0 = vld1_lane_u32((const uint32_t *)(s + p), v_s0, 1); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 2 * p), v_s1, 0); - v_s1 = vld1_lane_u32((const uint32_t *)(s + 3 * p), v_s1, 1); - v_sum = vaddl_u8(vreinterpret_u8_u32(v_s0), vreinterpret_u8_u32(v_s1)); - return (horizontal_add_u16x8(v_sum) + 8) >> 4; -} - -unsigned int vp9_avg_8x8_neon(const uint8_t *s, int p) { - uint8x8_t v_s0 = vld1_u8(s); - const uint8x8_t v_s1 = vld1_u8(s + p); - uint16x8_t v_sum = vaddl_u8(v_s0, v_s1); - - v_s0 = vld1_u8(s + 2 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 3 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 4 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 5 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 6 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - v_s0 = vld1_u8(s + 7 * p); - v_sum = vaddw_u8(v_sum, v_s0); - - return (horizontal_add_u16x8(v_sum) + 32) >> 6; -} - -// coeff: 16 bits, dynamic range [-32640, 32640]. -// length: value range {16, 64, 256, 1024}. -int vp9_satd_neon(const int16_t *coeff, int length) { - const int16x4_t zero = vdup_n_s16(0); - int32x4_t accum = vdupq_n_s32(0); - - do { - const int16x8_t src0 = vld1q_s16(coeff); - const int16x8_t src8 = vld1q_s16(coeff + 8); - accum = vabal_s16(accum, vget_low_s16(src0), zero); - accum = vabal_s16(accum, vget_high_s16(src0), zero); - accum = vabal_s16(accum, vget_low_s16(src8), zero); - accum = vabal_s16(accum, vget_high_s16(src8), zero); - length -= 16; - coeff += 16; - } while (length != 0); - - { - // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int satd = vget_lane_s32(s1, 0); - return satd; - } -} - -void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, - const int ref_stride, const int height) { - int i; - uint16x8_t vec_sum_lo = vdupq_n_u16(0); - uint16x8_t vec_sum_hi = vdupq_n_u16(0); - const int shift_factor = ((height >> 5) + 3) * -1; - const int16x8_t vec_shift = vdupq_n_s16(shift_factor); - - for (i = 0; i < height; i += 8) { - const uint8x16_t vec_row1 = vld1q_u8(ref); - const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); - const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); - const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); - const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); - const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); - const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); - const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); - - vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); - vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); - - ref += ref_stride * 8; - } - - vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); - vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); - - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); - hbuf += 8; - vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); -} - -int16_t vp9_int_pro_col_neon(uint8_t const *ref, const int width) { - int i; - uint16x8_t vec_sum = vdupq_n_u16(0); - - for (i = 0; i < width; i += 16) { - const uint8x16_t vec_row = vld1q_u8(ref); - vec_sum = vaddw_u8(vec_sum, vget_low_u8(vec_row)); - vec_sum = vaddw_u8(vec_sum, vget_high_u8(vec_row)); - ref += 16; - } - - return horizontal_add_u16x8(vec_sum); -} - -// ref, src = [0, 510] - max diff = 16-bits -// bwl = {2, 3, 4}, width = {16, 32, 64} -int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { - int width = 4 << bwl; - int32x4_t sse = vdupq_n_s32(0); - int16x8_t total = vdupq_n_s16(0); - - assert(width >= 8); - assert((width % 8) == 0); - - do { - const int16x8_t r = vld1q_s16(ref); - const int16x8_t s = vld1q_s16(src); - const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. - const int16x4_t diff_lo = vget_low_s16(diff); - const int16x4_t diff_hi = vget_high_s16(diff); - sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. - sse = vmlal_s16(sse, diff_hi, diff_hi); - total = vaddq_s16(total, diff); // dynamic range 16 bits. - - ref += 8; - src += 8; - width -= 8; - } while (width != 0); - - { - // Note: 'total''s pairwise addition could be implemented similarly to - // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired - // with the summation of 'sse' performed better on a Cortex-A15. - const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' - const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); - const int32x2_t t2 = vpadd_s32(t1, t1); - const int t = vget_lane_s32(t2, 0); - const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. - const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), - vreinterpret_s32_s64(vget_high_s64(s0))); - const int s = vget_lane_s32(s1, 0); - const int shift_factor = bwl + 2; - return s - ((t * t) >> shift_factor); - } -} diff --git a/vp9/encoder/mips/msa/vp9_avg_msa.c b/vp9/encoder/mips/msa/vp9_avg_msa.c deleted file mode 100644 index 611adb1a2..000000000 --- a/vp9/encoder/mips/msa/vp9_avg_msa.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "vpx_dsp/mips/macros_msa.h" - -uint32_t vp9_avg_8x8_msa(const uint8_t *src, int32_t src_stride) { - uint32_t sum_out; - v16u8 src0, src1, src2, src3, src4, src5, src6, src7; - v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7; - v4u32 sum = { 0 }; - - LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - HADD_UB4_UH(src0, src1, src2, src3, sum0, sum1, sum2, sum3); - HADD_UB4_UH(src4, src5, src6, src7, sum4, sum5, sum6, sum7); - ADD4(sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7, sum0, sum2, sum4, sum6); - ADD2(sum0, sum2, sum4, sum6, sum0, sum4); - sum0 += sum4; - - sum = __msa_hadd_u_w(sum0, sum0); - sum0 = (v8u16)__msa_pckev_h((v8i16)sum, (v8i16)sum); - sum = __msa_hadd_u_w(sum0, sum0); - sum = (v4u32)__msa_srari_w((v4i32)sum, 6); - sum_out = __msa_copy_u_w((v4i32)sum, 0); - - return sum_out; -} - -uint32_t vp9_avg_4x4_msa(const uint8_t *src, int32_t src_stride) { - uint32_t sum_out; - uint32_t src0, src1, src2, src3; - v16u8 vec = { 0 }; - v8u16 sum0; - v4u32 sum1; - v2u64 sum2; - - LW4(src, src_stride, src0, src1, src2, src3); - INSERT_W4_UB(src0, src1, src2, src3, vec); - - sum0 = __msa_hadd_u_h(vec, vec); - sum1 = __msa_hadd_u_w(sum0, sum0); - sum0 = (v8u16)__msa_pckev_h((v8i16)sum1, (v8i16)sum1); - sum1 = __msa_hadd_u_w(sum0, sum0); - sum2 = __msa_hadd_u_d(sum1, sum1); - sum1 = (v4u32)__msa_srari_w((v4i32)sum2, 4); - sum_out = __msa_copy_u_w((v4i32)sum1, 0); - - return sum_out; -} diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c deleted file mode 100644 index 7baa09ae5..000000000 --- a/vp9/encoder/vp9_avg.c +++ /dev/null @@ -1,230 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" -#include "vpx_ports/mem.h" - -unsigned int vp9_avg_8x8_c(const uint8_t *s, int p) { - int i, j; - int sum = 0; - for (i = 0; i < 8; ++i, s+=p) - for (j = 0; j < 8; sum += s[j], ++j) {} - - return (sum + 32) >> 6; -} - -unsigned int vp9_avg_4x4_c(const uint8_t *s, int p) { - int i, j; - int sum = 0; - for (i = 0; i < 4; ++i, s+=p) - for (j = 0; j < 4; sum += s[j], ++j) {} - - return (sum + 8) >> 4; -} - -// src_diff: first pass, 9 bit, dynamic range [-255, 255] -// second pass, 12 bit, dynamic range [-2040, 2040] -static void hadamard_col8(const int16_t *src_diff, int src_stride, - int16_t *coeff) { - int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride]; - int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride]; - int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride]; - int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride]; - int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride]; - int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride]; - int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride]; - int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride]; - - int16_t c0 = b0 + b2; - int16_t c1 = b1 + b3; - int16_t c2 = b0 - b2; - int16_t c3 = b1 - b3; - int16_t c4 = b4 + b6; - int16_t c5 = b5 + b7; - int16_t c6 = b4 - b6; - int16_t c7 = b5 - b7; - - coeff[0] = c0 + c4; - coeff[7] = c1 + c5; - coeff[3] = c2 + c6; - coeff[4] = c3 + c7; - coeff[2] = c0 - c4; - coeff[6] = c1 - c5; - coeff[1] = c2 - c6; - coeff[5] = c3 - c7; -} - -void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - int idx; - int16_t buffer[64]; - int16_t *tmp_buf = &buffer[0]; - for (idx = 0; idx < 8; ++idx) { - hadamard_col8(src_diff, src_stride, tmp_buf); // src_diff: 9 bit - // dynamic range [-255, 255] - tmp_buf += 8; - ++src_diff; - } - - tmp_buf = &buffer[0]; - for (idx = 0; idx < 8; ++idx) { - hadamard_col8(tmp_buf, 8, coeff); // tmp_buf: 12 bit - // dynamic range [-2040, 2040] - coeff += 8; // coeff: 15 bit - // dynamic range [-16320, 16320] - ++tmp_buf; - } -} - -// In place 16x16 2D Hadamard transform -void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - int idx; - for (idx = 0; idx < 4; ++idx) { - // src_diff: 9 bit, dynamic range [-255, 255] - int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride - + (idx & 0x01) * 8; - vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64); - } - - // coeff: 15 bit, dynamic range [-16320, 16320] - for (idx = 0; idx < 64; ++idx) { - int16_t a0 = coeff[0]; - int16_t a1 = coeff[64]; - int16_t a2 = coeff[128]; - int16_t a3 = coeff[192]; - - int16_t b0 = (a0 + a1) >> 1; // (a0 + a1): 16 bit, [-32640, 32640] - int16_t b1 = (a0 - a1) >> 1; // b0-b3: 15 bit, dynamic range - int16_t b2 = (a2 + a3) >> 1; // [-16320, 16320] - int16_t b3 = (a2 - a3) >> 1; - - coeff[0] = b0 + b2; // 16 bit, [-32640, 32640] - coeff[64] = b1 + b3; - coeff[128] = b0 - b2; - coeff[192] = b1 - b3; - - ++coeff; - } -} - -// coeff: 16 bits, dynamic range [-32640, 32640]. -// length: value range {16, 64, 256, 1024}. -int vp9_satd_c(const int16_t *coeff, int length) { - int i; - int satd = 0; - for (i = 0; i < length; ++i) - satd += abs(coeff[i]); - - // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] - return satd; -} - -// Integer projection onto row vectors. -// height: value range {16, 32, 64}. -void vp9_int_pro_row_c(int16_t hbuf[16], uint8_t const *ref, - const int ref_stride, const int height) { - int idx; - const int norm_factor = height >> 1; - for (idx = 0; idx < 16; ++idx) { - int i; - hbuf[idx] = 0; - // hbuf[idx]: 14 bit, dynamic range [0, 16320]. - for (i = 0; i < height; ++i) - hbuf[idx] += ref[i * ref_stride]; - // hbuf[idx]: 9 bit, dynamic range [0, 510]. - hbuf[idx] /= norm_factor; - ++ref; - } -} - -// width: value range {16, 32, 64}. -int16_t vp9_int_pro_col_c(uint8_t const *ref, const int width) { - int idx; - int16_t sum = 0; - // sum: 14 bit, dynamic range [0, 16320] - for (idx = 0; idx < width; ++idx) - sum += ref[idx]; - return sum; -} - -// ref: [0 - 510] -// src: [0 - 510] -// bwl: {2, 3, 4} -int vp9_vector_var_c(int16_t const *ref, int16_t const *src, - const int bwl) { - int i; - int width = 4 << bwl; - int sse = 0, mean = 0, var; - - for (i = 0; i < width; ++i) { - int diff = ref[i] - src[i]; // diff: dynamic range [-510, 510], 10 bits. - mean += diff; // mean: dynamic range 16 bits. - sse += diff * diff; // sse: dynamic range 26 bits. - } - - // (mean * mean): dynamic range 31 bits. - var = sse - ((mean * mean) >> (bwl + 2)); - return var; -} - -void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp, - int *min, int *max) { - int i, j; - *min = 255; - *max = 0; - for (i = 0; i < 8; ++i, s += p, d += dp) { - for (j = 0; j < 8; ++j) { - int diff = abs(s[j]-d[j]); - *min = diff < *min ? diff : *min; - *max = diff > *max ? diff : *max; - } - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) { - int i, j; - int sum = 0; - const uint16_t* s = CONVERT_TO_SHORTPTR(s8); - for (i = 0; i < 8; ++i, s+=p) - for (j = 0; j < 8; sum += s[j], ++j) {} - - return (sum + 32) >> 6; -} - -unsigned int vp9_highbd_avg_4x4_c(const uint8_t *s8, int p) { - int i, j; - int sum = 0; - const uint16_t* s = CONVERT_TO_SHORTPTR(s8); - for (i = 0; i < 4; ++i, s+=p) - for (j = 0; j < 4; sum += s[j], ++j) {} - - return (sum + 8) >> 4; -} - -void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, - int dp, int *min, int *max) { - int i, j; - const uint16_t* s = CONVERT_TO_SHORTPTR(s8); - const uint16_t* d = CONVERT_TO_SHORTPTR(d8); - *min = 255; - *max = 0; - for (i = 0; i < 8; ++i, s += p, d += dp) { - for (j = 0; j < 8; ++j) { - int diff = abs(s[j]-d[j]); - *min = diff < *min ? diff : *min; - *max = diff > *max ? diff : *max; - } - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - - diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index c643b1848..fc5eb1bbe 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -556,16 +556,16 @@ static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d, if (x8_idx < pixels_wide && y8_idx < pixels_high) { #if CONFIG_VP9_HIGHBITDEPTH if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp, &min, &max); } else { - vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp, &min, &max); } #else - vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp, + vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp, d + y8_idx * dp + x8_idx, dp, &min, &max); #endif @@ -597,18 +597,18 @@ static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d, int d_avg = 128; #if CONFIG_VP9_HIGHBITDEPTH if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { - s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); + s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp); if (!is_key_frame) - d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp); + d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp); } else { - s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp); if (!is_key_frame) - d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); + d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp); } #else - s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp); + s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp); if (!is_key_frame) - d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp); + d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp); #endif sum = s_avg - d_avg; sse = sum * sum; @@ -636,18 +636,18 @@ static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d, int d_avg = 128; #if CONFIG_VP9_HIGHBITDEPTH if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) { - s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); + s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp); if (!is_key_frame) - d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); + d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp); } else { - s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); + s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp); if (!is_key_frame) - d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); + d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp); } #else - s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp); + s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp); if (!is_key_frame) - d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp); + d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp); #endif sum = s_avg - d_avg; sse = sum * sum; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 327ac1985..a84202bb4 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1755,7 +1755,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { int center, offset = 0; int bw = 4 << bwl; // redundant variable, to be changed in the experiments. for (d = 0; d <= bw; d += 16) { - this_sad = vp9_vector_var(&ref[d], src, bwl); + this_sad = vpx_vector_var(&ref[d], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; offset = d; @@ -1768,7 +1768,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { // check limit if (this_pos < 0 || this_pos > bw) continue; - this_sad = vp9_vector_var(&ref[this_pos], src, bwl); + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; @@ -1781,7 +1781,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { // check limit if (this_pos < 0 || this_pos > bw) continue; - this_sad = vp9_vector_var(&ref[this_pos], src, bwl); + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; @@ -1794,7 +1794,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { // check limit if (this_pos < 0 || this_pos > bw) continue; - this_sad = vp9_vector_var(&ref[this_pos], src, bwl); + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; @@ -1807,7 +1807,7 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) { // check limit if (this_pos < 0 || this_pos > bw) continue; - this_sad = vp9_vector_var(&ref[this_pos], src, bwl); + this_sad = vpx_vector_var(&ref[this_pos], src, bwl); if (this_sad < best_sad) { best_sad = this_sad; center = this_pos; @@ -1876,25 +1876,25 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, // Set up prediction 1-D reference set ref_buf = xd->plane[0].pre[0].buf - (bw >> 1); for (idx = 0; idx < search_width; idx += 16) { - vp9_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh); + vpx_int_pro_row(&hbuf[idx], ref_buf, ref_stride, bh); ref_buf += 16; } ref_buf = xd->plane[0].pre[0].buf - (bh >> 1) * ref_stride; for (idx = 0; idx < search_height; ++idx) { - vbuf[idx] = vp9_int_pro_col(ref_buf, bw) >> norm_factor; + vbuf[idx] = vpx_int_pro_col(ref_buf, bw) >> norm_factor; ref_buf += ref_stride; } // Set up src 1-D reference set for (idx = 0; idx < bw; idx += 16) { src_buf = x->plane[0].src.buf + idx; - vp9_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh); + vpx_int_pro_row(&src_hbuf[idx], src_buf, src_stride, bh); } src_buf = x->plane[0].src.buf; for (idx = 0; idx < bh; ++idx) { - src_vbuf[idx] = vp9_int_pro_col(src_buf, bw) >> norm_factor; + src_vbuf[idx] = vpx_int_pro_col(src_buf, bw) >> norm_factor; src_buf += src_stride; } diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 90650dbe8..0d86518a5 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -619,14 +619,14 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, scan_order->scan, scan_order->iscan); break; case TX_16X16: - vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff); + vpx_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff); vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; case TX_8X8: - vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff); + vpx_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff); vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp, p->quant_fp, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, @@ -673,7 +673,7 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist, if (*eob == 1) *rate += (int)abs(qcoeff[0]); else if (*eob > 1) - *rate += vp9_satd((const int16_t *)qcoeff, step << 4); + *rate += vpx_satd((const int16_t *)qcoeff, step << 4); *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift; } diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c deleted file mode 100644 index 441487130..000000000 --- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c +++ /dev/null @@ -1,423 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include - -#include "./vp9_rtcd.h" -#include "vpx_ports/mem.h" - -void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp, - int *min, int *max) { - __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff; - u0 = _mm_setzero_si128(); - // Row 0 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff0 = _mm_max_epi16(diff, negdiff); - // Row 1 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(absdiff0, absdiff); - minabsdiff = _mm_min_epi16(absdiff0, absdiff); - // Row 2 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 3 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 4 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 5 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 6 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - // Row 7 - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); - d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0); - diff = _mm_subs_epi16(s0, d0); - negdiff = _mm_subs_epi16(u0, diff); - absdiff = _mm_max_epi16(diff, negdiff); - maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff); - minabsdiff = _mm_min_epi16(minabsdiff, absdiff); - - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8)); - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32)); - maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16)); - *max = _mm_extract_epi16(maxabsdiff, 0); - - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8)); - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32)); - minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16)); - *min = _mm_extract_epi16(minabsdiff, 0); -} - -unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) { - __m128i s0, s1, u0; - unsigned int avg = 0; - u0 = _mm_setzero_si128(); - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - - s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); - s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); - s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); - avg = _mm_extract_epi16(s0, 0); - return (avg + 32) >> 6; -} - -unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) { - __m128i s0, s1, u0; - unsigned int avg = 0; - u0 = _mm_setzero_si128(); - s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); - s0 = _mm_adds_epu16(s0, s1); - - s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); - s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); - avg = _mm_extract_epi16(s0, 0); - return (avg + 8) >> 4; -} - -static void hadamard_col8_sse2(__m128i *in, int iter) { - __m128i a0 = in[0]; - __m128i a1 = in[1]; - __m128i a2 = in[2]; - __m128i a3 = in[3]; - __m128i a4 = in[4]; - __m128i a5 = in[5]; - __m128i a6 = in[6]; - __m128i a7 = in[7]; - - __m128i b0 = _mm_add_epi16(a0, a1); - __m128i b1 = _mm_sub_epi16(a0, a1); - __m128i b2 = _mm_add_epi16(a2, a3); - __m128i b3 = _mm_sub_epi16(a2, a3); - __m128i b4 = _mm_add_epi16(a4, a5); - __m128i b5 = _mm_sub_epi16(a4, a5); - __m128i b6 = _mm_add_epi16(a6, a7); - __m128i b7 = _mm_sub_epi16(a6, a7); - - a0 = _mm_add_epi16(b0, b2); - a1 = _mm_add_epi16(b1, b3); - a2 = _mm_sub_epi16(b0, b2); - a3 = _mm_sub_epi16(b1, b3); - a4 = _mm_add_epi16(b4, b6); - a5 = _mm_add_epi16(b5, b7); - a6 = _mm_sub_epi16(b4, b6); - a7 = _mm_sub_epi16(b5, b7); - - if (iter == 0) { - b0 = _mm_add_epi16(a0, a4); - b7 = _mm_add_epi16(a1, a5); - b3 = _mm_add_epi16(a2, a6); - b4 = _mm_add_epi16(a3, a7); - b2 = _mm_sub_epi16(a0, a4); - b6 = _mm_sub_epi16(a1, a5); - b1 = _mm_sub_epi16(a2, a6); - b5 = _mm_sub_epi16(a3, a7); - - a0 = _mm_unpacklo_epi16(b0, b1); - a1 = _mm_unpacklo_epi16(b2, b3); - a2 = _mm_unpackhi_epi16(b0, b1); - a3 = _mm_unpackhi_epi16(b2, b3); - a4 = _mm_unpacklo_epi16(b4, b5); - a5 = _mm_unpacklo_epi16(b6, b7); - a6 = _mm_unpackhi_epi16(b4, b5); - a7 = _mm_unpackhi_epi16(b6, b7); - - b0 = _mm_unpacklo_epi32(a0, a1); - b1 = _mm_unpacklo_epi32(a4, a5); - b2 = _mm_unpackhi_epi32(a0, a1); - b3 = _mm_unpackhi_epi32(a4, a5); - b4 = _mm_unpacklo_epi32(a2, a3); - b5 = _mm_unpacklo_epi32(a6, a7); - b6 = _mm_unpackhi_epi32(a2, a3); - b7 = _mm_unpackhi_epi32(a6, a7); - - in[0] = _mm_unpacklo_epi64(b0, b1); - in[1] = _mm_unpackhi_epi64(b0, b1); - in[2] = _mm_unpacklo_epi64(b2, b3); - in[3] = _mm_unpackhi_epi64(b2, b3); - in[4] = _mm_unpacklo_epi64(b4, b5); - in[5] = _mm_unpackhi_epi64(b4, b5); - in[6] = _mm_unpacklo_epi64(b6, b7); - in[7] = _mm_unpackhi_epi64(b6, b7); - } else { - in[0] = _mm_add_epi16(a0, a4); - in[7] = _mm_add_epi16(a1, a5); - in[3] = _mm_add_epi16(a2, a6); - in[4] = _mm_add_epi16(a3, a7); - in[2] = _mm_sub_epi16(a0, a4); - in[6] = _mm_sub_epi16(a1, a5); - in[1] = _mm_sub_epi16(a2, a6); - in[5] = _mm_sub_epi16(a3, a7); - } -} - -void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - __m128i src[8]; - src[0] = _mm_load_si128((const __m128i *)src_diff); - src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride)); - - hadamard_col8_sse2(src, 0); - hadamard_col8_sse2(src, 1); - - _mm_store_si128((__m128i *)coeff, src[0]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[1]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[2]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[3]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[4]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[5]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[6]); - coeff += 8; - _mm_store_si128((__m128i *)coeff, src[7]); -} - -void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride, - int16_t *coeff) { - int idx; - for (idx = 0; idx < 4; ++idx) { - int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride - + (idx & 0x01) * 8; - vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64); - } - - for (idx = 0; idx < 64; idx += 8) { - __m128i coeff0 = _mm_load_si128((const __m128i *)coeff); - __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64)); - __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128)); - __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192)); - - __m128i b0 = _mm_add_epi16(coeff0, coeff1); - __m128i b1 = _mm_sub_epi16(coeff0, coeff1); - __m128i b2 = _mm_add_epi16(coeff2, coeff3); - __m128i b3 = _mm_sub_epi16(coeff2, coeff3); - - b0 = _mm_srai_epi16(b0, 1); - b1 = _mm_srai_epi16(b1, 1); - b2 = _mm_srai_epi16(b2, 1); - b3 = _mm_srai_epi16(b3, 1); - - coeff0 = _mm_add_epi16(b0, b2); - coeff1 = _mm_add_epi16(b1, b3); - _mm_store_si128((__m128i *)coeff, coeff0); - _mm_store_si128((__m128i *)(coeff + 64), coeff1); - - coeff2 = _mm_sub_epi16(b0, b2); - coeff3 = _mm_sub_epi16(b1, b3); - _mm_store_si128((__m128i *)(coeff + 128), coeff2); - _mm_store_si128((__m128i *)(coeff + 192), coeff3); - - coeff += 8; - } -} - -int vp9_satd_sse2(const int16_t *coeff, int length) { - int i; - const __m128i zero = _mm_setzero_si128(); - __m128i accum = zero; - - for (i = 0; i < length; i += 8) { - const __m128i src_line = _mm_load_si128((const __m128i *)coeff); - const __m128i inv = _mm_sub_epi16(zero, src_line); - const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) - const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); - const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); - const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); - accum = _mm_add_epi32(accum, sum); - coeff += 8; - } - - { // cascading summation of accum - __m128i hi = _mm_srli_si128(accum, 8); - accum = _mm_add_epi32(accum, hi); - hi = _mm_srli_epi64(accum, 32); - accum = _mm_add_epi32(accum, hi); - } - - return _mm_cvtsi128_si32(accum); -} - -void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref, - const int ref_stride, const int height) { - int idx; - __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_loadu_si128((const __m128i *)ref); - __m128i s0 = _mm_unpacklo_epi8(src_line, zero); - __m128i s1 = _mm_unpackhi_epi8(src_line, zero); - __m128i t0, t1; - int height_1 = height - 1; - ref += ref_stride; - - for (idx = 1; idx < height_1; idx += 2) { - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - ref += ref_stride; - - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - ref += ref_stride; - } - - src_line = _mm_loadu_si128((const __m128i *)ref); - t0 = _mm_unpacklo_epi8(src_line, zero); - t1 = _mm_unpackhi_epi8(src_line, zero); - s0 = _mm_adds_epu16(s0, t0); - s1 = _mm_adds_epu16(s1, t1); - - if (height == 64) { - s0 = _mm_srai_epi16(s0, 5); - s1 = _mm_srai_epi16(s1, 5); - } else if (height == 32) { - s0 = _mm_srai_epi16(s0, 4); - s1 = _mm_srai_epi16(s1, 4); - } else { - s0 = _mm_srai_epi16(s0, 3); - s1 = _mm_srai_epi16(s1, 3); - } - - _mm_storeu_si128((__m128i *)hbuf, s0); - hbuf += 8; - _mm_storeu_si128((__m128i *)hbuf, s1); -} - -int16_t vp9_int_pro_col_sse2(uint8_t const *ref, const int width) { - __m128i zero = _mm_setzero_si128(); - __m128i src_line = _mm_load_si128((const __m128i *)ref); - __m128i s0 = _mm_sad_epu8(src_line, zero); - __m128i s1; - int i; - - for (i = 16; i < width; i += 16) { - ref += 16; - src_line = _mm_load_si128((const __m128i *)ref); - s1 = _mm_sad_epu8(src_line, zero); - s0 = _mm_adds_epu16(s0, s1); - } - - s1 = _mm_srli_si128(s0, 8); - s0 = _mm_adds_epu16(s0, s1); - - return _mm_extract_epi16(s0, 0); -} - -int vp9_vector_var_sse2(int16_t const *ref, int16_t const *src, - const int bwl) { - int idx; - int width = 4 << bwl; - int16_t mean; - __m128i v0 = _mm_loadu_si128((const __m128i *)ref); - __m128i v1 = _mm_load_si128((const __m128i *)src); - __m128i diff = _mm_subs_epi16(v0, v1); - __m128i sum = diff; - __m128i sse = _mm_madd_epi16(diff, diff); - - ref += 8; - src += 8; - - for (idx = 8; idx < width; idx += 8) { - v0 = _mm_loadu_si128((const __m128i *)ref); - v1 = _mm_load_si128((const __m128i *)src); - diff = _mm_subs_epi16(v0, v1); - - sum = _mm_add_epi16(sum, diff); - v0 = _mm_madd_epi16(diff, diff); - sse = _mm_add_epi32(sse, v0); - - ref += 8; - src += 8; - } - - v0 = _mm_srli_si128(sum, 8); - sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi64(sum, 32); - sum = _mm_add_epi16(sum, v0); - v0 = _mm_srli_epi32(sum, 16); - sum = _mm_add_epi16(sum, v0); - - v1 = _mm_srli_si128(sse, 8); - sse = _mm_add_epi32(sse, v1); - v1 = _mm_srli_epi64(sse, 32); - sse = _mm_add_epi32(sse, v1); - - mean = _mm_extract_epi16(sum, 0); - - return _mm_cvtsi128_si32(sse) - ((mean * mean) >> (bwl + 2)); -} diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm deleted file mode 100644 index 74c52df19..000000000 --- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm +++ /dev/null @@ -1,121 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%define private_prefix vp9 - -%include "third_party/x86inc/x86inc.asm" - -; This file provides SSSE3 version of the forward transformation. Part -; of the macro definitions are originally derived from the ffmpeg project. -; The current version applies to x86 64-bit only. - -SECTION .text - -%if ARCH_X86_64 -; matrix transpose -%macro INTERLEAVE_2X 4 - punpckh%1 m%4, m%2, m%3 - punpckl%1 m%2, m%3 - SWAP %3, %4 -%endmacro - -%macro TRANSPOSE8X8 9 - INTERLEAVE_2X wd, %1, %2, %9 - INTERLEAVE_2X wd, %3, %4, %9 - INTERLEAVE_2X wd, %5, %6, %9 - INTERLEAVE_2X wd, %7, %8, %9 - - INTERLEAVE_2X dq, %1, %3, %9 - INTERLEAVE_2X dq, %2, %4, %9 - INTERLEAVE_2X dq, %5, %7, %9 - INTERLEAVE_2X dq, %6, %8, %9 - - INTERLEAVE_2X qdq, %1, %5, %9 - INTERLEAVE_2X qdq, %3, %7, %9 - INTERLEAVE_2X qdq, %2, %6, %9 - INTERLEAVE_2X qdq, %4, %8, %9 - - SWAP %2, %5 - SWAP %4, %7 -%endmacro - -%macro HMD8_1D 0 - psubw m8, m0, m1 - psubw m9, m2, m3 - paddw m0, m1 - paddw m2, m3 - SWAP 1, 8 - SWAP 3, 9 - psubw m8, m4, m5 - psubw m9, m6, m7 - paddw m4, m5 - paddw m6, m7 - SWAP 5, 8 - SWAP 7, 9 - - psubw m8, m0, m2 - psubw m9, m1, m3 - paddw m0, m2 - paddw m1, m3 - SWAP 2, 8 - SWAP 3, 9 - psubw m8, m4, m6 - psubw m9, m5, m7 - paddw m4, m6 - paddw m5, m7 - SWAP 6, 8 - SWAP 7, 9 - - psubw m8, m0, m4 - psubw m9, m1, m5 - paddw m0, m4 - paddw m1, m5 - SWAP 4, 8 - SWAP 5, 9 - psubw m8, m2, m6 - psubw m9, m3, m7 - paddw m2, m6 - paddw m3, m7 - SWAP 6, 8 - SWAP 7, 9 -%endmacro - -INIT_XMM ssse3 -cglobal hadamard_8x8, 3, 5, 10, input, stride, output - lea r3, [2 * strideq] - lea r4, [4 * strideq] - - mova m0, [inputq] - mova m1, [inputq + r3] - lea inputq, [inputq + r4] - mova m2, [inputq] - mova m3, [inputq + r3] - lea inputq, [inputq + r4] - mova m4, [inputq] - mova m5, [inputq + r3] - lea inputq, [inputq + r4] - mova m6, [inputq] - mova m7, [inputq + r3] - - HMD8_1D - TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9 - HMD8_1D - - mova [outputq + 0], m0 - mova [outputq + 16], m1 - mova [outputq + 32], m2 - mova [outputq + 48], m3 - mova [outputq + 64], m4 - mova [outputq + 80], m5 - mova [outputq + 96], m6 - mova [outputq + 112], m7 - - RET -%endif -- cgit v1.2.3