diff options
Diffstat (limited to 'vp9/encoder')
-rw-r--r-- | vp9/encoder/arm/neon/vp9_subtract_neon.c | 81 | ||||
-rw-r--r-- | vp9/encoder/mips/msa/vp9_subtract_msa.c | 264 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemb.c | 60 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 11 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_subtract_sse2.asm | 127 |
5 files changed, 17 insertions, 526 deletions
diff --git a/vp9/encoder/arm/neon/vp9_subtract_neon.c b/vp9/encoder/arm/neon/vp9_subtract_neon.c deleted file mode 100644 index b4bf567db..000000000 --- a/vp9/encoder/arm/neon/vp9_subtract_neon.c +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> -#include "./vp9_rtcd.h" -#include "./vpx_config.h" - -#include "vpx/vpx_integer.h" - -void vp9_subtract_block_neon(int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src, ptrdiff_t src_stride, - const uint8_t *pred, ptrdiff_t pred_stride) { - int r, c; - - if (cols > 16) { - for (r = 0; r < rows; ++r) { - for (c = 0; c < cols; c += 32) { - const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); - const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); - const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); - const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); - const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00), - vget_low_u8(v_pred_00)); - const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00), - vget_high_u8(v_pred_00)); - const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16), - vget_low_u8(v_pred_16)); - const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16), - vget_high_u8(v_pred_16)); - vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); - vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); - vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); - vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); - } - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } - } else if (cols > 8) { - for (r = 0; r < rows; ++r) { - const uint8x16_t v_src = vld1q_u8(&src[0]); - const uint8x16_t v_pred = vld1q_u8(&pred[0]); - const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src), - vget_low_u8(v_pred)); - const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src), - vget_high_u8(v_pred)); - vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); - vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } - } else if (cols > 4) { - for (r = 0; r < rows; ++r) { - const uint8x8_t v_src = vld1_u8(&src[0]); - const uint8x8_t v_pred = vld1_u8(&pred[0]); - const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); - vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } - } else { - for (r = 0; r < rows; ++r) { - for (c = 0; c < cols; ++c) - diff[c] = src[c] - pred[c]; - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } - } -} diff --git a/vp9/encoder/mips/msa/vp9_subtract_msa.c b/vp9/encoder/mips/msa/vp9_subtract_msa.c deleted file mode 100644 index 1b8b694ce..000000000 --- a/vp9/encoder/mips/msa/vp9_subtract_msa.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "vp9/common/mips/msa/vp9_macros_msa.h" - -static void sub_blk_4x4_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *pred_ptr, int32_t pred_stride, - int16_t *diff_ptr, int32_t diff_stride) { - uint32_t src0, src1, src2, src3; - uint32_t pred0, pred1, pred2, pred3; - v16i8 src = { 0 }; - v16i8 pred = { 0 }; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - LW4(src_ptr, src_stride, src0, src1, src2, src3); - LW4(pred_ptr, pred_stride, pred0, pred1, pred2, pred3); - INSERT_W4_SB(src0, src1, src2, src3, src); - INSERT_W4_SB(pred0, pred1, pred2, pred3, pred); - ILVRL_B2_UB(src, pred, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST8x4_UB(diff0, diff1, diff_ptr, (2 * diff_stride)); -} - -static void sub_blk_8x8_msa(const uint8_t *src_ptr, int32_t src_stride, - const uint8_t *pred_ptr, int32_t pred_stride, - int16_t *diff_ptr, int32_t diff_stride) { - uint32_t loop_cnt; - uint64_t src0, src1, pred0, pred1; - v16i8 src = { 0 }; - v16i8 pred = { 0 }; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (loop_cnt = 4; loop_cnt--;) { - LD2(src_ptr, src_stride, src0, src1); - src_ptr += (2 * src_stride); - LD2(pred_ptr, pred_stride, pred0, pred1); - pred_ptr += (2 * pred_stride); - - INSERT_D2_SB(src0, src1, src); - INSERT_D2_SB(pred0, pred1, pred); - ILVRL_B2_UB(src, pred, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff_ptr, diff_stride); - diff_ptr += (2 * diff_stride); - } -} - -static void sub_blk_16x16_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *pred, int32_t pred_stride, - int16_t *diff, int32_t diff_stride) { - int8_t count; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (count = 2; count--;) { - LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); - src += (8 * src_stride); - - LD_SB8(pred, pred_stride, - pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7); - pred += (8 * pred_stride); - - ILVRL_B2_UB(src0, pred0, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src1, pred1, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src2, pred2, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src3, pred3, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src4, pred4, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src5, pred5, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src6, pred6, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - - ILVRL_B2_UB(src7, pred7, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - diff += diff_stride; - } -} - -static void sub_blk_32x32_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *pred, int32_t pred_stride, - int16_t *diff, int32_t diff_stride) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (loop_cnt = 8; loop_cnt--;) { - LD_SB2(src, 16, src0, src1); - src += src_stride; - LD_SB2(src, 16, src2, src3); - src += src_stride; - LD_SB2(src, 16, src4, src5); - src += src_stride; - LD_SB2(src, 16, src6, src7); - src += src_stride; - - LD_SB2(pred, 16, pred0, pred1); - pred += pred_stride; - LD_SB2(pred, 16, pred2, pred3); - pred += pred_stride; - LD_SB2(pred, 16, pred4, pred5); - pred += pred_stride; - LD_SB2(pred, 16, pred6, pred7); - pred += pred_stride; - - ILVRL_B2_UB(src0, pred0, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src1, pred1, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - - ILVRL_B2_UB(src2, pred2, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src3, pred3, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - - ILVRL_B2_UB(src4, pred4, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src5, pred5, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - - ILVRL_B2_UB(src6, pred6, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src7, pred7, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - diff += diff_stride; - } -} - -static void sub_blk_64x64_msa(const uint8_t *src, int32_t src_stride, - const uint8_t *pred, int32_t pred_stride, - int16_t *diff, int32_t diff_stride) { - uint32_t loop_cnt; - v16i8 src0, src1, src2, src3, src4, src5, src6, src7; - v16i8 pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7; - v16u8 src_l0, src_l1; - v8i16 diff0, diff1; - - for (loop_cnt = 32; loop_cnt--;) { - LD_SB4(src, 16, src0, src1, src2, src3); - src += src_stride; - LD_SB4(src, 16, src4, src5, src6, src7); - src += src_stride; - - LD_SB4(pred, 16, pred0, pred1, pred2, pred3); - pred += pred_stride; - LD_SB4(pred, 16, pred4, pred5, pred6, pred7); - pred += pred_stride; - - ILVRL_B2_UB(src0, pred0, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src1, pred1, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - ILVRL_B2_UB(src2, pred2, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 32, 8); - ILVRL_B2_UB(src3, pred3, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 48, 8); - diff += diff_stride; - - ILVRL_B2_UB(src4, pred4, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff, 8); - ILVRL_B2_UB(src5, pred5, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 16, 8); - ILVRL_B2_UB(src6, pred6, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 32, 8); - ILVRL_B2_UB(src7, pred7, src_l0, src_l1); - HSUB_UB2_SH(src_l0, src_l1, diff0, diff1); - ST_SH2(diff0, diff1, diff + 48, 8); - diff += diff_stride; - } -} - -void vp9_subtract_block_msa(int32_t rows, int32_t cols, - int16_t *diff_ptr, ptrdiff_t diff_stride, - const uint8_t *src_ptr, ptrdiff_t src_stride, - const uint8_t *pred_ptr, ptrdiff_t pred_stride) { - if (rows == cols) { - switch (rows) { - case 4: - sub_blk_4x4_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); - break; - case 8: - sub_blk_8x8_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); - break; - case 16: - sub_blk_16x16_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); - break; - case 32: - sub_blk_32x32_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); - break; - case 64: - sub_blk_64x64_msa(src_ptr, src_stride, pred_ptr, pred_stride, - diff_ptr, diff_stride); - break; - default: - vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, - src_stride, pred_ptr, pred_stride); - break; - } - } else { - vp9_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride, - pred_ptr, pred_stride); - } -} diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 2829365e5..313094140 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -11,6 +11,7 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -31,45 +32,6 @@ struct optimize_ctx { ENTROPY_CONTEXT tl[MAX_MB_PLANE][16]; }; -void vp9_subtract_block_c(int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src, ptrdiff_t src_stride, - const uint8_t *pred, ptrdiff_t pred_stride) { - int r, c; - - for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) - diff[c] = src[c] - pred[c]; - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } -} - -#if CONFIG_VP9_HIGHBITDEPTH -void vp9_highbd_subtract_block_c(int rows, int cols, - int16_t *diff, ptrdiff_t diff_stride, - const uint8_t *src8, ptrdiff_t src_stride, - const uint8_t *pred8, ptrdiff_t pred_stride, - int bd) { - int r, c; - uint16_t *src = CONVERT_TO_SHORTPTR(src8); - uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); - (void) bd; - - for (r = 0; r < rows; r++) { - for (c = 0; c < cols; c++) { - diff[c] = src[c] - pred[c]; - } - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane]; @@ -79,13 +41,13 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) { #if CONFIG_VP9_HIGHBITDEPTH if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, + vpx_highbd_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, x->e_mbd.bd); return; } #endif // CONFIG_VP9_HIGHBITDEPTH - vp9_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, + vpx_subtract_block(bh, bw, p->src_diff, bw, p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride); } @@ -838,7 +800,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_highbd_subtract_block(32, 32, src_diff, diff_stride, + vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, @@ -859,7 +821,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_highbd_subtract_block(16, 16, src_diff, diff_stride, + vpx_highbd_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type); vp9_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, @@ -881,7 +843,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_highbd_subtract_block(8, 8, src_diff, diff_stride, + vpx_highbd_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type); vp9_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, @@ -904,7 +866,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_highbd_subtract_block(4, 4, src_diff, diff_stride, + vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); if (tx_type != DCT_DCT) vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type); @@ -946,7 +908,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_subtract_block(32, 32, src_diff, diff_stride, + vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst, dst_stride); fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, @@ -966,7 +928,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_subtract_block(16, 16, src_diff, diff_stride, + vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst, dst_stride); vp9_fht16x16(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, @@ -986,7 +948,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_subtract_block(8, 8, src_diff, diff_stride, + vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst, dst_stride); vp9_fht8x8(src_diff, coeff, diff_stride, tx_type); vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant, @@ -1007,7 +969,7 @@ void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize, dst, dst_stride, i, j, plane); if (!x->skip_recode) { - vp9_subtract_block(4, 4, src_diff, diff_stride, + vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride); if (tx_type != DCT_DCT) vp9_fht4x4(src_diff, coeff, diff_stride, tx_type); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e6b7f193a..dc054f0c1 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -12,6 +12,7 @@ #include <math.h> #include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -832,7 +833,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, x->skip_encode ? src : dst, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, idx, idy, 0); - vp9_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, + vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride, xd->bd); if (xd->lossless) { const scan_order *so = &vp9_default_scan_orders[TX_4X4]; @@ -932,7 +933,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, x->skip_encode ? src : dst, x->skip_encode ? src_stride : dst_stride, dst, dst_stride, idx, idy, 0); - vp9_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride); + vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride); if (xd->lossless) { const scan_order *so = &vp9_default_scan_orders[TX_4X4]; @@ -1394,16 +1395,16 @@ static int64_t encode_inter_mb_segment(VP9_COMP *cpi, #if CONFIG_VP9_HIGHBITDEPTH if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { - vp9_highbd_subtract_block( + vpx_highbd_subtract_block( height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, src, p->src.stride, dst, pd->dst.stride, xd->bd); } else { - vp9_subtract_block( + vpx_subtract_block( height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, src, p->src.stride, dst, pd->dst.stride); } #else - vp9_subtract_block(height, width, + vpx_subtract_block(height, width, vp9_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff), 8, src, p->src.stride, dst, pd->dst.stride); #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vp9/encoder/x86/vp9_subtract_sse2.asm deleted file mode 100644 index 982408083..000000000 --- a/vp9/encoder/x86/vp9_subtract_sse2.asm +++ /dev/null @@ -1,127 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION .text - -; void vp9_subtract_block(int rows, int cols, -; int16_t *diff, ptrdiff_t diff_stride, -; const uint8_t *src, ptrdiff_t src_stride, -; const uint8_t *pred, ptrdiff_t pred_stride) - -INIT_XMM sse2 -cglobal subtract_block, 7, 7, 8, \ - rows, cols, diff, diff_stride, src, src_stride, \ - pred, pred_stride -%define pred_str colsq - pxor m7, m7 ; dedicated zero register - cmp colsd, 4 - je .case_4 - cmp colsd, 8 - je .case_8 - cmp colsd, 16 - je .case_16 - cmp colsd, 32 - je .case_32 - -%macro loop16 6 - mova m0, [srcq+%1] - mova m4, [srcq+%2] - mova m1, [predq+%3] - mova m5, [predq+%4] - punpckhbw m2, m0, m7 - punpckhbw m3, m1, m7 - punpcklbw m0, m7 - punpcklbw m1, m7 - psubw m2, m3 - psubw m0, m1 - punpckhbw m1, m4, m7 - punpckhbw m3, m5, m7 - punpcklbw m4, m7 - punpcklbw m5, m7 - psubw m1, m3 - psubw m4, m5 - mova [diffq+mmsize*0+%5], m0 - mova [diffq+mmsize*1+%5], m2 - mova [diffq+mmsize*0+%6], m4 - mova [diffq+mmsize*1+%6], m1 -%endmacro - - mov pred_str, pred_stridemp -.loop_64: - loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize - loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize - lea diffq, [diffq+diff_strideq*2] - add predq, pred_str - add srcq, src_strideq - dec rowsd - jg .loop_64 - RET - -.case_32: - mov pred_str, pred_stridemp -.loop_32: - loop16 0, mmsize, 0, mmsize, 0, 2*mmsize - lea diffq, [diffq+diff_strideq*2] - add predq, pred_str - add srcq, src_strideq - dec rowsd - jg .loop_32 - RET - -.case_16: - mov pred_str, pred_stridemp -.loop_16: - loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 - lea diffq, [diffq+diff_strideq*4] - lea predq, [predq+pred_str*2] - lea srcq, [srcq+src_strideq*2] - sub rowsd, 2 - jg .loop_16 - RET - -%macro loop_h 0 - movh m0, [srcq] - movh m2, [srcq+src_strideq] - movh m1, [predq] - movh m3, [predq+pred_str] - punpcklbw m0, m7 - punpcklbw m1, m7 - punpcklbw m2, m7 - punpcklbw m3, m7 - psubw m0, m1 - psubw m2, m3 - mova [diffq], m0 - mova [diffq+diff_strideq*2], m2 -%endmacro - -.case_8: - mov pred_str, pred_stridemp -.loop_8: - loop_h - lea diffq, [diffq+diff_strideq*4] - lea srcq, [srcq+src_strideq*2] - lea predq, [predq+pred_str*2] - sub rowsd, 2 - jg .loop_8 - RET - -INIT_MMX -.case_4: - mov pred_str, pred_stridemp -.loop_4: - loop_h - lea diffq, [diffq+diff_strideq*4] - lea srcq, [srcq+src_strideq*2] - lea predq, [predq+pred_str*2] - sub rowsd, 2 - jg .loop_4 - RET |