diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/arm/neon/vp9_reconintra_neon.c | 822 | ||||
-rw-r--r-- | vp9/common/arm/neon/vp9_reconintra_neon_asm.asm | 630 | ||||
-rw-r--r-- | vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c | 332 | ||||
-rw-r--r-- | vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c | 232 | ||||
-rw-r--r-- | vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c | 610 | ||||
-rw-r--r-- | vp9/common/mips/msa/vp9_intra_predict_msa.c | 737 | ||||
-rw-r--r-- | vp9/common/vp9_common.h | 14 | ||||
-rw-r--r-- | vp9/common/vp9_reconintra.c | 679 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.pl | 318 | ||||
-rw-r--r-- | vp9/common/x86/vp9_high_intrapred_sse2.asm | 476 | ||||
-rw-r--r-- | vp9/common/x86/vp9_intrapred_sse2.asm | 667 | ||||
-rw-r--r-- | vp9/common/x86/vp9_intrapred_ssse3.asm | 1036 | ||||
-rw-r--r-- | vp9/vp9_common.mk | 12 |
13 files changed, 1 insertions, 6564 deletions
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c deleted file mode 100644 index 92706bf2c..000000000 --- a/vp9/common/arm/neon/vp9_reconintra_neon.c +++ /dev/null @@ -1,822 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> - -#include "./vp9_rtcd.h" -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" - -//------------------------------------------------------------------------------ -// DC 4x4 - -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - const uint16x4_t p1 = vpadd_u16(p0, p0); - sum_top = vcombine_u16(p1, p1); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - const uint16x4_t p1 = vpadd_u16(p0, p0); - sum_left = vcombine_u16(p1, p1); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 3); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 2); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 2); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x8_t dc = vdup_lane_u8(dc0, 0); - int i; - for (i = 0; i < 4; ++i) { - vst1_lane_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc), 0); - } - } -} - -void vp9_dc_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_4x4(dst, stride, above, left, 1, 1); -} - -void vp9_dc_left_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - dc_4x4(dst, stride, NULL, left, 0, 1); -} - -void vp9_dc_top_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - dc_4x4(dst, stride, above, NULL, 1, 0); -} - -void vp9_dc_128_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - dc_4x4(dst, stride, NULL, NULL, 0, 0); -} - -//------------------------------------------------------------------------------ -// DC 8x8 - -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_8x8(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x8_t A = vld1_u8(above); // top row - const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_top = vcombine_u16(p2, p2); - } - - if (do_left) { - const uint8x8_t L = vld1_u8(left); // left border - const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - const uint16x4_t p1 = vpadd_u16(p0, p0); - const uint16x4_t p2 = vpadd_u16(p1, p1); - sum_left = vcombine_u16(p2, p2); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 4); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 3); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 3); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x8_t dc = vdup_lane_u8(dc0, 0); - int i; - for (i = 0; i < 8; ++i) { - vst1_u32((uint32_t*)(dst + i * stride), vreinterpret_u32_u8(dc)); - } - } -} - -void vp9_dc_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_8x8(dst, stride, above, left, 1, 1); -} - -void vp9_dc_left_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - dc_8x8(dst, stride, NULL, left, 0, 1); -} - -void vp9_dc_top_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - dc_8x8(dst, stride, above, NULL, 1, 0); -} - -void vp9_dc_128_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - dc_8x8(dst, stride, NULL, NULL, 0, 0); -} - -//------------------------------------------------------------------------------ -// DC 16x16 - -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_16x16(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x16_t A = vld1q_u8(above); // top row - const uint16x8_t p0 = vpaddlq_u8(A); // cascading summation of the top - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_top = vcombine_u16(p3, p3); - } - - if (do_left) { - const uint8x16_t L = vld1q_u8(left); // left row - const uint16x8_t p0 = vpaddlq_u8(L); // cascading summation of the left - const uint16x4_t p1 = vadd_u16(vget_low_u16(p0), vget_high_u16(p0)); - const uint16x4_t p2 = vpadd_u16(p1, p1); - const uint16x4_t p3 = vpadd_u16(p2, p2); - sum_left = vcombine_u16(p3, p3); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 5); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 4); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 4); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 16; ++i) { - vst1q_u8(dst + i * stride, dc); - } - } -} - -void vp9_dc_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_16x16(dst, stride, above, left, 1, 1); -} - -void vp9_dc_left_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - dc_16x16(dst, stride, NULL, left, 0, 1); -} - -void vp9_dc_top_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - dc_16x16(dst, stride, above, NULL, 1, 0); -} - -void vp9_dc_128_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - dc_16x16(dst, stride, NULL, NULL, 0, 0); -} - -//------------------------------------------------------------------------------ -// DC 32x32 - -// 'do_above' and 'do_left' facilitate branch removal when inlined. -static INLINE void dc_32x32(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left, - int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; - - if (do_above) { - const uint8x16_t A0 = vld1q_u8(above); // top row - const uint8x16_t A1 = vld1q_u8(above + 16); - const uint16x8_t p0 = vpaddlq_u8(A0); // cascading summation of the top - const uint16x8_t p1 = vpaddlq_u8(A1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_top = vcombine_u16(p5, p5); - } - - if (do_left) { - const uint8x16_t L0 = vld1q_u8(left); // left row - const uint8x16_t L1 = vld1q_u8(left + 16); - const uint16x8_t p0 = vpaddlq_u8(L0); // cascading summation of the left - const uint16x8_t p1 = vpaddlq_u8(L1); - const uint16x8_t p2 = vaddq_u16(p0, p1); - const uint16x4_t p3 = vadd_u16(vget_low_u16(p2), vget_high_u16(p2)); - const uint16x4_t p4 = vpadd_u16(p3, p3); - const uint16x4_t p5 = vpadd_u16(p4, p4); - sum_left = vcombine_u16(p5, p5); - } - - if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 6); - } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 5); - } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 5); - } else { - dc0 = vdup_n_u8(0x80); - } - - { - const uint8x16_t dc = vdupq_lane_u8(dc0, 0); - int i; - for (i = 0; i < 32; ++i) { - vst1q_u8(dst + i * stride, dc); - vst1q_u8(dst + i * stride + 16, dc); - } - } -} - -void vp9_dc_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - dc_32x32(dst, stride, above, left, 1, 1); -} - -void vp9_dc_left_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - dc_32x32(dst, stride, NULL, left, 0, 1); -} - -void vp9_dc_top_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)left; - dc_32x32(dst, stride, above, NULL, 1, 0); -} - -void vp9_dc_128_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - (void)left; - dc_32x32(dst, stride, NULL, NULL, 0, 0); -} - -// ----------------------------------------------------------------------------- - -void vp9_d45_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const uint64x1_t A0 = vreinterpret_u64_u8(vld1_u8(above)); // top row - const uint64x1_t A1 = vshr_n_u64(A0, 8); - const uint64x1_t A2 = vshr_n_u64(A0, 16); - const uint8x8_t ABCDEFGH = vreinterpret_u8_u64(A0); - const uint8x8_t BCDEFGH0 = vreinterpret_u8_u64(A1); - const uint8x8_t CDEFGH00 = vreinterpret_u8_u64(A2); - const uint8x8_t avg1 = vhadd_u8(ABCDEFGH, CDEFGH00); - const uint8x8_t avg2 = vrhadd_u8(avg1, BCDEFGH0); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r0 = vreinterpret_u32_u8(avg2); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r3 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - (void)left; - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); - dst[3 * stride + 3] = above[7]; -} - -void vp9_d45_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - static const uint8_t shuffle1[8] = { 1, 2, 3, 4, 5, 6, 7, 7 }; - static const uint8_t shuffle2[8] = { 2, 3, 4, 5, 6, 7, 7, 7 }; - const uint8x8_t sh_12345677 = vld1_u8(shuffle1); - const uint8x8_t sh_23456777 = vld1_u8(shuffle2); - const uint8x8_t A0 = vld1_u8(above); // top row - const uint8x8_t A1 = vtbl1_u8(A0, sh_12345677); - const uint8x8_t A2 = vtbl1_u8(A0, sh_23456777); - const uint8x8_t avg1 = vhadd_u8(A0, A2); - uint8x8_t row = vrhadd_u8(avg1, A1); - int i; - (void)left; - for (i = 0; i < 7; ++i) { - vst1_u8(dst + i * stride, row); - row = vtbl1_u8(row, sh_12345677); - } - vst1_u8(dst + i * stride, row); -} - -void vp9_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const uint8x16_t A0 = vld1q_u8(above); // top row - const uint8x16_t above_right = vld1q_dup_u8(above + 15); - const uint8x16_t A1 = vextq_u8(A0, above_right, 1); - const uint8x16_t A2 = vextq_u8(A0, above_right, 2); - const uint8x16_t avg1 = vhaddq_u8(A0, A2); - uint8x16_t row = vrhaddq_u8(avg1, A1); - int i; - (void)left; - for (i = 0; i < 15; ++i) { - vst1q_u8(dst + i * stride, row); - row = vextq_u8(row, above_right, 1); - } - vst1q_u8(dst + i * stride, row); -} - -// ----------------------------------------------------------------------------- - -void vp9_d135_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const uint8x8_t XABCD_u8 = vld1_u8(above - 1); - const uint64x1_t XABCD = vreinterpret_u64_u8(XABCD_u8); - const uint64x1_t ____XABC = vshl_n_u64(XABCD, 32); - const uint32x2_t zero = vdup_n_u32(0); - const uint32x2_t IJKL = vld1_lane_u32((const uint32_t *)left, zero, 0); - const uint8x8_t IJKL_u8 = vreinterpret_u8_u32(IJKL); - const uint64x1_t LKJI____ = vreinterpret_u64_u8(vrev32_u8(IJKL_u8)); - const uint64x1_t LKJIXABC = vorr_u64(LKJI____, ____XABC); - const uint8x8_t KJIXABC_ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 8)); - const uint8x8_t JIXABC__ = vreinterpret_u8_u64(vshr_n_u64(LKJIXABC, 16)); - const uint8_t D = vget_lane_u8(XABCD_u8, 4); - const uint8x8_t JIXABCD_ = vset_lane_u8(D, JIXABC__, 6); - const uint8x8_t LKJIXABC_u8 = vreinterpret_u8_u64(LKJIXABC); - const uint8x8_t avg1 = vhadd_u8(JIXABCD_, LKJIXABC_u8); - const uint8x8_t avg2 = vrhadd_u8(avg1, KJIXABC_); - const uint64x1_t avg2_u64 = vreinterpret_u64_u8(avg2); - const uint32x2_t r3 = vreinterpret_u32_u8(avg2); - const uint32x2_t r2 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 8)); - const uint32x2_t r1 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 16)); - const uint32x2_t r0 = vreinterpret_u32_u64(vshr_n_u64(avg2_u64, 24)); - vst1_lane_u32((uint32_t *)(dst + 0 * stride), r0, 0); - vst1_lane_u32((uint32_t *)(dst + 1 * stride), r1, 0); - vst1_lane_u32((uint32_t *)(dst + 2 * stride), r2, 0); - vst1_lane_u32((uint32_t *)(dst + 3 * stride), r3, 0); -} - -#if !HAVE_NEON_ASM - -void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint32x2_t d0u32 = vdup_n_u32(0); - (void)left; - - d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0); - for (i = 0; i < 4; i++, dst += stride) - vst1_lane_u32((uint32_t *)dst, d0u32, 0); -} - -void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint8x8_t d0u8 = vdup_n_u8(0); - (void)left; - - d0u8 = vld1_u8(above); - for (i = 0; i < 8; i++, dst += stride) - vst1_u8(dst, d0u8); -} - -void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - (void)left; - - q0u8 = vld1q_u8(above); - for (i = 0; i < 16; i++, dst += stride) - vst1q_u8(dst, q0u8); -} - -void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)left; - - q0u8 = vld1q_u8(above); - q1u8 = vld1q_u8(above + 16); - for (i = 0; i < 32; i++, dst += stride) { - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q1u8); - } -} - -void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d1u32 = vdup_n_u32(0); - (void)above; - - d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0); - - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); -} - -void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - uint8x8_t d0u8 = vdup_n_u8(0); - uint64x1_t d1u64 = vdup_n_u64(0); - (void)above; - - d1u64 = vld1_u64((const uint64_t *)left); - - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6); - vst1_u8(dst, d0u8); - dst += stride; - d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7); - vst1_u8(dst, d0u8); -} - -void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)above; - - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - dst += stride; - } -} - -void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j, k; - uint8x8_t d2u8 = vdup_n_u8(0); - uint8x16_t q0u8 = vdupq_n_u8(0); - uint8x16_t q1u8 = vdupq_n_u8(0); - (void)above; - - for (k = 0; k < 2; k++, left += 16) { - q1u8 = vld1q_u8(left); - d2u8 = vget_low_u8(q1u8); - for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) { - q0u8 = vdupq_lane_u8(d2u8, 0); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 1); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 2); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 3); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 4); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 5); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 6); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - q0u8 = vdupq_lane_u8(d2u8, 7); - vst1q_u8(dst, q0u8); - vst1q_u8(dst + 16, q0u8); - dst += stride; - } - } -} - -void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int i; - uint16x8_t q1u16, q3u16; - int16x8_t q1s16; - uint8x8_t d0u8 = vdup_n_u8(0); - uint32x2_t d2u32 = vdup_n_u32(0); - - d0u8 = vld1_dup_u8(above - 1); - d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0); - q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8); - for (i = 0; i < 4; i++, dst += stride) { - q1u16 = vdupq_n_u16((uint16_t)left[i]); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16), - vreinterpretq_s16_u16(q3u16)); - d0u8 = vqmovun_s16(q1s16); - vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0); - } -} - -void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j; - uint16x8_t q0u16, q3u16, q10u16; - int16x8_t q0s16; - uint16x4_t d20u16; - uint8x8_t d0u8, d2u8, d30u8; - - d0u8 = vld1_dup_u8(above - 1); - d30u8 = vld1_u8(left); - d2u8 = vld1_u8(above); - q10u16 = vmovl_u8(d30u8); - q3u16 = vsubl_u8(d2u8, d0u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 1); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 2); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - q0u16 = vdupq_lane_u16(d20u16, 3); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16), - vreinterpretq_s16_u16(q0u16)); - d0u8 = vqmovun_s16(q0s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8)); - dst += stride; - } -} - -void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16; - uint8x16_t q0u8, q1u8; - int16x8_t q0s16, q1s16, q8s16, q11s16; - uint16x4_t d20u16; - uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - for (k = 0; k < 2; k++, left += 8) { - d18u8 = vld1_u8(left); - q10u16 = vmovl_u8(d18u8); - d20u16 = vget_low_u16(q10u16); - for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) { - q0u16 = vdupq_lane_u16(d20u16, 0); - q8u16 = vdupq_lane_u16(d20u16, 1); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q2u16)); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q3u16)); - q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q2u16)); - q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d20u16, 2); - q8u16 = vdupq_lane_u16(d20u16, 3); - q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q2u16)); - q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q3u16)); - q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q2u16)); - q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16), - vreinterpretq_s16_u16(q3u16)); - d2u8 = vqmovun_s16(q1s16); - d3u8 = vqmovun_s16(q0s16); - d22u8 = vqmovun_s16(q11s16); - d23u8 = vqmovun_s16(q8s16); - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8)); - dst += stride; - vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8)); - vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8)); - dst += stride; - } - } -} - -void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int j, k; - uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16; - uint8x16_t q0u8, q1u8, q2u8; - int16x8_t q12s16, q13s16, q14s16, q15s16; - uint16x4_t d6u16; - uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8; - - q0u8 = vld1q_dup_u8(above - 1); - q1u8 = vld1q_u8(above); - q2u8 = vld1q_u8(above + 16); - q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8)); - q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8)); - q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8)); - q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8)); - for (k = 0; k < 4; k++, left += 8) { - d26u8 = vld1_u8(left); - q3u16 = vmovl_u8(d26u8); - d6u16 = vget_low_u16(q3u16); - for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) { - q0u16 = vdupq_lane_u16(d6u16, 0); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 1); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 2); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - - q0u16 = vdupq_lane_u16(d6u16, 3); - q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q8u16)); - q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q9u16)); - q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q10u16)); - q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16), - vreinterpretq_s16_u16(q11u16)); - d0u8 = vqmovun_s16(q12s16); - d1u8 = vqmovun_s16(q13s16); - d2u8 = vqmovun_s16(q14s16); - d3u8 = vqmovun_s16(q15s16); - q0u8 = vcombine_u8(d0u8, d1u8); - q1u8 = vcombine_u8(d2u8, d3u8); - vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8)); - vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8)); - dst += stride; - } - } -} -#endif // !HAVE_NEON_ASM diff --git a/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm b/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm deleted file mode 100644 index 14f574a50..000000000 --- a/vp9/common/arm/neon/vp9_reconintra_neon_asm.asm +++ /dev/null @@ -1,630 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp9_v_predictor_4x4_neon| - EXPORT |vp9_v_predictor_8x8_neon| - EXPORT |vp9_v_predictor_16x16_neon| - EXPORT |vp9_v_predictor_32x32_neon| - EXPORT |vp9_h_predictor_4x4_neon| - EXPORT |vp9_h_predictor_8x8_neon| - EXPORT |vp9_h_predictor_16x16_neon| - EXPORT |vp9_h_predictor_32x32_neon| - EXPORT |vp9_tm_predictor_4x4_neon| - EXPORT |vp9_tm_predictor_8x8_neon| - EXPORT |vp9_tm_predictor_16x16_neon| - EXPORT |vp9_tm_predictor_32x32_neon| - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_v_predictor_4x4_neon| PROC - vld1.32 {d0[0]}, [r2] - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d0[0]}, [r0], r1 - bx lr - ENDP ; |vp9_v_predictor_4x4_neon| - -;void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_v_predictor_8x8_neon| PROC - vld1.8 {d0}, [r2] - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - vst1.8 {d0}, [r0], r1 - bx lr - ENDP ; |vp9_v_predictor_8x8_neon| - -;void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_v_predictor_16x16_neon| PROC - vld1.8 {q0}, [r2] - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - vst1.8 {q0}, [r0], r1 - bx lr - ENDP ; |vp9_v_predictor_16x16_neon| - -;void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_v_predictor_32x32_neon| PROC - vld1.8 {q0, q1}, [r2] - mov r2, #2 -loop_v - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - vst1.8 {q0, q1}, [r0], r1 - subs r2, r2, #1 - bgt loop_v - bx lr - ENDP ; |vp9_v_predictor_32x32_neon| - -;void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_h_predictor_4x4_neon| PROC - vld1.32 {d1[0]}, [r3] - vdup.8 d0, d1[0] - vst1.32 {d0[0]}, [r0], r1 - vdup.8 d0, d1[1] - vst1.32 {d0[0]}, [r0], r1 - vdup.8 d0, d1[2] - vst1.32 {d0[0]}, [r0], r1 - vdup.8 d0, d1[3] - vst1.32 {d0[0]}, [r0], r1 - bx lr - ENDP ; |vp9_h_predictor_4x4_neon| - -;void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_h_predictor_8x8_neon| PROC - vld1.64 {d1}, [r3] - vdup.8 d0, d1[0] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[1] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[2] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[3] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[4] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[5] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[6] - vst1.64 {d0}, [r0], r1 - vdup.8 d0, d1[7] - vst1.64 {d0}, [r0], r1 - bx lr - ENDP ; |vp9_h_predictor_8x8_neon| - -;void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_h_predictor_16x16_neon| PROC - vld1.8 {q1}, [r3] - vdup.8 q0, d2[0] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[1] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[2] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[3] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[4] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[5] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[6] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[7] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[0] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[1] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[2] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[3] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[4] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[5] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[6] - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[7] - vst1.8 {q0}, [r0], r1 - bx lr - ENDP ; |vp9_h_predictor_16x16_neon| - -;void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_h_predictor_32x32_neon| PROC - sub r1, r1, #16 - mov r2, #2 -loop_h - vld1.8 {q1}, [r3]! - vdup.8 q0, d2[0] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[1] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[2] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[3] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[4] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[5] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[6] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d2[7] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[0] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[1] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[2] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[3] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[4] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[5] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[6] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - vdup.8 q0, d3[7] - vst1.8 {q0}, [r0]! - vst1.8 {q0}, [r0], r1 - subs r2, r2, #1 - bgt loop_h - bx lr - ENDP ; |vp9_h_predictor_32x32_neon| - -;void vp9_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_tm_predictor_4x4_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.u8 {d0[]}, [r12] - - ; Load above 4 pixels - vld1.32 {d2[0]}, [r2] - - ; Compute above - ytop_left - vsubl.u8 q3, d2, d0 - - ; Load left row by row and compute left + (above - ytop_left) - ; 1st row and 2nd row - vld1.u8 {d2[]}, [r3]! - vld1.u8 {d4[]}, [r3]! - vmovl.u8 q1, d2 - vmovl.u8 q2, d4 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d1[0]}, [r0], r1 - - ; 3rd row and 4th row - vld1.u8 {d2[]}, [r3]! - vld1.u8 {d4[]}, [r3] - vmovl.u8 q1, d2 - vmovl.u8 q2, d4 - vadd.s16 q1, q1, q3 - vadd.s16 q2, q2, q3 - vqmovun.s16 d0, q1 - vqmovun.s16 d1, q2 - vst1.32 {d0[0]}, [r0], r1 - vst1.32 {d1[0]}, [r0], r1 - bx lr - ENDP ; |vp9_tm_predictor_4x4_neon| - -;void vp9_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_tm_predictor_8x8_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.8 {d0[]}, [r12] - - ; preload 8 left - vld1.8 {d30}, [r3] - - ; Load above 8 pixels - vld1.64 {d2}, [r2] - - vmovl.u8 q10, d30 - - ; Compute above - ytop_left - vsubl.u8 q3, d2, d0 - - ; Load left row by row and compute left + (above - ytop_left) - ; 1st row and 2nd row - vdup.16 q0, d20[0] - vdup.16 q1, d20[1] - vadd.s16 q0, q3, q0 - vadd.s16 q1, q3, q1 - - ; 3rd row and 4th row - vdup.16 q8, d20[2] - vdup.16 q9, d20[3] - vadd.s16 q8, q3, q8 - vadd.s16 q9, q3, q9 - - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q8 - vqmovun.s16 d3, q9 - - vst1.64 {d0}, [r0], r1 - vst1.64 {d1}, [r0], r1 - vst1.64 {d2}, [r0], r1 - vst1.64 {d3}, [r0], r1 - - ; 5th row and 6th row - vdup.16 q0, d21[0] - vdup.16 q1, d21[1] - vadd.s16 q0, q3, q0 - vadd.s16 q1, q3, q1 - - ; 7th row and 8th row - vdup.16 q8, d21[2] - vdup.16 q9, d21[3] - vadd.s16 q8, q3, q8 - vadd.s16 q9, q3, q9 - - vqmovun.s16 d0, q0 - vqmovun.s16 d1, q1 - vqmovun.s16 d2, q8 - vqmovun.s16 d3, q9 - - vst1.64 {d0}, [r0], r1 - vst1.64 {d1}, [r0], r1 - vst1.64 {d2}, [r0], r1 - vst1.64 {d3}, [r0], r1 - - bx lr - ENDP ; |vp9_tm_predictor_8x8_neon| - -;void vp9_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_tm_predictor_16x16_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.8 {d0[]}, [r12] - - ; Load above 8 pixels - vld1.8 {q1}, [r2] - - ; preload 8 left into r12 - vld1.8 {d18}, [r3]! - - ; Compute above - ytop_left - vsubl.u8 q2, d2, d0 - vsubl.u8 q3, d3, d0 - - vmovl.u8 q10, d18 - - ; Load left row by row and compute left + (above - ytop_left) - ; Process 8 rows in each single loop and loop 2 times to process 16 rows. - mov r2, #2 - -loop_16x16_neon - ; Process two rows. - vdup.16 q0, d20[0] - vdup.16 q8, d20[1] - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vdup.16 q0, d20[2] ; proload next 2 rows data - vdup.16 q8, d20[3] - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - ; Process two rows. - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vdup.16 q0, d21[0] ; proload next 2 rows data - vdup.16 q8, d21[1] - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vdup.16 q0, d21[2] ; proload next 2 rows data - vdup.16 q8, d21[3] - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - - vadd.s16 q1, q0, q2 - vadd.s16 q0, q0, q3 - vadd.s16 q11, q8, q2 - vadd.s16 q8, q8, q3 - vqmovun.s16 d2, q1 - vqmovun.s16 d3, q0 - vqmovun.s16 d22, q11 - vqmovun.s16 d23, q8 - vld1.8 {d18}, [r3]! ; preload 8 left into r12 - vmovl.u8 q10, d18 - vst1.64 {d2,d3}, [r0], r1 - vst1.64 {d22,d23}, [r0], r1 - - subs r2, r2, #1 - bgt loop_16x16_neon - - bx lr - ENDP ; |vp9_tm_predictor_16x16_neon| - -;void vp9_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride, -; const uint8_t *above, -; const uint8_t *left) -; r0 uint8_t *dst -; r1 ptrdiff_t y_stride -; r2 const uint8_t *above -; r3 const uint8_t *left - -|vp9_tm_predictor_32x32_neon| PROC - ; Load ytop_left = above[-1]; - sub r12, r2, #1 - vld1.8 {d0[]}, [r12] - - ; Load above 32 pixels - vld1.8 {q1}, [r2]! - vld1.8 {q2}, [r2] - - ; preload 8 left pixels - vld1.8 {d26}, [r3]! - - ; Compute above - ytop_left - vsubl.u8 q8, d2, d0 - vsubl.u8 q9, d3, d0 - vsubl.u8 q10, d4, d0 - vsubl.u8 q11, d5, d0 - - vmovl.u8 q3, d26 - - ; Load left row by row and compute left + (above - ytop_left) - ; Process 8 rows in each single loop and loop 4 times to process 32 rows. - mov r2, #4 - -loop_32x32_neon - ; Process two rows. - vdup.16 q0, d6[0] - vdup.16 q2, d6[1] - vadd.s16 q12, q0, q8 - vadd.s16 q13, q0, q9 - vadd.s16 q14, q0, q10 - vadd.s16 q15, q0, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vdup.16 q1, d6[2] - vdup.16 q2, d6[3] - vst1.64 {d24-d27}, [r0], r1 - - ; Process two rows. - vadd.s16 q12, q1, q8 - vadd.s16 q13, q1, q9 - vadd.s16 q14, q1, q10 - vadd.s16 q15, q1, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vdup.16 q0, d7[0] - vdup.16 q2, d7[1] - vst1.64 {d24-d27}, [r0], r1 - - ; Process two rows. - vadd.s16 q12, q0, q8 - vadd.s16 q13, q0, q9 - vadd.s16 q14, q0, q10 - vadd.s16 q15, q0, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vdup.16 q0, d7[2] - vdup.16 q2, d7[3] - vst1.64 {d24-d27}, [r0], r1 - - ; Process two rows. - vadd.s16 q12, q0, q8 - vadd.s16 q13, q0, q9 - vadd.s16 q14, q0, q10 - vadd.s16 q15, q0, q11 - vqmovun.s16 d0, q12 - vqmovun.s16 d1, q13 - vadd.s16 q12, q2, q8 - vadd.s16 q13, q2, q9 - vqmovun.s16 d2, q14 - vqmovun.s16 d3, q15 - vadd.s16 q14, q2, q10 - vadd.s16 q15, q2, q11 - vst1.64 {d0-d3}, [r0], r1 - vqmovun.s16 d24, q12 - vqmovun.s16 d25, q13 - vld1.8 {d0}, [r3]! ; preload 8 left pixels - vqmovun.s16 d26, q14 - vqmovun.s16 d27, q15 - vmovl.u8 q3, d0 - vst1.64 {d24-d27}, [r0], r1 - - subs r2, r2, #1 - bgt loop_32x32_neon - - bx lr - ENDP ; |vp9_tm_predictor_32x32_neon| - - END diff --git a/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c b/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c deleted file mode 100644 index b0dc496ae..000000000 --- a/vp9/common/mips/dspr2/vp9_intrapred16_dspr2.c +++ /dev/null @@ -1,332 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include <stdlib.h> - -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" - -#if HAVE_DSPR2 -void vp9_h_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - int32_t tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15, tmp16; - - __asm__ __volatile__ ( - "lb %[tmp1], (%[left]) \n\t" - "lb %[tmp2], 1(%[left]) \n\t" - "lb %[tmp3], 2(%[left]) \n\t" - "lb %[tmp4], 3(%[left]) \n\t" - "lb %[tmp5], 4(%[left]) \n\t" - "lb %[tmp6], 5(%[left]) \n\t" - "lb %[tmp7], 6(%[left]) \n\t" - "lb %[tmp8], 7(%[left]) \n\t" - "lb %[tmp9], 8(%[left]) \n\t" - "lb %[tmp10], 9(%[left]) \n\t" - "lb %[tmp11], 10(%[left]) \n\t" - "lb %[tmp12], 11(%[left]) \n\t" - "lb %[tmp13], 12(%[left]) \n\t" - "lb %[tmp14], 13(%[left]) \n\t" - "lb %[tmp15], 14(%[left]) \n\t" - "lb %[tmp16], 15(%[left]) \n\t" - - "replv.qb %[tmp1], %[tmp1] \n\t" - "replv.qb %[tmp2], %[tmp2] \n\t" - "replv.qb %[tmp3], %[tmp3] \n\t" - "replv.qb %[tmp4], %[tmp4] \n\t" - "replv.qb %[tmp5], %[tmp5] \n\t" - "replv.qb %[tmp6], %[tmp6] \n\t" - "replv.qb %[tmp7], %[tmp7] \n\t" - "replv.qb %[tmp8], %[tmp8] \n\t" - "replv.qb %[tmp9], %[tmp9] \n\t" - "replv.qb %[tmp10], %[tmp10] \n\t" - "replv.qb %[tmp11], %[tmp11] \n\t" - "replv.qb %[tmp12], %[tmp12] \n\t" - "replv.qb %[tmp13], %[tmp13] \n\t" - "replv.qb %[tmp14], %[tmp14] \n\t" - "replv.qb %[tmp15], %[tmp15] \n\t" - "replv.qb %[tmp16], %[tmp16] \n\t" - - "sw %[tmp1], (%[dst]) \n\t" - "sw %[tmp1], 4(%[dst]) \n\t" - "sw %[tmp1], 8(%[dst]) \n\t" - "sw %[tmp1], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp2], (%[dst]) \n\t" - "sw %[tmp2], 4(%[dst]) \n\t" - "sw %[tmp2], 8(%[dst]) \n\t" - "sw %[tmp2], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp3], (%[dst]) \n\t" - "sw %[tmp3], 4(%[dst]) \n\t" - "sw %[tmp3], 8(%[dst]) \n\t" - "sw %[tmp3], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp4], (%[dst]) \n\t" - "sw %[tmp4], 4(%[dst]) \n\t" - "sw %[tmp4], 8(%[dst]) \n\t" - "sw %[tmp4], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp5], (%[dst]) \n\t" - "sw %[tmp5], 4(%[dst]) \n\t" - "sw %[tmp5], 8(%[dst]) \n\t" - "sw %[tmp5], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp6], (%[dst]) \n\t" - "sw %[tmp6], 4(%[dst]) \n\t" - "sw %[tmp6], 8(%[dst]) \n\t" - "sw %[tmp6], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp7], (%[dst]) \n\t" - "sw %[tmp7], 4(%[dst]) \n\t" - "sw %[tmp7], 8(%[dst]) \n\t" - "sw %[tmp7], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp8], (%[dst]) \n\t" - "sw %[tmp8], 4(%[dst]) \n\t" - "sw %[tmp8], 8(%[dst]) \n\t" - "sw %[tmp8], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp9], (%[dst]) \n\t" - "sw %[tmp9], 4(%[dst]) \n\t" - "sw %[tmp9], 8(%[dst]) \n\t" - "sw %[tmp9], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp10], (%[dst]) \n\t" - "sw %[tmp10], 4(%[dst]) \n\t" - "sw %[tmp10], 8(%[dst]) \n\t" - "sw %[tmp10], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp11], (%[dst]) \n\t" - "sw %[tmp11], 4(%[dst]) \n\t" - "sw %[tmp11], 8(%[dst]) \n\t" - "sw %[tmp11], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp12], (%[dst]) \n\t" - "sw %[tmp12], 4(%[dst]) \n\t" - "sw %[tmp12], 8(%[dst]) \n\t" - "sw %[tmp12], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp13], (%[dst]) \n\t" - "sw %[tmp13], 4(%[dst]) \n\t" - "sw %[tmp13], 8(%[dst]) \n\t" - "sw %[tmp13], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp14], (%[dst]) \n\t" - "sw %[tmp14], 4(%[dst]) \n\t" - "sw %[tmp14], 8(%[dst]) \n\t" - "sw %[tmp14], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp15], (%[dst]) \n\t" - "sw %[tmp15], 4(%[dst]) \n\t" - "sw %[tmp15], 8(%[dst]) \n\t" - "sw %[tmp15], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp16], (%[dst]) \n\t" - "sw %[tmp16], 4(%[dst]) \n\t" - "sw %[tmp16], 8(%[dst]) \n\t" - "sw %[tmp16], 12(%[dst]) \n\t" - - : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), - [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4), - [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7), - [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8), - [tmp9] "=&r" (tmp9), [tmp10] "=&r" (tmp10), - [tmp11] "=&r" (tmp11), [tmp12] "=&r" (tmp12), - [tmp13] "=&r" (tmp13), [tmp14] "=&r" (tmp14), - [tmp15] "=&r" (tmp15), [tmp16] "=&r" (tmp16) - : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride) - ); -} - -void vp9_dc_predictor_16x16_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; - int32_t above2, left2; - - __asm__ __volatile__ ( - "lw %[above1], (%[above]) \n\t" - "lw %[above2], 4(%[above]) \n\t" - "lw %[left1], (%[left]) \n\t" - "lw %[left2], 4(%[left]) \n\t" - - "preceu.ph.qbl %[above_l1], %[above1] \n\t" - "preceu.ph.qbr %[above_r1], %[above1] \n\t" - "preceu.ph.qbl %[left_l1], %[left1] \n\t" - "preceu.ph.qbr %[left_r1], %[left1] \n\t" - - "addu.ph %[average], %[above_r1], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "preceu.ph.qbl %[above_l1], %[above2] \n\t" - "preceu.ph.qbr %[above_r1], %[above2] \n\t" - "preceu.ph.qbl %[left_l1], %[left2] \n\t" - "preceu.ph.qbr %[left_r1], %[left2] \n\t" - - "addu.ph %[average], %[average], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[above_r1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "lw %[above1], 8(%[above]) \n\t" - "lw %[above2], 12(%[above]) \n\t" - "lw %[left1], 8(%[left]) \n\t" - "lw %[left2], 12(%[left]) \n\t" - - "preceu.ph.qbl %[above_l1], %[above1] \n\t" - "preceu.ph.qbr %[above_r1], %[above1] \n\t" - "preceu.ph.qbl %[left_l1], %[left1] \n\t" - "preceu.ph.qbr %[left_r1], %[left1] \n\t" - - "addu.ph %[average], %[average], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[above_r1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "preceu.ph.qbl %[above_l1], %[above2] \n\t" - "preceu.ph.qbr %[above_r1], %[above2] \n\t" - "preceu.ph.qbl %[left_l1], %[left2] \n\t" - "preceu.ph.qbr %[left_r1], %[left2] \n\t" - - "addu.ph %[average], %[average], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[above_r1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "addiu %[average], %[average], 16 \n\t" - "srl %[tmp], %[average], 16 \n\t" - "addu.ph %[average], %[tmp], %[average] \n\t" - "srl %[expected_dc], %[average], 5 \n\t" - "replv.qb %[expected_dc], %[expected_dc] \n\t" - - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - "sw %[expected_dc], 8(%[dst]) \n\t" - "sw %[expected_dc], 12(%[dst]) \n\t" - - : [left1] "=&r" (left1), [above1] "=&r" (above1), - [left_l1] "=&r" (left_l1), [above_l1] "=&r" (above_l1), - [left_r1] "=&r" (left_r1), [above_r1] "=&r" (above_r1), - [above2] "=&r" (above2), [left2] "=&r" (left2), - [average] "=&r" (average), [tmp] "=&r" (tmp), - [expected_dc] "=&r" (expected_dc) - : [above] "r" (above), [left] "r" (left), - [dst] "r" (dst), [stride] "r" (stride) - ); -} -#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c b/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c deleted file mode 100644 index a53c62381..000000000 --- a/vp9/common/mips/dspr2/vp9_intrapred4_dspr2.c +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include <stdlib.h> - -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" - -#if HAVE_DSPR2 -void vp9_h_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4; - - __asm__ __volatile__ ( - "lb %[tmp1], (%[left]) \n\t" - "lb %[tmp2], 1(%[left]) \n\t" - "lb %[tmp3], 2(%[left]) \n\t" - "lb %[tmp4], 3(%[left]) \n\t" - "replv.qb %[tmp1], %[tmp1] \n\t" - "replv.qb %[tmp2], %[tmp2] \n\t" - "replv.qb %[tmp3], %[tmp3] \n\t" - "replv.qb %[tmp4], %[tmp4] \n\t" - "sw %[tmp1], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp2], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp3], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp4], (%[dst]) \n\t" - - : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), - [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4) - : [left] "r" (left), [dst] "r" (dst), [stride] "r" (stride) - ); -} - -void vp9_dc_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above_c, above_l, above_r, left_c, left_r, left_l; - - __asm__ __volatile__ ( - "lw %[above_c], (%[above]) \n\t" - "lw %[left_c], (%[left]) \n\t" - - "preceu.ph.qbl %[above_l], %[above_c] \n\t" - "preceu.ph.qbr %[above_r], %[above_c] \n\t" - "preceu.ph.qbl %[left_l], %[left_c] \n\t" - "preceu.ph.qbr %[left_r], %[left_c] \n\t" - - "addu.ph %[average], %[above_r], %[above_l] \n\t" - "addu.ph %[average], %[average], %[left_l] \n\t" - "addu.ph %[average], %[average], %[left_r] \n\t" - "addiu %[average], %[average], 4 \n\t" - "srl %[tmp], %[average], 16 \n\t" - "addu.ph %[average], %[tmp], %[average] \n\t" - "srl %[expected_dc], %[average], 3 \n\t" - "replv.qb %[expected_dc], %[expected_dc] \n\t" - - "sw %[expected_dc], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - - : [above_c] "=&r" (above_c), [above_l] "=&r" (above_l), - [above_r] "=&r" (above_r), [left_c] "=&r" (left_c), - [left_l] "=&r" (left_l), [left_r] "=&r" (left_r), - [average] "=&r" (average), [tmp] "=&r" (tmp), - [expected_dc] "=&r" (expected_dc) - : [above] "r" (above), [left] "r" (left), - [dst] "r" (dst), [stride] "r" (stride) - ); -} - -void vp9_tm_predictor_4x4_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t abovel, abover; - int32_t left0, left1, left2, left3; - int32_t res0, res1; - int32_t resl; - int32_t resr; - int32_t top_left; - uint8_t *cm = vp9_ff_cropTbl; - - __asm__ __volatile__ ( - "ulw %[resl], (%[above]) \n\t" - - "lbu %[left0], (%[left]) \n\t" - "lbu %[left1], 1(%[left]) \n\t" - "lbu %[left2], 2(%[left]) \n\t" - "lbu %[left3], 3(%[left]) \n\t" - - "lbu %[top_left], -1(%[above]) \n\t" - - "preceu.ph.qbl %[abovel], %[resl] \n\t" - "preceu.ph.qbr %[abover], %[resl] \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "replv.ph %[left1], %[left1] \n\t" - "replv.ph %[left2], %[left2] \n\t" - "replv.ph %[left3], %[left3] \n\t" - - "replv.ph %[top_left], %[top_left] \n\t" - - "addu.ph %[resl], %[abovel], %[left0] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left0] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "addu.ph %[resl], %[abovel], %[left1] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left1] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sb %[res1], 1(%[dst]) \n\t" - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "addu.ph %[resl], %[abovel], %[left2] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left2] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - - "sb %[res1], 1(%[dst]) \n\t" - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "addu.ph %[resl], %[abovel], %[left3] \n\t" - "subu.ph %[resl], %[resl], %[top_left] \n\t" - - "addu.ph %[resr], %[abover], %[left3] \n\t" - "subu.ph %[resr], %[resr], %[top_left] \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - - "sll %[res0], %[resr], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - - "sra %[res1], %[resr], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "sb %[res0], (%[dst]) \n\t" - - "sll %[res0], %[resl], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "lbux %[res0], %[res0](%[cm]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - - "sra %[res1], %[resl], 16 \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - - "sb %[res0], 2(%[dst]) \n\t" - "sb %[res1], 3(%[dst]) \n\t" - - : [abovel] "=&r" (abovel), [abover] "=&r" (abover), - [left0] "=&r" (left0), [left1] "=&r" (left1), [left2] "=&r" (left2), - [res0] "=&r" (res0), [res1] "=&r" (res1), [left3] "=&r" (left3), - [resl] "=&r" (resl), [resr] "=&r" (resr), [top_left] "=&r" (top_left) - : [above] "r" (above), [left] "r" (left), - [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm) - ); -} -#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c b/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c deleted file mode 100644 index 40d93ae35..000000000 --- a/vp9/common/mips/dspr2/vp9_intrapred8_dspr2.c +++ /dev/null @@ -1,610 +0,0 @@ -/* - * Copyright (c) 2013 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include <stdlib.h> - -#include "./vp9_rtcd.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/mips/dspr2/vp9_common_dspr2.h" - -#if HAVE_DSPR2 -void vp9_h_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8; - - __asm__ __volatile__ ( - "lb %[tmp1], (%[left]) \n\t" - "lb %[tmp2], 1(%[left]) \n\t" - "lb %[tmp3], 2(%[left]) \n\t" - "lb %[tmp4], 3(%[left]) \n\t" - "lb %[tmp5], 4(%[left]) \n\t" - "lb %[tmp6], 5(%[left]) \n\t" - "lb %[tmp7], 6(%[left]) \n\t" - "lb %[tmp8], 7(%[left]) \n\t" - - "replv.qb %[tmp1], %[tmp1] \n\t" - "replv.qb %[tmp2], %[tmp2] \n\t" - "replv.qb %[tmp3], %[tmp3] \n\t" - "replv.qb %[tmp4], %[tmp4] \n\t" - "replv.qb %[tmp5], %[tmp5] \n\t" - "replv.qb %[tmp6], %[tmp6] \n\t" - "replv.qb %[tmp7], %[tmp7] \n\t" - "replv.qb %[tmp8], %[tmp8] \n\t" - - "sw %[tmp1], (%[dst]) \n\t" - "sw %[tmp1], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp2], (%[dst]) \n\t" - "sw %[tmp2], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp3], (%[dst]) \n\t" - "sw %[tmp3], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp4], (%[dst]) \n\t" - "sw %[tmp4], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp5], (%[dst]) \n\t" - "sw %[tmp5], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp6], (%[dst]) \n\t" - "sw %[tmp6], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp7], (%[dst]) \n\t" - "sw %[tmp7], 4(%[dst]) \n\t" - "add %[dst], %[dst], %[stride] \n\t" - "sw %[tmp8], (%[dst]) \n\t" - "sw %[tmp8], 4(%[dst]) \n\t" - - : [tmp1] "=&r" (tmp1), [tmp2] "=&r" (tmp2), - [tmp3] "=&r" (tmp3), [tmp4] "=&r" (tmp4), - [tmp5] "=&r" (tmp5), [tmp7] "=&r" (tmp7), - [tmp6] "=&r" (tmp6), [tmp8] "=&r" (tmp8) - : [left] "r" (left), [dst] "r" (dst), - [stride] "r" (stride) - ); -} - -void vp9_dc_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t expected_dc; - int32_t average; - int32_t tmp, above1, above_l1, above_r1, left1, left_r1, left_l1; - int32_t above2, above_l2, above_r2, left2, left_r2, left_l2; - - __asm__ __volatile__ ( - "lw %[above1], (%[above]) \n\t" - "lw %[above2], 4(%[above]) \n\t" - "lw %[left1], (%[left]) \n\t" - "lw %[left2], 4(%[left]) \n\t" - - "preceu.ph.qbl %[above_l1], %[above1] \n\t" - "preceu.ph.qbr %[above_r1], %[above1] \n\t" - "preceu.ph.qbl %[left_l1], %[left1] \n\t" - "preceu.ph.qbr %[left_r1], %[left1] \n\t" - - "preceu.ph.qbl %[above_l2], %[above2] \n\t" - "preceu.ph.qbr %[above_r2], %[above2] \n\t" - "preceu.ph.qbl %[left_l2], %[left2] \n\t" - "preceu.ph.qbr %[left_r2], %[left2] \n\t" - - "addu.ph %[average], %[above_r1], %[above_l1] \n\t" - "addu.ph %[average], %[average], %[left_l1] \n\t" - "addu.ph %[average], %[average], %[left_r1] \n\t" - - "addu.ph %[average], %[average], %[above_l2] \n\t" - "addu.ph %[average], %[average], %[above_r2] \n\t" - "addu.ph %[average], %[average], %[left_l2] \n\t" - "addu.ph %[average], %[average], %[left_r2] \n\t" - - "addiu %[average], %[average], 8 \n\t" - - "srl %[tmp], %[average], 16 \n\t" - "addu.ph %[average], %[tmp], %[average] \n\t" - "srl %[expected_dc], %[average], 4 \n\t" - "replv.qb %[expected_dc], %[expected_dc] \n\t" - - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - "add %[dst], %[dst], %[stride] \n\t" - "sw %[expected_dc], (%[dst]) \n\t" - "sw %[expected_dc], 4(%[dst]) \n\t" - - : [above1] "=&r" (above1), [above_l1] "=&r" (above_l1), - [above_r1] "=&r" (above_r1), [left1] "=&r" (left1), - [left_l1] "=&r" (left_l1), [left_r1] "=&r" (left_r1), - [above2] "=&r" (above2), [above_l2] "=&r" (above_l2), - [above_r2] "=&r" (above_r2), [left2] "=&r" (left2), - [left_l2] "=&r" (left_l2), [left_r2] "=&r" (left_r2), - [average] "=&r" (average), [tmp] "=&r" (tmp), - [expected_dc] "=&r" (expected_dc) - : [above] "r" (above), [left] "r" (left), [dst] "r" (dst), - [stride] "r" (stride) - ); -} - -void vp9_tm_predictor_8x8_dspr2(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - int32_t abovel, abover; - int32_t abovel_1, abover_1; - int32_t left0; - int32_t res0, res1, res2, res3; - int32_t reshw; - int32_t top_left; - uint8_t *cm = vp9_ff_cropTbl; - - __asm__ __volatile__ ( - "ulw %[reshw], (%[above]) \n\t" - "ulw %[top_left], 4(%[above]) \n\t" - - "lbu %[left0], (%[left]) \n\t" - - "preceu.ph.qbl %[abovel], %[reshw] \n\t" - "preceu.ph.qbr %[abover], %[reshw] \n\t" - "preceu.ph.qbl %[abovel_1], %[top_left] \n\t" - "preceu.ph.qbr %[abover_1], %[top_left] \n\t" - - "lbu %[top_left], -1(%[above]) \n\t" - "replv.ph %[left0], %[left0] \n\t" - - "replv.ph %[top_left], %[top_left] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 1(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 2(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 3(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 4(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 5(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 6(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbu %[left0], 7(%[left]) \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - "replv.ph %[left0], %[left0] \n\t" - "add %[dst], %[dst], %[stride] \n\t" - - "addu.ph %[reshw], %[abovel], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], (%[dst]) \n\t" - "sb %[res1], 1(%[dst]) \n\t" - "sb %[res2], 2(%[dst]) \n\t" - "sb %[res3], 3(%[dst]) \n\t" - - "addu.ph %[reshw], %[abovel_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res2], %[reshw], 16 \n\t" - "sra %[res2], %[res2], 16 \n\t" - "sra %[res3], %[reshw], 16 \n\t" - - "addu.ph %[reshw], %[abover_1], %[left0] \n\t" - "subu.ph %[reshw], %[reshw], %[top_left] \n\t" - - "sll %[res0], %[reshw], 16 \n\t" - "sra %[res0], %[res0], 16 \n\t" - "sra %[res1], %[reshw], 16 \n\t" - - "lbux %[res0], %[res0](%[cm]) \n\t" - "lbux %[res1], %[res1](%[cm]) \n\t" - "lbux %[res2], %[res2](%[cm]) \n\t" - "lbux %[res3], %[res3](%[cm]) \n\t" - - "sb %[res0], 4(%[dst]) \n\t" - "sb %[res1], 5(%[dst]) \n\t" - "sb %[res2], 6(%[dst]) \n\t" - "sb %[res3], 7(%[dst]) \n\t" - - : [abovel] "=&r" (abovel), [abover] "=&r" (abover), - [abovel_1] "=&r" (abovel_1), [abover_1] "=&r" (abover_1), - [left0] "=&r" (left0), [res2] "=&r" (res2), [res3] "=&r" (res3), - [res0] "=&r" (res0), [res1] "=&r" (res1), - [reshw] "=&r" (reshw), [top_left] "=&r" (top_left) - : [above] "r" (above), [left] "r" (left), - [dst] "r" (dst), [stride] "r" (stride), [cm] "r" (cm) - ); -} -#endif // #if HAVE_DSPR2 diff --git a/vp9/common/mips/msa/vp9_intra_predict_msa.c b/vp9/common/mips/msa/vp9_intra_predict_msa.c deleted file mode 100644 index abf2704ca..000000000 --- a/vp9/common/mips/msa/vp9_intra_predict_msa.c +++ /dev/null @@ -1,737 +0,0 @@ -/* - * Copyright (c) 2015 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vp9_rtcd.h" -#include "vpx_dsp/mips/macros_msa.h" - -#define IPRED_SUBS_UH2_UH(in0, in1, out0, out1) { \ - out0 = __msa_subs_u_h(out0, in0); \ - out1 = __msa_subs_u_h(out1, in1); \ -} - -static void intra_predict_vert_4x4_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t src_data; - - src_data = LW(src); - - SW4(src_data, src_data, src_data, src_data, dst, dst_stride); -} - -static void intra_predict_vert_8x8_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - uint32_t src_data1, src_data2; - - src_data1 = LW(src); - src_data2 = LW(src + 4); - - for (row = 8; row--;) { - SW(src_data1, dst); - SW(src_data2, (dst + 4)); - dst += dst_stride; - } -} - -static void intra_predict_vert_16x16_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 src0; - - src0 = LD_UB(src); - - for (row = 16; row--;) { - ST_UB(src0, dst); - dst += dst_stride; - } -} - -static void intra_predict_vert_32x32_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 src1, src2; - - src1 = LD_UB(src); - src2 = LD_UB(src + 16); - - for (row = 32; row--;) { - ST_UB2(src1, src2, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_horiz_4x4_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t out0, out1, out2, out3; - - out0 = src[0] * 0x01010101; - out1 = src[1] * 0x01010101; - out2 = src[2] * 0x01010101; - out3 = src[3] * 0x01010101; - - SW4(out0, out1, out2, out3, dst, dst_stride); -} - -static void intra_predict_horiz_8x8_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint64_t out0, out1, out2, out3, out4, out5, out6, out7; - - out0 = src[0] * 0x0101010101010101ull; - out1 = src[1] * 0x0101010101010101ull; - out2 = src[2] * 0x0101010101010101ull; - out3 = src[3] * 0x0101010101010101ull; - out4 = src[4] * 0x0101010101010101ull; - out5 = src[5] * 0x0101010101010101ull; - out6 = src[6] * 0x0101010101010101ull; - out7 = src[7] * 0x0101010101010101ull; - - SD4(out0, out1, out2, out3, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out4, out5, out6, out7, dst, dst_stride); -} - -static void intra_predict_horiz_16x16_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - uint8_t inp0, inp1, inp2, inp3; - v16u8 src0, src1, src2, src3; - - for (row = 4; row--;) { - inp0 = src[0]; - inp1 = src[1]; - inp2 = src[2]; - inp3 = src[3]; - src += 4; - - src0 = (v16u8)__msa_fill_b(inp0); - src1 = (v16u8)__msa_fill_b(inp1); - src2 = (v16u8)__msa_fill_b(inp2); - src3 = (v16u8)__msa_fill_b(inp3); - - ST_UB4(src0, src1, src2, src3, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void intra_predict_horiz_32x32_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - uint8_t inp0, inp1, inp2, inp3; - v16u8 src0, src1, src2, src3; - - for (row = 8; row--;) { - inp0 = src[0]; - inp1 = src[1]; - inp2 = src[2]; - inp3 = src[3]; - src += 4; - - src0 = (v16u8)__msa_fill_b(inp0); - src1 = (v16u8)__msa_fill_b(inp1); - src2 = (v16u8)__msa_fill_b(inp2); - src3 = (v16u8)__msa_fill_b(inp3); - - ST_UB2(src0, src0, dst, 16); - dst += dst_stride; - ST_UB2(src1, src1, dst, 16); - dst += dst_stride; - ST_UB2(src2, src2, dst, 16); - dst += dst_stride; - ST_UB2(src3, src3, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_dc_4x4_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { - uint32_t val0, val1; - v16i8 store, src = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - val0 = LW(src_top); - val1 = LW(src_left); - INSERT_W2_SB(val0, val1, src); - sum_h = __msa_hadd_u_h((v16u8)src, (v16u8)src); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_w((v4i32)store, 0); - - SW4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_dc_tl_4x4_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t val0; - v16i8 store, data = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - - val0 = LW(src); - data = (v16i8)__msa_insert_w((v4i32)data, 0, val0); - sum_h = __msa_hadd_u_h((v16u8)data, (v16u8)data); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_w, 2); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_w((v4i32)store, 0); - - SW4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_128dc_4x4_msa(uint8_t *dst, int32_t dst_stride) { - uint32_t out; - const v16i8 store = __msa_ldi_b(128); - - out = __msa_copy_u_w((v4i32)store, 0); - - SW4(out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_8x8_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { - uint64_t val0, val1; - v16i8 store; - v16u8 src = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - val0 = LD(src_top); - val1 = LD(src_left); - INSERT_D2_UB(val0, val1, src); - sum_h = __msa_hadd_u_h(src, src); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_d((v2i64)store, 0); - - SD4(val0, val0, val0, val0, dst, dst_stride); - dst += (4 * dst_stride); - SD4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_dc_tl_8x8_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint64_t val0; - v16i8 store; - v16u8 data = { 0 }; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - val0 = LD(src); - data = (v16u8)__msa_insert_d((v2i64)data, 0, val0); - sum_h = __msa_hadd_u_h(data, data); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 3); - store = __msa_splati_b((v16i8)sum_w, 0); - val0 = __msa_copy_u_d((v2i64)store, 0); - - SD4(val0, val0, val0, val0, dst, dst_stride); - dst += (4 * dst_stride); - SD4(val0, val0, val0, val0, dst, dst_stride); -} - -static void intra_predict_128dc_8x8_msa(uint8_t *dst, int32_t dst_stride) { - uint64_t out; - const v16i8 store = __msa_ldi_b(128); - - out = __msa_copy_u_d((v2i64)store, 0); - - SD4(out, out, out, out, dst, dst_stride); - dst += (4 * dst_stride); - SD4(out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_16x16_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { - v16u8 top, left, out; - v8u16 sum_h, sum_top, sum_left; - v4u32 sum_w; - v2u64 sum_d; - - top = LD_UB(src_top); - left = LD_UB(src_left); - HADD_UB2_UH(top, left, sum_top, sum_left); - sum_h = sum_top + sum_left; - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); - dst += (8 * dst_stride); - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_tl_16x16_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - v16u8 data, out; - v8u16 sum_h; - v4u32 sum_w; - v2u64 sum_d; - - data = LD_UB(src); - sum_h = __msa_hadd_u_h(data, data); - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 4); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); - dst += (8 * dst_stride); - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); -} - -static void intra_predict_128dc_16x16_msa(uint8_t *dst, int32_t dst_stride) { - const v16u8 out = (v16u8)__msa_ldi_b(128); - - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); - dst += (8 * dst_stride); - ST_UB8(out, out, out, out, out, out, out, out, dst, dst_stride); -} - -static void intra_predict_dc_32x32_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { - uint32_t row; - v16u8 top0, top1, left0, left1, out; - v8u16 sum_h, sum_top0, sum_top1, sum_left0, sum_left1; - v4u32 sum_w; - v2u64 sum_d; - - LD_UB2(src_top, 16, top0, top1); - LD_UB2(src_left, 16, left0, left1); - HADD_UB2_UH(top0, top1, sum_top0, sum_top1); - HADD_UB2_UH(left0, left1, sum_left0, sum_left1); - sum_h = sum_top0 + sum_top1; - sum_h += sum_left0 + sum_left1; - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 6); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - for (row = 16; row--;) { - ST_UB2(out, out, dst, 16); - dst += dst_stride; - ST_UB2(out, out, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_dc_tl_32x32_msa(const uint8_t *src, uint8_t *dst, - int32_t dst_stride) { - uint32_t row; - v16u8 data0, data1, out; - v8u16 sum_h, sum_data0, sum_data1; - v4u32 sum_w; - v2u64 sum_d; - - LD_UB2(src, 16, data0, data1); - HADD_UB2_UH(data0, data1, sum_data0, sum_data1); - sum_h = sum_data0 + sum_data1; - sum_w = __msa_hadd_u_w(sum_h, sum_h); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_pckev_w((v4i32)sum_d, (v4i32)sum_d); - sum_d = __msa_hadd_u_d(sum_w, sum_w); - sum_w = (v4u32)__msa_srari_w((v4i32)sum_d, 5); - out = (v16u8)__msa_splati_b((v16i8)sum_w, 0); - - for (row = 16; row--;) { - ST_UB2(out, out, dst, 16); - dst += dst_stride; - ST_UB2(out, out, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_128dc_32x32_msa(uint8_t *dst, int32_t dst_stride) { - uint32_t row; - const v16u8 out = (v16u8)__msa_ldi_b(128); - - for (row = 16; row--;) { - ST_UB2(out, out, dst, 16); - dst += dst_stride; - ST_UB2(out, out, dst, 16); - dst += dst_stride; - } -} - -static void intra_predict_tm_4x4_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { - uint32_t val; - uint8_t top_left = src_top_ptr[-1]; - v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; - v16u8 src0, src1, src2, src3; - v8u16 src_top_left, vec0, vec1, vec2, vec3; - - src_top_left = (v8u16)__msa_fill_h(top_left); - val = LW(src_top_ptr); - src_top = (v16i8)__msa_insert_w((v4i32)src_top, 0, val); - - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - - ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, - src_left3, src_top, src0, src1, src2, src3); - HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); - SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); - ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride); -} - -static void intra_predict_tm_8x8_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { - uint64_t val; - uint8_t top_left = src_top_ptr[-1]; - uint32_t loop_cnt; - v16i8 src_left0, src_left1, src_left2, src_left3, tmp0, tmp1, src_top = { 0 }; - v8u16 src_top_left, vec0, vec1, vec2, vec3; - v16u8 src0, src1, src2, src3; - - val = LD(src_top_ptr); - src_top = (v16i8)__msa_insert_d((v2i64)src_top, 0, val); - src_top_left = (v8u16)__msa_fill_h(top_left); - - for (loop_cnt = 2; loop_cnt--;) { - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - src_left += 4; - - ILVR_B4_UB(src_left0, src_top, src_left1, src_top, src_left2, src_top, - src_left3, src_top, src0, src1, src2, src3); - HADD_UB4_UH(src0, src1, src2, src3, vec0, vec1, vec2, vec3); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec0, vec1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3); - SAT_UH4_UH(vec0, vec1, vec2, vec3, 7); - PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); - dst += (4 * dst_stride); - } -} - -static void intra_predict_tm_16x16_msa(const uint8_t *src_top_ptr, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { - uint8_t top_left = src_top_ptr[-1]; - uint32_t loop_cnt; - v16i8 src_top, src_left0, src_left1, src_left2, src_left3; - v8u16 src_top_left, res_r, res_l; - - src_top = LD_SB(src_top_ptr); - src_top_left = (v8u16)__msa_fill_h(top_left); - - for (loop_cnt = 4; loop_cnt--;) { - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - src_left += 4; - - ILVRL_B2_UH(src_left0, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - - ILVRL_B2_UH(src_left1, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - - ILVRL_B2_UH(src_left2, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - - ILVRL_B2_UH(src_left3, src_top, res_r, res_l); - HADD_UB2_UH(res_r, res_l, res_r, res_l); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r, res_l); - SAT_UH2_UH(res_r, res_l, 7); - PCKEV_ST_SB(res_r, res_l, dst); - dst += dst_stride; - } -} - -static void intra_predict_tm_32x32_msa(const uint8_t *src_top, - const uint8_t *src_left, - uint8_t *dst, int32_t dst_stride) { - uint8_t top_left = src_top[-1]; - uint32_t loop_cnt; - v16i8 src_top0, src_top1, src_left0, src_left1, src_left2, src_left3; - v8u16 src_top_left, res_r0, res_r1, res_l0, res_l1; - - LD_SB2(src_top, 16, src_top0, src_top1); - src_top_left = (v8u16)__msa_fill_h(top_left); - - for (loop_cnt = 8; loop_cnt--;) { - src_left0 = __msa_fill_b(src_left[0]); - src_left1 = __msa_fill_b(src_left[1]); - src_left2 = __msa_fill_b(src_left[2]); - src_left3 = __msa_fill_b(src_left[3]); - src_left += 4; - - ILVR_B2_UH(src_left0, src_top0, src_left0, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left0, src_top0, src_left0, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - - ILVR_B2_UH(src_left1, src_top0, src_left1, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left1, src_top0, src_left1, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - - ILVR_B2_UH(src_left2, src_top0, src_left2, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left2, src_top0, src_left2, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - - ILVR_B2_UH(src_left3, src_top0, src_left3, src_top1, res_r0, res_r1); - ILVL_B2_UH(src_left3, src_top0, src_left3, src_top1, res_l0, res_l1); - HADD_UB4_UH(res_r0, res_l0, res_r1, res_l1, res_r0, res_l0, res_r1, res_l1); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r0, res_l0); - IPRED_SUBS_UH2_UH(src_top_left, src_top_left, res_r1, res_l1); - SAT_UH4_UH(res_r0, res_l0, res_r1, res_l1, 7); - PCKEV_ST_SB(res_r0, res_l0, dst); - PCKEV_ST_SB(res_r1, res_l1, dst + 16); - dst += dst_stride; - } -} - -void vp9_v_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_4x4_msa(above, dst, y_stride); -} - -void vp9_v_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_8x8_msa(above, dst, y_stride); -} - -void vp9_v_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_16x16_msa(above, dst, y_stride); -} - -void vp9_v_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_vert_32x32_msa(above, dst, y_stride); -} - -void vp9_h_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_4x4_msa(left, dst, y_stride); -} - -void vp9_h_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_8x8_msa(left, dst, y_stride); -} - -void vp9_h_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_16x16_msa(left, dst, y_stride); -} - -void vp9_h_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_horiz_32x32_msa(left, dst, y_stride); -} - -void vp9_dc_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_4x4_msa(above, left, dst, y_stride); -} - -void vp9_dc_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_8x8_msa(above, left, dst, y_stride); -} - -void vp9_dc_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_16x16_msa(above, left, dst, y_stride); -} - -void vp9_dc_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_dc_32x32_msa(above, left, dst, y_stride); -} - -void vp9_dc_top_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_4x4_msa(above, dst, y_stride); -} - -void vp9_dc_top_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_8x8_msa(above, dst, y_stride); -} - -void vp9_dc_top_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_16x16_msa(above, dst, y_stride); -} - -void vp9_dc_top_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)left; - - intra_predict_dc_tl_32x32_msa(above, dst, y_stride); -} - -void vp9_dc_left_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_4x4_msa(left, dst, y_stride); -} - -void vp9_dc_left_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_8x8_msa(left, dst, y_stride); -} - -void vp9_dc_left_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_16x16_msa(left, dst, y_stride); -} - -void vp9_dc_left_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, - const uint8_t *left) { - (void)above; - - intra_predict_dc_tl_32x32_msa(left, dst, y_stride); -} - -void vp9_dc_128_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_4x4_msa(dst, y_stride); -} - -void vp9_dc_128_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_8x8_msa(dst, y_stride); -} - -void vp9_dc_128_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_16x16_msa(dst, y_stride); -} - -void vp9_dc_128_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - (void)above; - (void)left; - - intra_predict_128dc_32x32_msa(dst, y_stride); -} - -void vp9_tm_predictor_4x4_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_4x4_msa(above, left, dst, y_stride); -} - -void vp9_tm_predictor_8x8_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_8x8_msa(above, left, dst, y_stride); -} - -void vp9_tm_predictor_16x16_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_16x16_msa(above, left, dst, y_stride); -} - -void vp9_tm_predictor_32x32_msa(uint8_t *dst, ptrdiff_t y_stride, - const uint8_t *above, const uint8_t *left) { - intra_predict_tm_32x32_msa(above, left, dst, y_stride); -} diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 42c3a09d7..c249ad4d7 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -44,20 +44,6 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { return num_values > 0 ? get_msb(num_values) + 1 : 0; } -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE uint16_t clip_pixel_highbd(int val, int bd) { - switch (bd) { - case 8: - default: - return (uint16_t)clamp(val, 0, 255); - case 10: - return (uint16_t)clamp(val, 0, 1023); - case 12: - return (uint16_t)clamp(val, 0, 4095); - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - #if CONFIG_DEBUG #define CHECK_MEM_ERROR(cm, lval, expr) do { \ lval = (expr); \ diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index f969ff1a8..6dc83c901 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -9,7 +9,7 @@ */ #include "./vpx_config.h" -#include "./vp9_rtcd.h" +#include "./vpx_dsp_rtcd.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" @@ -50,683 +50,6 @@ static const uint8_t extend_modes[INTRA_MODES] = { NEED_LEFT | NEED_ABOVE, // TM }; -// This serves as a wrapper function, so that all the prediction functions -// can be unified and accessed as a pointer array. Note that the boundary -// above and left are not necessarily used all the time. -#define intra_pred_sized(type, size) \ - void vp9_##type##_predictor_##size##x##size##_c(uint8_t *dst, \ - ptrdiff_t stride, \ - const uint8_t *above, \ - const uint8_t *left) { \ - type##_predictor(dst, stride, size, above, left); \ - } - -#if CONFIG_VP9_HIGHBITDEPTH -#define intra_pred_highbd_sized(type, size) \ - void vp9_highbd_##type##_predictor_##size##x##size##_c( \ - uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ - const uint16_t *left, int bd) { \ - highbd_##type##_predictor(dst, stride, size, above, left, bd); \ - } - -#define intra_pred_allsizes(type) \ - intra_pred_sized(type, 4) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) \ - intra_pred_highbd_sized(type, 4) \ - intra_pred_highbd_sized(type, 8) \ - intra_pred_highbd_sized(type, 16) \ - intra_pred_highbd_sized(type, 32) - -#define intra_pred_no_4x4(type) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) \ - intra_pred_highbd_sized(type, 4) \ - intra_pred_highbd_sized(type, 8) \ - intra_pred_highbd_sized(type, 16) \ - intra_pred_highbd_sized(type, 32) - -#else - -#define intra_pred_allsizes(type) \ - intra_pred_sized(type, 4) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) - -#define intra_pred_no_4x4(type) \ - intra_pred_sized(type, 8) \ - intra_pred_sized(type, 16) \ - intra_pred_sized(type, 32) -#endif // CONFIG_VP9_HIGHBITDEPTH - -#define DST(x, y) dst[(x) + (y) * stride] -#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) -#define AVG2(a, b) (((a) + (b) + 1) >> 1) - -#if CONFIG_VP9_HIGHBITDEPTH -static INLINE void highbd_d207_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void) above; - (void) bd; - - // First column. - for (r = 0; r < bs - 1; ++r) { - dst[r * stride] = AVG2(left[r], left[r + 1]); - } - dst[(bs - 1) * stride] = left[bs - 1]; - dst++; - - // Second column. - for (r = 0; r < bs - 2; ++r) { - dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]); - } - dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]); - dst[(bs - 1) * stride] = left[bs - 1]; - dst++; - - // Rest of last row. - for (c = 0; c < bs - 2; ++c) - dst[(bs - 1) * stride + c] = left[bs - 1]; - - for (r = bs - 2; r >= 0; --r) { - for (c = 0; c < bs - 2; ++c) - dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; - } -} - -static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void) left; - (void) bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r & 1 ? AVG3(above[(r >> 1) + c], above[(r >> 1) + c + 1], - above[(r >> 1) + c + 2]) - : AVG2(above[(r >> 1) + c], above[(r >> 1) + c + 1]); - } - dst += stride; - } -} - -static INLINE void highbd_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, - const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void) left; - (void) bd; - for (r = 0; r < bs; ++r) { - for (c = 0; c < bs; ++c) { - dst[c] = r + c + 2 < bs * 2 ? AVG3(above[r + c], above[r + c + 1], - above[r + c + 2]) - : above[bs * 2 - 1]; - } - dst += stride; - } -} - -static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void) bd; - - // first row - for (c = 0; c < bs; c++) - dst[c] = AVG2(above[c - 1], above[c]); - dst += stride; - - // second row - dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) - dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - dst += stride; - - // the rest of first col - dst[0] = AVG3(above[-1], left[0], left[1]); - for (r = 3; r < bs; ++r) - dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); - - // the rest of the block - for (r = 2; r < bs; ++r) { - for (c = 1; c < bs; c++) - dst[c] = dst[-2 * stride + c - 1]; - dst += stride; - } -} - -static INLINE void highbd_d135_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void) bd; - dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) - dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - - dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bs; ++r) - dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); - - dst += stride; - for (r = 1; r < bs; ++r) { - for (c = 1; c < bs; c++) - dst[c] = dst[-stride + c - 1]; - dst += stride; - } -} - -static INLINE void highbd_d153_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - (void) bd; - dst[0] = AVG2(above[-1], left[0]); - for (r = 1; r < bs; r++) - dst[r * stride] = AVG2(left[r - 1], left[r]); - dst++; - - dst[0] = AVG3(left[0], above[-1], above[0]); - dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bs; r++) - dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); - dst++; - - for (c = 0; c < bs - 2; c++) - dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); - dst += stride; - - for (r = 1; r < bs; ++r) { - for (c = 0; c < bs - 2; c++) - dst[c] = dst[-stride + c - 2]; - dst += stride; - } -} - -static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r; - (void) left; - (void) bd; - for (r = 0; r < bs; r++) { - memcpy(dst, above, bs * sizeof(uint16_t)); - dst += stride; - } -} - -static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r; - (void) above; - (void) bd; - for (r = 0; r < bs; r++) { - vpx_memset16(dst, left[r], bs); - dst += stride; - } -} - -static INLINE void highbd_tm_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r, c; - int ytop_left = above[-1]; - (void) bd; - - for (r = 0; r < bs; r++) { - for (c = 0; c < bs; c++) - dst[c] = clip_pixel_highbd(left[r] + above[c] - ytop_left, bd); - dst += stride; - } -} - -static INLINE void highbd_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int r; - (void) above; - (void) left; - - for (r = 0; r < bs; r++) { - vpx_memset16(dst, 128 << (bd - 8), bs); - dst += stride; - } -} - -static INLINE void highbd_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int i, r, expected_dc, sum = 0; - (void) above; - (void) bd; - - for (i = 0; i < bs; i++) - sum += left[i]; - expected_dc = (sum + (bs >> 1)) / bs; - - for (r = 0; r < bs; r++) { - vpx_memset16(dst, expected_dc, bs); - dst += stride; - } -} - -static INLINE void highbd_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int i, r, expected_dc, sum = 0; - (void) left; - (void) bd; - - for (i = 0; i < bs; i++) - sum += above[i]; - expected_dc = (sum + (bs >> 1)) / bs; - - for (r = 0; r < bs; r++) { - vpx_memset16(dst, expected_dc, bs); - dst += stride; - } -} - -static INLINE void highbd_dc_predictor(uint16_t *dst, ptrdiff_t stride, - int bs, const uint16_t *above, - const uint16_t *left, int bd) { - int i, r, expected_dc, sum = 0; - const int count = 2 * bs; - (void) bd; - - for (i = 0; i < bs; i++) { - sum += above[i]; - sum += left[i]; - } - - expected_dc = (sum + (count >> 1)) / count; - - for (r = 0; r < bs; r++) { - vpx_memset16(dst, expected_dc, bs); - dst += stride; - } -} -#endif // CONFIG_VP9_HIGHBITDEPTH - -void vp9_d207_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - (void)above; - DST(0, 0) = AVG2(I, J); - DST(2, 0) = DST(0, 1) = AVG2(J, K); - DST(2, 1) = DST(0, 2) = AVG2(K, L); - DST(1, 0) = AVG3(I, J, K); - DST(3, 0) = DST(1, 1) = AVG3(J, K, L); - DST(3, 1) = DST(1, 2) = AVG3(K, L, L); - DST(3, 2) = DST(2, 2) = - DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; -} - -static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - (void) above; - // first column - for (r = 0; r < bs - 1; ++r) - dst[r * stride] = AVG2(left[r], left[r + 1]); - dst[(bs - 1) * stride] = left[bs - 1]; - dst++; - - // second column - for (r = 0; r < bs - 2; ++r) - dst[r * stride] = AVG3(left[r], left[r + 1], left[r + 2]); - dst[(bs - 2) * stride] = AVG3(left[bs - 2], left[bs - 1], left[bs - 1]); - dst[(bs - 1) * stride] = left[bs - 1]; - dst++; - - // rest of last row - for (c = 0; c < bs - 2; ++c) - dst[(bs - 1) * stride + c] = left[bs - 1]; - - for (r = bs - 2; r >= 0; --r) - for (c = 0; c < bs - 2; ++c) - dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; -} -intra_pred_no_4x4(d207) - -void vp9_d63_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - const int E = above[4]; - const int F = above[5]; - const int G = above[6]; - (void)left; - DST(0, 0) = AVG2(A, B); - DST(1, 0) = DST(0, 2) = AVG2(B, C); - DST(2, 0) = DST(1, 2) = AVG2(C, D); - DST(3, 0) = DST(2, 2) = AVG2(D, E); - DST(3, 2) = AVG2(E, F); // differs from vp8 - - DST(0, 1) = AVG3(A, B, C); - DST(1, 1) = DST(0, 3) = AVG3(B, C, D); - DST(2, 1) = DST(1, 3) = AVG3(C, D, E); - DST(3, 1) = DST(2, 3) = AVG3(D, E, F); - DST(3, 3) = AVG3(E, F, G); // differs from vp8 -} - -static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - int size; - (void)left; - for (c = 0; c < bs; ++c) { - dst[c] = AVG2(above[c], above[c + 1]); - dst[stride + c] = AVG3(above[c], above[c + 1], above[c + 2]); - } - for (r = 2, size = bs - 2; r < bs; r += 2, --size) { - memcpy(dst + (r + 0) * stride, dst + (r >> 1), size); - memset(dst + (r + 0) * stride + size, above[bs - 1], bs - size); - memcpy(dst + (r + 1) * stride, dst + stride + (r >> 1), size); - memset(dst + (r + 1) * stride + size, above[bs - 1], bs - size); - } -} -intra_pred_no_4x4(d63) - -void vp9_d45_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - const int E = above[4]; - const int F = above[5]; - const int G = above[6]; - const int H = above[7]; - (void)stride; - (void)left; - DST(0, 0) = AVG3(A, B, C); - DST(1, 0) = DST(0, 1) = AVG3(B, C, D); - DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); - DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); - DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); - DST(3, 2) = DST(2, 3) = AVG3(F, G, H); - DST(3, 3) = H; // differs from vp8 -} - -static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - const uint8_t above_right = above[bs - 1]; - const uint8_t *const dst_row0 = dst; - int x, size; - (void)left; - - for (x = 0; x < bs - 1; ++x) { - dst[x] = AVG3(above[x], above[x + 1], above[x + 2]); - } - dst[bs - 1] = above_right; - dst += stride; - for (x = 1, size = bs - 2; x < bs; ++x, --size) { - memcpy(dst, dst_row0 + x, size); - memset(dst + size, above_right, x + 1); - dst += stride; - } -} -intra_pred_no_4x4(d45) - -void vp9_d117_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - DST(0, 0) = DST(1, 2) = AVG2(X, A); - DST(1, 0) = DST(2, 2) = AVG2(A, B); - DST(2, 0) = DST(3, 2) = AVG2(B, C); - DST(3, 0) = AVG2(C, D); - - DST(0, 3) = AVG3(K, J, I); - DST(0, 2) = AVG3(J, I, X); - DST(0, 1) = DST(1, 3) = AVG3(I, X, A); - DST(1, 1) = DST(2, 3) = AVG3(X, A, B); - DST(2, 1) = DST(3, 3) = AVG3(A, B, C); - DST(3, 1) = AVG3(B, C, D); -} - -static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - - // first row - for (c = 0; c < bs; c++) - dst[c] = AVG2(above[c - 1], above[c]); - dst += stride; - - // second row - dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) - dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - dst += stride; - - // the rest of first col - dst[0] = AVG3(above[-1], left[0], left[1]); - for (r = 3; r < bs; ++r) - dst[(r - 2) * stride] = AVG3(left[r - 3], left[r - 2], left[r - 1]); - - // the rest of the block - for (r = 2; r < bs; ++r) { - for (c = 1; c < bs; c++) - dst[c] = dst[-2 * stride + c - 1]; - dst += stride; - } -} -intra_pred_no_4x4(d117) - -void vp9_d135_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - const int D = above[3]; - (void)stride; - DST(0, 3) = AVG3(J, K, L); - DST(1, 3) = DST(0, 2) = AVG3(I, J, K); - DST(2, 3) = DST(1, 2) = DST(0, 1) = AVG3(X, I, J); - DST(3, 3) = DST(2, 2) = DST(1, 1) = DST(0, 0) = AVG3(A, X, I); - DST(3, 2) = DST(2, 1) = DST(1, 0) = AVG3(B, A, X); - DST(3, 1) = DST(2, 0) = AVG3(C, B, A); - DST(3, 0) = AVG3(D, C, B); -} - -static INLINE void d135_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - dst[0] = AVG3(left[0], above[-1], above[0]); - for (c = 1; c < bs; c++) - dst[c] = AVG3(above[c - 2], above[c - 1], above[c]); - - dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bs; ++r) - dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); - - dst += stride; - for (r = 1; r < bs; ++r) { - for (c = 1; c < bs; c++) - dst[c] = dst[-stride + c - 1]; - dst += stride; - } -} -intra_pred_no_4x4(d135) - -void vp9_d153_predictor_4x4_c(uint8_t *dst, ptrdiff_t stride, - const uint8_t *above, const uint8_t *left) { - const int I = left[0]; - const int J = left[1]; - const int K = left[2]; - const int L = left[3]; - const int X = above[-1]; - const int A = above[0]; - const int B = above[1]; - const int C = above[2]; - - DST(0, 0) = DST(2, 1) = AVG2(I, X); - DST(0, 1) = DST(2, 2) = AVG2(J, I); - DST(0, 2) = DST(2, 3) = AVG2(K, J); - DST(0, 3) = AVG2(L, K); - - DST(3, 0) = AVG3(A, B, C); - DST(2, 0) = AVG3(X, A, B); - DST(1, 0) = DST(3, 1) = AVG3(I, X, A); - DST(1, 1) = DST(3, 2) = AVG3(J, I, X); - DST(1, 2) = DST(3, 3) = AVG3(K, J, I); - DST(1, 3) = AVG3(L, K, J); -} - -static INLINE void d153_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - dst[0] = AVG2(above[-1], left[0]); - for (r = 1; r < bs; r++) - dst[r * stride] = AVG2(left[r - 1], left[r]); - dst++; - - dst[0] = AVG3(left[0], above[-1], above[0]); - dst[stride] = AVG3(above[-1], left[0], left[1]); - for (r = 2; r < bs; r++) - dst[r * stride] = AVG3(left[r - 2], left[r - 1], left[r]); - dst++; - - for (c = 0; c < bs - 2; c++) - dst[c] = AVG3(above[c - 1], above[c], above[c + 1]); - dst += stride; - - for (r = 1; r < bs; ++r) { - for (c = 0; c < bs - 2; c++) - dst[c] = dst[-stride + c - 2]; - dst += stride; - } -} -intra_pred_no_4x4(d153) - -static INLINE void v_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r; - (void) left; - - for (r = 0; r < bs; r++) { - memcpy(dst, above, bs); - dst += stride; - } -} -intra_pred_allsizes(v) - -static INLINE void h_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r; - (void) above; - - for (r = 0; r < bs; r++) { - memset(dst, left[r], bs); - dst += stride; - } -} -intra_pred_allsizes(h) - -static INLINE void tm_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r, c; - int ytop_left = above[-1]; - - for (r = 0; r < bs; r++) { - for (c = 0; c < bs; c++) - dst[c] = clip_pixel(left[r] + above[c] - ytop_left); - dst += stride; - } -} -intra_pred_allsizes(tm) - -static INLINE void dc_128_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int r; - (void) above; - (void) left; - - for (r = 0; r < bs; r++) { - memset(dst, 128, bs); - dst += stride; - } -} -intra_pred_allsizes(dc_128) - -static INLINE void dc_left_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, - const uint8_t *left) { - int i, r, expected_dc, sum = 0; - (void) above; - - for (i = 0; i < bs; i++) - sum += left[i]; - expected_dc = (sum + (bs >> 1)) / bs; - - for (r = 0; r < bs; r++) { - memset(dst, expected_dc, bs); - dst += stride; - } -} -intra_pred_allsizes(dc_left) - -static INLINE void dc_top_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int i, r, expected_dc, sum = 0; - (void) left; - - for (i = 0; i < bs; i++) - sum += above[i]; - expected_dc = (sum + (bs >> 1)) / bs; - - for (r = 0; r < bs; r++) { - memset(dst, expected_dc, bs); - dst += stride; - } -} -intra_pred_allsizes(dc_top) - -static INLINE void dc_predictor(uint8_t *dst, ptrdiff_t stride, int bs, - const uint8_t *above, const uint8_t *left) { - int i, r, expected_dc, sum = 0; - const int count = 2 * bs; - - for (i = 0; i < bs; i++) { - sum += above[i]; - sum += left[i]; - } - - expected_dc = (sum + (count >> 1)) / count; - - for (r = 0; r < bs; r++) { - memset(dst, expected_dc, bs); - dst += stride; - } -} -intra_pred_allsizes(dc) -#undef intra_pred_allsizes - typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index f80d31ed7..758d10577 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -61,165 +61,6 @@ if ((vpx_config("HAVE_AVX2") eq "yes") && (vpx_config("HAVE_SSSE3") eq "yes")) { } # -# RECON -# -add_proto qw/void vp9_d207_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d207_predictor_4x4/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d45_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d45_predictor_4x4 neon/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d63_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d63_predictor_4x4/, "$ssse3_x86inc"; - -add_proto qw/void vp9_h_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_4x4 neon dspr2 msa/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d117_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d117_predictor_4x4/; - -add_proto qw/void vp9_d135_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d135_predictor_4x4 neon/; - -add_proto qw/void vp9_d153_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d153_predictor_4x4/, "$ssse3_x86inc"; - -add_proto qw/void vp9_v_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_4x4 neon msa/, "$sse_x86inc"; - -add_proto qw/void vp9_tm_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_4x4 neon dspr2 msa/, "$sse_x86inc"; - -add_proto qw/void vp9_dc_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_4x4 dspr2 msa neon/, "$sse_x86inc"; - -add_proto qw/void vp9_dc_top_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_4x4 msa neon/, "$sse_x86inc"; - -add_proto qw/void vp9_dc_left_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_4x4 msa neon/, "$sse_x86inc"; - -add_proto qw/void vp9_dc_128_predictor_4x4/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_4x4 msa neon/, "$sse_x86inc"; - -add_proto qw/void vp9_d207_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d207_predictor_8x8/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d45_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d45_predictor_8x8 neon/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d63_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d63_predictor_8x8/, "$ssse3_x86inc"; - -add_proto qw/void vp9_h_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_8x8 neon dspr2 msa/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d117_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d117_predictor_8x8/; - -add_proto qw/void vp9_d135_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d135_predictor_8x8/; - -add_proto qw/void vp9_d153_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d153_predictor_8x8/, "$ssse3_x86inc"; - -add_proto qw/void vp9_v_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_8x8 neon msa/, "$sse_x86inc"; - -add_proto qw/void vp9_tm_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_8x8 neon dspr2 msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_8x8 dspr2 neon msa/, "$sse_x86inc"; - -add_proto qw/void vp9_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_8x8 neon msa/, "$sse_x86inc"; - -add_proto qw/void vp9_dc_left_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_8x8 neon msa/, "$sse_x86inc"; - -add_proto qw/void vp9_dc_128_predictor_8x8/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_8x8 neon msa/, "$sse_x86inc"; - -add_proto qw/void vp9_d207_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d207_predictor_16x16/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d45_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d45_predictor_16x16 neon/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d63_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d63_predictor_16x16/, "$ssse3_x86inc"; - -add_proto qw/void vp9_h_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_16x16 neon dspr2 msa/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d117_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d117_predictor_16x16/; - -add_proto qw/void vp9_d135_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d135_predictor_16x16/; - -add_proto qw/void vp9_d153_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d153_predictor_16x16/, "$ssse3_x86inc"; - -add_proto qw/void vp9_v_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_16x16 neon msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_tm_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_16x16 neon msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_16x16 dspr2 neon msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_16x16 neon msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_dc_left_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_16x16 neon msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_dc_128_predictor_16x16/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_16x16 neon msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_d207_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d207_predictor_32x32/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d45_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d45_predictor_32x32/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d63_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d63_predictor_32x32/, "$ssse3_x86inc"; - -add_proto qw/void vp9_h_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_h_predictor_32x32 neon msa/, "$ssse3_x86inc"; - -add_proto qw/void vp9_d117_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d117_predictor_32x32/; - -add_proto qw/void vp9_d135_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d135_predictor_32x32/; - -add_proto qw/void vp9_d153_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_d153_predictor_32x32/, "$ssse3_x86inc"; - -add_proto qw/void vp9_v_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_v_predictor_32x32 neon msa/, "$sse2_x86inc"; - -add_proto qw/void vp9_tm_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_tm_predictor_32x32 neon msa/, "$sse2_x86_64_x86inc"; - -add_proto qw/void vp9_dc_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_predictor_32x32 msa neon/, "$sse2_x86inc"; - -add_proto qw/void vp9_dc_top_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_top_predictor_32x32 msa neon/, "$sse2_x86inc"; - -add_proto qw/void vp9_dc_left_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_left_predictor_32x32 msa neon/, "$sse2_x86inc"; - -add_proto qw/void vp9_dc_128_predictor_32x32/, "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left"; -specialize qw/vp9_dc_128_predictor_32x32 msa neon/, "$sse2_x86inc"; - -# # post proc # if (vpx_config("CONFIG_VP9_POSTPROC") eq "yes") { @@ -439,165 +280,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # - # Intra prediction - # - add_proto qw/void vp9_highbd_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d207_predictor_4x4/; - - add_proto qw/void vp9_highbd_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d45_predictor_4x4/; - - add_proto qw/void vp9_highbd_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d63_predictor_4x4/; - - add_proto qw/void vp9_highbd_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_h_predictor_4x4/; - - add_proto qw/void vp9_highbd_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d117_predictor_4x4/; - - add_proto qw/void vp9_highbd_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d135_predictor_4x4/; - - add_proto qw/void vp9_highbd_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d153_predictor_4x4/; - - add_proto qw/void vp9_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_v_predictor_4x4/, "$sse_x86inc"; - - add_proto qw/void vp9_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_tm_predictor_4x4/, "$sse_x86inc"; - - add_proto qw/void vp9_highbd_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_predictor_4x4/, "$sse_x86inc"; - - add_proto qw/void vp9_highbd_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_top_predictor_4x4/; - - add_proto qw/void vp9_highbd_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_left_predictor_4x4/; - - add_proto qw/void vp9_highbd_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_128_predictor_4x4/; - - add_proto qw/void vp9_highbd_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d207_predictor_8x8/; - - add_proto qw/void vp9_highbd_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d45_predictor_8x8/; - - add_proto qw/void vp9_highbd_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d63_predictor_8x8/; - - add_proto qw/void vp9_highbd_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_h_predictor_8x8/; - - add_proto qw/void vp9_highbd_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d117_predictor_8x8/; - - add_proto qw/void vp9_highbd_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d135_predictor_8x8/; - - add_proto qw/void vp9_highbd_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d153_predictor_8x8/; - - add_proto qw/void vp9_highbd_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_v_predictor_8x8/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_tm_predictor_8x8/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_predictor_8x8/, "$sse2_x86inc";; - - add_proto qw/void vp9_highbd_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_top_predictor_8x8/; - - add_proto qw/void vp9_highbd_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_left_predictor_8x8/; - - add_proto qw/void vp9_highbd_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_128_predictor_8x8/; - - add_proto qw/void vp9_highbd_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d207_predictor_16x16/; - - add_proto qw/void vp9_highbd_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d45_predictor_16x16/; - - add_proto qw/void vp9_highbd_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d63_predictor_16x16/; - - add_proto qw/void vp9_highbd_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_h_predictor_16x16/; - - add_proto qw/void vp9_highbd_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d117_predictor_16x16/; - - add_proto qw/void vp9_highbd_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d135_predictor_16x16/; - - add_proto qw/void vp9_highbd_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d153_predictor_16x16/; - - add_proto qw/void vp9_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_v_predictor_16x16/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_tm_predictor_16x16/, "$sse2_x86_64_x86inc"; - - add_proto qw/void vp9_highbd_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_predictor_16x16/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_top_predictor_16x16/; - - add_proto qw/void vp9_highbd_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_left_predictor_16x16/; - - add_proto qw/void vp9_highbd_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_128_predictor_16x16/; - - add_proto qw/void vp9_highbd_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d207_predictor_32x32/; - - add_proto qw/void vp9_highbd_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d45_predictor_32x32/; - - add_proto qw/void vp9_highbd_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d63_predictor_32x32/; - - add_proto qw/void vp9_highbd_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_h_predictor_32x32/; - - add_proto qw/void vp9_highbd_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d117_predictor_32x32/; - - add_proto qw/void vp9_highbd_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d135_predictor_32x32/; - - add_proto qw/void vp9_highbd_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_d153_predictor_32x32/; - - add_proto qw/void vp9_highbd_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_v_predictor_32x32/, "$sse2_x86inc"; - - add_proto qw/void vp9_highbd_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_tm_predictor_32x32/, "$sse2_x86_64_x86inc"; - - add_proto qw/void vp9_highbd_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_predictor_32x32/, "$sse2_x86_64_x86inc"; - - add_proto qw/void vp9_highbd_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_top_predictor_32x32/; - - add_proto qw/void vp9_highbd_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_left_predictor_32x32/; - - add_proto qw/void vp9_highbd_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd"; - specialize qw/vp9_highbd_dc_128_predictor_32x32/; - - # # Sub Pixel Filters # add_proto qw/void vp9_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps"; diff --git a/vp9/common/x86/vp9_high_intrapred_sse2.asm b/vp9/common/x86/vp9_high_intrapred_sse2.asm deleted file mode 100644 index b12d29c0a..000000000 --- a/vp9/common/x86/vp9_high_intrapred_sse2.asm +++ /dev/null @@ -1,476 +0,0 @@ -; -; Copyright (c) 2014 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_16: times 4 dd 16 -pw_32: times 4 dd 32 - -SECTION .text -INIT_MMX sse -cglobal highbd_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - - movq m0, [aboveq] - movq m2, [leftq] - DEFINE_ARGS dst, stride, one - mov oned, 0x0001 - pxor m1, m1 - movd m3, oned - pshufw m3, m3, 0x0 - paddw m0, m2 - pmaddwd m0, m3 - packssdw m0, m1 - pmaddwd m0, m3 - paddw m0, [GLOBAL(pw_4)] - psraw m0, 3 - pshufw m0, m0, 0x0 - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal highbd_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [leftq] - DEFINE_ARGS dst, stride, stride3, one - mov oned, 0x00010001 - lea stride3q, [strideq*3] - movd m3, oned - pshufd m3, m3, 0x0 - paddw m0, m2 - pmaddwd m0, m3 - packssdw m0, m1 - pmaddwd m0, m3 - packssdw m0, m1 - pmaddwd m0, m3 - paddw m0, [GLOBAL(pw_8)] - psrlw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - lea dstq, [dstq+strideq*8] - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal highbd_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m3, [aboveq+16] - mova m2, [leftq] - mova m4, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - movhlps m2, m0 - paddw m0, m2 - punpcklwd m0, m1 - movhlps m2, m0 - paddd m0, m2 - punpckldq m0, m1 - movhlps m2, m0 - paddd m0, m2 - paddd m0, [GLOBAL(pw_16)] - psrad m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2 +16], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4 +16], m0 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2+16], m0 - lea dstq, [dstq+strideq*8] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -%if ARCH_X86_64 -INIT_XMM sse2 -cglobal highbd_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [aboveq+16] - mova m3, [aboveq+32] - mova m4, [aboveq+48] - mova m5, [leftq] - mova m6, [leftq+16] - mova m7, [leftq+32] - mova m8, [leftq+48] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - paddw m0, m5 - paddw m0, m6 - paddw m0, m7 - paddw m0, m8 - movhlps m2, m0 - paddw m0, m2 - punpcklwd m0, m1 - movhlps m2, m0 - paddd m0, m2 - punpckldq m0, m1 - movhlps m2, m0 - paddd m0, m2 - paddd m0, [GLOBAL(pw_32)] - psrad m0, 6 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16 ], m0 - mova [dstq +32 ], m0 - mova [dstq +48 ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16 ], m0 - mova [dstq+strideq*2+32 ], m0 - mova [dstq+strideq*2+48 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4+16 ], m0 - mova [dstq+strideq*4+32 ], m0 - mova [dstq+strideq*4+48 ], m0 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2 +16], m0 - mova [dstq+stride3q*2 +32], m0 - mova [dstq+stride3q*2 +48], m0 - lea dstq, [dstq+strideq*8] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET -%endif - -INIT_MMX sse -cglobal highbd_v_predictor_4x4, 3, 3, 1, dst, stride, above - movq m0, [aboveq] - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq*2], m0 - RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_8x8, 3, 3, 1, dst, stride, above - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - lea dstq, [dstq+strideq*8] - mova [dstq ], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*4 ], m0 - mova [dstq+stride3q*2], m0 - RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_16x16, 3, 4, 2, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 4 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2 +16], m1 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4 +16], m1 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2+16], m1 - lea dstq, [dstq+strideq*8] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_v_predictor_32x32, 3, 4, 4, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - mova m2, [aboveq+32] - mova m3, [aboveq+48] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 8 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq +32], m2 - mova [dstq +48], m3 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2 +16], m1 - mova [dstq+strideq*2 +32], m2 - mova [dstq+strideq*2 +48], m3 - mova [dstq+strideq*4 ], m0 - mova [dstq+strideq*4 +16], m1 - mova [dstq+strideq*4 +32], m2 - mova [dstq+strideq*4 +48], m3 - mova [dstq+stride3q*2 ], m0 - mova [dstq+stride3q*2 +16], m1 - mova [dstq+stride3q*2 +32], m2 - mova [dstq+stride3q*2 +48], m3 - lea dstq, [dstq+strideq*8] - dec nlines4d - jnz .loop - REP_RET - -INIT_MMX sse -cglobal highbd_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one - movd m1, [aboveq-2] - movq m0, [aboveq] - pshufw m1, m1, 0x0 - ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - movd m3, oned - movd m4, bpsd - pshufw m3, m3, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -2 - mova m2, m3 - psllw m3, m4 - add leftq, 8 - psubw m3, m2 ; max possible value - pxor m4, m4 ; min possible value - psubw m0, m1 -.loop: - movq m1, [leftq+lineq*4] - movq m2, [leftq+lineq*4+2] - pshufw m1, m1, 0x0 - pshufw m2, m2, 0x0 - paddw m1, m0 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m1, m3 - pminsw m2, m3 - pmaxsw m1, m4 - pmaxsw m2, m4 - ;Store the values - movq [dstq ], m1 - movq [dstq+strideq*2], m2 - lea dstq, [dstq+strideq*4] - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one - movd m1, [aboveq-2] - mova m0, [aboveq] - pshuflw m1, m1, 0x0 - ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - pxor m3, m3 - pxor m4, m4 - pinsrw m3, oned, 0 - pinsrw m4, bpsd, 0 - pshuflw m3, m3, 0x0 - DEFINE_ARGS dst, stride, line, left - punpcklqdq m3, m3 - mov lineq, -4 - mova m2, m3 - punpcklqdq m1, m1 - psllw m3, m4 - add leftq, 16 - psubw m3, m2 ; max possible value - pxor m4, m4 ; min possible value - psubw m0, m1 -.loop: - movd m1, [leftq+lineq*4] - movd m2, [leftq+lineq*4+2] - pshuflw m1, m1, 0x0 - pshuflw m2, m2, 0x0 - punpcklqdq m1, m1 - punpcklqdq m2, m2 - paddw m1, m0 - paddw m2, m0 - ;Clamp to the bit-depth - pminsw m1, m3 - pminsw m2, m3 - pmaxsw m1, m4 - pmaxsw m2, m4 - ;Store the values - mova [dstq ], m1 - mova [dstq+strideq*2], m2 - lea dstq, [dstq+strideq*4] - inc lineq - jnz .loop - REP_RET - -%if ARCH_X86_64 -INIT_XMM sse2 -cglobal highbd_tm_predictor_16x16, 5, 6, 9, dst, stride, above, left, bps, one - movd m2, [aboveq-2] - mova m0, [aboveq] - mova m1, [aboveq+16] - pshuflw m2, m2, 0x0 - ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - pxor m7, m7 - pxor m8, m8 - pinsrw m7, oned, 0 - pinsrw m8, bpsd, 0 - pshuflw m7, m7, 0x0 - DEFINE_ARGS dst, stride, line, left - punpcklqdq m7, m7 - mov lineq, -8 - mova m5, m7 - punpcklqdq m2, m2 - psllw m7, m8 - add leftq, 32 - psubw m7, m5 ; max possible value - pxor m8, m8 ; min possible value - psubw m0, m2 - psubw m1, m2 -.loop: - movd m2, [leftq+lineq*4] - movd m3, [leftq+lineq*4+2] - pshuflw m2, m2, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m2, m2 - punpcklqdq m3, m3 - paddw m4, m2, m0 - paddw m5, m3, m0 - paddw m2, m1 - paddw m3, m1 - ;Clamp to the bit-depth - pminsw m4, m7 - pminsw m5, m7 - pminsw m2, m7 - pminsw m3, m7 - pmaxsw m4, m8 - pmaxsw m5, m8 - pmaxsw m2, m8 - pmaxsw m3, m8 - ;Store the values - mova [dstq ], m4 - mova [dstq+strideq*2 ], m5 - mova [dstq +16], m2 - mova [dstq+strideq*2+16], m3 - lea dstq, [dstq+strideq*4] - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal highbd_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one - movd m0, [aboveq-2] - mova m1, [aboveq] - mova m2, [aboveq+16] - mova m3, [aboveq+32] - mova m4, [aboveq+48] - pshuflw m0, m0, 0x0 - ; Get the values to compute the maximum value at this bit depth - mov oned, 1 - pxor m10, m10 - pxor m11, m11 - pinsrw m10, oned, 0 - pinsrw m11, bpsd, 0 - pshuflw m10, m10, 0x0 - DEFINE_ARGS dst, stride, line, left - punpcklqdq m10, m10 - mov lineq, -16 - mova m5, m10 - punpcklqdq m0, m0 - psllw m10, m11 - add leftq, 64 - psubw m10, m5 ; max possible value - pxor m11, m11 ; min possible value - psubw m1, m0 - psubw m2, m0 - psubw m3, m0 - psubw m4, m0 -.loop: - movd m5, [leftq+lineq*4] - movd m6, [leftq+lineq*4+2] - pshuflw m5, m5, 0x0 - pshuflw m6, m6, 0x0 - punpcklqdq m5, m5 - punpcklqdq m6, m6 - paddw m7, m5, m1 - paddw m8, m5, m2 - paddw m9, m5, m3 - paddw m5, m4 - ;Clamp these values to the bit-depth - pminsw m7, m10 - pminsw m8, m10 - pminsw m9, m10 - pminsw m5, m10 - pmaxsw m7, m11 - pmaxsw m8, m11 - pmaxsw m9, m11 - pmaxsw m5, m11 - ;Store these values - mova [dstq ], m7 - mova [dstq +16], m8 - mova [dstq +32], m9 - mova [dstq +48], m5 - paddw m7, m6, m1 - paddw m8, m6, m2 - paddw m9, m6, m3 - paddw m6, m4 - ;Clamp these values to the bit-depth - pminsw m7, m10 - pminsw m8, m10 - pminsw m9, m10 - pminsw m6, m10 - pmaxsw m7, m11 - pmaxsw m8, m11 - pmaxsw m9, m11 - pmaxsw m6, m11 - ;Store these values - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m8 - mova [dstq+strideq*2+32], m9 - mova [dstq+strideq*2+48], m6 - lea dstq, [dstq+strideq*4] - inc lineq - jnz .loop - REP_RET -%endif diff --git a/vp9/common/x86/vp9_intrapred_sse2.asm b/vp9/common/x86/vp9_intrapred_sse2.asm deleted file mode 100644 index 22b573188..000000000 --- a/vp9/common/x86/vp9_intrapred_sse2.asm +++ /dev/null @@ -1,667 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA -pw_4: times 8 dw 4 -pw_8: times 8 dw 8 -pw_16: times 8 dw 16 -pw_32: times 8 dw 32 -dc_128: times 16 db 128 -pw2_4: times 8 dw 2 -pw2_8: times 8 dw 4 -pw2_16: times 8 dw 8 -pw2_32: times 8 dw 16 - -SECTION .text - -INIT_MMX sse -cglobal dc_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movd m0, [aboveq] - punpckldq m0, [leftq] - psadbw m0, m1 - paddw m0, [GLOBAL(pw_4)] - psraw m0, 3 - pshufw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_MMX sse -cglobal dc_left_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movd m0, [leftq] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_4)] - psraw m0, 2 - pshufw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_MMX sse -cglobal dc_top_predictor_4x4, 4, 5, 2, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movd m0, [aboveq] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_4)] - psraw m0, 2 - pshufw m0, m0, 0x0 - packuswb m0, m0 - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_MMX sse -cglobal dc_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [aboveq] - movq m2, [leftq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - paddw m0, [GLOBAL(pw_8)] - psraw m0, 4 - pshufw m0, m0, 0x0 - packuswb m0, m0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_MMX sse -cglobal dc_top_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_8)] - psraw m0, 3 - pshufw m0, m0, 0x0 - packuswb m0, m0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_MMX sse -cglobal dc_left_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - movq m0, [leftq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - psadbw m0, m1 - paddw m0, [GLOBAL(pw2_8)] - psraw m0, 3 - pshufw m0, m0, 0x0 - packuswb m0, m0 - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_MMX sse -cglobal dc_128_predictor_4x4, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movd m0, [GLOBAL(dc_128)] - movd [dstq ], m0 - movd [dstq+strideq ], m0 - movd [dstq+strideq*2], m0 - movd [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_MMX sse -cglobal dc_128_predictor_8x8, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movq m0, [GLOBAL(dc_128)] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_XMM sse2 -cglobal dc_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [leftq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw_16)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - - -INIT_XMM sse2 -cglobal dc_top_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - pxor m2, m2 - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_16)] - psraw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_left_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - pxor m2, m2 - mova m0, [leftq] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_16)] - psraw m0, 4 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_128_predictor_16x16, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 4 - mova m0, [GLOBAL(dc_128)] -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - RESTORE_GOT - RET - - -INIT_XMM sse2 -cglobal dc_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [aboveq+16] - mova m3, [leftq] - mova m4, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - psadbw m3, m1 - psadbw m4, m1 - paddw m0, m2 - paddw m0, m3 - paddw m0, m4 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw_32)] - psraw m0, 6 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_top_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [aboveq] - mova m2, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_32)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_left_predictor_32x32, 4, 5, 5, dst, stride, above, left, goffset - GET_GOT goffsetq - - pxor m1, m1 - mova m0, [leftq] - mova m2, [leftq+16] - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - psadbw m0, m1 - psadbw m2, m1 - paddw m0, m2 - movhlps m2, m0 - paddw m0, m2 - paddw m0, [GLOBAL(pw2_32)] - psraw m0, 5 - pshuflw m0, m0, 0x0 - punpcklqdq m0, m0 - packuswb m0, m0 -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - - RESTORE_GOT - REP_RET - -INIT_XMM sse2 -cglobal dc_128_predictor_32x32, 4, 5, 3, dst, stride, above, left, goffset - GET_GOT goffsetq - - DEFINE_ARGS dst, stride, stride3, lines4 - lea stride3q, [strideq*3] - mov lines4d, 8 - mova m0, [GLOBAL(dc_128)] -.loop: - mova [dstq ], m0 - mova [dstq +16], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m0 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m0 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m0 - lea dstq, [dstq+strideq*4] - dec lines4d - jnz .loop - RESTORE_GOT - RET - -INIT_MMX sse -cglobal v_predictor_4x4, 3, 3, 1, dst, stride, above - movd m0, [aboveq] - movd [dstq ], m0 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - movd [dstq ], m0 - movd [dstq+strideq], m0 - RET - -INIT_MMX sse -cglobal v_predictor_8x8, 3, 3, 1, dst, stride, above - movq m0, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq ], m0 - movq [dstq+strideq ], m0 - movq [dstq+strideq*2], m0 - movq [dstq+stride3q ], m0 - RET - -INIT_XMM sse2 -cglobal v_predictor_16x16, 3, 4, 1, dst, stride, above - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 4 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m0 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - dec nlines4d - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal v_predictor_32x32, 3, 4, 2, dst, stride, above - mova m0, [aboveq] - mova m1, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, nlines4 - lea stride3q, [strideq*3] - mov nlines4d, 8 -.loop: - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+strideq ], m0 - mova [dstq+strideq +16], m1 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m1 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q +16], m1 - lea dstq, [dstq+strideq*4] - dec nlines4d - jnz .loop - REP_RET - -INIT_MMX sse -cglobal tm_predictor_4x4, 4, 4, 4, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - movd m0, [aboveq] - punpcklbw m2, m1 - punpcklbw m0, m1 - pshufw m2, m2, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -2 - add leftq, 4 - psubw m0, m2 -.loop: - movd m2, [leftq+lineq*2] - movd m3, [leftq+lineq*2+1] - punpcklbw m2, m1 - punpcklbw m3, m1 - pshufw m2, m2, 0x0 - pshufw m3, m3, 0x0 - paddw m2, m0 - paddw m3, m0 - packuswb m2, m2 - packuswb m3, m3 - movd [dstq ], m2 - movd [dstq+strideq], m3 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_8x8, 4, 4, 4, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - movq m0, [aboveq] - punpcklbw m2, m1 - punpcklbw m0, m1 - pshuflw m2, m2, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -4 - punpcklqdq m2, m2 - add leftq, 8 - psubw m0, m2 -.loop: - movd m2, [leftq+lineq*2] - movd m3, [leftq+lineq*2+1] - punpcklbw m2, m1 - punpcklbw m3, m1 - pshuflw m2, m2, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m2, m2 - punpcklqdq m3, m3 - paddw m2, m0 - paddw m3, m0 - packuswb m2, m3 - movq [dstq ], m2 - movhps [dstq+strideq], m2 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET - -INIT_XMM sse2 -cglobal tm_predictor_16x16, 4, 4, 7, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - mova m0, [aboveq] - punpcklbw m2, m1 - punpckhbw m4, m0, m1 - punpcklbw m0, m1 - pshuflw m2, m2, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -8 - punpcklqdq m2, m2 - add leftq, 16 - psubw m0, m2 - psubw m4, m2 -.loop: - movd m2, [leftq+lineq*2] - movd m3, [leftq+lineq*2+1] - punpcklbw m2, m1 - punpcklbw m3, m1 - pshuflw m2, m2, 0x0 - pshuflw m3, m3, 0x0 - punpcklqdq m2, m2 - punpcklqdq m3, m3 - paddw m5, m2, m0 - paddw m6, m3, m0 - paddw m2, m4 - paddw m3, m4 - packuswb m5, m2 - packuswb m6, m3 - mova [dstq ], m5 - mova [dstq+strideq], m6 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET - -%if ARCH_X86_64 -INIT_XMM sse2 -cglobal tm_predictor_32x32, 4, 4, 10, dst, stride, above, left - pxor m1, m1 - movd m2, [aboveq-1] - mova m0, [aboveq] - mova m4, [aboveq+16] - punpcklbw m2, m1 - punpckhbw m3, m0, m1 - punpckhbw m5, m4, m1 - punpcklbw m0, m1 - punpcklbw m4, m1 - pshuflw m2, m2, 0x0 - DEFINE_ARGS dst, stride, line, left - mov lineq, -16 - punpcklqdq m2, m2 - add leftq, 32 - psubw m0, m2 - psubw m3, m2 - psubw m4, m2 - psubw m5, m2 -.loop: - movd m2, [leftq+lineq*2] - movd m6, [leftq+lineq*2+1] - punpcklbw m2, m1 - punpcklbw m6, m1 - pshuflw m2, m2, 0x0 - pshuflw m6, m6, 0x0 - punpcklqdq m2, m2 - punpcklqdq m6, m6 - paddw m7, m2, m0 - paddw m8, m2, m3 - paddw m9, m2, m4 - paddw m2, m5 - packuswb m7, m8 - packuswb m9, m2 - paddw m2, m6, m0 - paddw m8, m6, m3 - mova [dstq ], m7 - paddw m7, m6, m4 - paddw m6, m5 - mova [dstq +16], m9 - packuswb m2, m8 - packuswb m7, m6 - mova [dstq+strideq ], m2 - mova [dstq+strideq+16], m7 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET -%endif diff --git a/vp9/common/x86/vp9_intrapred_ssse3.asm b/vp9/common/x86/vp9_intrapred_ssse3.asm deleted file mode 100644 index 88df9b2d1..000000000 --- a/vp9/common/x86/vp9_intrapred_ssse3.asm +++ /dev/null @@ -1,1036 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - -%include "third_party/x86inc/x86inc.asm" - -SECTION_RODATA - -pb_1: times 16 db 1 -sh_b01234577: db 0, 1, 2, 3, 4, 5, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b12345677: db 1, 2, 3, 4, 5, 6, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b23456777: db 2, 3, 4, 5, 6, 7, 7, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b0123456777777777: db 0, 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b1234567777777777: db 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b2345677777777777: db 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7 -sh_b123456789abcdeff: db 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15 -sh_b23456789abcdefff: db 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15 -sh_b32104567: db 3, 2, 1, 0, 4, 5, 6, 7, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b8091a2b345: db 8, 0, 9, 1, 10, 2, 11, 3, 4, 5, 0, 0, 0, 0, 0, 0 -sh_b76543210: db 7, 6, 5, 4, 3, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b65432108: db 6, 5, 4, 3, 2, 1, 0, 8, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b54321089: db 5, 4, 3, 2, 1, 0, 8, 9, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b89abcdef: db 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0 -sh_bfedcba9876543210: db 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 -sh_b1233: db 1, 2, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 -sh_b2333: db 2, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - -SECTION .text - -INIT_MMX ssse3 -cglobal h_predictor_4x4, 2, 4, 3, dst, stride, line, left - movifnidn leftq, leftmp - add leftq, 4 - mov lineq, -2 - pxor m0, m0 -.loop: - movd m1, [leftq+lineq*2 ] - movd m2, [leftq+lineq*2+1] - pshufb m1, m0 - pshufb m2, m0 - movd [dstq ], m1 - movd [dstq+strideq], m2 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET - -INIT_MMX ssse3 -cglobal h_predictor_8x8, 2, 4, 3, dst, stride, line, left - movifnidn leftq, leftmp - add leftq, 8 - mov lineq, -4 - pxor m0, m0 -.loop: - movd m1, [leftq+lineq*2 ] - movd m2, [leftq+lineq*2+1] - pshufb m1, m0 - pshufb m2, m0 - movq [dstq ], m1 - movq [dstq+strideq], m2 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET - -INIT_XMM ssse3 -cglobal h_predictor_16x16, 2, 4, 3, dst, stride, line, left - movifnidn leftq, leftmp - add leftq, 16 - mov lineq, -8 - pxor m0, m0 -.loop: - movd m1, [leftq+lineq*2 ] - movd m2, [leftq+lineq*2+1] - pshufb m1, m0 - pshufb m2, m0 - mova [dstq ], m1 - mova [dstq+strideq], m2 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET - -INIT_XMM ssse3 -cglobal h_predictor_32x32, 2, 4, 3, dst, stride, line, left - movifnidn leftq, leftmp - add leftq, 32 - mov lineq, -16 - pxor m0, m0 -.loop: - movd m1, [leftq+lineq*2 ] - movd m2, [leftq+lineq*2+1] - pshufb m1, m0 - pshufb m2, m0 - mova [dstq ], m1 - mova [dstq +16], m1 - mova [dstq+strideq ], m2 - mova [dstq+strideq+16], m2 - lea dstq, [dstq+strideq*2] - inc lineq - jnz .loop - REP_RET - -INIT_MMX ssse3 -cglobal d45_predictor_4x4, 3, 4, 4, dst, stride, above, goffset - GET_GOT goffsetq - - movq m0, [aboveq] - pshufb m2, m0, [GLOBAL(sh_b23456777)] - pshufb m1, m0, [GLOBAL(sh_b01234577)] - pshufb m0, [GLOBAL(sh_b12345677)] - pavgb m3, m2, m1 - pxor m2, m1 - pand m2, [GLOBAL(pb_1)] - psubb m3, m2 - pavgb m0, m3 - - ; store 4 lines - movd [dstq ], m0 - psrlq m0, 8 - movd [dstq+strideq], m0 - lea dstq, [dstq+strideq*2] - psrlq m0, 8 - movd [dstq ], m0 - psrlq m0, 8 - movd [dstq+strideq], m0 - - RESTORE_GOT - RET - -INIT_MMX ssse3 -cglobal d45_predictor_8x8, 3, 4, 4, dst, stride, above, goffset - GET_GOT goffsetq - - movq m0, [aboveq] - mova m1, [GLOBAL(sh_b12345677)] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - pshufb m2, m0, [GLOBAL(sh_b23456777)] - pavgb m3, m2, m0 - pxor m2, m0 - pshufb m0, m1 - pand m2, [GLOBAL(pb_1)] - psubb m3, m2 - pavgb m0, m3 - - ; store 4 lines - movq [dstq ], m0 - pshufb m0, m1 - movq [dstq+strideq ], m0 - pshufb m0, m1 - movq [dstq+strideq*2], m0 - pshufb m0, m1 - movq [dstq+stride3q ], m0 - pshufb m0, m1 - lea dstq, [dstq+strideq*4] - - ; store next 4 lines - movq [dstq ], m0 - pshufb m0, m1 - movq [dstq+strideq ], m0 - pshufb m0, m1 - movq [dstq+strideq*2], m0 - pshufb m0, m1 - movq [dstq+stride3q ], m0 - - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d45_predictor_16x16, 3, 6, 4, dst, stride, above, dst8, line, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, dst8, line - lea stride3q, [strideq*3] - lea dst8q, [dstq+strideq*8] - mova m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] - pavgb m3, m2, m0 - pxor m2, m0 - pshufb m0, m1 - pand m2, [GLOBAL(pb_1)] - psubb m3, m2 - pavgb m0, m3 - - ; first 4 lines and first half of 3rd 4 lines - mov lined, 2 -.loop: - mova [dstq ], m0 - movhps [dst8q ], m0 - pshufb m0, m1 - mova [dstq +strideq ], m0 - movhps [dst8q+strideq ], m0 - pshufb m0, m1 - mova [dstq +strideq*2 ], m0 - movhps [dst8q+strideq*2 ], m0 - pshufb m0, m1 - mova [dstq +stride3q ], m0 - movhps [dst8q+stride3q ], m0 - pshufb m0, m1 - lea dstq, [dstq +strideq*4] - lea dst8q, [dst8q+strideq*4] - dec lined - jnz .loop - - ; bottom-right 8x8 block - movhps [dstq +8], m0 - movhps [dstq+strideq +8], m0 - movhps [dstq+strideq*2+8], m0 - movhps [dstq+stride3q +8], m0 - lea dstq, [dstq+strideq*4] - movhps [dstq +8], m0 - movhps [dstq+strideq +8], m0 - movhps [dstq+strideq*2+8], m0 - movhps [dstq+stride3q +8], m0 - - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d45_predictor_32x32, 3, 6, 7, dst, stride, above, dst16, line, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - mova m4, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, dst16, line - lea stride3q, [strideq*3] - lea dst16q, [dstq +strideq*8] - lea dst16q, [dst16q+strideq*8] - mova m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m2, m4, [GLOBAL(sh_b23456789abcdefff)] - pavgb m3, m2, m4 - pxor m2, m4 - palignr m5, m4, m0, 1 - palignr m6, m4, m0, 2 - pshufb m4, m1 - pand m2, [GLOBAL(pb_1)] - psubb m3, m2 - pavgb m4, m3 - pavgb m3, m0, m6 - pxor m0, m6 - pand m0, [GLOBAL(pb_1)] - psubb m3, m0 - pavgb m5, m3 - - ; write 4x4 lines (and the first half of the second 4x4 lines) - mov lined, 4 -.loop: - mova [dstq ], m5 - mova [dstq +16], m4 - mova [dst16q ], m4 - palignr m3, m4, m5, 1 - pshufb m4, m1 - mova [dstq +strideq ], m3 - mova [dstq +strideq +16], m4 - mova [dst16q+strideq ], m4 - palignr m5, m4, m3, 1 - pshufb m4, m1 - mova [dstq +strideq*2 ], m5 - mova [dstq +strideq*2+16], m4 - mova [dst16q+strideq*2 ], m4 - palignr m3, m4, m5, 1 - pshufb m4, m1 - mova [dstq +stride3q ], m3 - mova [dstq +stride3q +16], m4 - mova [dst16q+stride3q ], m4 - palignr m5, m4, m3, 1 - pshufb m4, m1 - lea dstq, [dstq +strideq*4] - lea dst16q, [dst16q+strideq*4] - dec lined - jnz .loop - - ; write second half of second 4x4 lines - mova [dstq +16], m4 - mova [dstq +strideq +16], m4 - mova [dstq +strideq*2+16], m4 - mova [dstq +stride3q +16], m4 - lea dstq, [dstq +strideq*4] - mova [dstq +16], m4 - mova [dstq +strideq +16], m4 - mova [dstq +strideq*2+16], m4 - mova [dstq +stride3q +16], m4 - lea dstq, [dstq +strideq*4] - mova [dstq +16], m4 - mova [dstq +strideq +16], m4 - mova [dstq +strideq*2+16], m4 - mova [dstq +stride3q +16], m4 - lea dstq, [dstq +strideq*4] - mova [dstq +16], m4 - mova [dstq +strideq +16], m4 - mova [dstq +strideq*2+16], m4 - mova [dstq +stride3q +16], m4 - - RESTORE_GOT - RET - -; ------------------------------------------ -; input: x, y, z, result -; -; trick from pascal -; (x+2y+z+2)>>2 can be calculated as: -; result = avg(x,z) -; result -= xor(x,z) & 1 -; result = avg(result,y) -; ------------------------------------------ -%macro X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 4 - pavgb %4, %1, %3 - pxor %3, %1 - pand %3, [GLOBAL(pb_1)] - psubb %4, %3 - pavgb %4, %2 -%endmacro - -INIT_XMM ssse3 -cglobal d63_predictor_4x4, 3, 4, 5, dst, stride, above, goffset - GET_GOT goffsetq - - movq m3, [aboveq] - pshufb m1, m3, [GLOBAL(sh_b23456777)] - pshufb m2, m3, [GLOBAL(sh_b12345677)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m2, m1, m4 - pavgb m3, m2 - - ; store 4 lines - movd [dstq ], m3 - movd [dstq+strideq], m4 - lea dstq, [dstq+strideq*2] - psrldq m3, 1 - psrldq m4, 1 - movd [dstq ], m3 - movd [dstq+strideq], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d63_predictor_8x8, 3, 4, 5, dst, stride, above, goffset - GET_GOT goffsetq - - movq m3, [aboveq] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] - pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] - pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] - pshufb m3, [GLOBAL(sh_b0123456777777777)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m4 - pavgb m3, m2 - - ; store 4 lines - movq [dstq ], m3 - movq [dstq+strideq], m4 - psrldq m3, 1 - psrldq m4, 1 - movq [dstq+strideq*2], m3 - movq [dstq+stride3q ], m4 - lea dstq, [dstq+strideq*4] - psrldq m3, 1 - psrldq m4, 1 - - ; store 4 lines - movq [dstq ], m3 - movq [dstq+strideq], m4 - psrldq m3, 1 - psrldq m4, 1 - movq [dstq+strideq*2], m3 - movq [dstq+stride3q ], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d63_predictor_16x16, 3, 5, 5, dst, stride, above, line, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - DEFINE_ARGS dst, stride, stride3, line - lea stride3q, [strideq*3] - mova m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] - pshufb m3, m0, m1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m3, m2, m4 - pavgb m0, m3 - - mov lined, 4 -.loop: - mova [dstq ], m0 - mova [dstq+strideq ], m4 - pshufb m0, m1 - pshufb m4, m1 - mova [dstq+strideq*2], m0 - mova [dstq+stride3q ], m4 - pshufb m0, m1 - pshufb m4, m1 - lea dstq, [dstq+strideq*4] - dec lined - jnz .loop - RESTORE_GOT - REP_RET - -INIT_XMM ssse3 -cglobal d63_predictor_32x32, 3, 5, 8, dst, stride, above, line, goffset - GET_GOT goffsetq - - mova m0, [aboveq] - mova m7, [aboveq+16] - DEFINE_ARGS dst, stride, stride3, line - mova m1, [GLOBAL(sh_b123456789abcdeff)] - lea stride3q, [strideq*3] - pshufb m2, m7, [GLOBAL(sh_b23456789abcdefff)] - pshufb m3, m7, m1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m2, m4 - palignr m6, m7, m0, 1 - palignr m5, m7, m0, 2 - pavgb m7, m3 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m6, m5, m2 - pavgb m0, m6 - - mov lined, 8 -.loop: - mova [dstq ], m0 - mova [dstq +16], m7 - mova [dstq+strideq ], m2 - mova [dstq+strideq +16], m4 - palignr m3, m7, m0, 1 - palignr m5, m4, m2, 1 - pshufb m7, m1 - pshufb m4, m1 - - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m5 - mova [dstq+stride3q +16], m4 - palignr m0, m7, m3, 1 - palignr m2, m4, m5, 1 - pshufb m7, m1 - pshufb m4, m1 - lea dstq, [dstq+strideq*4] - dec lined - jnz .loop - RESTORE_GOT - REP_RET - -INIT_XMM ssse3 -cglobal d153_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset - GET_GOT goffsetq - movd m0, [leftq] ; l1, l2, l3, l4 - movd m1, [aboveq-1] ; tl, t1, t2, t3 - punpckldq m0, m1 ; l1, l2, l3, l4, tl, t1, t2, t3 - pshufb m0, [GLOBAL(sh_b32104567)]; l4, l3, l2, l1, tl, t1, t2, t3 - psrldq m1, m0, 1 ; l3, l2, l1, tl, t1, t2, t3 - psrldq m2, m0, 2 ; l2, l1, tl, t1, t2, t3 - ; comments below are for a predictor like this - ; A1 B1 C1 D1 - ; A2 B2 A1 B1 - ; A3 B3 A2 B2 - ; A4 B4 A3 B3 - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 ; 3-tap avg B4 B3 B2 B1 C1 D1 - pavgb m1, m0 ; 2-tap avg A4 A3 A2 A1 - - punpcklqdq m3, m1 ; B4 B3 B2 B1 C1 D1 x x A4 A3 A2 A1 .. - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - pshufb m3, [GLOBAL(sh_b8091a2b345)] ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+stride3q ], m3 - psrldq m3, 2 ; A3 B3 A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq*2], m3 - psrldq m3, 2 ; A2 B2 A1 B1 C1 D1 .. - movd [dstq+strideq ], m3 - psrldq m3, 2 ; A1 B1 C1 D1 .. - movd [dstq ], m3 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_8x8, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - movq m0, [leftq] ; [0- 7] l1-8 [byte] - movhps m0, [aboveq-1] ; [8-15] tl, t1-7 [byte] - pshufb m1, m0, [GLOBAL(sh_b76543210)] ; l8-1 [word] - pshufb m2, m0, [GLOBAL(sh_b65432108)] ; l7-1,tl [word] - pshufb m3, m0, [GLOBAL(sh_b54321089)] ; l6-1,tl,t1 [word] - pshufb m0, [GLOBAL(sh_b89abcdef)] ; tl,t1-7 [word] - psrldq m4, m0, 1 ; t1-7 [word] - psrldq m5, m0, 2 ; t2-7 [word] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 - ; A2 B2 A1 B1 C1 D1 E1 F1 - ; A3 B3 A2 B2 A1 B1 C1 D1 - ; A4 B4 A3 B3 A2 B2 A1 B1 - ; A5 B5 A4 B4 A3 B3 A2 B2 - ; A6 B6 A5 B5 A4 B4 A3 B3 - ; A7 B7 A6 B6 A5 B5 A4 B4 - ; A8 B8 A7 B7 A6 B6 A5 B5 - pavgb m6, m1, m2 ; 2-tap avg A8-A1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m4, m5, m7 ; 3-tap avg C-H1 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m2, m3, m0 ; 3-tap avg B8-1 - - punpcklbw m6, m0 ; A-B8, A-B7 ... A-B2, A-B1 - - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - - movhps [dstq+stride3q], m6 ; A-B4, A-B3, A-B2, A-B1 - palignr m0, m7, m6, 10 ; A-B3, A-B2, A-B1, C-H1 - movq [dstq+strideq*2], m0 - psrldq m0, 2 ; A-B2, A-B1, C-H1 - movq [dstq+strideq ], m0 - psrldq m0, 2 ; A-H1 - movq [dstq ], m0 - lea dstq, [dstq+strideq*4] - movq [dstq+stride3q ], m6 ; A-B8, A-B7, A-B6, A-B5 - psrldq m6, 2 ; A-B7, A-B6, A-B5, A-B4 - movq [dstq+strideq*2], m6 - psrldq m6, 2 ; A-B6, A-B5, A-B4, A-B3 - movq [dstq+strideq ], m6 - psrldq m6, 2 ; A-B5, A-B4, A-B3, A-B2 - movq [dstq ], m6 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_16x16, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - ; comments below are for a predictor like this - ; A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 O1 P1 - ; A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 M1 N1 - ; A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 K1 L1 - ; A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 I1 J1 - ; A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 G1 H1 - ; A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 E1 F1 - ; A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 C1 D1 - ; A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 A1 B1 - ; A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 A2 B2 - ; Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 A3 B3 - ; Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 A4 B4 - ; Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 A5 B5 - ; Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 A6 B6 - ; Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 A7 B7 - ; Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 A8 B8 - ; Ag Bg Af Bf Ae Be Ad Bd Ac Bc Ab Bb Aa Ba A9 B9 - pshufb m6, m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m6, 15 - palignr m3, m0, m6, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] - pavgb m5, m0 ; A1 - Ag - - punpcklbw m0, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - - pshufb m3, m7, [GLOBAL(sh_b123456789abcdeff)] - pshufb m5, m7, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg C1-P1 - - pshufb m6, m0, [GLOBAL(sh_bfedcba9876543210)] - DEFINE_ARGS dst, stride, stride3 - lea stride3q, [strideq*3] - palignr m2, m1, m6, 14 - mova [dstq ], m2 - palignr m2, m1, m6, 12 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 10 - mova [dstq+strideq*2], m2 - palignr m2, m1, m6, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m1, m6, 6 - mova [dstq ], m2 - palignr m2, m1, m6, 4 - mova [dstq+strideq ], m2 - palignr m2, m1, m6, 2 - mova [dstq+strideq*2], m2 - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - mova [dstq+stride3q ], m6 - lea dstq, [dstq+strideq*4] - - palignr m2, m6, m4, 14 - mova [dstq ], m2 - palignr m2, m6, m4, 12 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 10 - mova [dstq+strideq*2], m2 - palignr m2, m6, m4, 8 - mova [dstq+stride3q ], m2 - lea dstq, [dstq+strideq*4] - palignr m2, m6, m4, 6 - mova [dstq ], m2 - palignr m2, m6, m4, 4 - mova [dstq+strideq ], m2 - palignr m2, m6, m4, 2 - mova [dstq+strideq*2], m2 - mova [dstq+stride3q ], m4 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d153_predictor_32x32, 4, 5, 8, dst, stride, above, left, goffset - GET_GOT goffsetq - mova m0, [leftq] - movu m7, [aboveq-1] - movu m1, [aboveq+15] - - pshufb m4, m1, [GLOBAL(sh_b123456789abcdeff)] - pshufb m6, m1, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m4, m6, m2 ; 3-tap avg above [high] - - palignr m3, m1, m7, 1 - palignr m5, m1, m7, 2 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m7, m3, m5, m1 ; 3-tap avg above [low] - - pshufb m7, [GLOBAL(sh_bfedcba9876543210)] - palignr m5, m0, m7, 15 - palignr m3, m0, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m5, m3, m4 ; 3-tap avg B3-Bg - pavgb m5, m0 ; A1 - Ag - punpcklbw m6, m4, m5 ; A-B8 ... A-B1 - punpckhbw m4, m5 ; A-B9 ... A-Bg - pshufb m6, [GLOBAL(sh_bfedcba9876543210)] - pshufb m4, [GLOBAL(sh_bfedcba9876543210)] - - DEFINE_ARGS dst, stride, stride3, left, line - lea stride3q, [strideq*3] - - palignr m5, m2, m1, 14 - palignr m7, m1, m6, 14 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 12 - palignr m7, m1, m6, 12 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 10 - palignr m7, m1, m6, 10 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - palignr m5, m2, m1, 8 - palignr m7, m1, m6, 8 - mova [dstq+stride3q ], m7 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m2, m1, 6 - palignr m7, m1, m6, 6 - mova [dstq ], m7 - mova [dstq+16 ], m5 - palignr m5, m2, m1, 4 - palignr m7, m1, m6, 4 - mova [dstq+strideq ], m7 - mova [dstq+strideq+16 ], m5 - palignr m5, m2, m1, 2 - palignr m7, m1, m6, 2 - mova [dstq+strideq*2 ], m7 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m6 - mova [dstq+stride3q+16 ], m1 - lea dstq, [dstq+strideq*4] - - palignr m5, m1, m6, 14 - palignr m3, m6, m4, 14 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 12 - palignr m3, m6, m4, 12 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 10 - palignr m3, m6, m4, 10 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - palignr m5, m1, m6, 8 - palignr m3, m6, m4, 8 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m5 - lea dstq, [dstq+strideq*4] - palignr m5, m1, m6, 6 - palignr m3, m6, m4, 6 - mova [dstq ], m3 - mova [dstq+16 ], m5 - palignr m5, m1, m6, 4 - palignr m3, m6, m4, 4 - mova [dstq+strideq ], m3 - mova [dstq+strideq+16 ], m5 - palignr m5, m1, m6, 2 - palignr m3, m6, m4, 2 - mova [dstq+strideq*2 ], m3 - mova [dstq+strideq*2+16], m5 - mova [dstq+stride3q ], m4 - mova [dstq+stride3q+16 ], m6 - lea dstq, [dstq+strideq*4] - - mova m7, [leftq] - mova m3, [leftq+16] - palignr m5, m3, m7, 15 - palignr m0, m3, m7, 14 - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m3, m5, m0, m2 ; 3-tap avg Bh - - pavgb m5, m3 ; Ah - - punpcklbw m3, m2, m5 ; A-B8 ... A-B1 - punpckhbw m2, m5 ; A-B9 ... A-Bg - pshufb m3, [GLOBAL(sh_bfedcba9876543210)] - pshufb m2, [GLOBAL(sh_bfedcba9876543210)] - - palignr m7, m6, m4, 14 - palignr m0, m4, m3, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 12 - palignr m0, m4, m3, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 10 - palignr m0, m4, m3, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m6, m4, 8 - palignr m0, m4, m3, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m6, m4, 6 - palignr m0, m4, m3, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m6, m4, 4 - palignr m0, m4, m3, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m6, m4, 2 - palignr m0, m4, m3, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m3 - mova [dstq+stride3q+16 ], m4 - lea dstq, [dstq+strideq*4] - - palignr m7, m4, m3, 14 - palignr m0, m3, m2, 14 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 12 - palignr m0, m3, m2, 12 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 10 - palignr m0, m3, m2, 10 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - palignr m7, m4, m3, 8 - palignr m0, m3, m2, 8 - mova [dstq+stride3q ], m0 - mova [dstq+stride3q+16 ], m7 - lea dstq, [dstq+strideq*4] - palignr m7, m4, m3, 6 - palignr m0, m3, m2, 6 - mova [dstq ], m0 - mova [dstq+16 ], m7 - palignr m7, m4, m3, 4 - palignr m0, m3, m2, 4 - mova [dstq+strideq ], m0 - mova [dstq+strideq+16 ], m7 - palignr m7, m4, m3, 2 - palignr m0, m3, m2, 2 - mova [dstq+strideq*2 ], m0 - mova [dstq+strideq*2+16], m7 - mova [dstq+stride3q ], m2 - mova [dstq+stride3q+16 ], m3 - - RESTORE_GOT - RET - -INIT_MMX ssse3 -cglobal d207_predictor_4x4, 4, 5, 4, dst, stride, unused, left, goffset - GET_GOT goffsetq - movd m0, [leftq] ; abcd [byte] - pshufb m1, m0, [GLOBAL(sh_b1233)] ; bcdd [byte] - pshufb m3, m0, [GLOBAL(sh_b2333)] ; cddd - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m3, m2 - pavgb m1, m0 ; ab, bc, cd, d [byte] - - punpcklbw m1, m2 ; ab, a2bc, bc, b2cd, cd, c3d, d, d - movd [dstq ], m1 - psrlq m1, 16 ; bc, b2cd, cd, c3d, d, d - movd [dstq+strideq], m1 - lea dstq, [dstq+strideq*2] - psrlq m1, 16 ; cd, c3d, d, d - movd [dstq ], m1 - pshufw m1, m1, q1111 ; d, d, d, d - movd [dstq+strideq], m1 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d207_predictor_8x8, 4, 5, 4, dst, stride, stride3, left, goffset - GET_GOT goffsetq - movq m3, [leftq] ; abcdefgh [byte] - lea stride3q, [strideq*3] - - pshufb m1, m3, [GLOBAL(sh_b2345677777777777)] - pshufb m0, m3, [GLOBAL(sh_b0123456777777777)] - pshufb m2, m3, [GLOBAL(sh_b1234567777777777)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m2, m1, m3 - pavgb m0, m2 - punpcklbw m0, m3 ; interleaved output - - movq [dstq ], m0 - psrldq m0, 2 - movq [dstq+strideq ], m0 - psrldq m0, 2 - movq [dstq+strideq*2], m0 - psrldq m0, 2 - movq [dstq+stride3q ], m0 - lea dstq, [dstq+strideq*4] - pshufhw m0, m0, q0000 ; de, d2ef, ef, e2fg, fg, f2gh, gh, g3h, 8xh - psrldq m0, 2 - movq [dstq ], m0 - psrldq m0, 2 - movq [dstq+strideq ], m0 - psrldq m0, 2 - movq [dstq+strideq*2], m0 - psrldq m0, 2 - movq [dstq+stride3q ], m0 - RESTORE_GOT - RET - -INIT_XMM ssse3 -cglobal d207_predictor_16x16, 4, 5, 5, dst, stride, stride3, left, goffset - GET_GOT goffsetq - lea stride3q, [strideq*3] - mova m0, [leftq] ; abcdefghijklmnop [byte] - pshufb m1, m0, [GLOBAL(sh_b123456789abcdeff)] ; bcdefghijklmnopp - pshufb m2, m0, [GLOBAL(sh_b23456789abcdefff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m0, m1, m2, m3 - pavgb m1, m0 ; ab, bc, cd .. no, op, pp [byte] - - punpckhbw m4, m1, m3 ; interleaved input - punpcklbw m1, m3 ; interleaved output - mova [dstq ], m1 - palignr m3, m4, m1, 2 - mova [dstq+strideq ], m3 - palignr m3, m4, m1, 4 - mova [dstq+strideq*2], m3 - palignr m3, m4, m1, 6 - mova [dstq+stride3q ], m3 - lea dstq, [dstq+strideq*4] - palignr m3, m4, m1, 8 - mova [dstq ], m3 - palignr m3, m4, m1, 10 - mova [dstq+strideq ], m3 - palignr m3, m4, m1, 12 - mova [dstq+strideq*2], m3 - palignr m3, m4, m1, 14 - mova [dstq+stride3q ], m3 - DEFINE_ARGS dst, stride, stride3, line - mov lined, 2 - mova m0, [GLOBAL(sh_b23456789abcdefff)] -.loop: - lea dstq, [dstq+strideq*4] - mova [dstq ], m4 - pshufb m4, m0 - mova [dstq+strideq ], m4 - pshufb m4, m0 - mova [dstq+strideq*2], m4 - pshufb m4, m0 - mova [dstq+stride3q ], m4 - pshufb m4, m0 - dec lined - jnz .loop - RESTORE_GOT - REP_RET - -INIT_XMM ssse3 -cglobal d207_predictor_32x32, 4, 5, 8, dst, stride, stride3, left, goffset - GET_GOT goffsetq - lea stride3q, [strideq*3] - mova m1, [leftq] ; 0-15 [byte] - mova m2, [leftq+16] ; 16-31 [byte] - pshufb m0, m2, [GLOBAL(sh_b23456789abcdefff)] - pshufb m4, m2, [GLOBAL(sh_b123456789abcdeff)] - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m2, m4, m0, m3 - palignr m6, m2, m1, 1 - palignr m5, m2, m1, 2 - pavgb m2, m4 ; high 16px even lines - - X_PLUS_2Y_PLUS_Z_PLUS_2_RSH_2 m1, m6, m5, m0 - pavgb m1, m6 ; low 16px even lines - - punpckhbw m6, m1, m0 ; interleaved output 2 - punpcklbw m1, m0 ; interleaved output 1 - - punpckhbw m7, m2, m3 ; interleaved output 4 - punpcklbw m2, m3 ; interleaved output 3 - - ; output 1st 8 lines (and half of 2nd 8 lines) - DEFINE_ARGS dst, stride, stride3, dst8 - lea dst8q, [dstq+strideq*8] - mova [dstq ], m1 - mova [dstq +16], m6 - mova [dst8q ], m6 - palignr m0, m6, m1, 2 - palignr m4, m2, m6, 2 - mova [dstq +strideq ], m0 - mova [dstq +strideq +16], m4 - mova [dst8q+strideq ], m4 - palignr m0, m6, m1, 4 - palignr m4, m2, m6, 4 - mova [dstq +strideq*2 ], m0 - mova [dstq +strideq*2+16], m4 - mova [dst8q+strideq*2 ], m4 - palignr m0, m6, m1, 6 - palignr m4, m2, m6, 6 - mova [dstq +stride3q ], m0 - mova [dstq +stride3q +16], m4 - mova [dst8q+stride3q ], m4 - lea dstq, [dstq +strideq*4] - lea dst8q, [dst8q+strideq*4] - palignr m0, m6, m1, 8 - palignr m4, m2, m6, 8 - mova [dstq ], m0 - mova [dstq +16], m4 - mova [dst8q ], m4 - palignr m0, m6, m1, 10 - palignr m4, m2, m6, 10 - mova [dstq +strideq ], m0 - mova [dstq +strideq +16], m4 - mova [dst8q+strideq ], m4 - palignr m0, m6, m1, 12 - palignr m4, m2, m6, 12 - mova [dstq +strideq*2 ], m0 - mova [dstq +strideq*2+16], m4 - mova [dst8q+strideq*2 ], m4 - palignr m0, m6, m1, 14 - palignr m4, m2, m6, 14 - mova [dstq +stride3q ], m0 - mova [dstq +stride3q +16], m4 - mova [dst8q+stride3q ], m4 - lea dstq, [dstq+strideq*4] - lea dst8q, [dst8q+strideq*4] - - ; output 2nd half of 2nd 8 lines and half of 3rd 8 lines - mova [dstq +16], m2 - mova [dst8q ], m2 - palignr m4, m7, m2, 2 - mova [dstq +strideq +16], m4 - mova [dst8q+strideq ], m4 - palignr m4, m7, m2, 4 - mova [dstq +strideq*2+16], m4 - mova [dst8q+strideq*2 ], m4 - palignr m4, m7, m2, 6 - mova [dstq +stride3q +16], m4 - mova [dst8q+stride3q ], m4 - lea dstq, [dstq+strideq*4] - lea dst8q, [dst8q+strideq*4] - palignr m4, m7, m2, 8 - mova [dstq +16], m4 - mova [dst8q ], m4 - palignr m4, m7, m2, 10 - mova [dstq +strideq +16], m4 - mova [dst8q+strideq ], m4 - palignr m4, m7, m2, 12 - mova [dstq +strideq*2+16], m4 - mova [dst8q+strideq*2 ], m4 - palignr m4, m7, m2, 14 - mova [dstq +stride3q +16], m4 - mova [dst8q+stride3q ], m4 - lea dstq, [dstq+strideq*4] - lea dst8q, [dst8q+strideq*4] - - ; output 2nd half of 3rd 8 lines and half of 4th 8 lines - mova m0, [GLOBAL(sh_b23456789abcdefff)] - mova [dstq +16], m7 - mova [dst8q ], m7 - pshufb m7, m0 - mova [dstq +strideq +16], m7 - mova [dst8q+strideq ], m7 - pshufb m7, m0 - mova [dstq +strideq*2+16], m7 - mova [dst8q+strideq*2 ], m7 - pshufb m7, m0 - mova [dstq +stride3q +16], m7 - mova [dst8q+stride3q ], m7 - pshufb m7, m0 - lea dstq, [dstq+strideq*4] - lea dst8q, [dst8q+strideq*4] - mova [dstq +16], m7 - mova [dst8q ], m7 - pshufb m7, m0 - mova [dstq +strideq +16], m7 - mova [dst8q+strideq ], m7 - pshufb m7, m0 - mova [dstq +strideq*2+16], m7 - mova [dst8q+strideq*2 ], m7 - pshufb m7, m0 - mova [dstq +stride3q +16], m7 - mova [dst8q+stride3q ], m7 - pshufb m7, m0 - lea dstq, [dstq+strideq*4] - - ; output last half of 4th 8 lines - mova [dstq +16], m7 - mova [dstq +strideq +16], m7 - mova [dstq +strideq*2+16], m7 - mova [dstq +stride3q +16], m7 - lea dstq, [dstq+strideq*4] - mova [dstq +16], m7 - mova [dstq +strideq +16], m7 - mova [dstq +strideq*2+16], m7 - mova [dstq +stride3q +16], m7 - - ; done! - RESTORE_GOT - RET diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 50c11b9a0..78ea63fa1 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -84,16 +84,11 @@ endif ifeq ($(CONFIG_USE_X86INC),yes) VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm endif ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_8t_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_subpixel_bilinear_sse2.asm -ifeq ($(CONFIG_USE_X86INC),yes) -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_intrapred_sse2.asm -endif endif # common (c) @@ -108,9 +103,6 @@ VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_avg_horiz_dspr VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_horiz_dspr2.c VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve8_vert_dspr2.c -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_intrapred4_dspr2.c -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_intrapred8_dspr2.c -VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_intrapred16_dspr2.c ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes) VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c @@ -135,7 +127,6 @@ VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct32x32_msa.c VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct_msa.h -VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_intra_predict_msa.c ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c @@ -174,7 +165,6 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct4x4_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_1_add_neon_asm$(ASM) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon_asm$(ASM) -VP9_COMMON_SRCS-yes += common/arm/neon/vp9_reconintra_neon_asm$(ASM) else ifeq ($(HAVE_NEON), yes) VP9_COMMON_SRCS-yes += common/arm/neon/vp9_convolve8_avg_neon.c @@ -194,6 +184,4 @@ VP9_COMMON_SRCS-yes += common/arm/neon/vp9_idct8x8_add_neon.c endif # HAVE_NEON endif # HAVE_NEON_ASM -VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_reconintra_neon.c - $(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl)) |