summaryrefslogtreecommitdiff
path: root/vp9/common
diff options
context:
space:
mode:
Diffstat (limited to 'vp9/common')
-rw-r--r--vp9/common/arm/neon/vp9_reconintra_neon.c766
-rw-r--r--vp9/common/vp9_mfqe.c6
-rw-r--r--vp9/common/vp9_mvref_common.c2
-rw-r--r--vp9/common/vp9_rtcd_defs.pl231
-rw-r--r--vp9/common/x86/convolve.h296
-rw-r--r--vp9/common/x86/vp9_asm_stubs.c416
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c122
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c201
8 files changed, 896 insertions, 1144 deletions
diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.c b/vp9/common/arm/neon/vp9_reconintra_neon.c
index d0beaa720..66cf6600e 100644
--- a/vp9/common/arm/neon/vp9_reconintra_neon.c
+++ b/vp9/common/arm/neon/vp9_reconintra_neon.c
@@ -11,463 +11,415 @@
#include <stddef.h>
#include <arm_neon.h>
-void vp9_v_predictor_4x4_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint32x2_t d0u32 = vdup_n_u32(0);
- (void)left;
+void vp9_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint32x2_t d0u32 = vdup_n_u32(0);
+ (void)left;
- d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
- for (i = 0; i < 4; i++, dst += y_stride)
- vst1_lane_u32((uint32_t *)dst, d0u32, 0);
- return;
+ d0u32 = vld1_lane_u32((const uint32_t *)above, d0u32, 0);
+ for (i = 0; i < 4; i++, dst += y_stride)
+ vst1_lane_u32((uint32_t *)dst, d0u32, 0);
}
-void vp9_v_predictor_8x8_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint8x8_t d0u8 = vdup_n_u8(0);
- (void)left;
+void vp9_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ (void)left;
- d0u8 = vld1_u8(above);
- for (i = 0; i < 8; i++, dst += y_stride)
- vst1_u8(dst, d0u8);
- return;
+ d0u8 = vld1_u8(above);
+ for (i = 0; i < 8; i++, dst += y_stride)
+ vst1_u8(dst, d0u8);
}
-void vp9_v_predictor_16x16_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- (void)left;
+void vp9_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ (void)left;
- q0u8 = vld1q_u8(above);
- for (i = 0; i < 16; i++, dst += y_stride)
- vst1q_u8(dst, q0u8);
- return;
+ q0u8 = vld1q_u8(above);
+ for (i = 0; i < 16; i++, dst += y_stride)
+ vst1q_u8(dst, q0u8);
}
-void vp9_v_predictor_32x32_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)left;
+void vp9_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)left;
- q0u8 = vld1q_u8(above);
- q1u8 = vld1q_u8(above + 16);
- for (i = 0; i < 32; i++, dst += y_stride) {
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q1u8);
- }
- return;
+ q0u8 = vld1q_u8(above);
+ q1u8 = vld1q_u8(above + 16);
+ for (i = 0; i < 32; i++, dst += y_stride) {
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q1u8);
+ }
}
-void vp9_h_predictor_4x4_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint32x2_t d1u32 = vdup_n_u32(0);
- (void)above;
+void vp9_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d1u32 = vdup_n_u32(0);
+ (void)above;
- d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
+ d1u32 = vld1_lane_u32((const uint32_t *)left, d1u32, 0);
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- return;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 0);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 1);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 2);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u32(d1u32), 3);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
}
-void vp9_h_predictor_8x8_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint64x1_t d1u64 = vdup_n_u64(0);
- (void)above;
+void vp9_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint64x1_t d1u64 = vdup_n_u64(0);
+ (void)above;
- d1u64 = vld1_u64((const uint64_t *)left);
+ d1u64 = vld1_u64((const uint64_t *)left);
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
- vst1_u8(dst, d0u8);
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 0);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
+ vst1_u8(dst, d0u8);
+ dst += y_stride;
+ d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
+ vst1_u8(dst, d0u8);
+}
+
+void vp9_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+
+ q1u8 = vld1q_u8(left);
+ d2u8 = vget_low_u8(q1u8);
+ for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 1);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 2);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 3);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 4);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 5);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 6);
- vst1_u8(dst, d0u8);
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
dst += y_stride;
- d0u8 = vdup_lane_u8(vreinterpret_u8_u64(d1u64), 7);
- vst1_u8(dst, d0u8);
- return;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ dst += y_stride;
+ }
}
-void vp9_h_predictor_16x16_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)above;
+void vp9_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint8x8_t d2u8 = vdup_n_u8(0);
+ uint8x16_t q0u8 = vdupq_n_u8(0);
+ uint8x16_t q1u8 = vdupq_n_u8(0);
+ (void)above;
+ for (k = 0; k < 2; k++, left += 16) {
q1u8 = vld1q_u8(left);
d2u8 = vget_low_u8(q1u8);
for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
- dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 0);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 1);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 2);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 3);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 4);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 5);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 6);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
+ q0u8 = vdupq_lane_u8(d2u8, 7);
+ vst1q_u8(dst, q0u8);
+ vst1q_u8(dst + 16, q0u8);
+ dst += y_stride;
}
- return;
+ }
}
-void vp9_h_predictor_32x32_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j, k;
- uint8x8_t d2u8 = vdup_n_u8(0);
- uint8x16_t q0u8 = vdupq_n_u8(0);
- uint8x16_t q1u8 = vdupq_n_u8(0);
- (void)above;
+void vp9_tm_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int i;
+ uint16x8_t q1u16, q3u16;
+ int16x8_t q1s16;
+ uint8x8_t d0u8 = vdup_n_u8(0);
+ uint32x2_t d2u32 = vdup_n_u32(0);
- for (k = 0; k < 2; k++, left += 16) {
- q1u8 = vld1q_u8(left);
- d2u8 = vget_low_u8(q1u8);
- for (j = 0; j < 2; j++, d2u8 = vget_high_u8(q1u8)) {
- q0u8 = vdupq_lane_u8(d2u8, 0);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 1);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 2);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 3);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 4);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 5);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 6);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- q0u8 = vdupq_lane_u8(d2u8, 7);
- vst1q_u8(dst, q0u8);
- vst1q_u8(dst + 16, q0u8);
- dst += y_stride;
- }
- }
- return;
+ d0u8 = vdup_n_u8(above[-1]);
+ d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
+ q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
+ for (i = 0; i < 4; i++, dst += y_stride) {
+ q1u16 = vdupq_n_u16((uint16_t)left[i]);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
+ vreinterpretq_s16_u16(q3u16));
+ d0u8 = vqmovun_s16(q1s16);
+ vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
+ }
}
-void vp9_tm_predictor_4x4_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int i;
- uint16x8_t q1u16, q3u16;
- int16x8_t q1s16;
- uint8x8_t d0u8 = vdup_n_u8(0);
- uint32x2_t d2u32 = vdup_n_u32(0);
+void vp9_tm_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j;
+ uint16x8_t q0u16, q3u16, q10u16;
+ int16x8_t q0s16;
+ uint16x4_t d20u16;
+ uint8x8_t d0u8, d2u8, d30u8;
- d0u8 = vdup_n_u8(above[-1]);
- d2u32 = vld1_lane_u32((const uint32_t *)above, d2u32, 0);
- q3u16 = vsubl_u8(vreinterpret_u8_u32(d2u32), d0u8);
- for (i = 0; i < 4; i++, dst += y_stride) {
- q1u16 = vdupq_n_u16((uint16_t)left[i]);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q1u16),
- vreinterpretq_s16_u16(q3u16));
- d0u8 = vqmovun_s16(q1s16);
- vst1_lane_u32((uint32_t *)dst, vreinterpret_u32_u8(d0u8), 0);
- }
- return;
+ d0u8 = vdup_n_u8(above[-1]);
+ d30u8 = vld1_u8(left);
+ d2u8 = vld1_u8(above);
+ q10u16 = vmovl_u8(d30u8);
+ q3u16 = vsubl_u8(d2u8, d0u8);
+ d20u16 = vget_low_u16(q10u16);
+ for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 1);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 3);
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
+ vreinterpretq_s16_u16(q0u16));
+ d0u8 = vqmovun_s16(q0s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
+ dst += y_stride;
+ }
}
-void vp9_tm_predictor_8x8_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j;
- uint16x8_t q0u16, q3u16, q10u16;
- int16x8_t q0s16;
- uint16x4_t d20u16;
- uint8x8_t d0u8, d2u8, d30u8;
+void vp9_tm_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
+ uint8x16_t q0u8, q1u8;
+ int16x8_t q0s16, q1s16, q8s16, q11s16;
+ uint16x4_t d20u16;
+ uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
- d0u8 = vdup_n_u8(above[-1]);
- d30u8 = vld1_u8(left);
- d2u8 = vld1_u8(above);
- q10u16 = vmovl_u8(d30u8);
- q3u16 = vsubl_u8(d2u8, d0u8);
+ q0u8 = vdupq_n_u8(above[-1]);
+ q1u8 = vld1q_u8(above);
+ q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ for (k = 0; k < 2; k++, left += 8) {
+ d18u8 = vld1_u8(left);
+ q10u16 = vmovl_u8(d18u8);
d20u16 = vget_low_u16(q10u16);
for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
- q0u16 = vdupq_lane_u16(d20u16, 0);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += y_stride;
- q0u16 = vdupq_lane_u16(d20u16, 1);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += y_stride;
- q0u16 = vdupq_lane_u16(d20u16, 2);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += y_stride;
- q0u16 = vdupq_lane_u16(d20u16, 3);
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q3u16),
- vreinterpretq_s16_u16(q0u16));
- d0u8 = vqmovun_s16(q0s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d0u8));
- dst += y_stride;
- }
- return;
-}
-
-void vp9_tm_predictor_16x16_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j, k;
- uint16x8_t q0u16, q2u16, q3u16, q8u16, q10u16;
- uint8x16_t q0u8, q1u8;
- int16x8_t q0s16, q1s16, q8s16, q11s16;
- uint16x4_t d20u16;
- uint8x8_t d2u8, d3u8, d18u8, d22u8, d23u8;
-
- q0u8 = vdupq_n_u8(above[-1]);
- q1u8 = vld1q_u8(above);
- q2u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
- q3u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
- for (k = 0; k < 2; k++, left += 8) {
- d18u8 = vld1_u8(left);
- q10u16 = vmovl_u8(d18u8);
- d20u16 = vget_low_u16(q10u16);
- for (j = 0; j < 2; j++, d20u16 = vget_high_u16(q10u16)) {
- q0u16 = vdupq_lane_u16(d20u16, 0);
- q8u16 = vdupq_lane_u16(d20u16, 1);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q2u16));
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q3u16));
- q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q2u16));
- q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q3u16));
- d2u8 = vqmovun_s16(q1s16);
- d3u8 = vqmovun_s16(q0s16);
- d22u8 = vqmovun_s16(q11s16);
- d23u8 = vqmovun_s16(q8s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
- dst += y_stride;
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
- dst += y_stride;
+ q0u16 = vdupq_lane_u16(d20u16, 0);
+ q8u16 = vdupq_lane_u16(d20u16, 1);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += y_stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += y_stride;
- q0u16 = vdupq_lane_u16(d20u16, 2);
- q8u16 = vdupq_lane_u16(d20u16, 3);
- q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q2u16));
- q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q3u16));
- q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q2u16));
- q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
- vreinterpretq_s16_u16(q3u16));
- d2u8 = vqmovun_s16(q1s16);
- d3u8 = vqmovun_s16(q0s16);
- d22u8 = vqmovun_s16(q11s16);
- d23u8 = vqmovun_s16(q8s16);
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
- dst += y_stride;
- vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
- vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
- dst += y_stride;
- }
+ q0u16 = vdupq_lane_u16(d20u16, 2);
+ q8u16 = vdupq_lane_u16(d20u16, 3);
+ q1s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q2u16));
+ q0s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q3u16));
+ q11s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q2u16));
+ q8s16 = vaddq_s16(vreinterpretq_s16_u16(q8u16),
+ vreinterpretq_s16_u16(q3u16));
+ d2u8 = vqmovun_s16(q1s16);
+ d3u8 = vqmovun_s16(q0s16);
+ d22u8 = vqmovun_s16(q11s16);
+ d23u8 = vqmovun_s16(q8s16);
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d2u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d3u8));
+ dst += y_stride;
+ vst1_u64((uint64_t *)dst, vreinterpret_u64_u8(d22u8));
+ vst1_u64((uint64_t *)(dst + 8), vreinterpret_u64_u8(d23u8));
+ dst += y_stride;
}
- return;
+ }
}
-void vp9_tm_predictor_32x32_neon(
- uint8_t *dst,
- ptrdiff_t y_stride,
- const uint8_t *above,
- const uint8_t *left) {
- int j, k;
- uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
- uint8x16_t q0u8, q1u8, q2u8;
- int16x8_t q12s16, q13s16, q14s16, q15s16;
- uint16x4_t d6u16;
- uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
+void vp9_tm_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
+ const uint8_t *above, const uint8_t *left) {
+ int j, k;
+ uint16x8_t q0u16, q3u16, q8u16, q9u16, q10u16, q11u16;
+ uint8x16_t q0u8, q1u8, q2u8;
+ int16x8_t q12s16, q13s16, q14s16, q15s16;
+ uint16x4_t d6u16;
+ uint8x8_t d0u8, d1u8, d2u8, d3u8, d26u8;
- q0u8 = vdupq_n_u8(above[-1]);
- q1u8 = vld1q_u8(above);
- q2u8 = vld1q_u8(above + 16);
- q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
- q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
- q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
- q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
- for (k = 0; k < 4; k++, left += 8) {
- d26u8 = vld1_u8(left);
- q3u16 = vmovl_u8(d26u8);
- d6u16 = vget_low_u16(q3u16);
- for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
- q0u16 = vdupq_lane_u16(d6u16, 0);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += y_stride;
+ q0u8 = vdupq_n_u8(above[-1]);
+ q1u8 = vld1q_u8(above);
+ q2u8 = vld1q_u8(above + 16);
+ q8u16 = vsubl_u8(vget_low_u8(q1u8), vget_low_u8(q0u8));
+ q9u16 = vsubl_u8(vget_high_u8(q1u8), vget_high_u8(q0u8));
+ q10u16 = vsubl_u8(vget_low_u8(q2u8), vget_low_u8(q0u8));
+ q11u16 = vsubl_u8(vget_high_u8(q2u8), vget_high_u8(q0u8));
+ for (k = 0; k < 4; k++, left += 8) {
+ d26u8 = vld1_u8(left);
+ q3u16 = vmovl_u8(d26u8);
+ d6u16 = vget_low_u16(q3u16);
+ for (j = 0; j < 2; j++, d6u16 = vget_high_u16(q3u16)) {
+ q0u16 = vdupq_lane_u16(d6u16, 0);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
- q0u16 = vdupq_lane_u16(d6u16, 1);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += y_stride;
+ q0u16 = vdupq_lane_u16(d6u16, 1);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
- q0u16 = vdupq_lane_u16(d6u16, 2);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += y_stride;
+ q0u16 = vdupq_lane_u16(d6u16, 2);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
- q0u16 = vdupq_lane_u16(d6u16, 3);
- q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q8u16));
- q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q9u16));
- q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q10u16));
- q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
- vreinterpretq_s16_u16(q11u16));
- d0u8 = vqmovun_s16(q12s16);
- d1u8 = vqmovun_s16(q13s16);
- d2u8 = vqmovun_s16(q14s16);
- d3u8 = vqmovun_s16(q15s16);
- q0u8 = vcombine_u8(d0u8, d1u8);
- q1u8 = vcombine_u8(d2u8, d3u8);
- vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
- vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
- dst += y_stride;
- }
+ q0u16 = vdupq_lane_u16(d6u16, 3);
+ q12s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q8u16));
+ q13s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q9u16));
+ q14s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q10u16));
+ q15s16 = vaddq_s16(vreinterpretq_s16_u16(q0u16),
+ vreinterpretq_s16_u16(q11u16));
+ d0u8 = vqmovun_s16(q12s16);
+ d1u8 = vqmovun_s16(q13s16);
+ d2u8 = vqmovun_s16(q14s16);
+ d3u8 = vqmovun_s16(q15s16);
+ q0u8 = vcombine_u8(d0u8, d1u8);
+ q1u8 = vcombine_u8(d2u8, d3u8);
+ vst1q_u64((uint64_t *)dst, vreinterpretq_u64_u8(q0u8));
+ vst1q_u64((uint64_t *)(dst + 16), vreinterpretq_u64_u8(q1u8));
+ dst += y_stride;
}
- return;
+ }
}
diff --git a/vp9/common/vp9_mfqe.c b/vp9/common/vp9_mfqe.c
index 57189df16..bebb37eda 100644
--- a/vp9/common/vp9_mfqe.c
+++ b/vp9/common/vp9_mfqe.c
@@ -171,13 +171,13 @@ static void mfqe_block(BLOCK_SIZE bs, const uint8_t *y, const uint8_t *u,
get_thr(bs, qdiff, &sad_thr, &vdiff_thr);
if (bs == BLOCK_16X16) {
- vdiff = (vp9_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
+ vdiff = (vpx_variance16x16(y, y_stride, yd, yd_stride, &sse) + 128) >> 8;
sad = (vpx_sad16x16(y, y_stride, yd, yd_stride) + 128) >> 8;
} else if (bs == BLOCK_32X32) {
- vdiff = (vp9_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
+ vdiff = (vpx_variance32x32(y, y_stride, yd, yd_stride, &sse) + 512) >> 10;
sad = (vpx_sad32x32(y, y_stride, yd, yd_stride) + 512) >> 10;
} else /* if (bs == BLOCK_64X64) */ {
- vdiff = (vp9_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
+ vdiff = (vpx_variance64x64(y, y_stride, yd, yd_stride, &sse) + 2048) >> 12;
sad = (vpx_sad64x64(y, y_stride, yd, yd_stride) + 2048) >> 12;
}
diff --git a/vp9/common/vp9_mvref_common.c b/vp9/common/vp9_mvref_common.c
index 51e147e00..ce6952752 100644
--- a/vp9/common/vp9_mvref_common.c
+++ b/vp9/common/vp9_mvref_common.c
@@ -223,6 +223,6 @@ void vp9_append_sub8x8_mvs_for_idx(VP9_COMMON *cm, MACROBLOCKD *xd,
break;
}
default:
- assert("Invalid block index.");
+ assert(0 && "Invalid block index.");
}
}
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 2a9736b40..30710ba00 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -797,51 +797,6 @@ if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
# variance
-add_proto qw/unsigned int vp9_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x32/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x32 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x64 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance32x32 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance64x64 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x8 neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get8x8var neon/, "$sse2_x86inc";
-
-add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance8x4/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance4x4/, "$sse2_x86inc";
-
add_proto qw/unsigned int vp9_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_sub_pixel_variance64x64 avx2 neon/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -922,21 +877,6 @@ specialize qw/vp9_sub_pixel_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance4x4/, "$sse_x86inc", "$ssse3_x86inc";
-add_proto qw/unsigned int vp9_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x16 avx2/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x16/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse16x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
-specialize qw/vp9_mse8x8/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vp9_get_mb_ss/, "const int16_t *";
-specialize qw/vp9_get_mb_ss/, "$sse2_x86inc";
-
add_proto qw/unsigned int vp9_avg_8x8/, "const uint8_t *, int p";
specialize qw/vp9_avg_8x8 sse2 neon/;
@@ -1141,142 +1081,6 @@ specialize qw/vp9_temporal_filter_apply sse2/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
- # variance
- add_proto qw/unsigned int vp9_highbd_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance8x4/;
-
- add_proto qw/unsigned int vp9_highbd_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_variance4x4/;
-
- add_proto qw/void vp9_highbd_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_get8x8var/, "$sse2_x86inc";
-
- add_proto qw/void vp9_highbd_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_get16x16var/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance8x4/;
-
- add_proto qw/unsigned int vp9_highbd_10_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_10_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_variance4x4/;
-
- add_proto qw/void vp9_highbd_10_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_10_get8x8var/, "$sse2_x86inc";
-
- add_proto qw/void vp9_highbd_10_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_10_get16x16var/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance32x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance16x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance64x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance64x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance32x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance32x32/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance32x32/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance64x64/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance16x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance8x4/;
-
- add_proto qw/unsigned int vp9_highbd_12_variance4x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance4x8/;
-
- add_proto qw/unsigned int vp9_highbd_12_variance4x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_variance4x4/;
-
- add_proto qw/void vp9_highbd_12_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_12_get8x8var/, "$sse2_x86inc";
-
- add_proto qw/void vp9_highbd_12_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
- specialize qw/vp9_highbd_12_get16x16var/, "$sse2_x86inc";
-
add_proto qw/unsigned int vp9_highbd_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_highbd_sub_pixel_variance64x64/, "$sse2_x86inc";
@@ -1511,41 +1315,6 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
add_proto qw/unsigned int vp9_highbd_12_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_highbd_12_sub_pixel_avg_variance4x4/;
- add_proto qw/unsigned int vp9_highbd_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse8x16/;
-
- add_proto qw/unsigned int vp9_highbd_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse16x8/;
-
- add_proto qw/unsigned int vp9_highbd_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_mse8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_10_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse8x16/;
-
- add_proto qw/unsigned int vp9_highbd_10_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse16x8/;
-
- add_proto qw/unsigned int vp9_highbd_10_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_10_mse8x8/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse16x16/, "$sse2_x86inc";
-
- add_proto qw/unsigned int vp9_highbd_12_mse8x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse8x16/;
-
- add_proto qw/unsigned int vp9_highbd_12_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse16x8/;
-
- add_proto qw/unsigned int vp9_highbd_12_mse8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse";
- specialize qw/vp9_highbd_12_mse8x8/, "$sse2_x86inc";
# ENCODEMB INVOKE
diff --git a/vp9/common/x86/convolve.h b/vp9/common/x86/convolve.h
new file mode 100644
index 000000000..de2df47e5
--- /dev/null
+++ b/vp9/common/x86/convolve.h
@@ -0,0 +1,296 @@
+/*
+ * Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP9_COMMON_X86_CONVOLVE_H_
+#define VP9_COMMON_X86_CONVOLVE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+
+typedef void filter8_1dfunction (
+ const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter
+);
+
+#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h) { \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ if (filter[0] || filter[1] || filter[2]) { \
+ while (w >= 16) { \
+ vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vp9_filter_block1d16_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_filter_block1d8_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_filter_block1d4_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h); \
+ } \
+}
+
+#define FUN_CONV_2D(avg, opt) \
+void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h) { \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
+ vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h + 7); \
+ vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } else { \
+ DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
+ vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h + 1); \
+ vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, \
+ y_step_q4, w, h); \
+ } \
+ } else { \
+ vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
+ } \
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+typedef void highbd_filter8_1dfunction (
+ const uint16_t *src_ptr,
+ const ptrdiff_t src_pitch,
+ uint16_t *output_ptr,
+ ptrdiff_t out_pitch,
+ unsigned int output_height,
+ const int16_t *filter,
+ int bd
+);
+
+#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
+ void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
+ ptrdiff_t src_stride, \
+ uint8_t *dst8, \
+ ptrdiff_t dst_stride, \
+ const int16_t *filter_x, \
+ int x_step_q4, \
+ const int16_t *filter_y, \
+ int y_step_q4, \
+ int w, int h, int bd) { \
+ if (step_q4 == 16 && filter[3] != 128) { \
+ uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
+ uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
+ if (filter[0] || filter[1] || filter[2]) { \
+ while (w >= 16) { \
+ vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } else { \
+ while (w >= 16) { \
+ vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 16; \
+ dst += 16; \
+ w -= 16; \
+ } \
+ while (w >= 8) { \
+ vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 8; \
+ dst += 8; \
+ w -= 8; \
+ } \
+ while (w >= 4) { \
+ vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
+ src_stride, \
+ dst, \
+ dst_stride, \
+ h, \
+ filter, \
+ bd); \
+ src += 4; \
+ dst += 4; \
+ w -= 4; \
+ } \
+ } \
+ } \
+ if (w) { \
+ vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, \
+ w, h, bd); \
+ } \
+}
+
+#define HIGH_FUN_CONV_2D(avg, opt) \
+void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
+ uint8_t *dst, ptrdiff_t dst_stride, \
+ const int16_t *filter_x, int x_step_q4, \
+ const int16_t *filter_y, int y_step_q4, \
+ int w, int h, int bd) { \
+ assert(w <= 64); \
+ assert(h <= 64); \
+ if (x_step_q4 == 16 && y_step_q4 == 16) { \
+ if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+ filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
+ vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+ CONVERT_TO_BYTEPTR(fdata2), 64, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h + 7, bd); \
+ vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
+ 64, dst, dst_stride, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h, bd); \
+ } else { \
+ DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
+ vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
+ CONVERT_TO_BYTEPTR(fdata2), 64, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h + 1, bd); \
+ vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
+ dst, dst_stride, \
+ filter_x, x_step_q4, \
+ filter_y, y_step_q4, \
+ w, h, bd); \
+ } \
+ } else { \
+ vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
+ filter_x, x_step_q4, filter_y, y_step_q4, w, \
+ h, bd); \
+ } \
+}
+#endif // CONFIG_VP9_HIGHBITDEPTH
+
+#endif // VP9_COMMON_X86_CONVOLVE_H_
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 963023c53..fd55fb8c6 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -8,421 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include <assert.h>
-
-#include "./vpx_config.h"
#include "./vp9_rtcd.h"
-#include "vpx_ports/mem.h"
-
-typedef void filter8_1dfunction (
- const unsigned char *src_ptr,
- const ptrdiff_t src_pitch,
- unsigned char *output_ptr,
- ptrdiff_t out_pitch,
- unsigned int output_height,
- const short *filter
-);
-
-#define FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vp9_convolve8_##name##_##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- if (step_q4 == 16 && filter[3] != 128) { \
- if (filter[0] || filter[1] || filter[2]) { \
- while (w >= 16) { \
- vp9_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vp9_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vp9_convolve8_##name##_c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h); \
- } \
-}
-
-#define FUN_CONV_2D(avg, opt) \
-void vp9_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
- filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
- DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 71]); \
- vp9_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 7); \
- vp9_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } else { \
- DECLARE_ALIGNED(16, unsigned char, fdata2[64 * 65]); \
- vp9_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h + 1); \
- vp9_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, \
- y_step_q4, w, h); \
- } \
- } else { \
- vp9_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, h); \
- } \
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-
-typedef void highbd_filter8_1dfunction (
- const uint16_t *src_ptr,
- const ptrdiff_t src_pitch,
- uint16_t *output_ptr,
- ptrdiff_t out_pitch,
- unsigned int output_height,
- const int16_t *filter,
- int bd
-);
-
-#define HIGH_FUN_CONV_1D(name, step_q4, filter, dir, src_start, avg, opt) \
- void vp9_highbd_convolve8_##name##_##opt(const uint8_t *src8, \
- ptrdiff_t src_stride, \
- uint8_t *dst8, \
- ptrdiff_t dst_stride, \
- const int16_t *filter_x, \
- int x_step_q4, \
- const int16_t *filter_y, \
- int y_step_q4, \
- int w, int h, int bd) { \
- if (step_q4 == 16 && filter[3] != 128) { \
- uint16_t *src = CONVERT_TO_SHORTPTR(src8); \
- uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); \
- if (filter[0] || filter[1] || filter[2]) { \
- while (w >= 16) { \
- vp9_highbd_filter_block1d16_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_highbd_filter_block1d8_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_highbd_filter_block1d4_##dir##8_##avg##opt(src_start, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } else { \
- while (w >= 16) { \
- vp9_highbd_filter_block1d16_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 16; \
- dst += 16; \
- w -= 16; \
- } \
- while (w >= 8) { \
- vp9_highbd_filter_block1d8_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 8; \
- dst += 8; \
- w -= 8; \
- } \
- while (w >= 4) { \
- vp9_highbd_filter_block1d4_##dir##2_##avg##opt(src, \
- src_stride, \
- dst, \
- dst_stride, \
- h, \
- filter, \
- bd); \
- src += 4; \
- dst += 4; \
- w -= 4; \
- } \
- } \
- } \
- if (w) { \
- vp9_highbd_convolve8_##name##_c(src8, src_stride, dst8, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, \
- w, h, bd); \
- } \
-}
-
-#define HIGH_FUN_CONV_2D(avg, opt) \
-void vp9_highbd_convolve8_##avg##opt(const uint8_t *src, ptrdiff_t src_stride, \
- uint8_t *dst, ptrdiff_t dst_stride, \
- const int16_t *filter_x, int x_step_q4, \
- const int16_t *filter_y, int y_step_q4, \
- int w, int h, int bd) { \
- assert(w <= 64); \
- assert(h <= 64); \
- if (x_step_q4 == 16 && y_step_q4 == 16) { \
- if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
- filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
- vp9_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 7, bd); \
- vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
- 64, dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } else { \
- DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
- vp9_highbd_convolve8_horiz_##opt(src, src_stride, \
- CONVERT_TO_BYTEPTR(fdata2), 64, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h + 1, bd); \
- vp9_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
- dst, dst_stride, \
- filter_x, x_step_q4, \
- filter_y, y_step_q4, \
- w, h, bd); \
- } \
- } else { \
- vp9_highbd_convolve8_##avg##c(src, src_stride, dst, dst_stride, \
- filter_x, x_step_q4, filter_y, y_step_q4, w, \
- h, bd); \
- } \
-}
-#endif // CONFIG_VP9_HIGHBITDEPTH
-
-#if HAVE_AVX2 && HAVE_SSSE3
-filter8_1dfunction vp9_filter_block1d16_v8_avx2;
-filter8_1dfunction vp9_filter_block1d16_h8_avx2;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
-#else // ARCH_X86
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
-#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
-#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
-#endif // ARCH_X86_64 / ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
-#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
-#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
-#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
-#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
-#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
-#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
-// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
-
-// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, avx2);
-#endif // HAVE_AX2 && HAVE_SSSE3
-#if HAVE_SSSE3
-#if ARCH_X86_64
-filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
-#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
-#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
-#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
-#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
-#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
-#else // ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
-#endif // ARCH_X86_64 / ARCH_X86
-filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
-
-filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
-filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
-filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
-
-// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
-FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
-FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
-FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
- ssse3);
-
-// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
-// uint8_t *dst, ptrdiff_t dst_stride,
-// const int16_t *filter_x, int x_step_q4,
-// const int16_t *filter_y, int y_step_q4,
-// int w, int h);
-FUN_CONV_2D(, ssse3);
-FUN_CONV_2D(avg_ , ssse3);
-#endif // HAVE_SSSE3
+#include "./vpx_config.h"
+#include "vp9/common/x86/convolve.h"
#if HAVE_SSE2
filter8_1dfunction vp9_filter_block1d16_v8_sse2;
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
index 3bc7d3918..cee8d1e76 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_avx2.c
@@ -8,7 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// immintrin.h.
+#include "./vp9_rtcd.h"
+
#include <immintrin.h>
+
+#include "vp9/common/x86/convolve.h"
#include "vpx_ports/mem.h"
// filters for 16_h8 and 16_v8
@@ -53,23 +60,23 @@ DECLARE_ALIGNED(32, static const uint8_t, filt4_global_avx2[32]) = {
# define MM256_BROADCASTSI128_SI256(x) _mm256_broadcastsi128_si256(x)
#endif // __clang__
-void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
+static void vp9_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i filtersReg;
__m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
__m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
__m256i srcReg32b1, srcReg32b2, filtersReg32;
unsigned int i;
- unsigned int src_stride, dst_stride;
+ ptrdiff_t src_stride, dst_stride;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -104,9 +111,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
for (i = output_height; i > 1; i-=2) {
// load the 2 strides of source
srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr-3)));
+ _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
- _mm_loadu_si128((__m128i *)
+ _mm_loadu_si128((const __m128i *)
(src_ptr+src_pixels_per_line-3)), 1);
// filter the source buffer
@@ -135,9 +142,9 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
// reading 2 strides of the next 16 bytes
// (part of it was being read by earlier read)
srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+5)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
- _mm_loadu_si128((__m128i *)
+ _mm_loadu_si128((const __m128i *)
(src_ptr+src_pixels_per_line+5)), 1);
// add and saturate the results together
@@ -202,7 +209,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
__m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
__m128i srcRegFilt2, srcRegFilt3;
- srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
@@ -237,7 +244,7 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
// reading the next 16 bytes
// (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
@@ -297,12 +304,12 @@ void vp9_filter_block1d16_h8_avx2(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- int16_t *filter) {
+static void vp9_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i filtersReg;
__m256i addFilterReg64;
__m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
@@ -310,11 +317,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
__m256i srcReg32b11, srcReg32b12, filtersReg32;
__m256i firstFilters, secondFilters, thirdFilters, forthFilters;
unsigned int i;
- unsigned int src_stride, dst_stride;
+ ptrdiff_t src_stride, dst_stride;
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the
// same data in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -344,19 +351,19 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
// load 16 bytes 7 times in stride of src_pitch
srcReg32b1 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr)));
+ _mm_loadu_si128((const __m128i *)(src_ptr)));
srcReg32b2 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
srcReg32b3 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
srcReg32b4 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
srcReg32b5 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
srcReg32b6 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
srcReg32b7 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));
// have each consecutive loads on the same 256 register
srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
@@ -393,11 +400,11 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
// load the last 2 loads of 16 bytes and have every two
// consecutive loads in the same 256 bit register
srcReg32b8 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
_mm256_castsi256_si128(srcReg32b8), 1);
srcReg32b9 = _mm256_castsi128_si256(
- _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*8)));
+ _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
_mm256_castsi256_si128(srcReg32b9), 1);
@@ -476,7 +483,7 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
__m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
__m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
// load the last 16 bytes
- srcRegFilt8 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));
+ srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
// merge the last 2 results together
srcRegFilt4 = _mm_unpacklo_epi8(
@@ -542,3 +549,54 @@ void vp9_filter_block1d16_v8_avx2(unsigned char *src_ptr,
_mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
}
}
+
+#if HAVE_AVX2 && HAVE_SSSE3
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+#if ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_intrin_ssse3
+#else // ARCH_X86
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#define vp9_filter_block1d8_v8_avx2 vp9_filter_block1d8_v8_ssse3
+#define vp9_filter_block1d8_h8_avx2 vp9_filter_block1d8_h8_ssse3
+#define vp9_filter_block1d4_h8_avx2 vp9_filter_block1d4_h8_ssse3
+#endif // ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+#define vp9_filter_block1d4_v8_avx2 vp9_filter_block1d4_v8_ssse3
+#define vp9_filter_block1d16_v2_avx2 vp9_filter_block1d16_v2_ssse3
+#define vp9_filter_block1d16_h2_avx2 vp9_filter_block1d16_h2_ssse3
+#define vp9_filter_block1d8_v2_avx2 vp9_filter_block1d8_v2_ssse3
+#define vp9_filter_block1d8_h2_avx2 vp9_filter_block1d8_h2_ssse3
+#define vp9_filter_block1d4_v2_avx2 vp9_filter_block1d4_v2_ssse3
+#define vp9_filter_block1d4_h2_avx2 vp9_filter_block1d4_h2_ssse3
+// void vp9_convolve8_horiz_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_vert_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , avx2);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , avx2);
+
+// void vp9_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, avx2);
+#endif // HAVE_AX2 && HAVE_SSSE3
diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
index 4ab49e772..5fd2857e1 100644
--- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
+++ b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c
@@ -8,9 +8,14 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+// Due to a header conflict between math.h and intrinsics includes with ceil()
+// in certain configurations under vs9 this include needs to precede
+// tmmintrin.h.
+#include "./vp9_rtcd.h"
+
#include <tmmintrin.h>
-#include "./vp9_rtcd.h"
+#include "vp9/common/x86/convolve.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/emmintrin_compat.h"
@@ -40,12 +45,17 @@ DECLARE_ALIGNED(16, static const uint8_t, filt4_global[16]) = {
6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14
};
-void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
+// These are reused by the avx2 intrinsics.
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+
+void vp9_filter_block1d4_h8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i firstFilters, secondFilters, shuffle1, shuffle2;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
__m128i addFilterReg64, filtersReg, srcReg, minReg;
@@ -53,7 +63,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 =_mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -74,7 +84,7 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);
for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1);
@@ -111,12 +121,12 @@ void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
+void vp9_filter_block1d8_h8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
@@ -125,7 +135,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -149,7 +159,7 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
for (i = 0; i < output_height; i++) {
- srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3));
+ srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg);
@@ -191,12 +201,12 @@ void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- int16_t *filter) {
+static void vp9_filter_block1d16_h8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pixels_per_line,
+ uint8_t *output_ptr,
+ ptrdiff_t output_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
__m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
@@ -205,7 +215,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -229,7 +239,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
filt4Reg = _mm_load_si128((__m128i const *)filt4_global);
for (i = 0; i < output_height; i++) {
- srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
// filter the source buffer
srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
@@ -256,7 +266,7 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
// reading the next 16 bytes.
// (part of it was being read by earlier read)
- srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
// add and saturate the results together
srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
@@ -308,12 +318,12 @@ void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- int16_t *filter) {
+void vp9_filter_block1d8_v8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i addFilterReg64, filtersReg, minReg;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
@@ -323,7 +333,7 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -338,17 +348,17 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
// load the first 7 rows of 8 bytes
- srcReg1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]);
- srcReg2 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch)[0]);
- srcReg3 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 2)[0]);
- srcReg4 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 3)[0]);
- srcReg5 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 4)[0]);
- srcReg6 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 5)[0]);
- srcReg7 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 6)[0]);
+ srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
+ srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
for (i = 0; i < output_height; i++) {
// load the last 8 bytes
- srcReg8 = _mm_loadl_epi64((__m128i *)&(src_ptr + src_pitch * 7)[0]);
+ srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
// merge the result together
srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
@@ -396,12 +406,12 @@ void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr,
}
}
-void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- int16_t *filter) {
+static void vp9_filter_block1d16_v8_intrin_ssse3(const uint8_t *src_ptr,
+ ptrdiff_t src_pitch,
+ uint8_t *output_ptr,
+ ptrdiff_t out_pitch,
+ uint32_t output_height,
+ const int16_t *filter) {
__m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3;
__m128i firstFilters, secondFilters, thirdFilters, forthFilters;
__m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
@@ -411,7 +421,7 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
// create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
- filtersReg = _mm_loadu_si128((__m128i *)filter);
+ filtersReg = _mm_loadu_si128((const __m128i *)filter);
// converting the 16 bit (short) to 8 bit (byte) and have the same data
// in both lanes of 128 bit register.
filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
@@ -426,17 +436,17 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));
// load the first 7 rows of 16 bytes
- srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr));
- srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch));
- srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2));
- srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3));
- srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4));
- srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5));
- srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6));
+ srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr));
+ srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+ srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+ srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+ srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+ srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+ srcReg7 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
for (i = 0; i < output_height; i++) {
// load the last 16 bytes
- srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7));
+ srcReg8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
// merge the result together
srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2);
@@ -510,3 +520,82 @@ void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
output_ptr+=out_pitch;
}
}
+
+#if ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3;
+#define vp9_filter_block1d16_v8_ssse3 vp9_filter_block1d16_v8_intrin_ssse3
+#define vp9_filter_block1d16_h8_ssse3 vp9_filter_block1d16_h8_intrin_ssse3
+#define vp9_filter_block1d8_v8_ssse3 vp9_filter_block1d8_v8_intrin_ssse3
+#define vp9_filter_block1d8_h8_ssse3 vp9_filter_block1d8_h8_intrin_ssse3
+#define vp9_filter_block1d4_h8_ssse3 vp9_filter_block1d4_h8_intrin_ssse3
+#else // ARCH_X86
+filter8_1dfunction vp9_filter_block1d16_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_ssse3;
+#endif // ARCH_X86_64
+filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3;
+
+filter8_1dfunction vp9_filter_block1d16_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_ssse3;
+filter8_1dfunction vp9_filter_block1d16_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d16_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d8_h2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_v2_avg_ssse3;
+filter8_1dfunction vp9_filter_block1d4_h2_avg_ssse3;
+
+// void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_1D(horiz, x_step_q4, filter_x, h, src, , ssse3);
+FUN_CONV_1D(vert, y_step_q4, filter_y, v, src - src_stride * 3, , ssse3);
+FUN_CONV_1D(avg_horiz, x_step_q4, filter_x, h, src, avg_, ssse3);
+FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
+ ssse3);
+
+// void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+// void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+// uint8_t *dst, ptrdiff_t dst_stride,
+// const int16_t *filter_x, int x_step_q4,
+// const int16_t *filter_y, int y_step_q4,
+// int w, int h);
+FUN_CONV_2D(, ssse3);
+FUN_CONV_2D(avg_ , ssse3);