summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c160
-rw-r--r--vp9/common/arm/neon/vp9_iht4x4_add_neon.c229
-rw-r--r--vp9/common/arm/neon/vp9_iht8x8_add_neon.c668
-rw-r--r--vp9/common/arm/neon/vp9_iht_neon.h60
-rw-r--r--vp9/common/vp9_entropy.c2
-rw-r--r--vp9/common/vp9_entropy.h1
-rw-r--r--vp9/common/vp9_entropymode.c3
-rw-r--r--vp9/common/vp9_entropymv.c4
-rw-r--r--vp9/common/vp9_loopfilter.c2
-rw-r--r--vp9/common/vp9_pred_common.c5
-rw-r--r--vp9/common/vp9_rtcd_defs.pl19
-rw-r--r--vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c419
-rw-r--r--vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c255
-rw-r--r--vp9/common/x86/vp9_idct_intrin_sse2.c40
-rw-r--r--vp9/encoder/vp9_aq_cyclicrefresh.c4
-rw-r--r--vp9/encoder/vp9_context_tree.c5
-rw-r--r--vp9/encoder/vp9_encodeframe.c18
-rw-r--r--vp9/encoder/vp9_encodemb.c9
-rw-r--r--vp9/encoder/vp9_encoder.c157
-rw-r--r--vp9/encoder/vp9_encoder.h11
-rw-r--r--vp9/encoder/vp9_firstpass.c68
-rw-r--r--vp9/encoder/vp9_firstpass.h12
-rw-r--r--vp9/encoder/vp9_mbgraph.h4
-rw-r--r--vp9/encoder/vp9_mcomp.c10
-rw-r--r--vp9/encoder/vp9_pickmode.c86
-rw-r--r--vp9/encoder/vp9_ratectrl.c41
-rw-r--r--vp9/encoder/vp9_ratectrl.h8
-rw-r--r--vp9/encoder/vp9_rdopt.c4
-rw-r--r--vp9/encoder/vp9_svc_layercontext.c8
-rw-r--r--vp9/encoder/vp9_svc_layercontext.h2
-rw-r--r--vp9/encoder/x86/vp9_dct_intrin_sse2.c12
-rw-r--r--vp9/encoder/x86/vp9_quantize_avx2.c140
-rw-r--r--vp9/vp9_common.mk16
-rw-r--r--vp9/vp9_cx_iface.c61
-rw-r--r--vp9/vp9cx.mk1
35 files changed, 1674 insertions, 870 deletions
diff --git a/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
new file mode 100644
index 000000000..46284238d
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_highbd_iht4x4_add_neon.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include <assert.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/highbd_idct_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void highbd_iadst4(int32x4_t *const io) {
+ const int32_t sinpis[4] = { sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9 };
+ const int32x4_t sinpi = vld1q_s32(sinpis);
+ int32x4_t s[8];
+
+ s[0] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 0);
+ s[1] = vmulq_lane_s32(io[0], vget_low_s32(sinpi), 1);
+ s[2] = vmulq_lane_s32(io[1], vget_high_s32(sinpi), 0);
+ s[3] = vmulq_lane_s32(io[2], vget_high_s32(sinpi), 1);
+ s[4] = vmulq_lane_s32(io[2], vget_low_s32(sinpi), 0);
+ s[5] = vmulq_lane_s32(io[3], vget_low_s32(sinpi), 1);
+ s[6] = vmulq_lane_s32(io[3], vget_high_s32(sinpi), 1);
+ s[7] = vsubq_s32(io[0], io[2]);
+ s[7] = vaddq_s32(s[7], io[3]);
+
+ s[0] = vaddq_s32(s[0], s[3]);
+ s[0] = vaddq_s32(s[0], s[5]);
+ s[1] = vsubq_s32(s[1], s[4]);
+ s[1] = vsubq_s32(s[1], s[6]);
+ s[3] = s[2];
+ s[2] = vmulq_lane_s32(s[7], vget_high_s32(sinpi), 0);
+
+ io[0] = vaddq_s32(s[0], s[3]);
+ io[1] = vaddq_s32(s[1], s[3]);
+ io[2] = s[2];
+ io[3] = vaddq_s32(s[0], s[1]);
+ io[3] = vsubq_s32(io[3], s[3]);
+ io[0] = vrshrq_n_s32(io[0], DCT_CONST_BITS);
+ io[1] = vrshrq_n_s32(io[1], DCT_CONST_BITS);
+ io[2] = vrshrq_n_s32(io[2], DCT_CONST_BITS);
+ io[3] = vrshrq_n_s32(io[3], DCT_CONST_BITS);
+}
+
+void vp9_highbd_iht4x4_16_add_neon(const tran_low_t *input, uint16_t *dest,
+ int stride, int tx_type, int bd) {
+ const int16x8_t max = vdupq_n_s16((1 << bd) - 1);
+ int16x8_t a[2];
+ int32x4_t c[4];
+
+ c[0] = vld1q_s32(input);
+ c[1] = vld1q_s32(input + 4);
+ c[2] = vld1q_s32(input + 8);
+ c[3] = vld1q_s32(input + 12);
+
+ if (bd == 8) {
+ a[0] = vcombine_s16(vmovn_s32(c[0]), vmovn_s32(c[1]));
+ a[1] = vcombine_s16(vmovn_s32(c[2]), vmovn_s32(c[3]));
+ transpose_s16_4x4q(&a[0], &a[1]);
+
+ switch (tx_type) {
+ case DCT_DCT:
+ idct4x4_16_kernel_bd8(a);
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ transpose_s16_4x4q(&a[0], &a[1]);
+ idct4x4_16_kernel_bd8(a);
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ break;
+
+ case ADST_DCT:
+ idct4x4_16_kernel_bd8(a);
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ transpose_s16_4x4q(&a[0], &a[1]);
+ iadst4(a);
+ break;
+
+ case DCT_ADST:
+ iadst4(a);
+ transpose_s16_4x4q(&a[0], &a[1]);
+ idct4x4_16_kernel_bd8(a);
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ break;
+
+ default:
+ assert(tx_type == ADST_ADST);
+ iadst4(a);
+ transpose_s16_4x4q(&a[0], &a[1]);
+ iadst4(a);
+ break;
+ }
+ a[0] = vrshrq_n_s16(a[0], 4);
+ a[1] = vrshrq_n_s16(a[1], 4);
+ } else {
+ switch (tx_type) {
+ case DCT_DCT: {
+ const int32x4_t cospis = vld1q_s32(kCospi32);
+
+ if (bd == 10) {
+ idct4x4_16_kernel_bd10(cospis, c);
+ idct4x4_16_kernel_bd10(cospis, c);
+ } else {
+ idct4x4_16_kernel_bd12(cospis, c);
+ idct4x4_16_kernel_bd12(cospis, c);
+ }
+ break;
+ }
+
+ case ADST_DCT: {
+ const int32x4_t cospis = vld1q_s32(kCospi32);
+
+ if (bd == 10) {
+ idct4x4_16_kernel_bd10(cospis, c);
+ } else {
+ idct4x4_16_kernel_bd12(cospis, c);
+ }
+ transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+ highbd_iadst4(c);
+ break;
+ }
+
+ case DCT_ADST: {
+ const int32x4_t cospis = vld1q_s32(kCospi32);
+
+ transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+ highbd_iadst4(c);
+ if (bd == 10) {
+ idct4x4_16_kernel_bd10(cospis, c);
+ } else {
+ idct4x4_16_kernel_bd12(cospis, c);
+ }
+ break;
+ }
+
+ default: {
+ assert(tx_type == ADST_ADST);
+ transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+ highbd_iadst4(c);
+ transpose_s32_4x4(&c[0], &c[1], &c[2], &c[3]);
+ highbd_iadst4(c);
+ break;
+ }
+ }
+ a[0] = vcombine_s16(vqrshrn_n_s32(c[0], 4), vqrshrn_n_s32(c[1], 4));
+ a[1] = vcombine_s16(vqrshrn_n_s32(c[2], 4), vqrshrn_n_s32(c[3], 4));
+ }
+
+ highbd_idct4x4_1_add_kernel1(&dest, stride, a[0], max);
+ highbd_idct4x4_1_add_kernel1(&dest, stride, a[1], max);
+}
diff --git a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
index 025254c3f..4f0a90f21 100644
--- a/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht4x4_add_neon.c
@@ -14,206 +14,63 @@
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
+#include "vp9/common/arm/neon/vp9_iht_neon.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/txfm_common.h"
-static INLINE void TRANSPOSE4X4(int16x8_t *q8s16, int16x8_t *q9s16) {
- int32x4_t q8s32, q9s32;
- int16x4x2_t d0x2s16, d1x2s16;
- int32x4x2_t q0x2s32;
-
- d0x2s16 = vtrn_s16(vget_low_s16(*q8s16), vget_high_s16(*q8s16));
- d1x2s16 = vtrn_s16(vget_low_s16(*q9s16), vget_high_s16(*q9s16));
-
- q8s32 = vreinterpretq_s32_s16(vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]));
- q9s32 = vreinterpretq_s32_s16(vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]));
- q0x2s32 = vtrnq_s32(q8s32, q9s32);
-
- *q8s16 = vreinterpretq_s16_s32(q0x2s32.val[0]);
- *q9s16 = vreinterpretq_s16_s32(q0x2s32.val[1]);
-}
-
-static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16,
- int16x4_t *d2s16) {
- *d0s16 = vdup_n_s16(cospi_8_64);
- *d1s16 = vdup_n_s16(cospi_16_64);
- *d2s16 = vdup_n_s16(cospi_24_64);
-}
-
-static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16,
- int16x4_t *d5s16, int16x8_t *q3s16) {
- *d3s16 = vdup_n_s16(sinpi_1_9);
- *d4s16 = vdup_n_s16(sinpi_2_9);
- *q3s16 = vdupq_n_s16(sinpi_3_9);
- *d5s16 = vdup_n_s16(sinpi_4_9);
-}
-
-static INLINE void IDCT4x4_1D(int16x4_t *d0s16, int16x4_t *d1s16,
- int16x4_t *d2s16, int16x8_t *q8s16,
- int16x8_t *q9s16) {
- int16x4_t d16s16, d17s16, d18s16, d19s16, d23s16, d24s16;
- int16x4_t d26s16, d27s16, d28s16, d29s16;
- int32x4_t q10s32, q13s32, q14s32, q15s32;
- int16x8_t q13s16, q14s16;
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
-
- d23s16 = vadd_s16(d16s16, d18s16);
- d24s16 = vsub_s16(d16s16, d18s16);
-
- q15s32 = vmull_s16(d17s16, *d2s16);
- q10s32 = vmull_s16(d17s16, *d0s16);
- q13s32 = vmull_s16(d23s16, *d1s16);
- q14s32 = vmull_s16(d24s16, *d1s16);
- q15s32 = vmlsl_s16(q15s32, d19s16, *d0s16);
- q10s32 = vmlal_s16(q10s32, d19s16, *d2s16);
-
- d26s16 = vrshrn_n_s32(q13s32, 14);
- d27s16 = vrshrn_n_s32(q14s32, 14);
- d29s16 = vrshrn_n_s32(q15s32, 14);
- d28s16 = vrshrn_n_s32(q10s32, 14);
-
- q13s16 = vcombine_s16(d26s16, d27s16);
- q14s16 = vcombine_s16(d28s16, d29s16);
- *q8s16 = vaddq_s16(q13s16, q14s16);
- *q9s16 = vsubq_s16(q13s16, q14s16);
- *q9s16 = vcombine_s16(vget_high_s16(*q9s16), vget_low_s16(*q9s16)); // vswp
-}
-
-static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
- int16x4_t *d5s16, int16x8_t *q3s16,
- int16x8_t *q8s16, int16x8_t *q9s16) {
- int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
- int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
- d6s16 = vget_low_s16(*q3s16);
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
-
- q10s32 = vmull_s16(*d3s16, d16s16);
- q11s32 = vmull_s16(*d4s16, d16s16);
- q12s32 = vmull_s16(d6s16, d17s16);
- q13s32 = vmull_s16(*d5s16, d18s16);
- q14s32 = vmull_s16(*d3s16, d18s16);
- q15s32 = vmovl_s16(d16s16);
- q15s32 = vaddw_s16(q15s32, d19s16);
- q8s32 = vmull_s16(*d4s16, d19s16);
- q15s32 = vsubw_s16(q15s32, d18s16);
- q9s32 = vmull_s16(*d5s16, d19s16);
-
- q10s32 = vaddq_s32(q10s32, q13s32);
- q10s32 = vaddq_s32(q10s32, q8s32);
- q11s32 = vsubq_s32(q11s32, q14s32);
- q8s32 = vdupq_n_s32(sinpi_3_9);
- q11s32 = vsubq_s32(q11s32, q9s32);
- q15s32 = vmulq_s32(q15s32, q8s32);
-
- q13s32 = vaddq_s32(q10s32, q12s32);
- q10s32 = vaddq_s32(q10s32, q11s32);
- q14s32 = vaddq_s32(q11s32, q12s32);
- q10s32 = vsubq_s32(q10s32, q12s32);
-
- d16s16 = vrshrn_n_s32(q13s32, 14);
- d17s16 = vrshrn_n_s32(q14s32, 14);
- d18s16 = vrshrn_n_s32(q15s32, 14);
- d19s16 = vrshrn_n_s32(q10s32, 14);
-
- *q8s16 = vcombine_s16(d16s16, d17s16);
- *q9s16 = vcombine_s16(d18s16, d19s16);
-}
-
void vp9_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
- uint8x8_t d26u8, d27u8;
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16;
- uint32x2_t d26u32, d27u32;
- int16x8_t q3s16, q8s16, q9s16;
- uint16x8_t q8u16, q9u16;
-
- d26u32 = d27u32 = vdup_n_u32(0);
+ int16x8_t a[2];
+ uint8x8_t s[2], d[2];
+ uint16x8_t sum[2];
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
+ assert(!((intptr_t)dest % sizeof(uint32_t)));
+ assert(!(stride % sizeof(uint32_t)));
- TRANSPOSE4X4(&q8s16, &q9s16);
+ a[0] = load_tran_low_to_s16q(input);
+ a[1] = load_tran_low_to_s16q(input + 8);
+ transpose_s16_4x4q(&a[0], &a[1]);
switch (tx_type) {
- case 0: // idct_idct is not supported. Fall back to C
- vp9_iht4x4_16_add_c(input, dest, stride, tx_type);
- return;
- case 1: // iadst_idct
- // generate constants
- GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
- GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
- // first transform rows
- IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
-
- // transpose the matrix
- TRANSPOSE4X4(&q8s16, &q9s16);
-
- // then transform columns
- IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ case DCT_DCT:
+ idct4x4_16_kernel_bd8(a);
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ transpose_s16_4x4q(&a[0], &a[1]);
+ idct4x4_16_kernel_bd8(a);
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
break;
- case 2: // idct_iadst
- // generate constantsyy
- GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16);
- GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
- // first transform rows
- IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
-
- // transpose the matrix
- TRANSPOSE4X4(&q8s16, &q9s16);
-
- // then transform columns
- IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16);
+ case ADST_DCT:
+ idct4x4_16_kernel_bd8(a);
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
+ transpose_s16_4x4q(&a[0], &a[1]);
+ iadst4(a);
break;
- case 3: // iadst_iadst
- // generate constants
- GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16);
-
- // first transform rows
- IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
- // transpose the matrix
- TRANSPOSE4X4(&q8s16, &q9s16);
-
- // then transform columns
- IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16);
+ case DCT_ADST:
+ iadst4(a);
+ transpose_s16_4x4q(&a[0], &a[1]);
+ idct4x4_16_kernel_bd8(a);
+ a[1] = vcombine_s16(vget_high_s16(a[1]), vget_low_s16(a[1]));
break;
- default: // iadst_idct
- assert(0);
+
+ default:
+ assert(tx_type == ADST_ADST);
+ iadst4(a);
+ transpose_s16_4x4q(&a[0], &a[1]);
+ iadst4(a);
break;
}
- q8s16 = vrshrq_n_s16(q8s16, 4);
- q9s16 = vrshrq_n_s16(q9s16, 4);
-
- d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0);
- dest += stride;
- d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1);
- dest += stride;
- d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0);
- dest += stride;
- d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1);
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32));
-
- d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
-
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1);
- dest -= stride;
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0);
- dest -= stride;
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1);
- dest -= stride;
- vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0);
+ a[0] = vrshrq_n_s16(a[0], 4);
+ a[1] = vrshrq_n_s16(a[1], 4);
+ s[0] = load_u8(dest, stride);
+ s[1] = load_u8(dest + 2 * stride, stride);
+ sum[0] = vaddw_u8(vreinterpretq_u16_s16(a[0]), s[0]);
+ sum[1] = vaddw_u8(vreinterpretq_u16_s16(a[1]), s[1]);
+ d[0] = vqmovun_s16(vreinterpretq_s16_u16(sum[0]));
+ d[1] = vqmovun_s16(vreinterpretq_s16_u16(sum[1]));
+ store_u8(dest, stride, d[0]);
+ store_u8(dest + 2 * stride, stride, d[1]);
}
diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
index 1c739861c..19163bc87 100644
--- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
+++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c
@@ -14,527 +14,199 @@
#include "./vp9_rtcd.h"
#include "./vpx_config.h"
#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/transpose_neon.h"
-static int16_t cospi_2_64 = 16305;
-static int16_t cospi_4_64 = 16069;
-static int16_t cospi_6_64 = 15679;
-static int16_t cospi_8_64 = 15137;
-static int16_t cospi_10_64 = 14449;
-static int16_t cospi_12_64 = 13623;
-static int16_t cospi_14_64 = 12665;
-static int16_t cospi_16_64 = 11585;
-static int16_t cospi_18_64 = 10394;
-static int16_t cospi_20_64 = 9102;
-static int16_t cospi_22_64 = 7723;
-static int16_t cospi_24_64 = 6270;
-static int16_t cospi_26_64 = 4756;
-static int16_t cospi_28_64 = 3196;
-static int16_t cospi_30_64 = 1606;
-
-static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
- int16x8_t *q10s16, int16x8_t *q11s16,
- int16x8_t *q12s16, int16x8_t *q13s16,
- int16x8_t *q14s16, int16x8_t *q15s16) {
- int16x4_t d0s16, d1s16, d2s16, d3s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16;
- int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32;
- int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32;
-
- d0s16 = vdup_n_s16(cospi_28_64);
- d1s16 = vdup_n_s16(cospi_4_64);
- d2s16 = vdup_n_s16(cospi_12_64);
- d3s16 = vdup_n_s16(cospi_20_64);
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
- d20s16 = vget_low_s16(*q10s16);
- d21s16 = vget_high_s16(*q10s16);
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
- d30s16 = vget_low_s16(*q15s16);
- d31s16 = vget_high_s16(*q15s16);
-
- q2s32 = vmull_s16(d18s16, d0s16);
- q3s32 = vmull_s16(d19s16, d0s16);
- q5s32 = vmull_s16(d26s16, d2s16);
- q6s32 = vmull_s16(d27s16, d2s16);
-
- q2s32 = vmlsl_s16(q2s32, d30s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d31s16, d1s16);
- q5s32 = vmlsl_s16(q5s32, d22s16, d3s16);
- q6s32 = vmlsl_s16(q6s32, d23s16, d3s16);
-
- d8s16 = vrshrn_n_s32(q2s32, 14);
- d9s16 = vrshrn_n_s32(q3s32, 14);
- d10s16 = vrshrn_n_s32(q5s32, 14);
- d11s16 = vrshrn_n_s32(q6s32, 14);
- q4s16 = vcombine_s16(d8s16, d9s16);
- q5s16 = vcombine_s16(d10s16, d11s16);
-
- q2s32 = vmull_s16(d18s16, d1s16);
- q3s32 = vmull_s16(d19s16, d1s16);
- q9s32 = vmull_s16(d26s16, d3s16);
- q13s32 = vmull_s16(d27s16, d3s16);
-
- q2s32 = vmlal_s16(q2s32, d30s16, d0s16);
- q3s32 = vmlal_s16(q3s32, d31s16, d0s16);
- q9s32 = vmlal_s16(q9s32, d22s16, d2s16);
- q13s32 = vmlal_s16(q13s32, d23s16, d2s16);
-
- d14s16 = vrshrn_n_s32(q2s32, 14);
- d15s16 = vrshrn_n_s32(q3s32, 14);
- d12s16 = vrshrn_n_s32(q9s32, 14);
- d13s16 = vrshrn_n_s32(q13s32, 14);
- q6s16 = vcombine_s16(d12s16, d13s16);
- q7s16 = vcombine_s16(d14s16, d15s16);
-
- d0s16 = vdup_n_s16(cospi_16_64);
-
- q2s32 = vmull_s16(d16s16, d0s16);
- q3s32 = vmull_s16(d17s16, d0s16);
- q13s32 = vmull_s16(d16s16, d0s16);
- q15s32 = vmull_s16(d17s16, d0s16);
-
- q2s32 = vmlal_s16(q2s32, d24s16, d0s16);
- q3s32 = vmlal_s16(q3s32, d25s16, d0s16);
- q13s32 = vmlsl_s16(q13s32, d24s16, d0s16);
- q15s32 = vmlsl_s16(q15s32, d25s16, d0s16);
-
- d0s16 = vdup_n_s16(cospi_24_64);
- d1s16 = vdup_n_s16(cospi_8_64);
-
- d18s16 = vrshrn_n_s32(q2s32, 14);
- d19s16 = vrshrn_n_s32(q3s32, 14);
- d22s16 = vrshrn_n_s32(q13s32, 14);
- d23s16 = vrshrn_n_s32(q15s32, 14);
- *q9s16 = vcombine_s16(d18s16, d19s16);
- *q11s16 = vcombine_s16(d22s16, d23s16);
-
- q2s32 = vmull_s16(d20s16, d0s16);
- q3s32 = vmull_s16(d21s16, d0s16);
- q8s32 = vmull_s16(d20s16, d1s16);
- q12s32 = vmull_s16(d21s16, d1s16);
-
- q2s32 = vmlsl_s16(q2s32, d28s16, d1s16);
- q3s32 = vmlsl_s16(q3s32, d29s16, d1s16);
- q8s32 = vmlal_s16(q8s32, d28s16, d0s16);
- q12s32 = vmlal_s16(q12s32, d29s16, d0s16);
-
- d26s16 = vrshrn_n_s32(q2s32, 14);
- d27s16 = vrshrn_n_s32(q3s32, 14);
- d30s16 = vrshrn_n_s32(q8s32, 14);
- d31s16 = vrshrn_n_s32(q12s32, 14);
- *q13s16 = vcombine_s16(d26s16, d27s16);
- *q15s16 = vcombine_s16(d30s16, d31s16);
-
- q0s16 = vaddq_s16(*q9s16, *q15s16);
- q1s16 = vaddq_s16(*q11s16, *q13s16);
- q2s16 = vsubq_s16(*q11s16, *q13s16);
- q3s16 = vsubq_s16(*q9s16, *q15s16);
-
- *q13s16 = vsubq_s16(q4s16, q5s16);
- q4s16 = vaddq_s16(q4s16, q5s16);
- *q14s16 = vsubq_s16(q7s16, q6s16);
- q7s16 = vaddq_s16(q7s16, q6s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
-
- d16s16 = vdup_n_s16(cospi_16_64);
-
- q9s32 = vmull_s16(d28s16, d16s16);
- q10s32 = vmull_s16(d29s16, d16s16);
- q11s32 = vmull_s16(d28s16, d16s16);
- q12s32 = vmull_s16(d29s16, d16s16);
-
- q9s32 = vmlsl_s16(q9s32, d26s16, d16s16);
- q10s32 = vmlsl_s16(q10s32, d27s16, d16s16);
- q11s32 = vmlal_s16(q11s32, d26s16, d16s16);
- q12s32 = vmlal_s16(q12s32, d27s16, d16s16);
-
- d10s16 = vrshrn_n_s32(q9s32, 14);
- d11s16 = vrshrn_n_s32(q10s32, 14);
- d12s16 = vrshrn_n_s32(q11s32, 14);
- d13s16 = vrshrn_n_s32(q12s32, 14);
- q5s16 = vcombine_s16(d10s16, d11s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
-
- *q8s16 = vaddq_s16(q0s16, q7s16);
- *q9s16 = vaddq_s16(q1s16, q6s16);
- *q10s16 = vaddq_s16(q2s16, q5s16);
- *q11s16 = vaddq_s16(q3s16, q4s16);
- *q12s16 = vsubq_s16(q3s16, q4s16);
- *q13s16 = vsubq_s16(q2s16, q5s16);
- *q14s16 = vsubq_s16(q1s16, q6s16);
- *q15s16 = vsubq_s16(q0s16, q7s16);
+static INLINE void iadst_half_butterfly_neon(int16x8_t *const x,
+ const int16x4_t c) {
+ const int16x8_t sum = vaddq_s16(x[0], x[1]);
+ const int16x8_t sub = vsubq_s16(x[0], x[1]);
+ int32x4_t t0[2], t1[2];
+
+ t0[0] = vmull_lane_s16(vget_low_s16(sum), c, 0);
+ t0[1] = vmull_lane_s16(vget_high_s16(sum), c, 0);
+ t1[0] = vmull_lane_s16(vget_low_s16(sub), c, 0);
+ t1[1] = vmull_lane_s16(vget_high_s16(sub), c, 0);
+ x[0] = dct_const_round_shift_low_8(t0);
+ x[1] = dct_const_round_shift_low_8(t1);
}
-static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
- int16x8_t *q10s16, int16x8_t *q11s16,
- int16x8_t *q12s16, int16x8_t *q13s16,
- int16x8_t *q14s16, int16x8_t *q15s16) {
- int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
- int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
- int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
- int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
- int16x8_t q2s16, q4s16, q5s16, q6s16;
- int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
- int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;
-
- d16s16 = vget_low_s16(*q8s16);
- d17s16 = vget_high_s16(*q8s16);
- d18s16 = vget_low_s16(*q9s16);
- d19s16 = vget_high_s16(*q9s16);
- d20s16 = vget_low_s16(*q10s16);
- d21s16 = vget_high_s16(*q10s16);
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- d26s16 = vget_low_s16(*q13s16);
- d27s16 = vget_high_s16(*q13s16);
- d28s16 = vget_low_s16(*q14s16);
- d29s16 = vget_high_s16(*q14s16);
- d30s16 = vget_low_s16(*q15s16);
- d31s16 = vget_high_s16(*q15s16);
-
- d14s16 = vdup_n_s16(cospi_2_64);
- d15s16 = vdup_n_s16(cospi_30_64);
-
- q1s32 = vmull_s16(d30s16, d14s16);
- q2s32 = vmull_s16(d31s16, d14s16);
- q3s32 = vmull_s16(d30s16, d15s16);
- q4s32 = vmull_s16(d31s16, d15s16);
-
- d30s16 = vdup_n_s16(cospi_18_64);
- d31s16 = vdup_n_s16(cospi_14_64);
-
- q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
- q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
- q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
- q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);
-
- q5s32 = vmull_s16(d22s16, d30s16);
- q6s32 = vmull_s16(d23s16, d30s16);
- q7s32 = vmull_s16(d22s16, d31s16);
- q8s32 = vmull_s16(d23s16, d31s16);
-
- q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
- q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
- q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
- q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);
-
- q11s32 = vaddq_s32(q1s32, q5s32);
- q12s32 = vaddq_s32(q2s32, q6s32);
- q1s32 = vsubq_s32(q1s32, q5s32);
- q2s32 = vsubq_s32(q2s32, q6s32);
-
- d22s16 = vrshrn_n_s32(q11s32, 14);
- d23s16 = vrshrn_n_s32(q12s32, 14);
- *q11s16 = vcombine_s16(d22s16, d23s16);
-
- q12s32 = vaddq_s32(q3s32, q7s32);
- q15s32 = vaddq_s32(q4s32, q8s32);
- q3s32 = vsubq_s32(q3s32, q7s32);
- q4s32 = vsubq_s32(q4s32, q8s32);
-
- d2s16 = vrshrn_n_s32(q1s32, 14);
- d3s16 = vrshrn_n_s32(q2s32, 14);
- d24s16 = vrshrn_n_s32(q12s32, 14);
- d25s16 = vrshrn_n_s32(q15s32, 14);
- d6s16 = vrshrn_n_s32(q3s32, 14);
- d7s16 = vrshrn_n_s32(q4s32, 14);
- *q12s16 = vcombine_s16(d24s16, d25s16);
-
- d0s16 = vdup_n_s16(cospi_10_64);
- d1s16 = vdup_n_s16(cospi_22_64);
- q4s32 = vmull_s16(d26s16, d0s16);
- q5s32 = vmull_s16(d27s16, d0s16);
- q2s32 = vmull_s16(d26s16, d1s16);
- q6s32 = vmull_s16(d27s16, d1s16);
-
- d30s16 = vdup_n_s16(cospi_26_64);
- d31s16 = vdup_n_s16(cospi_6_64);
-
- q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
- q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
- q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
- q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);
-
- q0s32 = vmull_s16(d18s16, d30s16);
- q13s32 = vmull_s16(d19s16, d30s16);
-
- q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
- q13s32 = vmlal_s16(q13s32, d29s16, d31s16);
-
- q10s32 = vmull_s16(d18s16, d31s16);
- q9s32 = vmull_s16(d19s16, d31s16);
-
- q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
- q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);
-
- q14s32 = vaddq_s32(q2s32, q10s32);
- q15s32 = vaddq_s32(q6s32, q9s32);
- q2s32 = vsubq_s32(q2s32, q10s32);
- q6s32 = vsubq_s32(q6s32, q9s32);
-
- d28s16 = vrshrn_n_s32(q14s32, 14);
- d29s16 = vrshrn_n_s32(q15s32, 14);
- d4s16 = vrshrn_n_s32(q2s32, 14);
- d5s16 = vrshrn_n_s32(q6s32, 14);
- *q14s16 = vcombine_s16(d28s16, d29s16);
-
- q9s32 = vaddq_s32(q4s32, q0s32);
- q10s32 = vaddq_s32(q5s32, q13s32);
- q4s32 = vsubq_s32(q4s32, q0s32);
- q5s32 = vsubq_s32(q5s32, q13s32);
-
- d30s16 = vdup_n_s16(cospi_8_64);
- d31s16 = vdup_n_s16(cospi_24_64);
-
- d18s16 = vrshrn_n_s32(q9s32, 14);
- d19s16 = vrshrn_n_s32(q10s32, 14);
- d8s16 = vrshrn_n_s32(q4s32, 14);
- d9s16 = vrshrn_n_s32(q5s32, 14);
- *q9s16 = vcombine_s16(d18s16, d19s16);
-
- q5s32 = vmull_s16(d2s16, d30s16);
- q6s32 = vmull_s16(d3s16, d30s16);
- q7s32 = vmull_s16(d2s16, d31s16);
- q0s32 = vmull_s16(d3s16, d31s16);
-
- q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
- q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
- q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
- q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);
-
- q1s32 = vmull_s16(d4s16, d30s16);
- q3s32 = vmull_s16(d5s16, d30s16);
- q10s32 = vmull_s16(d4s16, d31s16);
- q2s32 = vmull_s16(d5s16, d31s16);
-
- q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
- q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
- q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
- q2s32 = vmlal_s16(q2s32, d9s16, d30s16);
-
- *q8s16 = vaddq_s16(*q11s16, *q9s16);
- *q11s16 = vsubq_s16(*q11s16, *q9s16);
- q4s16 = vaddq_s16(*q12s16, *q14s16);
- *q12s16 = vsubq_s16(*q12s16, *q14s16);
-
- q14s32 = vaddq_s32(q5s32, q1s32);
- q15s32 = vaddq_s32(q6s32, q3s32);
- q5s32 = vsubq_s32(q5s32, q1s32);
- q6s32 = vsubq_s32(q6s32, q3s32);
-
- d18s16 = vrshrn_n_s32(q14s32, 14);
- d19s16 = vrshrn_n_s32(q15s32, 14);
- d10s16 = vrshrn_n_s32(q5s32, 14);
- d11s16 = vrshrn_n_s32(q6s32, 14);
- *q9s16 = vcombine_s16(d18s16, d19s16);
-
- q1s32 = vaddq_s32(q7s32, q10s32);
- q3s32 = vaddq_s32(q0s32, q2s32);
- q7s32 = vsubq_s32(q7s32, q10s32);
- q0s32 = vsubq_s32(q0s32, q2s32);
-
- d28s16 = vrshrn_n_s32(q1s32, 14);
- d29s16 = vrshrn_n_s32(q3s32, 14);
- d14s16 = vrshrn_n_s32(q7s32, 14);
- d15s16 = vrshrn_n_s32(q0s32, 14);
- *q14s16 = vcombine_s16(d28s16, d29s16);
-
- d30s16 = vdup_n_s16(cospi_16_64);
-
- d22s16 = vget_low_s16(*q11s16);
- d23s16 = vget_high_s16(*q11s16);
- q2s32 = vmull_s16(d22s16, d30s16);
- q3s32 = vmull_s16(d23s16, d30s16);
- q13s32 = vmull_s16(d22s16, d30s16);
- q1s32 = vmull_s16(d23s16, d30s16);
+static INLINE void iadst_butterfly_lane_0_1_neon(const int16x8_t in0,
+ const int16x8_t in1,
+ const int16x4_t c,
+ int32x4_t *const s0,
+ int32x4_t *const s1) {
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 0);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 0);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 1);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 1);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 1);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 1);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 0);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 0);
+}
- d24s16 = vget_low_s16(*q12s16);
- d25s16 = vget_high_s16(*q12s16);
- q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
- q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
- q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
- q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);
+static INLINE void iadst_butterfly_lane_2_3_neon(const int16x8_t in0,
+ const int16x8_t in1,
+ const int16x4_t c,
+ int32x4_t *const s0,
+ int32x4_t *const s1) {
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 3);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 3);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 2);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 2);
+}
- d4s16 = vrshrn_n_s32(q2s32, 14);
- d5s16 = vrshrn_n_s32(q3s32, 14);
- d24s16 = vrshrn_n_s32(q13s32, 14);
- d25s16 = vrshrn_n_s32(q1s32, 14);
- q2s16 = vcombine_s16(d4s16, d5s16);
- *q12s16 = vcombine_s16(d24s16, d25s16);
+static INLINE void iadst_butterfly_lane_3_2_neon(const int16x8_t in0,
+ const int16x8_t in1,
+ const int16x4_t c,
+ int32x4_t *const s0,
+ int32x4_t *const s1) {
+ s0[0] = vmull_lane_s16(vget_low_s16(in0), c, 3);
+ s0[1] = vmull_lane_s16(vget_high_s16(in0), c, 3);
+ s1[0] = vmull_lane_s16(vget_low_s16(in0), c, 2);
+ s1[1] = vmull_lane_s16(vget_high_s16(in0), c, 2);
+
+ s0[0] = vmlal_lane_s16(s0[0], vget_low_s16(in1), c, 2);
+ s0[1] = vmlal_lane_s16(s0[1], vget_high_s16(in1), c, 2);
+ s1[0] = vmlsl_lane_s16(s1[0], vget_low_s16(in1), c, 3);
+ s1[1] = vmlsl_lane_s16(s1[1], vget_high_s16(in1), c, 3);
+}
- q13s32 = vmull_s16(d10s16, d30s16);
- q1s32 = vmull_s16(d11s16, d30s16);
- q11s32 = vmull_s16(d10s16, d30s16);
- q0s32 = vmull_s16(d11s16, d30s16);
+static INLINE int16x8_t add_dct_const_round_shift_low_8(
+ const int32x4_t *const in0, const int32x4_t *const in1) {
+ int32x4_t sum[2];
- q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
- q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
- q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
- q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);
+ sum[0] = vaddq_s32(in0[0], in1[0]);
+ sum[1] = vaddq_s32(in0[1], in1[1]);
+ return dct_const_round_shift_low_8(sum);
+}
- d20s16 = vrshrn_n_s32(q13s32, 14);
- d21s16 = vrshrn_n_s32(q1s32, 14);
- d12s16 = vrshrn_n_s32(q11s32, 14);
- d13s16 = vrshrn_n_s32(q0s32, 14);
- *q10s16 = vcombine_s16(d20s16, d21s16);
- q6s16 = vcombine_s16(d12s16, d13s16);
+static INLINE int16x8_t sub_dct_const_round_shift_low_8(
+ const int32x4_t *const in0, const int32x4_t *const in1) {
+ int32x4_t sum[2];
- q5s16 = vdupq_n_s16(0);
+ sum[0] = vsubq_s32(in0[0], in1[0]);
+ sum[1] = vsubq_s32(in0[1], in1[1]);
+ return dct_const_round_shift_low_8(sum);
+}
- *q9s16 = vsubq_s16(q5s16, *q9s16);
- *q11s16 = vsubq_s16(q5s16, q2s16);
- *q13s16 = vsubq_s16(q5s16, q6s16);
- *q15s16 = vsubq_s16(q5s16, q4s16);
+static INLINE void iadst8(int16x8_t *const io) {
+ const int16x4_t c0 =
+ create_s16x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
+ const int16x4_t c1 =
+ create_s16x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
+ const int16x4_t c2 =
+ create_s16x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
+ int16x8_t x[8], t[4];
+ int32x4_t s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+
+ x[0] = io[7];
+ x[1] = io[0];
+ x[2] = io[5];
+ x[3] = io[2];
+ x[4] = io[3];
+ x[5] = io[4];
+ x[6] = io[1];
+ x[7] = io[6];
+
+ // stage 1
+ iadst_butterfly_lane_0_1_neon(x[0], x[1], c0, s0, s1);
+ iadst_butterfly_lane_2_3_neon(x[2], x[3], c0, s2, s3);
+ iadst_butterfly_lane_0_1_neon(x[4], x[5], c1, s4, s5);
+ iadst_butterfly_lane_2_3_neon(x[6], x[7], c1, s6, s7);
+
+ x[0] = add_dct_const_round_shift_low_8(s0, s4);
+ x[1] = add_dct_const_round_shift_low_8(s1, s5);
+ x[2] = add_dct_const_round_shift_low_8(s2, s6);
+ x[3] = add_dct_const_round_shift_low_8(s3, s7);
+ x[4] = sub_dct_const_round_shift_low_8(s0, s4);
+ x[5] = sub_dct_const_round_shift_low_8(s1, s5);
+ x[6] = sub_dct_const_round_shift_low_8(s2, s6);
+ x[7] = sub_dct_const_round_shift_low_8(s3, s7);
+
+ // stage 2
+ t[0] = x[0];
+ t[1] = x[1];
+ t[2] = x[2];
+ t[3] = x[3];
+ iadst_butterfly_lane_2_3_neon(x[4], x[5], c2, s4, s5);
+ iadst_butterfly_lane_3_2_neon(x[7], x[6], c2, s7, s6);
+
+ x[0] = vaddq_s16(t[0], t[2]);
+ x[1] = vaddq_s16(t[1], t[3]);
+ x[2] = vsubq_s16(t[0], t[2]);
+ x[3] = vsubq_s16(t[1], t[3]);
+ x[4] = add_dct_const_round_shift_low_8(s4, s6);
+ x[5] = add_dct_const_round_shift_low_8(s5, s7);
+ x[6] = sub_dct_const_round_shift_low_8(s4, s6);
+ x[7] = sub_dct_const_round_shift_low_8(s5, s7);
+
+ // stage 3
+ iadst_half_butterfly_neon(x + 2, c2);
+ iadst_half_butterfly_neon(x + 6, c2);
+
+ io[0] = x[0];
+ io[1] = vnegq_s16(x[4]);
+ io[2] = x[6];
+ io[3] = vnegq_s16(x[2]);
+ io[4] = x[3];
+ io[5] = vnegq_s16(x[7]);
+ io[6] = x[5];
+ io[7] = vnegq_s16(x[1]);
}
void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
- int i;
- uint8_t *d1, *d2;
- uint8x8_t d0u8, d1u8, d2u8, d3u8;
- uint64x1_t d0u64, d1u64, d2u64, d3u64;
- int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16;
- uint16x8_t q8u16, q9u16, q10u16, q11u16;
-
- q8s16 = vld1q_s16(input);
- q9s16 = vld1q_s16(input + 8);
- q10s16 = vld1q_s16(input + 8 * 2);
- q11s16 = vld1q_s16(input + 8 * 3);
- q12s16 = vld1q_s16(input + 8 * 4);
- q13s16 = vld1q_s16(input + 8 * 5);
- q14s16 = vld1q_s16(input + 8 * 6);
- q15s16 = vld1q_s16(input + 8 * 7);
-
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
+ const int16x8_t cospis = vld1q_s16(kCospi);
+ const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24
+ const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28
+ int16x8_t a[8];
+
+ a[0] = load_tran_low_to_s16q(input + 0 * 8);
+ a[1] = load_tran_low_to_s16q(input + 1 * 8);
+ a[2] = load_tran_low_to_s16q(input + 2 * 8);
+ a[3] = load_tran_low_to_s16q(input + 3 * 8);
+ a[4] = load_tran_low_to_s16q(input + 4 * 8);
+ a[5] = load_tran_low_to_s16q(input + 5 * 8);
+ a[6] = load_tran_low_to_s16q(input + 6 * 8);
+ a[7] = load_tran_low_to_s16q(input + 7 * 8);
+
+ transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
switch (tx_type) {
- case 0: // idct_idct is not supported. Fall back to C
- vp9_iht8x8_64_add_c(input, dest, stride, tx_type);
- return;
- case 1: // iadst_idct
- // generate IDCT constants
- // GENERATE_IDCT_CONSTANTS
-
- // first transform rows
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- // transpose the matrix
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
- &q14s16, &q15s16);
-
- // generate IADST constants
- // GENERATE_IADST_CONSTANTS
-
- // then transform columns
- IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
+ case DCT_DCT:
+ idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+ transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
break;
- case 2: // idct_iadst
- // generate IADST constants
- // GENERATE_IADST_CONSTANTS
-
- // first transform rows
- IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- // transpose the matrix
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
- &q14s16, &q15s16);
- // generate IDCT constants
- // GENERATE_IDCT_CONSTANTS
-
- // then transform columns
- IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
+ case ADST_DCT:
+ idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
+ transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+ iadst8(a);
break;
- case 3: // iadst_iadst
- // generate IADST constants
- // GENERATE_IADST_CONSTANTS
-
- // first transform rows
- IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
-
- // transpose the matrix
- transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16,
- &q14s16, &q15s16);
- // then transform columns
- IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16,
- &q15s16);
+ case DCT_ADST:
+ iadst8(a);
+ transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+ idct8x8_64_1d_bd8_kernel(cospis0, cospis1, a);
break;
- default: // iadst_idct
- assert(0);
+
+ default:
+ assert(tx_type == ADST_ADST);
+ iadst8(a);
+ transpose_s16_8x8(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]);
+ iadst8(a);
break;
}
- q8s16 = vrshrq_n_s16(q8s16, 5);
- q9s16 = vrshrq_n_s16(q9s16, 5);
- q10s16 = vrshrq_n_s16(q10s16, 5);
- q11s16 = vrshrq_n_s16(q11s16, 5);
- q12s16 = vrshrq_n_s16(q12s16, 5);
- q13s16 = vrshrq_n_s16(q13s16, 5);
- q14s16 = vrshrq_n_s16(q14s16, 5);
- q15s16 = vrshrq_n_s16(q15s16, 5);
-
- for (d1 = d2 = dest, i = 0; i < 2; i++) {
- if (i != 0) {
- q8s16 = q12s16;
- q9s16 = q13s16;
- q10s16 = q14s16;
- q11s16 = q15s16;
- }
-
- d0u64 = vld1_u64((uint64_t *)d1);
- d1 += stride;
- d1u64 = vld1_u64((uint64_t *)d1);
- d1 += stride;
- d2u64 = vld1_u64((uint64_t *)d1);
- d1 += stride;
- d3u64 = vld1_u64((uint64_t *)d1);
- d1 += stride;
-
- q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64));
- q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64));
- q10u16 =
- vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64));
- q11u16 =
- vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64));
-
- d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16));
- d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16));
- d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16));
- d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16));
-
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8));
- d2 += stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8));
- d2 += stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8));
- d2 += stride;
- vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8));
- d2 += stride;
- }
+ idct8x8_add8x8_neon(a, dest, stride);
}
diff --git a/vp9/common/arm/neon/vp9_iht_neon.h b/vp9/common/arm/neon/vp9_iht_neon.h
new file mode 100644
index 000000000..08daa1a4a
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_iht_neon.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+#define VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
+
+#include <arm_neon.h>
+
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/txfm_common.h"
+
+static INLINE void iadst4(int16x8_t *const io) {
+ const int32x4_t c3 = vdupq_n_s32(sinpi_3_9);
+ int16x4_t x[4];
+ int32x4_t s[8], output[4];
+ const int16x4_t c =
+ create_s16x4_neon(sinpi_1_9, sinpi_2_9, sinpi_3_9, sinpi_4_9);
+
+ x[0] = vget_low_s16(io[0]);
+ x[1] = vget_low_s16(io[1]);
+ x[2] = vget_high_s16(io[0]);
+ x[3] = vget_high_s16(io[1]);
+
+ s[0] = vmull_lane_s16(x[0], c, 0);
+ s[1] = vmull_lane_s16(x[0], c, 1);
+ s[2] = vmull_lane_s16(x[1], c, 2);
+ s[3] = vmull_lane_s16(x[2], c, 3);
+ s[4] = vmull_lane_s16(x[2], c, 0);
+ s[5] = vmull_lane_s16(x[3], c, 1);
+ s[6] = vmull_lane_s16(x[3], c, 3);
+ s[7] = vaddl_s16(x[0], x[3]);
+ s[7] = vsubw_s16(s[7], x[2]);
+
+ s[0] = vaddq_s32(s[0], s[3]);
+ s[0] = vaddq_s32(s[0], s[5]);
+ s[1] = vsubq_s32(s[1], s[4]);
+ s[1] = vsubq_s32(s[1], s[6]);
+ s[3] = s[2];
+ s[2] = vmulq_s32(c3, s[7]);
+
+ output[0] = vaddq_s32(s[0], s[3]);
+ output[1] = vaddq_s32(s[1], s[3]);
+ output[2] = s[2];
+ output[3] = vaddq_s32(s[0], s[1]);
+ output[3] = vsubq_s32(output[3], s[3]);
+ dct_const_round_shift_low_8_dual(output, &io[0], &io[1]);
+}
+
+#endif // VP9_COMMON_ARM_NEON_VP9_IHT_NEON_H_
diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index a575bda72..430b917b8 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c
@@ -42,6 +42,7 @@ const vpx_prob vp9_cat6_prob_high12[] = { 255, 255, 255, 255, 254, 254,
177, 153, 140, 133, 130, 129 };
#endif
+/* clang-format off */
const uint8_t vp9_coefband_trans_8x8plus[1024] = {
0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5,
// beyond MAXBAND_INDEX+1 all values are filled as 5
@@ -85,6 +86,7 @@ const uint8_t vp9_coefband_trans_8x8plus[1024] = {
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
};
+/* clang-format on */
const uint8_t vp9_coefband_trans_4x4[16] = {
0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 1da491166..0ab502592 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h
@@ -137,7 +137,6 @@ static INLINE const uint8_t *get_band_translate(TX_SIZE tx_size) {
// 128 lists of probabilities are stored for the following ONE node probs:
// 1, 3, 5, 7, ..., 253, 255
// In between probabilities are interpolated linearly
-
#define COEFF_PROB_MODELS 255
#define UNCONSTRAINED_NODES 3
diff --git a/vp9/common/vp9_entropymode.c b/vp9/common/vp9_entropymode.c
index 47cd63e94..48cad3318 100644
--- a/vp9/common/vp9_entropymode.c
+++ b/vp9/common/vp9_entropymode.c
@@ -186,16 +186,19 @@ const vpx_prob vp9_kf_partition_probs[PARTITION_CONTEXTS][PARTITION_TYPES - 1] =
{ 93, 24, 99 }, // a split, l not split
{ 85, 119, 44 }, // l split, a not split
{ 62, 59, 67 }, // a/l both split
+
// 16x16 -> 8x8
{ 149, 53, 53 }, // a/l both not split
{ 94, 20, 48 }, // a split, l not split
{ 83, 53, 24 }, // l split, a not split
{ 52, 18, 18 }, // a/l both split
+
// 32x32 -> 16x16
{ 150, 40, 39 }, // a/l both not split
{ 78, 12, 26 }, // a split, l not split
{ 67, 33, 11 }, // l split, a not split
{ 24, 7, 5 }, // a/l both split
+
// 64x64 -> 32x32
{ 174, 35, 49 }, // a/l both not split
{ 68, 11, 27 }, // a split, l not split
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index a18a290cf..b6f052d08 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -22,9 +22,7 @@ const vpx_tree_index vp9_mv_class_tree[TREE_SIZE(MV_CLASSES)] = {
18, -MV_CLASS_7, -MV_CLASS_8, -MV_CLASS_9, -MV_CLASS_10,
};
-const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = {
- -0, -1,
-};
+const vpx_tree_index vp9_mv_class0_tree[TREE_SIZE(CLASS0_SIZE)] = { -0, -1 };
const vpx_tree_index vp9_mv_fp_tree[TREE_SIZE(MV_FP_SIZE)] = { -0, 2, -1,
4, -2, -3 };
diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index c7c343aed..da9180b71 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c
@@ -1174,7 +1174,7 @@ void vp9_filter_block_plane_non420(VP9_COMMON *cm,
}
// Disable filtering on the leftmost column
- border_mask = ~(mi_col == 0);
+ border_mask = ~(mi_col == 0 ? 1 : 0);
#if CONFIG_VP9_HIGHBITDEPTH
if (cm->use_highbitdepth) {
highbd_filter_selectively_vert(
diff --git a/vp9/common/vp9_pred_common.c b/vp9/common/vp9_pred_common.c
index a7ddc0b95..15c7e6376 100644
--- a/vp9/common/vp9_pred_common.c
+++ b/vp9/common/vp9_pred_common.c
@@ -229,9 +229,8 @@ int vp9_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
else
pred_context = 4 * (edge_mi->ref_frame[0] == GOLDEN_FRAME);
} else {
- pred_context = 1 +
- 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
- edge_mi->ref_frame[1] == GOLDEN_FRAME);
+ pred_context = 1 + 2 * (edge_mi->ref_frame[0] == GOLDEN_FRAME ||
+ edge_mi->ref_frame[1] == GOLDEN_FRAME);
}
} else { // inter/inter
const int above_has_second = has_second_ref(above_mi);
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index dd6120266..d048857dd 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -67,13 +67,13 @@ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *outp
if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
# Note that there are more specializations appended when
# CONFIG_VP9_HIGHBITDEPTH is off.
- specialize qw/vp9_iht4x4_16_add sse2/;
- specialize qw/vp9_iht8x8_64_add sse2/;
+ specialize qw/vp9_iht4x4_16_add neon sse2/;
+ specialize qw/vp9_iht8x8_64_add neon sse2/;
specialize qw/vp9_iht16x16_256_add sse2/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
# Note that these specializations are appended to the above ones.
- specialize qw/vp9_iht4x4_16_add neon dspr2 msa/;
- specialize qw/vp9_iht8x8_64_add neon dspr2 msa/;
+ specialize qw/vp9_iht4x4_16_add dspr2 msa/;
+ specialize qw/vp9_iht8x8_64_add dspr2 msa/;
specialize qw/vp9_iht16x16_256_add dspr2 msa/;
}
}
@@ -97,13 +97,16 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
# Note as optimized versions of these functions are added we need to add a check to ensure
# that when CONFIG_EMULATE_HARDWARE is on, it defaults to the C versions only.
add_proto qw/void vp9_highbd_iht4x4_16_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
- if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
- specialize qw/vp9_highbd_iht4x4_16_add sse4_1/;
- }
add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd";
add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd";
+
+ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
+ specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/;
+ specialize qw/vp9_highbd_iht8x8_64_add sse4_1/;
+ specialize qw/vp9_highbd_iht16x16_256_add sse4_1/;
+ }
}
#
@@ -126,7 +129,7 @@ add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_
add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon sse2 avx2/, "$ssse3_x86_64";
add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp_32x32 neon/, "$ssse3_x86_64";
diff --git a/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
new file mode 100644
index 000000000..57b79a732
--- /dev/null
+++ b/vp9/common/x86/vp9_highbd_iht16x16_add_sse4.c
@@ -0,0 +1,419 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+ const int c,
+ __m128i *const s) {
+ const __m128i pair_c = pair_set_epi32(4 * c, 0);
+ __m128i x[2];
+
+ extend_64bit(in, x);
+ s[0] = _mm_mul_epi32(pair_c, x[0]);
+ s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+ const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const s0,
+ __m128i *const s1) {
+ const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+ const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+ __m128i t00[2], t01[2], t10[2], t11[2];
+ __m128i x0[2], x1[2];
+
+ extend_64bit(in0, x0);
+ extend_64bit(in1, x1);
+ t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+ t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+ t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+ t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+ t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+ t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+ t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+ t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+ s0[0] = _mm_add_epi64(t00[0], t11[0]);
+ s0[1] = _mm_add_epi64(t00[1], t11[1]);
+ s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+ s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst16_4col_sse4_1(__m128i *const io /*io[16]*/) {
+ __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2], s8[2], s9[2],
+ s10[2], s11[2], s12[2], s13[2], s14[2], s15[2];
+ __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2], x8[2], x9[2],
+ x10[2], x11[2], x12[2], x13[2], x14[2], x15[2];
+
+ // stage 1
+ highbd_iadst_butterfly_sse4_1(io[15], io[0], cospi_1_64, cospi_31_64, s0, s1);
+ highbd_iadst_butterfly_sse4_1(io[13], io[2], cospi_5_64, cospi_27_64, s2, s3);
+ highbd_iadst_butterfly_sse4_1(io[11], io[4], cospi_9_64, cospi_23_64, s4, s5);
+ highbd_iadst_butterfly_sse4_1(io[9], io[6], cospi_13_64, cospi_19_64, s6, s7);
+ highbd_iadst_butterfly_sse4_1(io[7], io[8], cospi_17_64, cospi_15_64, s8, s9);
+ highbd_iadst_butterfly_sse4_1(io[5], io[10], cospi_21_64, cospi_11_64, s10,
+ s11);
+ highbd_iadst_butterfly_sse4_1(io[3], io[12], cospi_25_64, cospi_7_64, s12,
+ s13);
+ highbd_iadst_butterfly_sse4_1(io[1], io[14], cospi_29_64, cospi_3_64, s14,
+ s15);
+
+ x0[0] = _mm_add_epi64(s0[0], s8[0]);
+ x0[1] = _mm_add_epi64(s0[1], s8[1]);
+ x1[0] = _mm_add_epi64(s1[0], s9[0]);
+ x1[1] = _mm_add_epi64(s1[1], s9[1]);
+ x2[0] = _mm_add_epi64(s2[0], s10[0]);
+ x2[1] = _mm_add_epi64(s2[1], s10[1]);
+ x3[0] = _mm_add_epi64(s3[0], s11[0]);
+ x3[1] = _mm_add_epi64(s3[1], s11[1]);
+ x4[0] = _mm_add_epi64(s4[0], s12[0]);
+ x4[1] = _mm_add_epi64(s4[1], s12[1]);
+ x5[0] = _mm_add_epi64(s5[0], s13[0]);
+ x5[1] = _mm_add_epi64(s5[1], s13[1]);
+ x6[0] = _mm_add_epi64(s6[0], s14[0]);
+ x6[1] = _mm_add_epi64(s6[1], s14[1]);
+ x7[0] = _mm_add_epi64(s7[0], s15[0]);
+ x7[1] = _mm_add_epi64(s7[1], s15[1]);
+ x8[0] = _mm_sub_epi64(s0[0], s8[0]);
+ x8[1] = _mm_sub_epi64(s0[1], s8[1]);
+ x9[0] = _mm_sub_epi64(s1[0], s9[0]);
+ x9[1] = _mm_sub_epi64(s1[1], s9[1]);
+ x10[0] = _mm_sub_epi64(s2[0], s10[0]);
+ x10[1] = _mm_sub_epi64(s2[1], s10[1]);
+ x11[0] = _mm_sub_epi64(s3[0], s11[0]);
+ x11[1] = _mm_sub_epi64(s3[1], s11[1]);
+ x12[0] = _mm_sub_epi64(s4[0], s12[0]);
+ x12[1] = _mm_sub_epi64(s4[1], s12[1]);
+ x13[0] = _mm_sub_epi64(s5[0], s13[0]);
+ x13[1] = _mm_sub_epi64(s5[1], s13[1]);
+ x14[0] = _mm_sub_epi64(s6[0], s14[0]);
+ x14[1] = _mm_sub_epi64(s6[1], s14[1]);
+ x15[0] = _mm_sub_epi64(s7[0], s15[0]);
+ x15[1] = _mm_sub_epi64(s7[1], s15[1]);
+
+ x0[0] = dct_const_round_shift_64bit(x0[0]);
+ x0[1] = dct_const_round_shift_64bit(x0[1]);
+ x1[0] = dct_const_round_shift_64bit(x1[0]);
+ x1[1] = dct_const_round_shift_64bit(x1[1]);
+ x2[0] = dct_const_round_shift_64bit(x2[0]);
+ x2[1] = dct_const_round_shift_64bit(x2[1]);
+ x3[0] = dct_const_round_shift_64bit(x3[0]);
+ x3[1] = dct_const_round_shift_64bit(x3[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ x8[0] = dct_const_round_shift_64bit(x8[0]);
+ x8[1] = dct_const_round_shift_64bit(x8[1]);
+ x9[0] = dct_const_round_shift_64bit(x9[0]);
+ x9[1] = dct_const_round_shift_64bit(x9[1]);
+ x10[0] = dct_const_round_shift_64bit(x10[0]);
+ x10[1] = dct_const_round_shift_64bit(x10[1]);
+ x11[0] = dct_const_round_shift_64bit(x11[0]);
+ x11[1] = dct_const_round_shift_64bit(x11[1]);
+ x12[0] = dct_const_round_shift_64bit(x12[0]);
+ x12[1] = dct_const_round_shift_64bit(x12[1]);
+ x13[0] = dct_const_round_shift_64bit(x13[0]);
+ x13[1] = dct_const_round_shift_64bit(x13[1]);
+ x14[0] = dct_const_round_shift_64bit(x14[0]);
+ x14[1] = dct_const_round_shift_64bit(x14[1]);
+ x15[0] = dct_const_round_shift_64bit(x15[0]);
+ x15[1] = dct_const_round_shift_64bit(x15[1]);
+ x0[0] = pack_4(x0[0], x0[1]);
+ x1[0] = pack_4(x1[0], x1[1]);
+ x2[0] = pack_4(x2[0], x2[1]);
+ x3[0] = pack_4(x3[0], x3[1]);
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+ x8[0] = pack_4(x8[0], x8[1]);
+ x9[0] = pack_4(x9[0], x9[1]);
+ x10[0] = pack_4(x10[0], x10[1]);
+ x11[0] = pack_4(x11[0], x11[1]);
+ x12[0] = pack_4(x12[0], x12[1]);
+ x13[0] = pack_4(x13[0], x13[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ // stage 2
+ s0[0] = x0[0];
+ s1[0] = x1[0];
+ s2[0] = x2[0];
+ s3[0] = x3[0];
+ s4[0] = x4[0];
+ s5[0] = x5[0];
+ s6[0] = x6[0];
+ s7[0] = x7[0];
+ x0[0] = _mm_add_epi32(s0[0], s4[0]);
+ x1[0] = _mm_add_epi32(s1[0], s5[0]);
+ x2[0] = _mm_add_epi32(s2[0], s6[0]);
+ x3[0] = _mm_add_epi32(s3[0], s7[0]);
+ x4[0] = _mm_sub_epi32(s0[0], s4[0]);
+ x5[0] = _mm_sub_epi32(s1[0], s5[0]);
+ x6[0] = _mm_sub_epi32(s2[0], s6[0]);
+ x7[0] = _mm_sub_epi32(s3[0], s7[0]);
+
+ highbd_iadst_butterfly_sse4_1(x8[0], x9[0], cospi_4_64, cospi_28_64, s8, s9);
+ highbd_iadst_butterfly_sse4_1(x10[0], x11[0], cospi_20_64, cospi_12_64, s10,
+ s11);
+ highbd_iadst_butterfly_sse4_1(x13[0], x12[0], cospi_28_64, cospi_4_64, s13,
+ s12);
+ highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_12_64, cospi_20_64, s15,
+ s14);
+
+ x8[0] = _mm_add_epi64(s8[0], s12[0]);
+ x8[1] = _mm_add_epi64(s8[1], s12[1]);
+ x9[0] = _mm_add_epi64(s9[0], s13[0]);
+ x9[1] = _mm_add_epi64(s9[1], s13[1]);
+ x10[0] = _mm_add_epi64(s10[0], s14[0]);
+ x10[1] = _mm_add_epi64(s10[1], s14[1]);
+ x11[0] = _mm_add_epi64(s11[0], s15[0]);
+ x11[1] = _mm_add_epi64(s11[1], s15[1]);
+ x12[0] = _mm_sub_epi64(s8[0], s12[0]);
+ x12[1] = _mm_sub_epi64(s8[1], s12[1]);
+ x13[0] = _mm_sub_epi64(s9[0], s13[0]);
+ x13[1] = _mm_sub_epi64(s9[1], s13[1]);
+ x14[0] = _mm_sub_epi64(s10[0], s14[0]);
+ x14[1] = _mm_sub_epi64(s10[1], s14[1]);
+ x15[0] = _mm_sub_epi64(s11[0], s15[0]);
+ x15[1] = _mm_sub_epi64(s11[1], s15[1]);
+ x8[0] = dct_const_round_shift_64bit(x8[0]);
+ x8[1] = dct_const_round_shift_64bit(x8[1]);
+ x9[0] = dct_const_round_shift_64bit(x9[0]);
+ x9[1] = dct_const_round_shift_64bit(x9[1]);
+ x10[0] = dct_const_round_shift_64bit(x10[0]);
+ x10[1] = dct_const_round_shift_64bit(x10[1]);
+ x11[0] = dct_const_round_shift_64bit(x11[0]);
+ x11[1] = dct_const_round_shift_64bit(x11[1]);
+ x12[0] = dct_const_round_shift_64bit(x12[0]);
+ x12[1] = dct_const_round_shift_64bit(x12[1]);
+ x13[0] = dct_const_round_shift_64bit(x13[0]);
+ x13[1] = dct_const_round_shift_64bit(x13[1]);
+ x14[0] = dct_const_round_shift_64bit(x14[0]);
+ x14[1] = dct_const_round_shift_64bit(x14[1]);
+ x15[0] = dct_const_round_shift_64bit(x15[0]);
+ x15[1] = dct_const_round_shift_64bit(x15[1]);
+ x8[0] = pack_4(x8[0], x8[1]);
+ x9[0] = pack_4(x9[0], x9[1]);
+ x10[0] = pack_4(x10[0], x10[1]);
+ x11[0] = pack_4(x11[0], x11[1]);
+ x12[0] = pack_4(x12[0], x12[1]);
+ x13[0] = pack_4(x13[0], x13[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ // stage 3
+ s0[0] = x0[0];
+ s1[0] = x1[0];
+ s2[0] = x2[0];
+ s3[0] = x3[0];
+ highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+ highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+ s8[0] = x8[0];
+ s9[0] = x9[0];
+ s10[0] = x10[0];
+ s11[0] = x11[0];
+ highbd_iadst_butterfly_sse4_1(x12[0], x13[0], cospi_8_64, cospi_24_64, s12,
+ s13);
+ highbd_iadst_butterfly_sse4_1(x15[0], x14[0], cospi_24_64, cospi_8_64, s15,
+ s14);
+
+ x0[0] = _mm_add_epi32(s0[0], s2[0]);
+ x1[0] = _mm_add_epi32(s1[0], s3[0]);
+ x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+ x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+ x4[0] = _mm_add_epi64(s4[0], s6[0]);
+ x4[1] = _mm_add_epi64(s4[1], s6[1]);
+ x5[0] = _mm_add_epi64(s5[0], s7[0]);
+ x5[1] = _mm_add_epi64(s5[1], s7[1]);
+ x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+ x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+ x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+ x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+ x8[0] = _mm_add_epi32(s8[0], s10[0]);
+ x9[0] = _mm_add_epi32(s9[0], s11[0]);
+ x10[0] = _mm_sub_epi32(s8[0], s10[0]);
+ x11[0] = _mm_sub_epi32(s9[0], s11[0]);
+ x12[0] = _mm_add_epi64(s12[0], s14[0]);
+ x12[1] = _mm_add_epi64(s12[1], s14[1]);
+ x13[0] = _mm_add_epi64(s13[0], s15[0]);
+ x13[1] = _mm_add_epi64(s13[1], s15[1]);
+ x14[0] = _mm_sub_epi64(s12[0], s14[0]);
+ x14[1] = _mm_sub_epi64(s12[1], s14[1]);
+ x15[0] = _mm_sub_epi64(s13[0], s15[0]);
+ x15[1] = _mm_sub_epi64(s13[1], s15[1]);
+ x12[0] = dct_const_round_shift_64bit(x12[0]);
+ x12[1] = dct_const_round_shift_64bit(x12[1]);
+ x13[0] = dct_const_round_shift_64bit(x13[0]);
+ x13[1] = dct_const_round_shift_64bit(x13[1]);
+ x14[0] = dct_const_round_shift_64bit(x14[0]);
+ x14[1] = dct_const_round_shift_64bit(x14[1]);
+ x15[0] = dct_const_round_shift_64bit(x15[0]);
+ x15[1] = dct_const_round_shift_64bit(x15[1]);
+ x12[0] = pack_4(x12[0], x12[1]);
+ x13[0] = pack_4(x13[0], x13[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ // stage 4
+ s2[0] = _mm_add_epi32(x2[0], x3[0]);
+ s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+ s6[0] = _mm_add_epi32(x7[0], x6[0]);
+ s7[0] = _mm_sub_epi32(x7[0], x6[0]);
+ s10[0] = _mm_add_epi32(x11[0], x10[0]);
+ s11[0] = _mm_sub_epi32(x11[0], x10[0]);
+ s14[0] = _mm_add_epi32(x14[0], x15[0]);
+ s15[0] = _mm_sub_epi32(x14[0], x15[0]);
+ highbd_iadst_half_butterfly_sse4_1(s2[0], -cospi_16_64, s2);
+ highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+ highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+ highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+ highbd_iadst_half_butterfly_sse4_1(s10[0], cospi_16_64, s10);
+ highbd_iadst_half_butterfly_sse4_1(s11[0], cospi_16_64, s11);
+ highbd_iadst_half_butterfly_sse4_1(s14[0], -cospi_16_64, s14);
+ highbd_iadst_half_butterfly_sse4_1(s15[0], cospi_16_64, s15);
+
+ x2[0] = dct_const_round_shift_64bit(s2[0]);
+ x2[1] = dct_const_round_shift_64bit(s2[1]);
+ x3[0] = dct_const_round_shift_64bit(s3[0]);
+ x3[1] = dct_const_round_shift_64bit(s3[1]);
+ x6[0] = dct_const_round_shift_64bit(s6[0]);
+ x6[1] = dct_const_round_shift_64bit(s6[1]);
+ x7[0] = dct_const_round_shift_64bit(s7[0]);
+ x7[1] = dct_const_round_shift_64bit(s7[1]);
+ x10[0] = dct_const_round_shift_64bit(s10[0]);
+ x10[1] = dct_const_round_shift_64bit(s10[1]);
+ x11[0] = dct_const_round_shift_64bit(s11[0]);
+ x11[1] = dct_const_round_shift_64bit(s11[1]);
+ x14[0] = dct_const_round_shift_64bit(s14[0]);
+ x14[1] = dct_const_round_shift_64bit(s14[1]);
+ x15[0] = dct_const_round_shift_64bit(s15[0]);
+ x15[1] = dct_const_round_shift_64bit(s15[1]);
+ x2[0] = pack_4(x2[0], x2[1]);
+ x3[0] = pack_4(x3[0], x3[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+ x10[0] = pack_4(x10[0], x10[1]);
+ x11[0] = pack_4(x11[0], x11[1]);
+ x14[0] = pack_4(x14[0], x14[1]);
+ x15[0] = pack_4(x15[0], x15[1]);
+
+ io[0] = x0[0];
+ io[1] = _mm_sub_epi32(_mm_setzero_si128(), x8[0]);
+ io[2] = x12[0];
+ io[3] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+ io[4] = x6[0];
+ io[5] = x14[0];
+ io[6] = x10[0];
+ io[7] = x2[0];
+ io[8] = x3[0];
+ io[9] = x11[0];
+ io[10] = x15[0];
+ io[11] = x7[0];
+ io[12] = x5[0];
+ io[13] = _mm_sub_epi32(_mm_setzero_si128(), x13[0]);
+ io[14] = x9[0];
+ io[15] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht16x16_256_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int tx_type, int bd) {
+ int i;
+ __m128i out[16], *in;
+
+ if (bd == 8) {
+ __m128i l[16], r[16];
+
+ in = l;
+ for (i = 0; i < 2; i++) {
+ highbd_load_pack_transpose_32bit_8x8(&input[0], 16, &in[0]);
+ highbd_load_pack_transpose_32bit_8x8(&input[8], 16, &in[8]);
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ idct16_8col(in, in);
+ } else {
+ vpx_iadst16_8col_sse2(in);
+ }
+ in = r;
+ input += 128;
+ }
+
+ for (i = 0; i < 16; i += 8) {
+ int j;
+ transpose_16bit_8x8(l + i, out);
+ transpose_16bit_8x8(r + i, out + 8);
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ idct16_8col(out, out);
+ } else {
+ vpx_iadst16_8col_sse2(out);
+ }
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_8(dest + j * stride, out[j], bd);
+ }
+ dest += 8;
+ }
+ } else {
+ __m128i all[4][16];
+
+ for (i = 0; i < 4; i++) {
+ in = all[i];
+ highbd_load_transpose_32bit_8x4(&input[0], 16, &in[0]);
+ highbd_load_transpose_32bit_8x4(&input[8], 16, &in[8]);
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ vpx_highbd_idct16_4col_sse4_1(in);
+ } else {
+ highbd_iadst16_4col_sse4_1(in);
+ }
+ input += 4 * 16;
+ }
+
+ for (i = 0; i < 16; i += 4) {
+ int j;
+ transpose_32bit_4x4(all[0] + i, out + 0);
+ transpose_32bit_4x4(all[1] + i, out + 4);
+ transpose_32bit_4x4(all[2] + i, out + 8);
+ transpose_32bit_4x4(all[3] + i, out + 12);
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ vpx_highbd_idct16_4col_sse4_1(out);
+ } else {
+ highbd_iadst16_4col_sse4_1(out);
+ }
+
+ for (j = 0; j < 16; ++j) {
+ highbd_write_buffer_4(dest + j * stride, out[j], bd);
+ }
+ dest += 4;
+ }
+ }
+}
diff --git a/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
new file mode 100644
index 000000000..7d949b6db
--- /dev/null
+++ b/vp9/common/x86/vp9_highbd_iht8x8_add_sse4.c
@@ -0,0 +1,255 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_idct.h"
+#include "vpx_dsp/x86/highbd_inv_txfm_sse4.h"
+#include "vpx_dsp/x86/inv_txfm_sse2.h"
+#include "vpx_dsp/x86/transpose_sse2.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+
+static INLINE void highbd_iadst_half_butterfly_sse4_1(const __m128i in,
+ const int c,
+ __m128i *const s) {
+ const __m128i pair_c = pair_set_epi32(4 * c, 0);
+ __m128i x[2];
+
+ extend_64bit(in, x);
+ s[0] = _mm_mul_epi32(pair_c, x[0]);
+ s[1] = _mm_mul_epi32(pair_c, x[1]);
+}
+
+static INLINE void highbd_iadst_butterfly_sse4_1(const __m128i in0,
+ const __m128i in1,
+ const int c0, const int c1,
+ __m128i *const s0,
+ __m128i *const s1) {
+ const __m128i pair_c0 = pair_set_epi32(4 * c0, 0);
+ const __m128i pair_c1 = pair_set_epi32(4 * c1, 0);
+ __m128i t00[2], t01[2], t10[2], t11[2];
+ __m128i x0[2], x1[2];
+
+ extend_64bit(in0, x0);
+ extend_64bit(in1, x1);
+ t00[0] = _mm_mul_epi32(pair_c0, x0[0]);
+ t00[1] = _mm_mul_epi32(pair_c0, x0[1]);
+ t01[0] = _mm_mul_epi32(pair_c0, x1[0]);
+ t01[1] = _mm_mul_epi32(pair_c0, x1[1]);
+ t10[0] = _mm_mul_epi32(pair_c1, x0[0]);
+ t10[1] = _mm_mul_epi32(pair_c1, x0[1]);
+ t11[0] = _mm_mul_epi32(pair_c1, x1[0]);
+ t11[1] = _mm_mul_epi32(pair_c1, x1[1]);
+
+ s0[0] = _mm_add_epi64(t00[0], t11[0]);
+ s0[1] = _mm_add_epi64(t00[1], t11[1]);
+ s1[0] = _mm_sub_epi64(t10[0], t01[0]);
+ s1[1] = _mm_sub_epi64(t10[1], t01[1]);
+}
+
+static void highbd_iadst8_sse4_1(__m128i *const io) {
+ __m128i s0[2], s1[2], s2[2], s3[2], s4[2], s5[2], s6[2], s7[2];
+ __m128i x0[2], x1[2], x2[2], x3[2], x4[2], x5[2], x6[2], x7[2];
+
+ transpose_32bit_4x4x2(io, io);
+
+ // stage 1
+ highbd_iadst_butterfly_sse4_1(io[7], io[0], cospi_2_64, cospi_30_64, s0, s1);
+ highbd_iadst_butterfly_sse4_1(io[3], io[4], cospi_18_64, cospi_14_64, s4, s5);
+ x0[0] = _mm_add_epi64(s0[0], s4[0]);
+ x0[1] = _mm_add_epi64(s0[1], s4[1]);
+ x1[0] = _mm_add_epi64(s1[0], s5[0]);
+ x1[1] = _mm_add_epi64(s1[1], s5[1]);
+ x4[0] = _mm_sub_epi64(s0[0], s4[0]);
+ x4[1] = _mm_sub_epi64(s0[1], s4[1]);
+ x5[0] = _mm_sub_epi64(s1[0], s5[0]);
+ x5[1] = _mm_sub_epi64(s1[1], s5[1]);
+
+ highbd_iadst_butterfly_sse4_1(io[5], io[2], cospi_10_64, cospi_22_64, s2, s3);
+ highbd_iadst_butterfly_sse4_1(io[1], io[6], cospi_26_64, cospi_6_64, s6, s7);
+ x2[0] = _mm_add_epi64(s2[0], s6[0]);
+ x2[1] = _mm_add_epi64(s2[1], s6[1]);
+ x3[0] = _mm_add_epi64(s3[0], s7[0]);
+ x3[1] = _mm_add_epi64(s3[1], s7[1]);
+ x6[0] = _mm_sub_epi64(s2[0], s6[0]);
+ x6[1] = _mm_sub_epi64(s2[1], s6[1]);
+ x7[0] = _mm_sub_epi64(s3[0], s7[0]);
+ x7[1] = _mm_sub_epi64(s3[1], s7[1]);
+
+ x0[0] = dct_const_round_shift_64bit(x0[0]);
+ x0[1] = dct_const_round_shift_64bit(x0[1]);
+ x1[0] = dct_const_round_shift_64bit(x1[0]);
+ x1[1] = dct_const_round_shift_64bit(x1[1]);
+ x2[0] = dct_const_round_shift_64bit(x2[0]);
+ x2[1] = dct_const_round_shift_64bit(x2[1]);
+ x3[0] = dct_const_round_shift_64bit(x3[0]);
+ x3[1] = dct_const_round_shift_64bit(x3[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ s0[0] = pack_4(x0[0], x0[1]); // s0 = x0;
+ s1[0] = pack_4(x1[0], x1[1]); // s1 = x1;
+ s2[0] = pack_4(x2[0], x2[1]); // s2 = x2;
+ s3[0] = pack_4(x3[0], x3[1]); // s3 = x3;
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+
+ // stage 2
+ x0[0] = _mm_add_epi32(s0[0], s2[0]);
+ x1[0] = _mm_add_epi32(s1[0], s3[0]);
+ x2[0] = _mm_sub_epi32(s0[0], s2[0]);
+ x3[0] = _mm_sub_epi32(s1[0], s3[0]);
+
+ highbd_iadst_butterfly_sse4_1(x4[0], x5[0], cospi_8_64, cospi_24_64, s4, s5);
+ highbd_iadst_butterfly_sse4_1(x7[0], x6[0], cospi_24_64, cospi_8_64, s7, s6);
+
+ x4[0] = _mm_add_epi64(s4[0], s6[0]);
+ x4[1] = _mm_add_epi64(s4[1], s6[1]);
+ x5[0] = _mm_add_epi64(s5[0], s7[0]);
+ x5[1] = _mm_add_epi64(s5[1], s7[1]);
+ x6[0] = _mm_sub_epi64(s4[0], s6[0]);
+ x6[1] = _mm_sub_epi64(s4[1], s6[1]);
+ x7[0] = _mm_sub_epi64(s5[0], s7[0]);
+ x7[1] = _mm_sub_epi64(s5[1], s7[1]);
+ x4[0] = dct_const_round_shift_64bit(x4[0]);
+ x4[1] = dct_const_round_shift_64bit(x4[1]);
+ x5[0] = dct_const_round_shift_64bit(x5[0]);
+ x5[1] = dct_const_round_shift_64bit(x5[1]);
+ x6[0] = dct_const_round_shift_64bit(x6[0]);
+ x6[1] = dct_const_round_shift_64bit(x6[1]);
+ x7[0] = dct_const_round_shift_64bit(x7[0]);
+ x7[1] = dct_const_round_shift_64bit(x7[1]);
+ x4[0] = pack_4(x4[0], x4[1]);
+ x5[0] = pack_4(x5[0], x5[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+
+ // stage 3
+ s2[0] = _mm_add_epi32(x2[0], x3[0]);
+ s3[0] = _mm_sub_epi32(x2[0], x3[0]);
+ s6[0] = _mm_add_epi32(x6[0], x7[0]);
+ s7[0] = _mm_sub_epi32(x6[0], x7[0]);
+ highbd_iadst_half_butterfly_sse4_1(s2[0], cospi_16_64, s2);
+ highbd_iadst_half_butterfly_sse4_1(s3[0], cospi_16_64, s3);
+ highbd_iadst_half_butterfly_sse4_1(s6[0], cospi_16_64, s6);
+ highbd_iadst_half_butterfly_sse4_1(s7[0], cospi_16_64, s7);
+
+ x2[0] = dct_const_round_shift_64bit(s2[0]);
+ x2[1] = dct_const_round_shift_64bit(s2[1]);
+ x3[0] = dct_const_round_shift_64bit(s3[0]);
+ x3[1] = dct_const_round_shift_64bit(s3[1]);
+ x6[0] = dct_const_round_shift_64bit(s6[0]);
+ x6[1] = dct_const_round_shift_64bit(s6[1]);
+ x7[0] = dct_const_round_shift_64bit(s7[0]);
+ x7[1] = dct_const_round_shift_64bit(s7[1]);
+ x2[0] = pack_4(x2[0], x2[1]);
+ x3[0] = pack_4(x3[0], x3[1]);
+ x6[0] = pack_4(x6[0], x6[1]);
+ x7[0] = pack_4(x7[0], x7[1]);
+
+ io[0] = x0[0];
+ io[1] = _mm_sub_epi32(_mm_setzero_si128(), x4[0]);
+ io[2] = x6[0];
+ io[3] = _mm_sub_epi32(_mm_setzero_si128(), x2[0]);
+ io[4] = x3[0];
+ io[5] = _mm_sub_epi32(_mm_setzero_si128(), x7[0]);
+ io[6] = x5[0];
+ io[7] = _mm_sub_epi32(_mm_setzero_si128(), x1[0]);
+}
+
+void vp9_highbd_iht8x8_64_add_sse4_1(const tran_low_t *input, uint16_t *dest,
+ int stride, int tx_type, int bd) {
+ __m128i io[16];
+
+ io[0] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 0));
+ io[4] = _mm_load_si128((const __m128i *)(input + 0 * 8 + 4));
+ io[1] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 0));
+ io[5] = _mm_load_si128((const __m128i *)(input + 1 * 8 + 4));
+ io[2] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 0));
+ io[6] = _mm_load_si128((const __m128i *)(input + 2 * 8 + 4));
+ io[3] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 0));
+ io[7] = _mm_load_si128((const __m128i *)(input + 3 * 8 + 4));
+ io[8] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 0));
+ io[12] = _mm_load_si128((const __m128i *)(input + 4 * 8 + 4));
+ io[9] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 0));
+ io[13] = _mm_load_si128((const __m128i *)(input + 5 * 8 + 4));
+ io[10] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 0));
+ io[14] = _mm_load_si128((const __m128i *)(input + 6 * 8 + 4));
+ io[11] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 0));
+ io[15] = _mm_load_si128((const __m128i *)(input + 7 * 8 + 4));
+
+ if (bd == 8) {
+ __m128i io_short[8];
+
+ io_short[0] = _mm_packs_epi32(io[0], io[4]);
+ io_short[1] = _mm_packs_epi32(io[1], io[5]);
+ io_short[2] = _mm_packs_epi32(io[2], io[6]);
+ io_short[3] = _mm_packs_epi32(io[3], io[7]);
+ io_short[4] = _mm_packs_epi32(io[8], io[12]);
+ io_short[5] = _mm_packs_epi32(io[9], io[13]);
+ io_short[6] = _mm_packs_epi32(io[10], io[14]);
+ io_short[7] = _mm_packs_epi32(io[11], io[15]);
+
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ vpx_idct8_sse2(io_short);
+ } else {
+ iadst8_sse2(io_short);
+ }
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ vpx_idct8_sse2(io_short);
+ } else {
+ iadst8_sse2(io_short);
+ }
+ round_shift_8x8(io_short, io);
+ } else {
+ __m128i temp[4];
+
+ if (tx_type == DCT_DCT || tx_type == ADST_DCT) {
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+ } else {
+ highbd_iadst8_sse4_1(io);
+ highbd_iadst8_sse4_1(&io[8]);
+ }
+
+ temp[0] = io[4];
+ temp[1] = io[5];
+ temp[2] = io[6];
+ temp[3] = io[7];
+ io[4] = io[8];
+ io[5] = io[9];
+ io[6] = io[10];
+ io[7] = io[11];
+
+ if (tx_type == DCT_DCT || tx_type == DCT_ADST) {
+ vpx_highbd_idct8x8_half1d_sse4_1(io);
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ vpx_highbd_idct8x8_half1d_sse4_1(&io[8]);
+ } else {
+ highbd_iadst8_sse4_1(io);
+ io[8] = temp[0];
+ io[9] = temp[1];
+ io[10] = temp[2];
+ io[11] = temp[3];
+ highbd_iadst8_sse4_1(&io[8]);
+ }
+ highbd_idct8x8_final_round(io);
+ }
+ recon_and_store_8x8(io, dest, stride, bd);
+}
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 6996260e2..ad693718c 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -10,8 +10,6 @@
#include "./vp9_rtcd.h"
#include "vpx_dsp/x86/inv_txfm_sse2.h"
-#include "vpx_dsp/x86/txfm_common_sse2.h"
-#include "vpx_ports/mem.h"
void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
int tx_type) {
@@ -22,23 +20,23 @@ void vp9_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[1] = load_input_data8(input + 8);
switch (tx_type) {
- case 0: // DCT_DCT
+ case DCT_DCT:
idct4_sse2(in);
idct4_sse2(in);
break;
- case 1: // ADST_DCT
+ case ADST_DCT:
idct4_sse2(in);
iadst4_sse2(in);
break;
- case 2: // DCT_ADST
+ case DCT_ADST:
iadst4_sse2(in);
idct4_sse2(in);
break;
- case 3: // ADST_ADST
+ default:
+ assert(tx_type == ADST_ADST);
iadst4_sse2(in);
iadst4_sse2(in);
break;
- default: assert(0); break;
}
// Final round and shift
@@ -67,23 +65,23 @@ void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
in[7] = load_input_data8(input + 8 * 7);
switch (tx_type) {
- case 0: // DCT_DCT
- idct8_sse2(in);
- idct8_sse2(in);
+ case DCT_DCT:
+ vpx_idct8_sse2(in);
+ vpx_idct8_sse2(in);
break;
- case 1: // ADST_DCT
- idct8_sse2(in);
+ case ADST_DCT:
+ vpx_idct8_sse2(in);
iadst8_sse2(in);
break;
- case 2: // DCT_ADST
+ case DCT_ADST:
iadst8_sse2(in);
- idct8_sse2(in);
+ vpx_idct8_sse2(in);
break;
- case 3: // ADST_ADST
+ default:
+ assert(tx_type == ADST_ADST);
iadst8_sse2(in);
iadst8_sse2(in);
break;
- default: assert(0); break;
}
// Final rounding and shift
@@ -201,23 +199,23 @@ void vp9_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
load_buffer_8x16(input, in1);
switch (tx_type) {
- case 0: // DCT_DCT
+ case DCT_DCT:
idct16_sse2(in0, in1);
idct16_sse2(in0, in1);
break;
- case 1: // ADST_DCT
+ case ADST_DCT:
idct16_sse2(in0, in1);
iadst16_sse2(in0, in1);
break;
- case 2: // DCT_ADST
+ case DCT_ADST:
iadst16_sse2(in0, in1);
idct16_sse2(in0, in1);
break;
- case 3: // ADST_ADST
+ default:
+ assert(tx_type == ADST_ADST);
iadst16_sse2(in0, in1);
iadst16_sse2(in0, in1);
break;
- default: assert(0); break;
}
write_buffer_8x16(dest, in0, stride);
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 2f2f0055a..ed43de701 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -464,10 +464,6 @@ void vp9_cyclic_refresh_update_parameters(VP9_COMP *const cpi) {
cr->rate_ratio_qdelta = VPXMAX(cr->rate_ratio_qdelta, 2.5);
}
}
- if (cpi->svc.spatial_layer_id > 0) {
- cr->motion_thresh = 4;
- cr->rate_boost_fac = 12;
- }
if (cpi->oxcf.rc_mode == VPX_VBR) {
// To be adjusted for VBR mode, e.g., based on gf period and boost.
// For now use smaller qp-delta (than CBR), no second boosted seg, and
diff --git a/vp9/encoder/vp9_context_tree.c b/vp9/encoder/vp9_context_tree.c
index 2f7e54433..52a81afb5 100644
--- a/vp9/encoder/vp9_context_tree.c
+++ b/vp9/encoder/vp9_context_tree.c
@@ -12,7 +12,10 @@
#include "vp9/encoder/vp9_encoder.h"
static const BLOCK_SIZE square[] = {
- BLOCK_8X8, BLOCK_16X16, BLOCK_32X32, BLOCK_64X64,
+ BLOCK_8X8,
+ BLOCK_16X16,
+ BLOCK_32X32,
+ BLOCK_64X64,
};
static void alloc_mode_context(VP9_COMMON *cm, int num_4x4_blk,
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 682477df1..6517fb358 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1513,9 +1513,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
}
}
}
- if (is_key_frame || (low_res &&
- vt.split[i].split[j].part_variances.none.variance >
- threshold_4x4avg)) {
+ if (is_key_frame ||
+ (low_res && vt.split[i].split[j].part_variances.none.variance >
+ threshold_4x4avg)) {
force_split[split_index] = 0;
// Go down to 4x4 down-sampling for variance.
variance4x4downsample[i2 + j] = 1;
@@ -3403,9 +3403,10 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
// Rate and distortion based partition search termination clause.
if (!cpi->sf.ml_partition_search_early_termination &&
- !x->e_mbd.lossless && ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
- (best_rdc.dist < dist_breakout_thr &&
- best_rdc.rate < rate_breakout_thr))) {
+ !x->e_mbd.lossless &&
+ ((best_rdc.dist < (dist_breakout_thr >> 2)) ||
+ (best_rdc.dist < dist_breakout_thr &&
+ best_rdc.rate < rate_breakout_thr))) {
do_rect = 0;
}
}
@@ -4620,8 +4621,9 @@ void vp9_init_tile_data(VP9_COMP *cpi) {
if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
if (cpi->tile_data != NULL) vpx_free(cpi->tile_data);
- CHECK_MEM_ERROR(cm, cpi->tile_data, vpx_malloc(tile_cols * tile_rows *
- sizeof(*cpi->tile_data)));
+ CHECK_MEM_ERROR(
+ cm, cpi->tile_data,
+ vpx_malloc(tile_cols * tile_rows * sizeof(*cpi->tile_data)));
cpi->allocated_tiles = tile_cols * tile_rows;
for (tile_row = 0; tile_row < tile_rows; ++tile_row)
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index f3c17f255..970077d89 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -50,7 +50,8 @@ void vp9_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
}
static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
- { 10, 6 }, { 8, 5 },
+ { 10, 6 },
+ { 8, 5 },
};
// 'num' can be negative, but 'shift' must be non-negative.
@@ -200,9 +201,9 @@ int vp9_optimize_b(MACROBLOCK *mb, int plane, int block, TX_SIZE tx_size,
const int band_next = band_translate[i + 1];
const int token_next =
(i + 1 != eob) ? vp9_get_token(qcoeff[scan[i + 1]]) : EOB_TOKEN;
- unsigned int(
- *const token_costs_next)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
- token_costs + band_next;
+ unsigned int(*const token_costs_next)[2][COEFF_CONTEXTS]
+ [ENTROPY_TOKENS] =
+ token_costs + band_next;
token_cache[rc] = vp9_pt_energy_class[t0];
ctx_next = get_coef_context(nb, token_cache, i + 1);
token_tree_sel_next = (x == 0);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 1e91d0155..fd3889dda 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -65,12 +65,12 @@
#define AM_SEGMENT_ID_INACTIVE 7
#define AM_SEGMENT_ID_ACTIVE 0
-#define ALTREF_HIGH_PRECISION_MV 1 // Whether to use high precision mv
- // for altref computation.
-#define HIGH_PRECISION_MV_QTHRESH 200 // Q threshold for high precision
- // mv. Choose a very high value for
- // now so that HIGH_PRECISION is always
- // chosen.
+// Whether to use high precision mv for altref computation.
+#define ALTREF_HIGH_PRECISION_MV 1
+
+// Q threshold for high precision mv. Choose a very high value for now so that
+// HIGH_PRECISION is always chosen.
+#define HIGH_PRECISION_MV_QTHRESH 200
#define FRAME_SIZE_FACTOR 128 // empirical params for context model threshold
#define FRAME_RATE_FACTOR 8
@@ -547,6 +547,74 @@ static void apply_active_map(VP9_COMP *cpi) {
}
}
+static void apply_roi_map(VP9_COMP *cpi) {
+ VP9_COMMON *cm = &cpi->common;
+ struct segmentation *const seg = &cm->seg;
+ vpx_roi_map_t *roi = &cpi->roi;
+ const int *delta_q = roi->delta_q;
+ const int *delta_lf = roi->delta_lf;
+ const int *skip = roi->skip;
+ int ref_frame[8];
+ int internal_delta_q[MAX_SEGMENTS];
+ int i;
+ static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+
+ // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
+ // realtime mode.
+ if (cpi->oxcf.mode != REALTIME || cpi->oxcf.speed < 5) return;
+ if (!roi->enabled) return;
+
+ memcpy(&ref_frame, roi->ref_frame, sizeof(ref_frame));
+
+ vp9_enable_segmentation(seg);
+ vp9_clearall_segfeatures(seg);
+ // Select delta coding method;
+ seg->abs_delta = SEGMENT_DELTADATA;
+
+ memcpy(cpi->segmentation_map, roi->roi_map, (cm->mi_rows * cm->mi_cols));
+
+ for (i = 0; i < MAX_SEGMENTS; ++i) {
+ // Translate the external delta q values to internal values.
+ internal_delta_q[i] = vp9_quantizer_to_qindex(abs(delta_q[i]));
+ if (delta_q[i] < 0) internal_delta_q[i] = -internal_delta_q[i];
+ vp9_disable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ vp9_disable_segfeature(seg, i, SEG_LVL_ALT_LF);
+ if (internal_delta_q[i] != 0) {
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_Q);
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_Q, internal_delta_q[i]);
+ }
+ if (delta_lf[i] != 0) {
+ vp9_enable_segfeature(seg, i, SEG_LVL_ALT_LF);
+ vp9_set_segdata(seg, i, SEG_LVL_ALT_LF, delta_lf[i]);
+ }
+ if (skip[i] != 0) {
+ vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
+ vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]);
+ }
+ if (ref_frame[i] >= 0) {
+ int valid_ref = 1;
+ // ALTREF is not used as reference for nonrd_pickmode with 0 lag.
+ if (ref_frame[i] == ALTREF_FRAME && cpi->sf.use_nonrd_pick_mode)
+ valid_ref = 0;
+ // If GOLDEN is selected, make sure it's set as reference.
+ if (ref_frame[i] == GOLDEN_FRAME &&
+ !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
+ valid_ref = 0;
+ }
+ // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
+ // same reference.
+ if (ref_frame[i] == GOLDEN_FRAME && cpi->rc.frames_since_golden == 0)
+ ref_frame[i] = LAST_FRAME;
+ if (valid_ref) {
+ vp9_enable_segfeature(seg, i, SEG_LVL_REF_FRAME);
+ vp9_set_segdata(seg, i, SEG_LVL_REF_FRAME, ref_frame[i]);
+ }
+ }
+ }
+ roi->enabled = 1;
+}
+
static void init_level_info(Vp9LevelInfo *level_info) {
Vp9LevelStats *const level_stats = &level_info->level_stats;
Vp9LevelSpec *const level_spec = &level_info->level_spec;
@@ -557,6 +625,13 @@ static void init_level_info(Vp9LevelInfo *level_info) {
level_spec->min_altref_distance = INT_MAX;
}
+static int check_seg_range(int seg_data[8], int range) {
+ return !(abs(seg_data[0]) > range || abs(seg_data[1]) > range ||
+ abs(seg_data[2]) > range || abs(seg_data[3]) > range ||
+ abs(seg_data[4]) > range || abs(seg_data[5]) > range ||
+ abs(seg_data[6]) > range || abs(seg_data[7]) > range);
+}
+
VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
int i;
const Vp9LevelSpec *this_level;
@@ -583,6 +658,61 @@ VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec) {
return (i == VP9_LEVELS) ? LEVEL_UNKNOWN : vp9_level_defs[i].level;
}
+int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
+ unsigned int cols, int delta_q[8], int delta_lf[8],
+ int skip[8], int ref_frame[8]) {
+ VP9_COMMON *cm = &cpi->common;
+ vpx_roi_map_t *roi = &cpi->roi;
+ const int range = 63;
+ const int ref_frame_range = 3; // Alt-ref
+ const int skip_range = 1;
+ const int frame_rows = cpi->common.mi_rows;
+ const int frame_cols = cpi->common.mi_cols;
+
+ // Check number of rows and columns match
+ if (frame_rows != (int)rows || frame_cols != (int)cols) {
+ return -1;
+ }
+
+ if (!check_seg_range(delta_q, range) || !check_seg_range(delta_lf, range) ||
+ !check_seg_range(ref_frame, ref_frame_range) ||
+ !check_seg_range(skip, skip_range))
+ return -1;
+
+ // Also disable segmentation if no deltas are specified.
+ if (!map ||
+ (!(delta_q[0] | delta_q[1] | delta_q[2] | delta_q[3] | delta_q[4] |
+ delta_q[5] | delta_q[6] | delta_q[7] | delta_lf[0] | delta_lf[1] |
+ delta_lf[2] | delta_lf[3] | delta_lf[4] | delta_lf[5] | delta_lf[6] |
+ delta_lf[7] | skip[0] | skip[1] | skip[2] | skip[3] | skip[4] |
+ skip[5] | skip[6] | skip[7]) &&
+ (ref_frame[0] == -1 && ref_frame[1] == -1 && ref_frame[2] == -1 &&
+ ref_frame[3] == -1 && ref_frame[4] == -1 && ref_frame[5] == -1 &&
+ ref_frame[6] == -1 && ref_frame[7] == -1))) {
+ vp9_disable_segmentation(&cm->seg);
+ cpi->roi.enabled = 0;
+ return 0;
+ }
+
+ if (roi->roi_map) {
+ vpx_free(roi->roi_map);
+ roi->roi_map = NULL;
+ }
+ CHECK_MEM_ERROR(cm, roi->roi_map, vpx_malloc(rows * cols));
+
+ // Copy to ROI sturcture in the compressor.
+ memcpy(roi->roi_map, map, rows * cols);
+ memcpy(&roi->delta_q, delta_q, MAX_SEGMENTS * sizeof(delta_q[0]));
+ memcpy(&roi->delta_lf, delta_lf, MAX_SEGMENTS * sizeof(delta_lf[0]));
+ memcpy(&roi->skip, skip, MAX_SEGMENTS * sizeof(skip[0]));
+ memcpy(&roi->ref_frame, ref_frame, MAX_SEGMENTS * sizeof(ref_frame[0]));
+ roi->enabled = 1;
+ roi->rows = rows;
+ roi->cols = cols;
+
+ return 0;
+}
+
int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows,
int cols) {
if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols) {
@@ -817,6 +947,9 @@ static void dealloc_compressor_data(VP9_COMP *cpi) {
vpx_free(cpi->active_map.map);
cpi->active_map.map = NULL;
+ vpx_free(cpi->roi.roi_map);
+ cpi->roi.roi_map = NULL;
+
vpx_free(cpi->consec_zero_mv);
cpi->consec_zero_mv = NULL;
@@ -1121,8 +1254,9 @@ static void alloc_util_frame_buffers(VP9_COMP *cpi) {
// For 1 pass cbr: allocate scaled_frame that may be used as an intermediate
// buffer for a 2 stage down-sampling: two stages of 1:2 down-sampling for a
- // target of 1/4x1/4.
- if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc) {
+ // target of 1/4x1/4. number_spatial_layers must be greater than 2.
+ if (is_one_pass_cbr_svc(cpi) && !cpi->svc.scaled_temp_is_alloc &&
+ cpi->svc.number_spatial_layers > 2) {
cpi->svc.scaled_temp_is_alloc = 1;
if (vpx_realloc_frame_buffer(
&cpi->svc.scaled_temp, cm->width >> 1, cm->height >> 1,
@@ -2017,8 +2151,9 @@ VP9_COMP *vp9_create_compressor(VP9EncoderConfig *oxcf,
realloc_segmentation_maps(cpi);
- CHECK_MEM_ERROR(cm, cpi->skin_map, vpx_calloc(cm->mi_rows * cm->mi_cols,
- sizeof(cpi->skin_map[0])));
+ CHECK_MEM_ERROR(
+ cm, cpi->skin_map,
+ vpx_calloc(cm->mi_rows * cm->mi_cols, sizeof(cpi->skin_map[0])));
CHECK_MEM_ERROR(cm, cpi->alt_ref_aq, vp9_alt_ref_aq_create());
@@ -3630,6 +3765,8 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
// it may be pretty bad for rate-control,
// and I should handle it somehow
vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
+ } else if (cpi->roi.enabled && cm->frame_type != KEY_FRAME) {
+ apply_roi_map(cpi);
}
apply_active_map(cpi);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index d723d93cb..2989af35e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -723,6 +723,8 @@ typedef struct VP9_COMP {
uint8_t *count_arf_frame_usage;
uint8_t *count_lastgolden_frame_usage;
+
+ vpx_roi_map_t roi;
} VP9_COMP;
void vp9_initialize_enc(void);
@@ -868,9 +870,8 @@ static INLINE int is_one_pass_cbr_svc(const struct VP9_COMP *const cpi) {
#if CONFIG_VP9_TEMPORAL_DENOISING
static INLINE int denoise_svc(const struct VP9_COMP *const cpi) {
- return (!cpi->use_svc ||
- (cpi->use_svc &&
- cpi->svc.spatial_layer_id >= cpi->svc.first_layer_denoise));
+ return (!cpi->use_svc || (cpi->use_svc && cpi->svc.spatial_layer_id >=
+ cpi->svc.first_layer_denoise));
}
#endif
@@ -938,6 +939,10 @@ static INLINE int log_tile_cols_from_picsize_level(uint32_t width,
VP9_LEVEL vp9_get_level(const Vp9LevelSpec *const level_spec);
+int vp9_set_roi_map(VP9_COMP *cpi, unsigned char *map, unsigned int rows,
+ unsigned int cols, int delta_q[8], int delta_lf[8],
+ int skip[8], int ref_frame[8]);
+
void vp9_new_framerate(VP9_COMP *cpi, double framerate);
void vp9_set_row_mt(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 4093b5529..f4fda0965 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -731,9 +731,8 @@ static void first_pass_stat_calc(VP9_COMP *cpi, FIRSTPASS_STATS *fps,
// Exclude any image dead zone
if (fp_acc_data->image_data_start_row > 0) {
fp_acc_data->intra_skip_count =
- VPXMAX(0,
- fp_acc_data->intra_skip_count -
- (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
+ VPXMAX(0, fp_acc_data->intra_skip_count -
+ (fp_acc_data->image_data_start_row * cm->mb_cols * 2));
}
fp_acc_data->intra_factor = fp_acc_data->intra_factor / (double)num_mbs;
@@ -2238,9 +2237,6 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
}
gf_group->arf_update_idx[0] = arf_buffer_indices[0];
gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
-
- // Step over the golden frame / overlay frame
- if (EOF == input_stats(twopass, &frame_stats)) return;
}
// Deduct the boost bits for arf (or gf if it is not a key frame)
@@ -2285,7 +2281,8 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
// Define middle frame
mid_frame_idx = frame_index + (rc->baseline_gf_interval >> 1) - 1;
- normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+ normal_frames =
+ rc->baseline_gf_interval - (key_frame || rc->source_alt_ref_pending);
if (normal_frames > 1)
normal_frame_bits = (int)(total_group_bits / normal_frames);
else
@@ -2551,9 +2548,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
}
// Break out conditions.
- if (
- // Break at active_max_gf_interval unless almost totally static.
- ((i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
+ // Break at maximum of active_max_gf_interval unless almost totally static.
+ if (((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) &&
+ (i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
(
// Don't break out with a very short interval.
(i >= active_min_gf_interval) &&
@@ -2573,8 +2570,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
// Should we use the alternate reference frame.
- if (allow_alt_ref && (i < cpi->oxcf.lag_in_frames) &&
- (i >= rc->min_gf_interval)) {
+ if ((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && allow_alt_ref &&
+ (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) {
const int forward_frames = (rc->frames_to_key - i >= i - 1)
? i - 1
: VPXMAX(0, rc->frames_to_key - i);
@@ -2602,7 +2599,10 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
#endif
// Set the interval until the next gf.
- rc->baseline_gf_interval = i - (is_key_frame || rc->source_alt_ref_pending);
+ rc->baseline_gf_interval =
+ (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH)
+ ? (i - (is_key_frame || rc->source_alt_ref_pending))
+ : i;
// Only encode alt reference frame in temporal base layer. So
// baseline_gf_interval should be multiple of a temporal layer group
@@ -2700,13 +2700,24 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
#endif
}
+// Intra / Inter threshold very low
+#define VERY_LOW_II 1.5
+// Clean slide transitions we expect a sharp single frame spike in error.
+#define ERROR_SPIKE 5.0
+
// Slide show transition detection.
// Tests for case where there is very low error either side of the current frame
// but much higher just for this frame. This can help detect key frames in
// slide shows even where the slides are pictures of different sizes.
+// Also requires that intra and inter errors are very similar to help eliminate
+// harmful false positives.
// It will not help if the transition is a fade or other multi-frame effect.
-static int slide_transition(double this_err, double last_err, double next_err) {
- return (this_err > (last_err * 5.0)) && (this_err > (next_err * 5.0));
+static int slide_transition(const FIRSTPASS_STATS *this_frame,
+ const FIRSTPASS_STATS *last_frame,
+ const FIRSTPASS_STATS *next_frame) {
+ return (this_frame->intra_error < (this_frame->coded_error * VERY_LOW_II)) &&
+ (this_frame->coded_error > (last_frame->coded_error * ERROR_SPIKE)) &&
+ (this_frame->coded_error > (next_frame->coded_error * ERROR_SPIKE));
}
// Threshold for use of the lagging second reference frame. High second ref
@@ -2753,8 +2764,7 @@ static int test_candidate_kf(TWO_PASS *twopass,
if ((this_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
(next_frame->pcnt_second_ref < SECOND_REF_USEAGE_THRESH) &&
((this_frame->pcnt_inter < VERY_LOW_INTER_THRESH) ||
- (slide_transition(this_frame->coded_error, last_frame->coded_error,
- next_frame->coded_error)) ||
+ (slide_transition(this_frame, last_frame, next_frame)) ||
((pcnt_intra > MIN_INTRA_LEVEL) &&
(pcnt_intra > (INTRA_VS_INTER_THRESH * modified_pcnt_inter)) &&
((this_frame->intra_error /
@@ -3019,8 +3029,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
double zm_factor;
// Monitor for static sections.
- zero_motion_accumulator = VPXMIN(
- zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+ // First frame in kf group the second ref indicator is invalid.
+ if (i > 0) {
+ zero_motion_accumulator = VPXMIN(
+ zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+ } else {
+ zero_motion_accumulator =
+ next_frame.pcnt_inter - next_frame.pcnt_motion;
+ }
// Factor 0.75-1.25 based on how much of frame is static.
zm_factor = (0.75 + (zero_motion_accumulator / 2.0));
@@ -3056,10 +3072,16 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
twopass->section_intra_rating = calculate_section_intra_ratio(
start_position, twopass->stats_in_end, rc->frames_to_key);
- // Apply various clamps for min and max boost
- rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
- rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
- rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+ // Special case for static / slide show content but dont apply
+ // if the kf group is very short.
+ if ((zero_motion_accumulator > 0.99) && (rc->frames_to_key > 8)) {
+ rc->kf_boost = VPXMAX((rc->frames_to_key * 100), MAX_KF_TOT_BOOST);
+ } else {
+ // Apply various clamps for min and max boost
+ rc->kf_boost = VPXMAX((int)boost_score, (rc->frames_to_key * 3));
+ rc->kf_boost = VPXMAX(rc->kf_boost, MIN_KF_TOT_BOOST);
+ rc->kf_boost = VPXMIN(rc->kf_boost, MAX_KF_TOT_BOOST);
+ }
// Work out how many bits to allocate for the key frame itself.
kf_bits = calculate_boost_bits((rc->frames_to_key - 1), rc->kf_boost,
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 000ecd779..aa497e3da 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -120,12 +120,12 @@ typedef enum {
typedef struct {
unsigned char index;
unsigned char first_inter_index;
- RATE_FACTOR_LEVEL rf_level[(MAX_LAG_BUFFERS * 2) + 1];
- FRAME_UPDATE_TYPE update_type[(MAX_LAG_BUFFERS * 2) + 1];
- unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
- unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
- unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
- int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
+ RATE_FACTOR_LEVEL rf_level[MAX_STATIC_GF_GROUP_LENGTH + 1];
+ FRAME_UPDATE_TYPE update_type[MAX_STATIC_GF_GROUP_LENGTH + 1];
+ unsigned char arf_src_offset[MAX_STATIC_GF_GROUP_LENGTH + 1];
+ unsigned char arf_update_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
+ unsigned char arf_ref_idx[MAX_STATIC_GF_GROUP_LENGTH + 1];
+ int bit_allocation[MAX_STATIC_GF_GROUP_LENGTH + 1];
} GF_GROUP;
typedef struct {
diff --git a/vp9/encoder/vp9_mbgraph.h b/vp9/encoder/vp9_mbgraph.h
index df2fb98ef..c3af972bc 100644
--- a/vp9/encoder/vp9_mbgraph.h
+++ b/vp9/encoder/vp9_mbgraph.h
@@ -25,7 +25,9 @@ typedef struct {
} ref[MAX_REF_FRAMES];
} MBGRAPH_MB_STATS;
-typedef struct { MBGRAPH_MB_STATS *mb_stats; } MBGRAPH_FRAME_STATS;
+typedef struct {
+ MBGRAPH_MB_STATS *mb_stats;
+} MBGRAPH_FRAME_STATS;
struct VP9_COMP;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 44f01be25..37406c232 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1785,7 +1785,10 @@ static int vector_match(int16_t *ref, int16_t *src, int bwl) {
}
static const MV search_pos[4] = {
- { -1, 0 }, { 0, -1 }, { 0, 1 }, { 1, 0 },
+ { -1, 0 },
+ { 0, -1 },
+ { 0, 1 },
+ { 1, 0 },
};
unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
@@ -1876,7 +1879,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
{
const uint8_t *const pos[4] = {
- ref_buf - ref_stride, ref_buf - 1, ref_buf + 1, ref_buf + ref_stride,
+ ref_buf - ref_stride,
+ ref_buf - 1,
+ ref_buf + 1,
+ ref_buf + ref_stride,
};
cpi->fn_ptr[bsize].sdx4df(src_buf, src_stride, pos, ref_stride, this_sad);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index f2f323a28..212c260fa 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -1488,7 +1488,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int skip_ref_find_pred[4] = { 0 };
unsigned int sse_zeromv_normalized = UINT_MAX;
unsigned int best_sse_sofar = UINT_MAX;
- unsigned int thresh_svc_skip_golden = 500;
#if CONFIG_VP9_TEMPORAL_DENOISING
VP9_PICKMODE_CTX_DEN ctx_den;
int64_t zero_last_cost_orig = INT64_MAX;
@@ -1496,11 +1495,23 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
#endif
INTERP_FILTER filter_gf_svc = EIGHTTAP;
MV_REFERENCE_FRAME best_second_ref_frame = NONE;
+ const struct segmentation *const seg = &cm->seg;
int comp_modes = 0;
int num_inter_modes = (cpi->use_svc) ? RT_INTER_MODES_SVC : RT_INTER_MODES;
int flag_svc_subpel = 0;
int svc_mv_col = 0;
int svc_mv_row = 0;
+ unsigned int thresh_svc_skip_golden = 500;
+ // Lower the skip threshold if lower spatial layer is better quality relative
+ // to current layer.
+ if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex > 150 &&
+ cm->base_qindex > cpi->svc.lower_layer_qindex + 15)
+ thresh_svc_skip_golden = 100;
+ // Increase skip threshold if lower spatial layer is lower quality relative
+ // to current layer.
+ else if (cpi->svc.spatial_layer_id > 0 && cm->base_qindex < 140 &&
+ cm->base_qindex < cpi->svc.lower_layer_qindex - 20)
+ thresh_svc_skip_golden = 1000;
init_ref_frame_cost(cm, xd, ref_frame_cost);
@@ -1638,6 +1649,16 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
cpi->sf.use_compound_nonrd_pickmode && usable_ref_frame == ALTREF_FRAME)
comp_modes = 2;
+ // If the segment reference frame feature is enabled and it's set to GOLDEN
+ // reference, then make sure we don't skip checking GOLDEN, this is to
+ // prevent possibility of not picking any mode.
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) == GOLDEN_FRAME) {
+ usable_ref_frame = GOLDEN_FRAME;
+ skip_ref_find_pred[GOLDEN_FRAME] = 0;
+ thresh_svc_skip_golden = 0;
+ }
+
for (ref_frame = LAST_FRAME; ref_frame <= usable_ref_frame; ++ref_frame) {
if (!skip_ref_find_pred[ref_frame]) {
find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
@@ -1699,6 +1720,12 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (ref_frame > usable_ref_frame) continue;
if (skip_ref_find_pred[ref_frame]) continue;
+ // If the segment reference frame feature is enabled then do nothing if the
+ // current ref frame is not allowed.
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) != (int)ref_frame)
+ continue;
+
if (flag_svc_subpel && ref_frame == GOLDEN_FRAME) {
force_gf_mv = 1;
// Only test mode if NEARESTMV/NEARMV is (svc_mv_col, svc_mv_row),
@@ -1713,7 +1740,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
}
if (comp_pred) {
- const struct segmentation *const seg = &cm->seg;
if (!cpi->allow_comp_inter_inter) continue;
// Skip compound inter modes if ARF is not available.
if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
@@ -1785,29 +1811,34 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
continue;
}
- if (sf->reference_masking &&
- !(frame_mv[this_mode][ref_frame].as_int == 0 &&
- ref_frame == LAST_FRAME)) {
- if (usable_ref_frame < ALTREF_FRAME) {
- if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
- i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
- if ((cpi->ref_frame_flags & flag_list[i]))
- if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
- ref_frame_skip_mask |= (1 << ref_frame);
+ // Disable this drop out case if the ref frame segment level feature is
+ // enabled for this segment. This is to prevent the possibility that we end
+ // up unable to pick any mode.
+ if (!segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) {
+ if (sf->reference_masking &&
+ !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+ ref_frame == LAST_FRAME)) {
+ if (usable_ref_frame < ALTREF_FRAME) {
+ if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
+ i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
+ if ((cpi->ref_frame_flags & flag_list[i]))
+ if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
+ ref_frame_skip_mask |= (1 << ref_frame);
+ }
+ } else if (!cpi->rc.is_src_frame_alt_ref &&
+ !(frame_mv[this_mode][ref_frame].as_int == 0 &&
+ ref_frame == ALTREF_FRAME)) {
+ int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
+ int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
+ if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+ (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
+ ((cpi->ref_frame_flags & flag_list[ref2]) &&
+ (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
+ ref_frame_skip_mask |= (1 << ref_frame);
}
- } else if (!cpi->rc.is_src_frame_alt_ref &&
- !(frame_mv[this_mode][ref_frame].as_int == 0 &&
- ref_frame == ALTREF_FRAME)) {
- int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
- int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
- if (((cpi->ref_frame_flags & flag_list[ref1]) &&
- (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
- ((cpi->ref_frame_flags & flag_list[ref2]) &&
- (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
- ref_frame_skip_mask |= (1 << ref_frame);
}
+ if (ref_frame_skip_mask & (1 << ref_frame)) continue;
}
- if (ref_frame_skip_mask & (1 << ref_frame)) continue;
// Select prediction reference frames.
for (i = 0; i < MAX_MB_PLANE; i++) {
@@ -2202,9 +2233,11 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
// For spatial enhancemanent layer: perform intra prediction only if base
// layer is chosen as the reference. Always perform intra prediction if
- // LAST is the only reference or is_key_frame is set.
+ // LAST is the only reference, or is_key_frame is set, or on base
+ // temporal layer.
if (cpi->svc.spatial_layer_id) {
perform_intra_pred =
+ cpi->svc.temporal_layer_id == 0 ||
cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame ||
!(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
(!cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame &&
@@ -2214,6 +2247,13 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (cpi->oxcf.lag_in_frames > 0 && cpi->oxcf.rc_mode == VPX_VBR &&
cpi->rc.is_src_frame_alt_ref)
perform_intra_pred = 0;
+
+ // If the segment reference frame feature is enabled and set then
+ // skip the intra prediction.
+ if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME) &&
+ get_segdata(seg, mi->segment_id, SEG_LVL_REF_FRAME) > 0)
+ perform_intra_pred = 0;
+
// Perform intra prediction search, if the best SAD is above a certain
// threshold.
if (best_rdc.rdcost == INT64_MAX ||
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 4a7ccc4e5..2e4c9d2bf 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -31,10 +31,13 @@
#include "vp9/encoder/vp9_encodemv.h"
#include "vp9/encoder/vp9_ratectrl.h"
-// Max rate target for 1080P and below encodes under normal circumstances
-// (1920 * 1080 / (16 * 16)) * MAX_MB_RATE bits per MB
+// Max rate per frame for 1080P and below encodes if no level requirement given.
+// For larger formats limit to MAX_MB_RATE bits per MB
+// 4Mbits is derived from the level requirement for level 4 (1080P 30) which
+// requires that HW can sustain a rate of 16Mbits over a 4 frame group.
+// If a lower level requirement is specified then this may over ride this value.
#define MAX_MB_RATE 250
-#define MAXRATE_1080P 2025000
+#define MAXRATE_1080P 4000000
#define DEFAULT_KF_BOOST 2000
#define DEFAULT_GF_BOOST 2000
@@ -1100,6 +1103,9 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int *bottom_index,
// Baseline value derived from cpi->active_worst_quality and kf boost.
active_best_quality =
get_kf_active_quality(rc, active_worst_quality, cm->bit_depth);
+ if (cpi->twopass.kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) {
+ active_best_quality /= 4;
+ }
// Allow somewhat lower kf minq with small image formats.
if ((cm->width * cm->height) <= (352 * 288)) {
@@ -1490,6 +1496,9 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) {
if (cm->frame_type != KEY_FRAME) rc->reset_high_source_sad = 0;
rc->last_avg_frame_bandwidth = rc->avg_frame_bandwidth;
+ if (cpi->use_svc &&
+ cpi->svc.spatial_layer_id < cpi->svc.number_spatial_layers - 1)
+ cpi->svc.lower_layer_qindex = cm->base_qindex;
}
void vp9_rc_postencode_update_drop_frame(VP9_COMP *cpi) {
@@ -1584,9 +1593,8 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) {
// Adjust boost and af_ratio based on avg_frame_low_motion, which varies
// between 0 and 100 (stationary, 100% zero/small motion).
rc->gfu_boost =
- VPXMAX(500,
- DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
- (rc->avg_frame_low_motion + 100));
+ VPXMAX(500, DEFAULT_GF_BOOST * (rc->avg_frame_low_motion << 1) /
+ (rc->avg_frame_low_motion + 100));
rc->af_ratio_onepass_vbr = VPXMIN(15, VPXMAX(5, 3 * rc->gfu_boost / 400));
}
adjust_gfint_frame_constraint(cpi, rc->frames_to_key);
@@ -1861,13 +1869,8 @@ void vp9_rc_set_gf_interval_range(const VP9_COMP *const cpi,
rc->max_gf_interval = vp9_rc_get_default_max_gf_interval(
cpi->framerate, rc->min_gf_interval);
- // Extended interval for genuinely static scenes
- rc->static_scene_max_gf_interval = MAX_LAG_BUFFERS * 2;
-
- if (is_altref_enabled(cpi)) {
- if (rc->static_scene_max_gf_interval > oxcf->lag_in_frames - 1)
- rc->static_scene_max_gf_interval = oxcf->lag_in_frames - 1;
- }
+ // Extended max interval for genuinely static scenes like slide shows.
+ rc->static_scene_max_gf_interval = MAX_STATIC_GF_GROUP_LENGTH;
if (rc->max_gf_interval > rc->static_scene_max_gf_interval)
rc->max_gf_interval = rc->static_scene_max_gf_interval;
@@ -1911,12 +1914,12 @@ void vp9_rc_update_framerate(VP9_COMP *cpi) {
VPXMAX(rc->min_frame_bandwidth, FRAME_OVERHEAD_BITS);
// A maximum bitrate for a frame is defined.
- // The baseline for this aligns with HW implementations that
- // can support decode of 1080P content up to a bitrate of MAX_MB_RATE bits
- // per 16x16 MB (averaged over a frame). However this limit is extended if
- // a very high rate is given on the command line or the the rate cannnot
- // be acheived because of a user specificed max q (e.g. when the user
- // specifies lossless encode.
+ // However this limit is extended if a very high rate is given on the command
+ // line or the the rate cannnot be acheived because of a user specificed max q
+ // (e.g. when the user specifies lossless encode).
+ //
+ // If a level is specified that requires a lower maximum rate then the level
+ // value take precedence.
vbr_max_bits =
(int)(((int64_t)rc->avg_frame_bandwidth * oxcf->two_pass_vbrmax_section) /
100);
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index c1b210677..3a40e0138 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -34,6 +34,14 @@ extern "C" {
#define FRAME_OVERHEAD_BITS 200
+// Threshold used to define a KF group as static (e.g. a slide show).
+// Essentially this means that no frame in the group has more than 1% of MBs
+// that are not marked as coded with 0,0 motion in the first pass.
+#define STATIC_KF_GROUP_THRESH 99
+
+// The maximum duration of a GF group that is static (for example a slide show).
+#define MAX_STATIC_GF_GROUP_LENGTH 250
+
typedef enum {
INTER_NORMAL = 0,
INTER_HIGH = 1,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 2ba6378c5..90f06720b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -59,7 +59,9 @@ typedef struct {
MV_REFERENCE_FRAME ref_frame[2];
} MODE_DEFINITION;
-typedef struct { MV_REFERENCE_FRAME ref_frame[2]; } REF_DEFINITION;
+typedef struct {
+ MV_REFERENCE_FRAME ref_frame[2];
+} REF_DEFINITION;
struct rdcost_block_args {
const VP9_COMP *cpi;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index eb39bab25..389d48f21 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -45,8 +45,8 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
svc->ext_lst_fb_idx[sl] = 0;
svc->ext_gld_fb_idx[sl] = 1;
svc->ext_alt_fb_idx[sl] = 2;
- svc->downsample_filter_type[sl] = EIGHTTAP;
- svc->downsample_filter_phase[sl] = 0; // Set to 8 for averaging filter.
+ svc->downsample_filter_type[sl] = BILINEAR;
+ svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter.
}
if (cpi->oxcf.error_resilient_mode == 0 && cpi->oxcf.pass == 2) {
@@ -155,6 +155,8 @@ void vp9_update_layer_context_change_config(VP9_COMP *const cpi,
int sl, tl, layer = 0, spatial_layer_target;
float bitrate_alloc = 1.0;
+ cpi->svc.temporal_layering_mode = oxcf->temporal_layering_mode;
+
if (svc->temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING) {
for (sl = 0; sl < oxcf->ss_number_layers; ++sl) {
for (tl = 0; tl < oxcf->ts_number_layers; ++tl) {
@@ -547,6 +549,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
if (!spatial_id) {
cpi->ref_frame_flags = VP9_LAST_FLAG;
} else {
+ if (spatial_id == cpi->svc.number_spatial_layers - 1)
+ cpi->ext_refresh_alt_ref_frame = 0;
cpi->ref_frame_flags = VP9_LAST_FLAG | VP9_GOLD_FLAG;
}
}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 8b708e271..87686fe59 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -103,6 +103,8 @@ typedef struct SVC {
int first_layer_denoise;
int skip_enhancement_layer;
+
+ int lower_layer_qindex;
} SVC;
struct VP9_COMP;
diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
index dbd243ac1..293cdcd67 100644
--- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c
@@ -170,13 +170,13 @@ void vp9_fht4x4_sse2(const int16_t *input, tran_low_t *output, int stride,
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
- case ADST_ADST:
+ default:
+ assert(tx_type == ADST_ADST);
load_buffer_4x4(input, in, stride);
fadst4_sse2(in);
fadst4_sse2(in);
write_buffer_4x4(output, in);
break;
- default: assert(0); break;
}
}
@@ -1097,14 +1097,14 @@ void vp9_fht8x8_sse2(const int16_t *input, tran_low_t *output, int stride,
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
- case ADST_ADST:
+ default:
+ assert(tx_type == ADST_ADST);
load_buffer_8x8(input, in, stride);
fadst8_sse2(in);
fadst8_sse2(in);
right_shift_8x8(in, 1);
write_buffer_8x8(output, in, 8);
break;
- default: assert(0); break;
}
}
@@ -1963,13 +1963,13 @@ void vp9_fht16x16_sse2(const int16_t *input, tran_low_t *output, int stride,
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
- case ADST_ADST:
+ default:
+ assert(tx_type == ADST_ADST);
load_buffer_16x16(input, in0, in1, stride);
fadst16_sse2(in0, in1);
right_shift_16x16(in0, in1);
fadst16_sse2(in0, in1);
write_buffer_16x16(output, in0, in1, 16);
break;
- default: assert(0); break;
}
}
diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c
new file mode 100644
index 000000000..4bebc34d6
--- /dev/null
+++ b/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h> // AVX2
+
+#include "./vp9_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/bitdepth_conversion_avx2.h"
+#include "vpx_dsp/x86/quantize_x86.h"
+
+// Zero fill 8 positions in the output buffer.
+static INLINE void store_zero_tran_low(tran_low_t *a) {
+ const __m256i zero = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+ _mm256_storeu_si256((__m256i *)(a), zero);
+ _mm256_storeu_si256((__m256i *)(a + 8), zero);
+#else
+ _mm256_storeu_si256((__m256i *)(a), zero);
+#endif
+}
+
+static INLINE __m256i scan_eob_256(const __m256i *iscan_ptr,
+ __m256i *coeff256) {
+ const __m256i iscan = _mm256_loadu_si256(iscan_ptr);
+ const __m256i zero256 = _mm256_setzero_si256();
+#if CONFIG_VP9_HIGHBITDEPTH
+ // The _mm256_packs_epi32() in load_tran_low() packs the 64 bit coeff as
+ // B1 A1 B0 A0. Shuffle to B1 B0 A1 A0 in order to scan eob correctly.
+ const __m256i _coeff256 = _mm256_permute4x64_epi64(*coeff256, 0xd8);
+ const __m256i zero_coeff0 = _mm256_cmpeq_epi16(_coeff256, zero256);
+#else
+ const __m256i zero_coeff0 = _mm256_cmpeq_epi16(*coeff256, zero256);
+#endif
+ const __m256i nzero_coeff0 = _mm256_cmpeq_epi16(zero_coeff0, zero256);
+ // Add one to convert from indices to counts
+ const __m256i iscan_plus_one = _mm256_sub_epi16(iscan, nzero_coeff0);
+ return _mm256_and_si256(iscan_plus_one, nzero_coeff0);
+}
+
+void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *round_ptr,
+ const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ __m128i eob;
+ __m256i round256, quant256, dequant256;
+ __m256i eob256, thr256;
+
+ (void)scan_ptr;
+ (void)skip_block;
+ assert(!skip_block);
+
+ coeff_ptr += n_coeffs;
+ iscan_ptr += n_coeffs;
+ qcoeff_ptr += n_coeffs;
+ dqcoeff_ptr += n_coeffs;
+ n_coeffs = -n_coeffs;
+
+ {
+ __m256i coeff256;
+
+ // Setup global values
+ {
+ const __m128i round = _mm_load_si128((const __m128i *)round_ptr);
+ const __m128i quant = _mm_load_si128((const __m128i *)quant_ptr);
+ const __m128i dequant = _mm_load_si128((const __m128i *)dequant_ptr);
+ round256 = _mm256_castsi128_si256(round);
+ round256 = _mm256_permute4x64_epi64(round256, 0x54);
+
+ quant256 = _mm256_castsi128_si256(quant);
+ quant256 = _mm256_permute4x64_epi64(quant256, 0x54);
+
+ dequant256 = _mm256_castsi128_si256(dequant);
+ dequant256 = _mm256_permute4x64_epi64(dequant256, 0x54);
+ }
+
+ {
+ __m256i qcoeff256;
+ __m256i qtmp256;
+ coeff256 = load_tran_low(coeff_ptr + n_coeffs);
+ qcoeff256 = _mm256_abs_epi16(coeff256);
+ qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+ qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+ qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+ store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
+ coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+ store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
+ }
+
+ eob256 = scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256);
+ n_coeffs += 8 * 2;
+ }
+
+ // remove dc constants
+ dequant256 = _mm256_permute2x128_si256(dequant256, dequant256, 0x31);
+ quant256 = _mm256_permute2x128_si256(quant256, quant256, 0x31);
+ round256 = _mm256_permute2x128_si256(round256, round256, 0x31);
+
+ thr256 = _mm256_srai_epi16(dequant256, 1);
+
+ // AC only loop
+ while (n_coeffs < 0) {
+ __m256i coeff256 = load_tran_low(coeff_ptr + n_coeffs);
+ __m256i qcoeff256 = _mm256_abs_epi16(coeff256);
+ int32_t nzflag =
+ _mm256_movemask_epi8(_mm256_cmpgt_epi16(qcoeff256, thr256));
+
+ if (nzflag) {
+ __m256i qtmp256;
+ qcoeff256 = _mm256_adds_epi16(qcoeff256, round256);
+ qtmp256 = _mm256_mulhi_epi16(qcoeff256, quant256);
+ qcoeff256 = _mm256_sign_epi16(qtmp256, coeff256);
+ store_tran_low(qcoeff256, qcoeff_ptr + n_coeffs);
+ coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256);
+ store_tran_low(coeff256, dqcoeff_ptr + n_coeffs);
+ eob256 = _mm256_max_epi16(
+ eob256,
+ scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256));
+ } else {
+ store_zero_tran_low(qcoeff_ptr + n_coeffs);
+ store_zero_tran_low(dqcoeff_ptr + n_coeffs);
+ }
+ n_coeffs += 8 * 2;
+ }
+
+ eob = _mm_max_epi16(_mm256_castsi256_si128(eob256),
+ _mm256_extracti128_si256(eob256, 1));
+
+ *eob_ptr = accumulate_eob(eob);
+}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 9819fb641..721e170bf 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -64,10 +64,13 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.h
VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_mfqe.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
-VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct4x4_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct8x8_msa.c
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_idct16x16_msa.c
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht_neon.h
ifeq ($(CONFIG_VP9_POSTPROC),yes)
VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_mfqe_msa.c
@@ -78,10 +81,11 @@ ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans4_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans8_dspr2.c
VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_itrans16_dspr2.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
-VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
else
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_highbd_iht4x4_add_neon.c
VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht4x4_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht8x8_add_sse4.c
+VP9_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp9_highbd_iht16x16_add_sse4.c
endif
$(eval $(call rtcd_h_template,vp9_rtcd,vp9/common/vp9_rtcd_defs.pl))
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 5bfe9aa05..40f7ab531 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1067,12 +1067,11 @@ static vpx_codec_frame_flags_t get_frame_pkt_flags(const VP9_COMP *cpi,
vpx_codec_frame_flags_t flags = lib_flags << 16;
if (lib_flags & FRAMEFLAGS_KEY ||
- (cpi->use_svc &&
- cpi->svc
- .layer_context[cpi->svc.spatial_layer_id *
- cpi->svc.number_temporal_layers +
- cpi->svc.temporal_layer_id]
- .is_key_frame))
+ (cpi->use_svc && cpi->svc
+ .layer_context[cpi->svc.spatial_layer_id *
+ cpi->svc.number_temporal_layers +
+ cpi->svc.temporal_layer_id]
+ .is_key_frame))
flags |= VPX_FRAME_IS_KEY;
if (cpi->droppable) flags |= VPX_FRAME_IS_DROPPABLE;
@@ -1234,6 +1233,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
ctx->pending_frame_magnitude |= size;
cx_data += size;
cx_data_sz -= size;
+ pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
+ pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
if (ctx->output_cx_pkt_cb.output_cx_pkt) {
pkt.kind = VPX_CODEC_CX_FRAME_PKT;
@@ -1260,8 +1261,8 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
pkt.data.frame.duration = (unsigned long)ticks_to_timebase_units(
timebase, dst_end_time_stamp - dst_time_stamp);
pkt.data.frame.flags = get_frame_pkt_flags(cpi, lib_flags);
- pkt.data.frame.width = cpi->common.width;
- pkt.data.frame.height = cpi->common.height;
+ pkt.data.frame.width[cpi->svc.spatial_layer_id] = cpi->common.width;
+ pkt.data.frame.height[cpi->svc.spatial_layer_id] = cpi->common.height;
if (ctx->pending_cx_data) {
if (size) ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
@@ -1340,9 +1341,8 @@ static vpx_codec_err_t ctrl_set_reference(vpx_codec_alg_priv_t *ctx,
vp9_set_reference_enc(ctx->cpi, ref_frame_to_vp9_reframe(frame->frame_type),
&sd);
return VPX_CODEC_OK;
- } else {
- return VPX_CODEC_INVALID_PARAM;
}
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
@@ -1356,9 +1356,8 @@ static vpx_codec_err_t ctrl_copy_reference(vpx_codec_alg_priv_t *ctx,
vp9_copy_reference_enc(ctx->cpi,
ref_frame_to_vp9_reframe(frame->frame_type), &sd);
return VPX_CODEC_OK;
- } else {
- return VPX_CODEC_INVALID_PARAM;
}
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
@@ -1371,9 +1370,8 @@ static vpx_codec_err_t ctrl_get_reference(vpx_codec_alg_priv_t *ctx,
yuvconfig2image(&frame->img, fb, NULL);
return VPX_CODEC_OK;
- } else {
- return VPX_CODEC_INVALID_PARAM;
}
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
@@ -1383,9 +1381,8 @@ static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
if (config != NULL) {
ctx->preview_ppcfg = *config;
return VPX_CODEC_OK;
- } else {
- return VPX_CODEC_INVALID_PARAM;
}
+ return VPX_CODEC_INVALID_PARAM;
#else
(void)ctx;
(void)args;
@@ -1407,17 +1404,24 @@ static vpx_image_t *encoder_get_preview(vpx_codec_alg_priv_t *ctx) {
if (vp9_get_preview_raw_frame(ctx->cpi, &sd, &flags) == 0) {
yuvconfig2image(&ctx->preview_img, &sd, NULL);
return &ctx->preview_img;
- } else {
- return NULL;
}
+ return NULL;
}
static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
va_list args) {
- (void)ctx;
- (void)args;
+ vpx_roi_map_t *data = va_arg(args, vpx_roi_map_t *);
- // TODO(yaowu): Need to re-implement and test for VP9.
+ if (data) {
+ vpx_roi_map_t *roi = (vpx_roi_map_t *)data;
+
+ if (!vp9_set_roi_map(ctx->cpi, roi->roi_map, roi->rows, roi->cols,
+ roi->delta_q, roi->delta_lf, roi->skip,
+ roi->ref_frame)) {
+ return VPX_CODEC_OK;
+ }
+ return VPX_CODEC_INVALID_PARAM;
+ }
return VPX_CODEC_INVALID_PARAM;
}
@@ -1429,11 +1433,10 @@ static vpx_codec_err_t ctrl_set_active_map(vpx_codec_alg_priv_t *ctx,
if (!vp9_set_active_map(ctx->cpi, map->active_map, (int)map->rows,
(int)map->cols))
return VPX_CODEC_OK;
- else
- return VPX_CODEC_INVALID_PARAM;
- } else {
+
return VPX_CODEC_INVALID_PARAM;
}
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
@@ -1444,11 +1447,10 @@ static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
if (!vp9_get_active_map(ctx->cpi, map->active_map, (int)map->rows,
(int)map->cols))
return VPX_CODEC_OK;
- else
- return VPX_CODEC_INVALID_PARAM;
- } else {
+
return VPX_CODEC_INVALID_PARAM;
}
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
@@ -1460,9 +1462,8 @@ static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
vp9_set_internal_size(ctx->cpi, (VPX_SCALING)mode->h_scaling_mode,
(VPX_SCALING)mode->v_scaling_mode);
return (res == 0) ? VPX_CODEC_OK : VPX_CODEC_INVALID_PARAM;
- } else {
- return VPX_CODEC_INVALID_PARAM;
}
+ return VPX_CODEC_INVALID_PARAM;
}
static vpx_codec_err_t ctrl_set_svc(vpx_codec_alg_priv_t *ctx, va_list args) {
@@ -1608,7 +1609,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
// Setters
{ VP8_SET_REFERENCE, ctrl_set_reference },
{ VP8_SET_POSTPROC, ctrl_set_previewpp },
- { VP8E_SET_ROI_MAP, ctrl_set_roi_map },
+ { VP9E_SET_ROI_MAP, ctrl_set_roi_map },
{ VP8E_SET_ACTIVEMAP, ctrl_set_active_map },
{ VP8E_SET_SCALEMODE, ctrl_set_scale_mode },
{ VP8E_SET_CPUUSED, ctrl_set_cpuused },
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index d633ed142..6186d4614 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -103,6 +103,7 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/temporal_filter_sse4.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_quantize_sse2.c
+VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_quantize_avx2.c
VP9_CX_SRCS-$(HAVE_AVX) += encoder/x86/vp9_diamond_search_sad_avx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_highbd_block_error_intrin_sse2.c