summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--test/convolve_test.cc52
-rw-r--r--test/vp8_fdct4x4_test.cc4
-rw-r--r--test/vpx_scale_test.h28
-rw-r--r--vp8/common/mips/mmi/idct_blk_mmi.c71
-rw-r--r--vp8/common/rtcd_defs.pl10
-rw-r--r--vp8/encoder/mips/mmi/dct_mmi.c426
-rw-r--r--vp8/vp8_common.mk1
-rw-r--r--vp8/vp8cx.mk1
-rw-r--r--vp9/encoder/vp9_encodeframe.c2
-rw-r--r--vp9/encoder/vp9_firstpass.c135
-rw-r--r--vp9/encoder/vp9_frame_scale.c2
-rw-r--r--vp9/encoder/vp9_ratectrl.c15
-rw-r--r--vp9/encoder/vp9_ratectrl.h3
-rw-r--r--vp9/encoder/vp9_speed_features.c6
-rw-r--r--vp9/encoder/vp9_speed_features.h3
-rw-r--r--vp9/encoder/x86/vp9_frame_scale_ssse3.c226
-rw-r--r--vpx_dsp/x86/convolve_ssse3.h61
-rw-r--r--vpx_dsp/x86/mem_sse2.h8
18 files changed, 983 insertions, 71 deletions
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 9f6f795c9..08ef57224 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -925,33 +925,51 @@ TEST_P(ConvolveTest, FilterExtremes) {
/* This test exercises that enough rows and columns are filtered with every
possible initial fractional positions and scaling steps. */
+#if !CONFIG_VP9_HIGHBITDEPTH
+static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c,
+ vpx_scaled_avg_2d_c };
+
TEST_P(ConvolveTest, CheckScalingFiltering) {
uint8_t *const in = input();
uint8_t *const out = output();
- const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP];
+ uint8_t ref[kOutputStride * kMaxDimension];
- SetConstantInput(127);
+ ::libvpx_test::ACMRandom prng;
+ for (int y = 0; y < Height(); ++y) {
+ for (int x = 0; x < Width(); ++x) {
+ const uint16_t r = prng.Rand8Extremes();
+ assign_val(in, y * kInputStride + x, r);
+ }
+ }
- for (int frac = 0; frac < 16; ++frac) {
- for (int step = 1; step <= 32; ++step) {
- /* Test the horizontal and vertical filters in combination. */
- ASM_REGISTER_STATE_CHECK(
- UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, frac,
- step, frac, step, Width(), Height()));
-
- CheckGuardBlocks();
-
- for (int y = 0; y < Height(); ++y) {
- for (int x = 0; x < Width(); ++x) {
- ASSERT_EQ(lookup(in, y * kInputStride + x),
- lookup(out, y * kOutputStride + x))
- << "x == " << x << ", y == " << y << ", frac == " << frac
- << ", step == " << step;
+ for (int i = 0; i < 2; ++i) {
+ for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) {
+ const InterpKernel *const eighttap = vp9_filter_kernels[filter_type];
+ for (int frac = 0; frac < 16; ++frac) {
+ for (int step = 1; step <= 32; ++step) {
+ /* Test the horizontal and vertical filters in combination. */
+ scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap,
+ frac, step, frac, step, Width(), Height());
+ ASM_REGISTER_STATE_CHECK(
+ UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap,
+ frac, step, frac, step, Width(), Height()));
+
+ CheckGuardBlocks();
+
+ for (int y = 0; y < Height(); ++y) {
+ for (int x = 0; x < Width(); ++x) {
+ ASSERT_EQ(lookup(ref, y * kOutputStride + x),
+ lookup(out, y * kOutputStride + x))
+ << "x == " << x << ", y == " << y << ", frac == " << frac
+ << ", step == " << step;
+ }
+ }
}
}
}
}
}
+#endif
using std::tr1::make_tuple;
diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc
index 9f69ae164..b7697d859 100644
--- a/test/vp8_fdct4x4_test.cc
+++ b/test/vp8_fdct4x4_test.cc
@@ -199,4 +199,8 @@ INSTANTIATE_TEST_CASE_P(SSE2, FdctTest,
INSTANTIATE_TEST_CASE_P(MSA, FdctTest,
::testing::Values(vp8_short_fdct4x4_msa));
#endif // HAVE_MSA
+#if HAVE_MMI
+INSTANTIATE_TEST_CASE_P(MMI, FdctTest,
+ ::testing::Values(vp8_short_fdct4x4_mmi));
+#endif // HAVE_MMI
} // namespace
diff --git a/test/vpx_scale_test.h b/test/vpx_scale_test.h
index 18909d1b5..dcbd02b91 100644
--- a/test/vpx_scale_test.h
+++ b/test/vpx_scale_test.h
@@ -15,11 +15,14 @@
#include "./vpx_config.h"
#include "./vpx_scale_rtcd.h"
+#include "test/acm_random.h"
#include "test/clear_system_state.h"
#include "test/register_state_check.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_scale/yv12config.h"
+using libvpx_test::ACMRandom;
+
namespace libvpx_test {
class VpxScaleBase {
@@ -65,12 +68,12 @@ class VpxScaleBase {
ResetScaleImage(&img_, src_width, src_height);
ResetScaleImage(&ref_img_, dst_width, dst_height);
ResetScaleImage(&dst_img_, dst_width, dst_height);
- FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
- img_.y_stride);
- FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
- img_.uv_stride);
- FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
- img_.uv_stride);
+ FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
+ img_.y_stride);
+ FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height,
+ img_.uv_stride);
+ FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height,
+ img_.uv_stride);
}
void DeallocImages() {
@@ -89,7 +92,8 @@ class VpxScaleBase {
static const int kBufFiller = 123;
static const int kBufMax = kBufFiller - 1;
- static void FillPlane(uint8_t *buf, int width, int height, int stride) {
+ static void FillPlane(uint8_t *const buf, const int width, const int height,
+ const int stride) {
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
buf[x + (y * stride)] = (x + (width * y)) % kBufMax;
@@ -97,6 +101,16 @@ class VpxScaleBase {
}
}
+ static void FillPlaneExtreme(uint8_t *const buf, const int width,
+ const int height, const int stride) {
+ ACMRandom rnd;
+ for (int y = 0; y < height; ++y) {
+ for (int x = 0; x < width; ++x) {
+ buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0;
+ }
+ }
+ }
+
static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
int width, int height, int stride, int padding) {
// Copy the outermost visible pixel to a distance of at least 'padding.'
diff --git a/vp8/common/mips/mmi/idct_blk_mmi.c b/vp8/common/mips/mmi/idct_blk_mmi.c
new file mode 100644
index 000000000..f6020ab46
--- /dev/null
+++ b/vp8/common/mips/mmi/idct_blk_mmi.c
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst,
+ int stride, int8_t *eobs) {
+ int i, j;
+
+ for (i = 0; i < 4; i++) {
+ for (j = 0; j < 4; j++) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_mmi(q, dq, dst, stride);
+ } else {
+ vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dst += 4;
+ }
+
+ dst += 4 * stride - 16;
+ }
+}
+
+void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu,
+ uint8_t *dstv, int stride,
+ int8_t *eobs) {
+ int i, j;
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_mmi(q, dq, dstu, stride);
+ } else {
+ vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dstu += 4;
+ }
+
+ dstu += 4 * stride - 8;
+ }
+
+ for (i = 0; i < 2; i++) {
+ for (j = 0; j < 2; j++) {
+ if (*eobs++ > 1) {
+ vp8_dequant_idct_add_mmi(q, dq, dstv, stride);
+ } else {
+ vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride);
+ memset(q, 0, 2 * sizeof(q[0]));
+ }
+
+ q += 16;
+ dstv += 4;
+ }
+
+ dstv += 4 * stride - 8;
+ }
+}
diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl
index 3bcfdc0d6..ece2785eb 100644
--- a/vp8/common/rtcd_defs.pl
+++ b/vp8/common/rtcd_defs.pl
@@ -28,10 +28,10 @@ add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char
specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/;
add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa/;
+specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/;
#
# Loopfilter
@@ -176,13 +176,13 @@ if ($opts{arch} =~ /x86/) {
# Forward DCT
#
add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/;
add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 sse2 neon msa/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_walsh4x4 sse2 neon msa/;
+specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
#
# Quantizer
diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c
new file mode 100644
index 000000000..7e45a1278
--- /dev/null
+++ b/vp8/encoder/mips/mmi/dct_mmi.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/asmdefs_mmi.h"
+
+/* clang-format off */
+#define TRANSPOSE_4H \
+ MMI_LI(%[tmp0], 0x93) \
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \
+ "mtc1 %[tmp0], %[ftmp10] \n\t" \
+ "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \
+ "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \
+ "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \
+ "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \
+ "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \
+ "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \
+ "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \
+ "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \
+ "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \
+ "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \
+ "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t"
+/* clang-format on */
+
+void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+ int pitch_half = pitch / 2;
+ uint64_t tmp[1];
+
+#if _MIPS_SIM == _ABIO32
+ register double ftmp0 asm("$f0");
+ register double ftmp1 asm("$f2");
+ register double ftmp2 asm("$f4");
+ register double ftmp3 asm("$f6");
+ register double ftmp4 asm("$f8");
+ register double ftmp5 asm("$f10");
+ register double ftmp6 asm("$f12");
+ register double ftmp7 asm("$f14");
+ register double ftmp8 asm("$f16");
+ register double ftmp9 asm("$f18");
+ register double ftmp10 asm("$f20");
+ register double ftmp11 asm("$f22");
+ register double ftmp12 asm("$f24");
+#else
+ register double ftmp0 asm("$f0");
+ register double ftmp1 asm("$f1");
+ register double ftmp2 asm("$f2");
+ register double ftmp3 asm("$f3");
+ register double ftmp4 asm("$f4");
+ register double ftmp5 asm("$f5");
+ register double ftmp6 asm("$f6");
+ register double ftmp7 asm("$f7");
+ register double ftmp8 asm("$f8");
+ register double ftmp9 asm("$f9");
+ register double ftmp10 asm("$f10");
+ register double ftmp11 asm("$f11");
+ register double ftmp12 asm("$f12");
+#endif // _MIPS_SIM == _ABIO32
+
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL };
+
+ DECLARE_ALIGNED(16, int, a[4]);
+ DECLARE_ALIGNED(16, int, b[4]);
+ DECLARE_ALIGNED(16, int, c[4]);
+ DECLARE_ALIGNED(16, int, d[4]);
+
+ // stage1
+ a[0] = (input[0] + input[3]) * 8;
+ a[1] = (input[0 + pitch_half] + input[3 + pitch_half]) * 8;
+ a[2] = (input[0 + 2 * pitch_half] + input[3 + 2 * pitch_half]) * 8;
+ a[3] = (input[0 + 3 * pitch_half] + input[3 + 3 * pitch_half]) * 8;
+
+ b[0] = (input[1] + input[2]) * 8;
+ b[1] = (input[1 + pitch_half] + input[2 + pitch_half]) * 8;
+ b[2] = (input[1 + 2 * pitch_half] + input[2 + 2 * pitch_half]) * 8;
+ b[3] = (input[1 + 3 * pitch_half] + input[2 + 3 * pitch_half]) * 8;
+
+ c[0] = (input[1] - input[2]) * 8;
+ c[1] = (input[1 + pitch_half] - input[2 + pitch_half]) * 8;
+ c[2] = (input[1 + 2 * pitch_half] - input[2 + 2 * pitch_half]) * 8;
+ c[3] = (input[1 + 3 * pitch_half] - input[2 + 3 * pitch_half]) * 8;
+
+ d[0] = (input[0] - input[3]) * 8;
+ d[1] = (input[0 + pitch_half] - input[3 + pitch_half]) * 8;
+ d[2] = (input[0 + 2 * pitch_half] - input[3 + 2 * pitch_half]) * 8;
+ d[3] = (input[0 + 3 * pitch_half] - input[3 + 3 * pitch_half]) * 8;
+
+ __asm__ volatile (
+ "gslqc1 %[ftmp2], %[ftmp1], 0x00(%[a]) \n\t"
+ "gslqc1 %[ftmp4], %[ftmp3], 0x00(%[b]) \n\t"
+ "gslqc1 %[ftmp6], %[ftmp5], 0x00(%[c]) \n\t"
+ "gslqc1 %[ftmp8], %[ftmp7], 0x00(%[d]) \n\t"
+
+ "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t"
+ "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t"
+ "psubw %[ftmp11], %[ftmp1], %[ftmp3] \n\t"
+ "psubw %[ftmp12], %[ftmp2], %[ftmp4] \n\t"
+ "packsswh %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ "packsswh %[ftmp3], %[ftmp11], %[ftmp12] \n\t"
+ "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "packsswh %[ftmp4], %[ftmp7], %[ftmp8] \n\t"
+ MMI_LI(%[tmp0], 0x0c)
+ "mov.d %[ftmp7], %[ftmp2] \n\t"
+ "mov.d %[ftmp8], %[ftmp4] \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "ldc1 %[ftmp12], %[ff_pw_14500] \n\t"
+ "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t"
+ "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
+ "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+ "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+ "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+
+ "ldc1 %[ftmp12], %[ff_pw_7500] \n\t"
+ "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t"
+ "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t"
+ "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+ "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+ "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+ TRANSPOSE_4H
+
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t"
+ "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t"
+ "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t"
+ "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t"
+
+ "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t"
+ "ldc1 %[ftmp9], %[ff_ph_01] \n\t"
+ "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t"
+
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t"
+ "ldc1 %[ftmp9], %[ff_ph_07] \n\t"
+ MMI_LI(%[tmp0], 0x04)
+ "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+ "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+
+ MMI_LI(%[tmp0], 0x10)
+ "ldc1 %[ftmp12], %[ff_pw_12000] \n\t"
+ "mtc1 %[tmp0], %[ftmp9] \n\t"
+
+ "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t"
+ "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+ "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
+ "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t"
+ "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t"
+
+ "ldc1 %[ftmp12], %[ff_pw_51000] \n\t"
+ "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t"
+ "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t"
+ "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t"
+ "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t"
+ "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t"
+ "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t"
+ "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t"
+ "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t"
+
+ : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2),
+ [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5),
+ [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8),
+ [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11),
+ [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0])
+ : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), [a] "r"(a),
+ [b] "r"(b), [c] "r"(c), [d] "r"(d), [ff_ph_op1] "f"(ff_ph_op1),
+ [ff_ph_op3] "f"(ff_ph_op3), [ff_pw_14500] "m"(ff_pw_14500),
+ [ff_pw_7500] "m"(ff_pw_7500), [ff_pw_12000] "m"(ff_pw_12000),
+ [ff_pw_51000] "m"(ff_pw_51000)
+ );
+
+ __asm__ volatile(
+ "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t"
+ "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t"
+ "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t"
+ "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t"
+ "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t"
+ "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t"
+ "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t"
+ :
+ : [ftmp1] "f"(ftmp1), [ftmp2] "f"(ftmp2), [ftmp3] "f"(ftmp3),
+ [ftmp4] "f"(ftmp4), [output] "r"(output)
+ : "memory");
+}
+
+void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) {
+ vp8_short_fdct4x4_mmi(input, output, pitch);
+ vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch);
+}
+
+void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) {
+ double ftmp[13];
+ uint32_t tmp[1];
+ DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL };
+ DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL };
+
+ __asm__ volatile (
+ MMI_LI(%[tmp0], 0x02)
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t"
+ MMI_ADDU(%[ip], %[ip], %[pitch])
+ "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t"
+ TRANSPOSE_4H
+
+ "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+ // a
+ "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t"
+ // d
+ "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t"
+ // c
+ "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t"
+ // b
+ "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t"
+
+ // a + d
+ "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t"
+ // b + c
+ "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t"
+ // b - c
+ "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t"
+ // a - d
+ "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t"
+
+ "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t"
+ "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t"
+ TRANSPOSE_4H
+
+ // op[2], op[0]
+ "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t"
+ // op[3], op[1]
+ "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t"
+
+ // op[6], op[4]
+ "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t"
+ // op[7], op[5]
+ "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t"
+
+ // op[10], op[8]
+ "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t"
+ // op[11], op[9]
+ "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t"
+
+ // op[14], op[12]
+ "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t"
+ // op[15], op[13]
+ "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t"
+
+ // a1, a3
+ "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t"
+ // d1, d3
+ "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t"
+ // c1, c3
+ "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t"
+ // b1, b3
+ "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t"
+
+ // a1 + d1, a3 + d3
+ "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t"
+ // b1 + c1, b3 + c3
+ "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t"
+ // b1 - c1, b3 - c3
+ "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t"
+ // a1 - d1, a3 - d3
+ "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t"
+
+ // a2, a4
+ "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t"
+ // d2, d4
+ "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t"
+ // c2, c4
+ "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t"
+ // b2, b4
+ "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t"
+
+ // a2 + d2, a4 + d4
+ "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t"
+ // b2 + c2, b4 + c4
+ "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t"
+ // b2 - c2, b4 - c4
+ "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t"
+ // a2 - d2, a4 - d4
+ "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t"
+
+ MMI_LI(%[tmp0], 0x03)
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t"
+ "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t"
+ "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t"
+ "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t"
+ "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t"
+ "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t"
+ "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t"
+ "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t"
+ "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t"
+ "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t"
+ "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t"
+ "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t"
+ "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t"
+ "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t"
+ "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t"
+
+ "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t"
+ "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t"
+ "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t"
+ "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t"
+ "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t"
+
+ "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t"
+ "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t"
+ "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t"
+ "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t"
+
+ MMI_LI(%[tmp0], 0x72)
+ "mtc1 %[tmp0], %[ftmp11] \n\t"
+ "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t"
+ "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t"
+ "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t"
+ "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t"
+
+ "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t"
+ "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t"
+ "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t"
+ "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t"
+ "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t"
+ "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t"
+ "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t"
+ "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]),
+ [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]),
+ [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]),
+ [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]),
+ [ftmp12]"=&f"(ftmp[12]),
+ [tmp0]"=&r"(tmp[0]),
+ [ip]"+&r"(input)
+ : [op]"r"(output),
+ [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch),
+ [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask),
+ [ff_ph_01]"f"(ff_ph_01)
+ : "memory"
+ );
+}
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 5813c81c4..246fe6a67 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -122,6 +122,7 @@ VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c
VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c
VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c
VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c
+VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 23d65d416..0dac0169d 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -111,6 +111,7 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c
+VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c
ifeq ($(CONFIG_TEMPORAL_DENOISING),yes)
VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index dee17ade2..aa298acdf 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -3489,7 +3489,7 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) {
static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x,
RD_COST *rd_cost, BLOCK_SIZE bsize,
PICK_MODE_CONTEXT *ctx) {
- if (bsize < BLOCK_16X16)
+ if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16)
vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX);
else
vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx);
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index db15d4021..9d9779f7b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -41,6 +41,11 @@
#define OUTPUT_FPF 0
#define ARF_STATS_OUTPUT 0
+#define COMPLEXITY_STATS_OUTPUT 0
+
+#ifdef CORPUS_VBR_EXPERIMENT
+#define CORPUS_VBR_MIDPOINT 82.0
+#endif
#define FIRST_PASS_Q 10.0
#define GF_MAX_BOOST 96.0
@@ -239,8 +244,12 @@ static double calculate_active_area(const VP9_COMP *cpi,
static double get_distribution_av_err(TWO_PASS *const twopass) {
const double av_weight =
twopass->total_stats.weight / twopass->total_stats.count;
+#ifdef CORPUS_VBR_EXPERIMENT
+ return av_weight * CORPUS_VBR_MIDPOINT;
+#else
return (twopass->total_stats.coded_error * av_weight) /
twopass->total_stats.count;
+#endif
}
// Calculate a modified Error used in distributing bits between easier and
@@ -1686,7 +1695,7 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width,
void vp9_init_second_pass(VP9_COMP *cpi) {
SVC *const svc = &cpi->svc;
- const VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
const int is_two_pass_svc =
(svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1);
RATE_CONTROL *const rc = &cpi->rc;
@@ -1706,28 +1715,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
*stats = *twopass->stats_in_end;
twopass->total_left_stats = *stats;
- frame_rate = 10000000.0 * stats->count / stats->duration;
- // Each frame can have a different duration, as the frame rate in the source
- // isn't guaranteed to be constant. The frame rate prior to the first frame
- // encoded in the second pass is a guess. However, the sum duration is not.
- // It is calculated based on the actual durations of all frames from the
- // first pass.
-
- if (is_two_pass_svc) {
- vp9_update_spatial_layer_framerate(cpi, frame_rate);
- twopass->bits_left =
- (int64_t)(stats->duration *
- svc->layer_context[svc->spatial_layer_id].target_bandwidth /
- 10000000.0);
- } else {
- vp9_new_framerate(cpi, frame_rate);
- twopass->bits_left =
- (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
- }
-
- // This variable monitors how far behind the second ref update is lagging.
- twopass->sr_update_lag = 1;
-
// Scan the first pass file and calculate a modified score for each
// frame that is used to distribute bits. The modified score is assumed
// to provide a linear basis for bit allocation. I.e a frame A with a score
@@ -1737,6 +1724,9 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
const FIRSTPASS_STATS *s = twopass->stats_in;
const double av_err = get_distribution_av_err(twopass);
+#ifdef CORPUS_VBR_EXPERIMENT
+ twopass->mean_mod_score = CORPUS_VBR_MIDPOINT;
+#else
// The first scan is unclamped and gives a raw average.
while (s < twopass->stats_in_end) {
modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err);
@@ -1747,6 +1737,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
// error for the rate distribution function.
twopass->mean_mod_score =
modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count);
+#endif
// Second scan using clamps based on the previous cycle average.
// This may modify the total and average somewhat but we dont bother with
@@ -1759,8 +1750,47 @@ void vp9_init_second_pass(VP9_COMP *cpi) {
++s;
}
twopass->normalized_score_left = modified_score_total;
+
+#ifdef CORPUS_VBR_EXPERIMENT
+ // If using Corpus wide VBR mode then update the clip target bandwidth.
+ oxcf->target_bandwidth =
+ (int64_t)((double)oxcf->target_bandwidth *
+ (twopass->normalized_score_left / stats->count));
+#endif
+
+#if COMPLEXITY_STATS_OUTPUT
+ {
+ FILE *compstats;
+ compstats = fopen("complexity_stats.stt", "a");
+ fprintf(compstats, "%10.3lf\n",
+ twopass->normalized_score_left / stats->count);
+ fclose(compstats);
+ }
+#endif
}
+ frame_rate = 10000000.0 * stats->count / stats->duration;
+ // Each frame can have a different duration, as the frame rate in the source
+ // isn't guaranteed to be constant. The frame rate prior to the first frame
+ // encoded in the second pass is a guess. However, the sum duration is not.
+ // It is calculated based on the actual durations of all frames from the
+ // first pass.
+
+ if (is_two_pass_svc) {
+ vp9_update_spatial_layer_framerate(cpi, frame_rate);
+ twopass->bits_left =
+ (int64_t)(stats->duration *
+ svc->layer_context[svc->spatial_layer_id].target_bandwidth /
+ 10000000.0);
+ } else {
+ vp9_new_framerate(cpi, frame_rate);
+ twopass->bits_left =
+ (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0);
+ }
+
+ // This variable monitors how far behind the second ref update is lagging.
+ twopass->sr_update_lag = 1;
+
// Reset the vbr bits off target counters
rc->vbr_bits_off_target = 0;
rc->vbr_bits_off_target_fast = 0;
@@ -2155,6 +2185,28 @@ static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) {
arf_buffer_indices[1] = ARF_SLOT2;
}
+#ifdef CORPUS_VBR_EXPERIMENT
+// Calculates the total normalized group complexity score for a given number
+// of frames starting at the current position in the stats file.
+static double calculate_group_score(VP9_COMP *cpi, double av_score,
+ int frame_count) {
+ VP9EncoderConfig *const oxcf = &cpi->oxcf;
+ TWO_PASS *const twopass = &cpi->twopass;
+ const FIRSTPASS_STATS *s = twopass->stats_in;
+ double score_total = 0.0;
+ int i = 0;
+
+ while ((i < frame_count) && (s < twopass->stats_in_end)) {
+ score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score);
+ ++s;
+ ++i;
+ }
+ assert(i == frame_count);
+
+ return score_total;
+}
+#endif
+
static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
int gf_arf_bits) {
RATE_CONTROL *const rc = &cpi->rc;
@@ -2175,8 +2227,13 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1;
int normal_frames;
int normal_frame_bits;
- int last_frame_bits;
- int last_frame_reduction;
+ int last_frame_reduction = 0;
+
+#ifdef CORPUS_VBR_EXPERIMENT
+ double av_score = get_distribution_av_err(twopass);
+ double tot_norm_frame_score;
+ double this_frame_score;
+#endif
// Only encode alt reference frame in temporal base layer.
if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers;
@@ -2249,17 +2306,17 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+#ifndef CORPUS_VBR_EXPERIMENT
// The last frame in the group is used less as a predictor so reduce
// its allocation a little.
if (normal_frames > 1) {
normal_frame_bits = (int)(total_group_bits / normal_frames);
- last_frame_reduction = normal_frame_bits / 16;
- last_frame_bits = normal_frame_bits - last_frame_reduction;
} else {
normal_frame_bits = (int)total_group_bits;
- last_frame_bits = normal_frame_bits;
- last_frame_reduction = 0;
}
+#else
+ tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames);
+#endif
// Allocate bits to the other frames in the group.
for (i = 0; i < normal_frames; ++i) {
@@ -2270,11 +2327,18 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
++frame_index;
}
- target_frame_size = (i == (normal_frames - 1))
- ? last_frame_bits
- : (frame_index == mid_frame_idx)
- ? normal_frame_bits + last_frame_reduction
- : normal_frame_bits;
+#ifdef CORPUS_VBR_EXPERIMENT
+ this_frame_score = calculate_norm_frame_score(cpi, twopass, &cpi->oxcf,
+ &frame_stats, av_score);
+ normal_frame_bits = (int)((double)total_group_bits *
+ (this_frame_score / tot_norm_frame_score));
+#endif
+
+ target_frame_size = normal_frame_bits;
+ if ((i == (normal_frames - 1)) && (i >= 1)) {
+ last_frame_reduction = normal_frame_bits / 16;
+ target_frame_size -= last_frame_reduction;
+ }
if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
mid_boost_bits += (target_frame_size >> 4);
@@ -2295,6 +2359,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
++frame_index;
}
+ // Add in some extra bits for the middle frame in the group.
+ gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction;
+
// Note:
// We need to configure the frame at the end of the sequence + 1 that will be
// the start frame for the next group. Otherwise prior to the call to
diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c
index 832df18c8..a410d0407 100644
--- a/vp9/encoder/vp9_frame_scale.c
+++ b/vp9/encoder/vp9_frame_scale.c
@@ -28,7 +28,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src,
const InterpKernel *const kernel = vp9_filter_kernels[filter_type];
int x, y, i;
-#if HAVE_NEON
+#if HAVE_SSSE3 || HAVE_NEON
// TODO(linfengz): The 4:3 specialized C code is disabled by default since
// it's much slower than the general version which calls vpx_scaled_2d() even
// if vpx_scaled_2d() is not optimized. It will only be enabled as a reference
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 8c71beaff..73d78a30c 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -1970,9 +1970,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) {
else
target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate);
+#ifndef CORPUS_VBR_EXPERIMENT
// Correction to rate target based on prior over or under shoot.
if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ)
vbr_rate_correction(cpi, &target_rate);
+#endif
vp9_rc_set_frame_target(cpi, target_rate);
}
@@ -2119,7 +2121,7 @@ static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi,
uint64_t avg_source_sad_lag = avg_sad_current;
int high_source_sad_lagindex = -1;
int steady_sad_lagindex = -1;
- uint32_t sad_thresh1 = 60000;
+ uint32_t sad_thresh1 = 70000;
uint32_t sad_thresh2 = 120000;
int low_content = 0;
int high_content = 0;
@@ -2280,8 +2282,10 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
uint64_t avg_sad_current = 0;
uint32_t min_thresh = 4000;
float thresh = 8.0f;
+ uint32_t thresh_key = 140000;
+ if (cpi->oxcf.speed <= 5) thresh_key = 240000;
if (cpi->oxcf.rc_mode == VPX_VBR) {
- min_thresh = 70000;
+ min_thresh = 65000;
thresh = 2.1f;
}
if (cpi->oxcf.lag_in_frames > 0) {
@@ -2307,7 +2311,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
rc->high_source_sad = 1;
else
rc->high_source_sad = 0;
- if (rc->high_source_sad && avg_sad_current > min_thresh << 1)
+ if (rc->high_source_sad && avg_sad_current > thresh_key)
scene_cut_force_key_frame = 1;
// Update recursive average for current frame.
if (avg_sad_current > 0)
@@ -2369,7 +2373,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
rc->high_source_sad = 1;
else
rc->high_source_sad = 0;
- if (rc->high_source_sad && avg_sad > min_thresh << 1)
+ if (rc->high_source_sad && avg_sad > thresh_key)
scene_cut_force_key_frame = 1;
if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR)
rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2;
@@ -2402,8 +2406,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) {
cpi->ext_refresh_frame_flags_pending == 0) {
int target;
cpi->refresh_golden_frame = 1;
- if (cpi->oxcf.speed >= 6 && scene_cut_force_key_frame)
- cm->frame_type = KEY_FRAME;
+ if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME;
rc->source_alt_ref_pending = 0;
if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf)
rc->source_alt_ref_pending = 1;
diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h
index bdae75542..f851e4286 100644
--- a/vp9/encoder/vp9_ratectrl.h
+++ b/vp9/encoder/vp9_ratectrl.h
@@ -24,6 +24,9 @@ extern "C" {
// Used to control aggressive VBR mode.
// #define AGGRESSIVE_VBR 1
+// Used to control Corpus VBR experiment
+// #define CORPUS_VBR_EXPERIMENT 1
+
// Bits Per MB at different Q (Multiplied by 512)
#define BPER_MB_NORMBITS 9
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 4d4a579e6..e5499d6dd 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -225,7 +225,11 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi,
}
if (speed >= 2) {
+#ifdef CORPUS_VBR_EXPERIMENT
+ sf->recode_loop = ALLOW_RECODE_FIRST;
+#else
sf->recode_loop = ALLOW_RECODE_KFARFGF;
+#endif
sf->tx_size_search_method =
frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL;
@@ -366,6 +370,7 @@ static void set_rt_speed_feature_framesize_independent(
sf->use_simple_block_yrd = 0;
sf->adapt_partition_source_sad = 0;
sf->use_altref_onepass = 0;
+ sf->nonrd_keyframe = 0;
if (speed >= 1) {
sf->allow_txfm_domain_distortion = 1;
@@ -598,6 +603,7 @@ static void set_rt_speed_feature_framesize_independent(
if (speed >= 8) {
sf->adaptive_rd_thresh = 4;
sf->skip_encode_sb = 1;
+ sf->nonrd_keyframe = 1;
if (!cpi->use_svc) cpi->max_copied_frame = 4;
if (cpi->row_mt && cpi->oxcf.max_threads > 1)
sf->adaptive_rd_thresh_row_mt = 1;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 517369dae..9e5bf9a24 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -499,6 +499,9 @@ typedef struct SPEED_FEATURES {
// Enable use of alt-refs in 1 pass VBR.
int use_altref_onepass;
+
+ // Always use nonrd_pick_intra for all block sizes on keyframes.
+ int nonrd_keyframe;
} SPEED_FEATURES;
struct VP9_COMP;
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 81e5b4229..7685e7bc3 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -438,6 +438,202 @@ static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride,
} while (x);
}
+typedef void (*shuffle_filter_funcs)(const int16_t *const filter,
+ __m128i *const f);
+
+typedef __m128i (*convolve8_funcs)(const __m128i *const s,
+ const __m128i *const f);
+
+static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride,
+ uint8_t *dst, const int dst_stride,
+ const int w, const int h,
+ const InterpKernel *const coef,
+ const int phase_scaler,
+ uint8_t *const temp_buffer) {
+ static const int step_q4 = 16 * 4 / 3;
+ const int width_hor = (w + 5) - ((w + 5) % 6);
+ const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels
+ const int width_ver = (w + 7) & ~7;
+ // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows
+ // above and (SUBPEL_TAPS / 2) extra rows below.
+ const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ const int height_ver = (h + 5) - ((h + 5) % 6);
+ int x, y = height_hor;
+ uint8_t *t = temp_buffer;
+ __m128i s[12], d[6], dd[4];
+ __m128i f0[4], f1[5], f2[5];
+ // The offset of the first row is always less than 1 pixel.
+ const int offset1_q4 = phase_scaler + 1 * step_q4;
+ const int offset2_q4 = phase_scaler + 2 * step_q4;
+ // offset_idxx indicates the pixel offset is even (0) or odd (1).
+ // It's used to choose the src offset and filter coefficient offset.
+ const int offset_idx1 = (offset1_q4 >> 4) & 1;
+ const int offset_idx2 = (offset2_q4 >> 4) & 1;
+ static const shuffle_filter_funcs shuffle_filter_funcs[2] = {
+ shuffle_filter_ssse3, shuffle_filter_odd_ssse3
+ };
+ static const convolve8_funcs convolve8_funcs[2] = {
+ convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3
+ };
+
+ assert(w && h);
+
+ shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0);
+ shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1);
+ shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2);
+
+ // Sub 64 to avoid overflow.
+ // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here.
+ // Coef 128 is in either fx[1] or fx[2] depending on the phase idx.
+ // When filter phase idx is 1, the two biggest coefficients are shuffled
+ // together, and the sum of them are always no less than 128. Sub 64 here.
+ // After the subtraction, when the sum of all positive coefficients are no
+ // larger than 128, and the sum of all negative coefficients are no
+ // less than -128, there will be no overflow in the convolve8 functions.
+ f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64));
+ f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64));
+ f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64));
+
+ src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1;
+
+ // horizontal 6x8
+ do {
+ load_8bit_8x8(src, src_stride, s);
+ // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
+ // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73
+ // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75
+ // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77
+ transpose_16bit_4x8(s, s);
+ x = width_hor;
+
+ do {
+ src += 8;
+ load_8bit_8x8(src, src_stride, &s[4]);
+ // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79
+ // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B
+ // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D
+ // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F
+ transpose_16bit_4x8(&s[4], &s[4]);
+
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72
+ // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx
+ // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx
+ dd[0] = _mm_packus_epi16(d[0], d[2]);
+ dd[1] = _mm_packus_epi16(d[1], d[3]);
+ dd[2] = _mm_packus_epi16(d[4], d[4]);
+ dd[3] = _mm_packus_epi16(d[5], d[5]);
+
+ // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71
+ // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73
+ // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75
+ d[0] = _mm_unpacklo_epi16(dd[0], dd[1]);
+ d[1] = _mm_unpackhi_epi16(dd[0], dd[1]);
+ d[2] = _mm_unpacklo_epi16(dd[2], dd[3]);
+
+ // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33
+ // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73
+ // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx
+ // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx
+ dd[0] = _mm_unpacklo_epi32(d[0], d[1]);
+ dd[1] = _mm_unpackhi_epi32(d[0], d[1]);
+ dd[2] = _mm_unpacklo_epi32(d[2], d[2]);
+ dd[3] = _mm_unpackhi_epi32(d[2], d[2]);
+
+ // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx
+ // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx
+ // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx
+ // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx
+ d[0] = _mm_unpacklo_epi64(dd[0], dd[2]);
+ d[1] = _mm_unpackhi_epi64(dd[0], dd[2]);
+ d[2] = _mm_unpacklo_epi64(dd[1], dd[3]);
+ d[3] = _mm_unpackhi_epi64(dd[1], dd[3]);
+
+ // store 4 extra pixels
+ storeu_8bit_16x4(d, t, stride_hor);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ t += 12;
+ x -= 6;
+ } while (x);
+ src += 8 * src_stride - 4 * width_hor / 3;
+ t += 3 * stride_hor + 4;
+ y -= 8;
+ } while (y);
+
+ // vertical 8x6
+ x = width_ver;
+ t = temp_buffer;
+ do {
+ // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17
+ // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37
+ // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57
+ // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77
+ loadu_8bit_16x4(t, stride_hor, s);
+ y = height_ver;
+
+ do {
+ // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97
+ // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7
+ // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7
+ // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7
+ t += 4 * stride_hor;
+ loadu_8bit_16x4(t, stride_hor, &s[4]);
+
+ d[0] = convolve8_8_even_offset_ssse3(&s[0], f0);
+ d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1);
+ d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2);
+ d[3] = convolve8_8_even_offset_ssse3(&s[2], f0);
+ d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1);
+ d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2);
+
+ // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17
+ // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37
+ // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57
+ d[0] = _mm_packus_epi16(d[0], d[1]);
+ d[2] = _mm_packus_epi16(d[2], d[3]);
+ d[4] = _mm_packus_epi16(d[4], d[5]);
+
+ _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]);
+ _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]);
+ _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]);
+ _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]);
+ _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]);
+ _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]);
+
+ s[0] = s[4];
+ s[1] = s[5];
+ s[2] = s[6];
+ s[3] = s[7];
+
+ dst += 6 * dst_stride;
+ y -= 6;
+ } while (y);
+ t -= stride_hor * 2 * height_ver / 3;
+ t += 16;
+ dst -= height_ver * dst_stride;
+ dst += 8;
+ x -= 8;
+ } while (x);
+}
+
static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s,
const __m128i *const f) {
__m128i ss[4], temp;
@@ -652,6 +848,36 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src,
scaled = 0;
}
}
+ } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) {
+ // 4 to 3
+ const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2;
+ const int buffer_stride_ver = (dst_w + 7) & ~7;
+ const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7;
+ // When the vertical filter reads more pixels than the horizontal filter
+ // generated in each row, we need extra padding to avoid heap read overflow.
+ // For example, the horizontal filter generates 18 pixels but the vertical
+ // filter reads 24 pixels in a row. The difference is multiplied by 2 since
+ // two rows are interlaced together in the optimization.
+ const int extra_padding = (buffer_stride_ver > buffer_stride_hor)
+ ? 2 * (buffer_stride_ver - buffer_stride_hor)
+ : 0;
+ const int buffer_size = buffer_stride_hor * buffer_height + extra_padding;
+ uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size);
+ if (temp_buffer) {
+ scaled = 1;
+ scale_plane_4_to_3_general(
+ src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w,
+ dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer);
+ scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type], phase_scaler,
+ temp_buffer);
+ scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer,
+ dst->uv_stride, dst_uv_w, dst_uv_h,
+ vp9_filter_kernels[filter_type], phase_scaler,
+ temp_buffer);
+ free(temp_buffer);
+ }
} else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) {
// 1 to 2
uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7));
diff --git a/vpx_dsp/x86/convolve_ssse3.h b/vpx_dsp/x86/convolve_ssse3.h
index b71da0e4e..8da28f0b2 100644
--- a/vpx_dsp/x86/convolve_ssse3.h
+++ b/vpx_dsp/x86/convolve_ssse3.h
@@ -11,6 +11,7 @@
#ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_
#define VPX_DSP_X86_CONVOLVE_SSSE3_H_
+#include <assert.h>
#include <tmmintrin.h> // SSSE3
#include "./vpx_config.h"
@@ -25,6 +26,20 @@ static INLINE void shuffle_filter_ssse3(const int16_t *const filter,
f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
}
+static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter,
+ __m128i *const f) {
+ const __m128i f_values = _mm_load_si128((const __m128i *)filter);
+ // pack and duplicate the filter values
+ // It utilizes the fact that the high byte of filter[3] is always 0 to clean
+ // half of f[0] and f[4].
+ assert(filter[3] >= 0 && filter[3] < 256);
+ f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u));
+ f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u));
+ f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u));
+ f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au));
+ f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu));
+}
+
static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
const __m128i *const f) {
// multiply 2 adjacent elements with the filter and add the result
@@ -45,4 +60,50 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s,
return temp;
}
+static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ // compensate the subtracted 64 in f[1]. x4 is always non negative.
+ const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64));
+ // add and saturate the results together
+ __m128i temp = _mm_adds_epi16(x0, x3);
+ temp = _mm_adds_epi16(temp, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x4);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
+static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s,
+ const __m128i *const f) {
+ // multiply 2 adjacent elements with the filter and add the result
+ const __m128i k_64 = _mm_set1_epi16(1 << 6);
+ const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]);
+ const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]);
+ const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]);
+ const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]);
+ const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]);
+ // compensate the subtracted 64 in f[2]. x5 is always non negative.
+ const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64));
+ __m128i temp;
+
+ // add and saturate the results together
+ temp = _mm_adds_epi16(x0, x1);
+ temp = _mm_adds_epi16(temp, x2);
+ temp = _mm_adds_epi16(temp, x3);
+ temp = _mm_adds_epi16(temp, x4);
+ temp = _mm_adds_epi16(temp, x5);
+ // round and shift by 7 bit each 16 bit
+ temp = _mm_adds_epi16(temp, k_64);
+ temp = _mm_srai_epi16(temp, 7);
+ return temp;
+}
+
#endif // VPX_DSP_X86_CONVOLVE_SSSE3_H_
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index f9f0a48a0..2ce738fb7 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -113,4 +113,12 @@ static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d,
_mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]);
}
+static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d,
+ const ptrdiff_t stride) {
+ _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]);
+ _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]);
+ _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]);
+ _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]);
+}
+
#endif // VPX_DSP_X86_MEM_SSE2_H_