diff options
-rw-r--r-- | test/convolve_test.cc | 52 | ||||
-rw-r--r-- | test/vp8_fdct4x4_test.cc | 4 | ||||
-rw-r--r-- | test/vpx_scale_test.h | 28 | ||||
-rw-r--r-- | vp8/common/mips/mmi/idct_blk_mmi.c | 71 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.pl | 10 | ||||
-rw-r--r-- | vp8/encoder/mips/mmi/dct_mmi.c | 426 | ||||
-rw-r--r-- | vp8/vp8_common.mk | 1 | ||||
-rw-r--r-- | vp8/vp8cx.mk | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 135 | ||||
-rw-r--r-- | vp9/encoder/vp9_frame_scale.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.c | 15 | ||||
-rw-r--r-- | vp9/encoder/vp9_ratectrl.h | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 6 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.h | 3 | ||||
-rw-r--r-- | vp9/encoder/x86/vp9_frame_scale_ssse3.c | 226 | ||||
-rw-r--r-- | vpx_dsp/x86/convolve_ssse3.h | 61 | ||||
-rw-r--r-- | vpx_dsp/x86/mem_sse2.h | 8 |
18 files changed, 983 insertions, 71 deletions
diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 9f6f795c9..08ef57224 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -925,33 +925,51 @@ TEST_P(ConvolveTest, FilterExtremes) { /* This test exercises that enough rows and columns are filtered with every possible initial fractional positions and scaling steps. */ +#if !CONFIG_VP9_HIGHBITDEPTH +static const ConvolveFunc scaled_2d_c_funcs[2] = { vpx_scaled_2d_c, + vpx_scaled_avg_2d_c }; + TEST_P(ConvolveTest, CheckScalingFiltering) { uint8_t *const in = input(); uint8_t *const out = output(); - const InterpKernel *const eighttap = vp9_filter_kernels[EIGHTTAP]; + uint8_t ref[kOutputStride * kMaxDimension]; - SetConstantInput(127); + ::libvpx_test::ACMRandom prng; + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + const uint16_t r = prng.Rand8Extremes(); + assign_val(in, y * kInputStride + x, r); + } + } - for (int frac = 0; frac < 16; ++frac) { - for (int step = 1; step <= 32; ++step) { - /* Test the horizontal and vertical filters in combination. */ - ASM_REGISTER_STATE_CHECK( - UUT_->shv8_[0](in, kInputStride, out, kOutputStride, eighttap, frac, - step, frac, step, Width(), Height())); - - CheckGuardBlocks(); - - for (int y = 0; y < Height(); ++y) { - for (int x = 0; x < Width(); ++x) { - ASSERT_EQ(lookup(in, y * kInputStride + x), - lookup(out, y * kOutputStride + x)) - << "x == " << x << ", y == " << y << ", frac == " << frac - << ", step == " << step; + for (int i = 0; i < 2; ++i) { + for (INTERP_FILTER filter_type = 0; filter_type < 4; ++filter_type) { + const InterpKernel *const eighttap = vp9_filter_kernels[filter_type]; + for (int frac = 0; frac < 16; ++frac) { + for (int step = 1; step <= 32; ++step) { + /* Test the horizontal and vertical filters in combination. */ + scaled_2d_c_funcs[i](in, kInputStride, ref, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height()); + ASM_REGISTER_STATE_CHECK( + UUT_->shv8_[i](in, kInputStride, out, kOutputStride, eighttap, + frac, step, frac, step, Width(), Height())); + + CheckGuardBlocks(); + + for (int y = 0; y < Height(); ++y) { + for (int x = 0; x < Width(); ++x) { + ASSERT_EQ(lookup(ref, y * kOutputStride + x), + lookup(out, y * kOutputStride + x)) + << "x == " << x << ", y == " << y << ", frac == " << frac + << ", step == " << step; + } + } } } } } } +#endif using std::tr1::make_tuple; diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc index 9f69ae164..b7697d859 100644 --- a/test/vp8_fdct4x4_test.cc +++ b/test/vp8_fdct4x4_test.cc @@ -199,4 +199,8 @@ INSTANTIATE_TEST_CASE_P(SSE2, FdctTest, INSTANTIATE_TEST_CASE_P(MSA, FdctTest, ::testing::Values(vp8_short_fdct4x4_msa)); #endif // HAVE_MSA +#if HAVE_MMI +INSTANTIATE_TEST_CASE_P(MMI, FdctTest, + ::testing::Values(vp8_short_fdct4x4_mmi)); +#endif // HAVE_MMI } // namespace diff --git a/test/vpx_scale_test.h b/test/vpx_scale_test.h index 18909d1b5..dcbd02b91 100644 --- a/test/vpx_scale_test.h +++ b/test/vpx_scale_test.h @@ -15,11 +15,14 @@ #include "./vpx_config.h" #include "./vpx_scale_rtcd.h" +#include "test/acm_random.h" #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "vpx_mem/vpx_mem.h" #include "vpx_scale/yv12config.h" +using libvpx_test::ACMRandom; + namespace libvpx_test { class VpxScaleBase { @@ -65,12 +68,12 @@ class VpxScaleBase { ResetScaleImage(&img_, src_width, src_height); ResetScaleImage(&ref_img_, dst_width, dst_height); ResetScaleImage(&dst_img_, dst_width, dst_height); - FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, - img_.y_stride); - FillPlane(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); - FillPlane(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, - img_.uv_stride); + FillPlaneExtreme(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, + img_.y_stride); + FillPlaneExtreme(img_.u_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); + FillPlaneExtreme(img_.v_buffer, img_.uv_crop_width, img_.uv_crop_height, + img_.uv_stride); } void DeallocImages() { @@ -89,7 +92,8 @@ class VpxScaleBase { static const int kBufFiller = 123; static const int kBufMax = kBufFiller - 1; - static void FillPlane(uint8_t *buf, int width, int height, int stride) { + static void FillPlane(uint8_t *const buf, const int width, const int height, + const int stride) { for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { buf[x + (y * stride)] = (x + (width * y)) % kBufMax; @@ -97,6 +101,16 @@ class VpxScaleBase { } } + static void FillPlaneExtreme(uint8_t *const buf, const int width, + const int height, const int stride) { + ACMRandom rnd; + for (int y = 0; y < height; ++y) { + for (int x = 0; x < width; ++x) { + buf[x + (y * stride)] = rnd.Rand8() % 2 ? 255 : 0; + } + } + } + static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, int width, int height, int stride, int padding) { // Copy the outermost visible pixel to a distance of at least 'padding.' diff --git a/vp8/common/mips/mmi/idct_blk_mmi.c b/vp8/common/mips/mmi/idct_blk_mmi.c new file mode 100644 index 000000000..f6020ab46 --- /dev/null +++ b/vp8/common/mips/mmi/idct_blk_mmi.c @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_mem/vpx_mem.h" + +void vp8_dequant_idct_add_y_block_mmi(int16_t *q, int16_t *dq, uint8_t *dst, + int stride, int8_t *eobs) { + int i, j; + + for (i = 0; i < 4; i++) { + for (j = 0; j < 4; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dst, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dst, stride, dst, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dst += 4; + } + + dst += 4 * stride - 16; + } +} + +void vp8_dequant_idct_add_uv_block_mmi(int16_t *q, int16_t *dq, uint8_t *dstu, + uint8_t *dstv, int stride, + int8_t *eobs) { + int i, j; + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dstu, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstu, stride, dstu, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstu += 4; + } + + dstu += 4 * stride - 8; + } + + for (i = 0; i < 2; i++) { + for (j = 0; j < 2; j++) { + if (*eobs++ > 1) { + vp8_dequant_idct_add_mmi(q, dq, dstv, stride); + } else { + vp8_dc_only_idct_add_mmi(q[0] * dq[0], dstv, stride, dstv, stride); + memset(q, 0, 2 * sizeof(q[0])); + } + + q += 16; + dstv += 4; + } + + dstv += 4 * stride - 8; + } +} diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 3bcfdc0d6..ece2785eb 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -28,10 +28,10 @@ add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/; add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa/; +specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/; add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa/; +specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/; # # Loopfilter @@ -176,13 +176,13 @@ if ($opts{arch} =~ /x86/) { # Forward DCT # add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct4x4 sse2 neon msa/; +specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/; add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct8x4 sse2 neon msa/; +specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/; add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_walsh4x4 sse2 neon msa/; +specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/; # # Quantizer diff --git a/vp8/encoder/mips/mmi/dct_mmi.c b/vp8/encoder/mips/mmi/dct_mmi.c new file mode 100644 index 000000000..7e45a1278 --- /dev/null +++ b/vp8/encoder/mips/mmi/dct_mmi.c @@ -0,0 +1,426 @@ +/* + * Copyright (c) 2017 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./vp8_rtcd.h" +#include "vpx_ports/mem.h" +#include "vpx_ports/asmdefs_mmi.h" + +/* clang-format off */ +#define TRANSPOSE_4H \ + MMI_LI(%[tmp0], 0x93) \ + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" \ + "mtc1 %[tmp0], %[ftmp10] \n\t" \ + "punpcklhw %[ftmp5], %[ftmp1], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp5], %[ftmp5], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp6], %[ftmp1], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp2], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp6], %[ftmp6], %[ftmp9] \n\t" \ + "punpcklhw %[ftmp7], %[ftmp3], %[ftmp0] \n\t" \ + "punpcklhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp7], %[ftmp7], %[ftmp9] \n\t" \ + "punpckhhw %[ftmp8], %[ftmp3], %[ftmp0] \n\t" \ + "punpckhhw %[ftmp9], %[ftmp4], %[ftmp0] \n\t" \ + "pshufh %[ftmp9], %[ftmp9], %[ftmp10] \n\t" \ + "or %[ftmp8], %[ftmp8], %[ftmp9] \n\t" \ + "punpcklwd %[ftmp1], %[ftmp5], %[ftmp7] \n\t" \ + "punpckhwd %[ftmp2], %[ftmp5], %[ftmp7] \n\t" \ + "punpcklwd %[ftmp3], %[ftmp6], %[ftmp8] \n\t" \ + "punpckhwd %[ftmp4], %[ftmp6], %[ftmp8] \n\t" +/* clang-format on */ + +void vp8_short_fdct4x4_mmi(int16_t *input, int16_t *output, int pitch) { + int pitch_half = pitch / 2; + uint64_t tmp[1]; + +#if _MIPS_SIM == _ABIO32 + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f2"); + register double ftmp2 asm("$f4"); + register double ftmp3 asm("$f6"); + register double ftmp4 asm("$f8"); + register double ftmp5 asm("$f10"); + register double ftmp6 asm("$f12"); + register double ftmp7 asm("$f14"); + register double ftmp8 asm("$f16"); + register double ftmp9 asm("$f18"); + register double ftmp10 asm("$f20"); + register double ftmp11 asm("$f22"); + register double ftmp12 asm("$f24"); +#else + register double ftmp0 asm("$f0"); + register double ftmp1 asm("$f1"); + register double ftmp2 asm("$f2"); + register double ftmp3 asm("$f3"); + register double ftmp4 asm("$f4"); + register double ftmp5 asm("$f5"); + register double ftmp6 asm("$f6"); + register double ftmp7 asm("$f7"); + register double ftmp8 asm("$f8"); + register double ftmp9 asm("$f9"); + register double ftmp10 asm("$f10"); + register double ftmp11 asm("$f11"); + register double ftmp12 asm("$f12"); +#endif // _MIPS_SIM == _ABIO32 + + DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_07) = { 0x0007000700070007ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_12000) = { 0x00002ee000002ee0ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_51000) = { 0x0000c7380000c738ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_14500) = { 0x000038a4000038a4ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_7500) = { 0x00001d4c00001d4cULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_op1) = { 0x14e808a914e808a9ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_op3) = { 0xeb1808a9eb1808a9ULL }; + + DECLARE_ALIGNED(16, int, a[4]); + DECLARE_ALIGNED(16, int, b[4]); + DECLARE_ALIGNED(16, int, c[4]); + DECLARE_ALIGNED(16, int, d[4]); + + // stage1 + a[0] = (input[0] + input[3]) * 8; + a[1] = (input[0 + pitch_half] + input[3 + pitch_half]) * 8; + a[2] = (input[0 + 2 * pitch_half] + input[3 + 2 * pitch_half]) * 8; + a[3] = (input[0 + 3 * pitch_half] + input[3 + 3 * pitch_half]) * 8; + + b[0] = (input[1] + input[2]) * 8; + b[1] = (input[1 + pitch_half] + input[2 + pitch_half]) * 8; + b[2] = (input[1 + 2 * pitch_half] + input[2 + 2 * pitch_half]) * 8; + b[3] = (input[1 + 3 * pitch_half] + input[2 + 3 * pitch_half]) * 8; + + c[0] = (input[1] - input[2]) * 8; + c[1] = (input[1 + pitch_half] - input[2 + pitch_half]) * 8; + c[2] = (input[1 + 2 * pitch_half] - input[2 + 2 * pitch_half]) * 8; + c[3] = (input[1 + 3 * pitch_half] - input[2 + 3 * pitch_half]) * 8; + + d[0] = (input[0] - input[3]) * 8; + d[1] = (input[0 + pitch_half] - input[3 + pitch_half]) * 8; + d[2] = (input[0 + 2 * pitch_half] - input[3 + 2 * pitch_half]) * 8; + d[3] = (input[0 + 3 * pitch_half] - input[3 + 3 * pitch_half]) * 8; + + __asm__ volatile ( + "gslqc1 %[ftmp2], %[ftmp1], 0x00(%[a]) \n\t" + "gslqc1 %[ftmp4], %[ftmp3], 0x00(%[b]) \n\t" + "gslqc1 %[ftmp6], %[ftmp5], 0x00(%[c]) \n\t" + "gslqc1 %[ftmp8], %[ftmp7], 0x00(%[d]) \n\t" + + "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t" + "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t" + "psubw %[ftmp11], %[ftmp1], %[ftmp3] \n\t" + "psubw %[ftmp12], %[ftmp2], %[ftmp4] \n\t" + "packsswh %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + "packsswh %[ftmp3], %[ftmp11], %[ftmp12] \n\t" + "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "packsswh %[ftmp4], %[ftmp7], %[ftmp8] \n\t" + MMI_LI(%[tmp0], 0x0c) + "mov.d %[ftmp7], %[ftmp2] \n\t" + "mov.d %[ftmp8], %[ftmp4] \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "ldc1 %[ftmp12], %[ff_pw_14500] \n\t" + "punpcklhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp9], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op1] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + + "ldc1 %[ftmp12], %[ff_pw_7500] \n\t" + "punpcklhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp5], %[ftmp9], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp9], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp6], %[ftmp9], %[ff_ph_op3] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp12] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp12] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + "packsswh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + TRANSPOSE_4H + + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "paddh %[ftmp5], %[ftmp1], %[ftmp4] \n\t" + "paddh %[ftmp6], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp7], %[ftmp2], %[ftmp3] \n\t" + "psubh %[ftmp8], %[ftmp1], %[ftmp4] \n\t" + + "pcmpeqh %[ftmp0], %[ftmp8], %[ftmp0] \n\t" + "ldc1 %[ftmp9], %[ff_ph_01] \n\t" + "paddh %[ftmp0], %[ftmp0], %[ftmp9] \n\t" + + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + "psubh %[ftmp2], %[ftmp5], %[ftmp6] \n\t" + "ldc1 %[ftmp9], %[ff_ph_07] \n\t" + MMI_LI(%[tmp0], 0x04) + "paddh %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddh %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "psrah %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + + MMI_LI(%[tmp0], 0x10) + "ldc1 %[ftmp12], %[ff_pw_12000] \n\t" + "mtc1 %[tmp0], %[ftmp9] \n\t" + + "punpcklhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op1] \n\t" + "punpckhhw %[ftmp5], %[ftmp7], %[ftmp8] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op1] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp3], %[ftmp10], %[ftmp11] \n\t" + "paddh %[ftmp3], %[ftmp3], %[ftmp0] \n\t" + + "ldc1 %[ftmp12], %[ff_pw_51000] \n\t" + "punpcklhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp10], %[ftmp5], %[ff_ph_op3] \n\t" + "punpckhhw %[ftmp5], %[ftmp8], %[ftmp7] \n\t" + "pmaddhw %[ftmp11], %[ftmp5], %[ff_ph_op3] \n\t" + "paddw %[ftmp10], %[ftmp10], %[ftmp12] \n\t" + "paddw %[ftmp11], %[ftmp11], %[ftmp12] \n\t" + "psraw %[ftmp10], %[ftmp10], %[ftmp9] \n\t" + "psraw %[ftmp11], %[ftmp11], %[ftmp9] \n\t" + "packsswh %[ftmp4], %[ftmp10], %[ftmp11] \n\t" + + : [ftmp0] "=&f"(ftmp0), [ftmp1] "=&f"(ftmp1), [ftmp2] "=&f"(ftmp2), + [ftmp3] "=&f"(ftmp3), [ftmp4] "=&f"(ftmp4), [ftmp5] "=&f"(ftmp5), + [ftmp6] "=&f"(ftmp6), [ftmp7] "=&f"(ftmp7), [ftmp8] "=&f"(ftmp8), + [ftmp9] "=&f"(ftmp9), [ftmp10] "=&f"(ftmp10), [ftmp11] "=&f"(ftmp11), + [ftmp12] "=&f"(ftmp12), [tmp0] "=&r"(tmp[0]) + : [ff_ph_01] "m"(ff_ph_01), [ff_ph_07] "m"(ff_ph_07), [a] "r"(a), + [b] "r"(b), [c] "r"(c), [d] "r"(d), [ff_ph_op1] "f"(ff_ph_op1), + [ff_ph_op3] "f"(ff_ph_op3), [ff_pw_14500] "m"(ff_pw_14500), + [ff_pw_7500] "m"(ff_pw_7500), [ff_pw_12000] "m"(ff_pw_12000), + [ff_pw_51000] "m"(ff_pw_51000) + ); + + __asm__ volatile( + "gssdlc1 %[ftmp1], 0x07(%[output]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[output]) \n\t" + "gssdlc1 %[ftmp3], 0x0f(%[output]) \n\t" + "gssdrc1 %[ftmp3], 0x08(%[output]) \n\t" + "gssdlc1 %[ftmp2], 0x17(%[output]) \n\t" + "gssdrc1 %[ftmp2], 0x10(%[output]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[output]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[output]) \n\t" + : + : [ftmp1] "f"(ftmp1), [ftmp2] "f"(ftmp2), [ftmp3] "f"(ftmp3), + [ftmp4] "f"(ftmp4), [output] "r"(output) + : "memory"); +} + +void vp8_short_fdct8x4_mmi(int16_t *input, int16_t *output, int pitch) { + vp8_short_fdct4x4_mmi(input, output, pitch); + vp8_short_fdct4x4_mmi(input + 4, output + 16, pitch); +} + +void vp8_short_walsh4x4_mmi(int16_t *input, int16_t *output, int pitch) { + double ftmp[13]; + uint32_t tmp[1]; + DECLARE_ALIGNED(8, const uint64_t, ff_ph_01) = { 0x0001000100010001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_01) = { 0x0000000100000001ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_03) = { 0x0000000300000003ULL }; + DECLARE_ALIGNED(8, const uint64_t, ff_pw_mask) = { 0x0001000000010000ULL }; + + __asm__ volatile ( + MMI_LI(%[tmp0], 0x02) + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "gsldlc1 %[ftmp1], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp2], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp3], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp3], 0x00(%[ip]) \n\t" + MMI_ADDU(%[ip], %[ip], %[pitch]) + "gsldlc1 %[ftmp4], 0x07(%[ip]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[ip]) \n\t" + TRANSPOSE_4H + + "psllh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "psllh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "psllh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "psllh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + // a + "paddh %[ftmp5], %[ftmp1], %[ftmp3] \n\t" + // d + "paddh %[ftmp6], %[ftmp2], %[ftmp4] \n\t" + // c + "psubh %[ftmp7], %[ftmp2], %[ftmp4] \n\t" + // b + "psubh %[ftmp8], %[ftmp1], %[ftmp3] \n\t" + + // a + d + "paddh %[ftmp1], %[ftmp5], %[ftmp6] \n\t" + // b + c + "paddh %[ftmp2], %[ftmp8], %[ftmp7] \n\t" + // b - c + "psubh %[ftmp3], %[ftmp8], %[ftmp7] \n\t" + // a - d + "psubh %[ftmp4], %[ftmp5], %[ftmp6] \n\t" + + "pcmpeqh %[ftmp6], %[ftmp5], %[ftmp0] \n\t" + "paddh %[ftmp6], %[ftmp6], %[ff_ph_01] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp6] \n\t" + TRANSPOSE_4H + + // op[2], op[0] + "pmaddhw %[ftmp5], %[ftmp1], %[ff_pw_01] \n\t" + // op[3], op[1] + "pmaddhw %[ftmp1], %[ftmp1], %[ff_pw_mask] \n\t" + + // op[6], op[4] + "pmaddhw %[ftmp6], %[ftmp2], %[ff_pw_01] \n\t" + // op[7], op[5] + "pmaddhw %[ftmp2], %[ftmp2], %[ff_pw_mask] \n\t" + + // op[10], op[8] + "pmaddhw %[ftmp7], %[ftmp3], %[ff_pw_01] \n\t" + // op[11], op[9] + "pmaddhw %[ftmp3], %[ftmp3], %[ff_pw_mask] \n\t" + + // op[14], op[12] + "pmaddhw %[ftmp8], %[ftmp4], %[ff_pw_01] \n\t" + // op[15], op[13] + "pmaddhw %[ftmp4], %[ftmp4], %[ff_pw_mask] \n\t" + + // a1, a3 + "paddw %[ftmp9], %[ftmp5], %[ftmp7] \n\t" + // d1, d3 + "paddw %[ftmp10], %[ftmp6], %[ftmp8] \n\t" + // c1, c3 + "psubw %[ftmp11], %[ftmp6], %[ftmp8] \n\t" + // b1, b3 + "psubw %[ftmp12], %[ftmp5], %[ftmp7] \n\t" + + // a1 + d1, a3 + d3 + "paddw %[ftmp5], %[ftmp9], %[ftmp10] \n\t" + // b1 + c1, b3 + c3 + "paddw %[ftmp6], %[ftmp12], %[ftmp11] \n\t" + // b1 - c1, b3 - c3 + "psubw %[ftmp7], %[ftmp12], %[ftmp11] \n\t" + // a1 - d1, a3 - d3 + "psubw %[ftmp8], %[ftmp9], %[ftmp10] \n\t" + + // a2, a4 + "paddw %[ftmp9], %[ftmp1], %[ftmp3] \n\t" + // d2, d4 + "paddw %[ftmp10], %[ftmp2], %[ftmp4] \n\t" + // c2, c4 + "psubw %[ftmp11], %[ftmp2], %[ftmp4] \n\t" + // b2, b4 + "psubw %[ftmp12], %[ftmp1], %[ftmp3] \n\t" + + // a2 + d2, a4 + d4 + "paddw %[ftmp1], %[ftmp9], %[ftmp10] \n\t" + // b2 + c2, b4 + c4 + "paddw %[ftmp2], %[ftmp12], %[ftmp11] \n\t" + // b2 - c2, b4 - c4 + "psubw %[ftmp3], %[ftmp12], %[ftmp11] \n\t" + // a2 - d2, a4 - d4 + "psubw %[ftmp4], %[ftmp9], %[ftmp10] \n\t" + + MMI_LI(%[tmp0], 0x03) + "mtc1 %[tmp0], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp1] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ftmp9] \n\t" + "paddw %[ftmp1], %[ftmp1], %[ff_pw_03] \n\t" + "psraw %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp2] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ftmp9] \n\t" + "paddw %[ftmp2], %[ftmp2], %[ff_pw_03] \n\t" + "psraw %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp3] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ftmp9] \n\t" + "paddw %[ftmp3], %[ftmp3], %[ff_pw_03] \n\t" + "psraw %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp4] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ftmp9] \n\t" + "paddw %[ftmp4], %[ftmp4], %[ff_pw_03] \n\t" + "psraw %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp5] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ftmp9] \n\t" + "paddw %[ftmp5], %[ftmp5], %[ff_pw_03] \n\t" + "psraw %[ftmp5], %[ftmp5], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp6] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ftmp9] \n\t" + "paddw %[ftmp6], %[ftmp6], %[ff_pw_03] \n\t" + "psraw %[ftmp6], %[ftmp6], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp7] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ftmp9] \n\t" + "paddw %[ftmp7], %[ftmp7], %[ff_pw_03] \n\t" + "psraw %[ftmp7], %[ftmp7], %[ftmp11] \n\t" + + "pcmpgtw %[ftmp9], %[ftmp0], %[ftmp8] \n\t" + "and %[ftmp9], %[ftmp9], %[ff_pw_01] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ftmp9] \n\t" + "paddw %[ftmp8], %[ftmp8], %[ff_pw_03] \n\t" + "psraw %[ftmp8], %[ftmp8], %[ftmp11] \n\t" + + "packsswh %[ftmp1], %[ftmp1], %[ftmp5] \n\t" + "packsswh %[ftmp2], %[ftmp2], %[ftmp6] \n\t" + "packsswh %[ftmp3], %[ftmp3], %[ftmp7] \n\t" + "packsswh %[ftmp4], %[ftmp4], %[ftmp8] \n\t" + + MMI_LI(%[tmp0], 0x72) + "mtc1 %[tmp0], %[ftmp11] \n\t" + "pshufh %[ftmp1], %[ftmp1], %[ftmp11] \n\t" + "pshufh %[ftmp2], %[ftmp2], %[ftmp11] \n\t" + "pshufh %[ftmp3], %[ftmp3], %[ftmp11] \n\t" + "pshufh %[ftmp4], %[ftmp4], %[ftmp11] \n\t" + + "gssdlc1 %[ftmp1], 0x07(%[op]) \n\t" + "gssdrc1 %[ftmp1], 0x00(%[op]) \n\t" + "gssdlc1 %[ftmp2], 0x0f(%[op]) \n\t" + "gssdrc1 %[ftmp2], 0x08(%[op]) \n\t" + "gssdlc1 %[ftmp3], 0x17(%[op]) \n\t" + "gssdrc1 %[ftmp3], 0x10(%[op]) \n\t" + "gssdlc1 %[ftmp4], 0x1f(%[op]) \n\t" + "gssdrc1 %[ftmp4], 0x18(%[op]) \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [ftmp4]"=&f"(ftmp[4]), [ftmp5]"=&f"(ftmp[5]), + [ftmp6]"=&f"(ftmp[6]), [ftmp7]"=&f"(ftmp[7]), + [ftmp8]"=&f"(ftmp[8]), [ftmp9]"=&f"(ftmp[9]), + [ftmp10]"=&f"(ftmp[10]), [ftmp11]"=&f"(ftmp[11]), + [ftmp12]"=&f"(ftmp[12]), + [tmp0]"=&r"(tmp[0]), + [ip]"+&r"(input) + : [op]"r"(output), + [ff_pw_01]"f"(ff_pw_01), [pitch]"r"((mips_reg)pitch), + [ff_pw_03]"f"(ff_pw_03), [ff_pw_mask]"f"(ff_pw_mask), + [ff_ph_01]"f"(ff_ph_01) + : "memory" + ); +} diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 5813c81c4..246fe6a67 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -122,6 +122,7 @@ VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/loopfilter_filters_mmi.c VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idctllm_mmi.c VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/dequantize_mmi.c VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/copymem_mmi.c +VP8_COMMON_SRCS-$(HAVE_MMI) += common/mips/mmi/idct_blk_mmi.c ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 23d65d416..0dac0169d 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -111,6 +111,7 @@ VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/vp8_quantize_mmi.c +VP8_CX_SRCS-$(HAVE_MMI) += encoder/mips/mmi/dct_mmi.c ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/denoising_msa.c diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index dee17ade2..aa298acdf 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -3489,7 +3489,7 @@ static TX_MODE select_tx_mode(const VP9_COMP *cpi, MACROBLOCKD *const xd) { static void hybrid_intra_mode_search(VP9_COMP *cpi, MACROBLOCK *const x, RD_COST *rd_cost, BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx) { - if (bsize < BLOCK_16X16) + if (!cpi->sf.nonrd_keyframe && bsize < BLOCK_16X16) vp9_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, INT64_MAX); else vp9_pick_intra_mode(cpi, x, rd_cost, bsize, ctx); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index db15d4021..9d9779f7b 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -41,6 +41,11 @@ #define OUTPUT_FPF 0 #define ARF_STATS_OUTPUT 0 +#define COMPLEXITY_STATS_OUTPUT 0 + +#ifdef CORPUS_VBR_EXPERIMENT +#define CORPUS_VBR_MIDPOINT 82.0 +#endif #define FIRST_PASS_Q 10.0 #define GF_MAX_BOOST 96.0 @@ -239,8 +244,12 @@ static double calculate_active_area(const VP9_COMP *cpi, static double get_distribution_av_err(TWO_PASS *const twopass) { const double av_weight = twopass->total_stats.weight / twopass->total_stats.count; +#ifdef CORPUS_VBR_EXPERIMENT + return av_weight * CORPUS_VBR_MIDPOINT; +#else return (twopass->total_stats.coded_error * av_weight) / twopass->total_stats.count; +#endif } // Calculate a modified Error used in distributing bits between easier and @@ -1686,7 +1695,7 @@ void calculate_coded_size(VP9_COMP *cpi, int *scaled_frame_width, void vp9_init_second_pass(VP9_COMP *cpi) { SVC *const svc = &cpi->svc; - const VP9EncoderConfig *const oxcf = &cpi->oxcf; + VP9EncoderConfig *const oxcf = &cpi->oxcf; const int is_two_pass_svc = (svc->number_spatial_layers > 1) || (svc->number_temporal_layers > 1); RATE_CONTROL *const rc = &cpi->rc; @@ -1706,28 +1715,6 @@ void vp9_init_second_pass(VP9_COMP *cpi) { *stats = *twopass->stats_in_end; twopass->total_left_stats = *stats; - frame_rate = 10000000.0 * stats->count / stats->duration; - // Each frame can have a different duration, as the frame rate in the source - // isn't guaranteed to be constant. The frame rate prior to the first frame - // encoded in the second pass is a guess. However, the sum duration is not. - // It is calculated based on the actual durations of all frames from the - // first pass. - - if (is_two_pass_svc) { - vp9_update_spatial_layer_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * - svc->layer_context[svc->spatial_layer_id].target_bandwidth / - 10000000.0); - } else { - vp9_new_framerate(cpi, frame_rate); - twopass->bits_left = - (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); - } - - // This variable monitors how far behind the second ref update is lagging. - twopass->sr_update_lag = 1; - // Scan the first pass file and calculate a modified score for each // frame that is used to distribute bits. The modified score is assumed // to provide a linear basis for bit allocation. I.e a frame A with a score @@ -1737,6 +1724,9 @@ void vp9_init_second_pass(VP9_COMP *cpi) { const FIRSTPASS_STATS *s = twopass->stats_in; const double av_err = get_distribution_av_err(twopass); +#ifdef CORPUS_VBR_EXPERIMENT + twopass->mean_mod_score = CORPUS_VBR_MIDPOINT; +#else // The first scan is unclamped and gives a raw average. while (s < twopass->stats_in_end) { modified_score_total += calculate_mod_frame_score(cpi, oxcf, s, av_err); @@ -1747,6 +1737,7 @@ void vp9_init_second_pass(VP9_COMP *cpi) { // error for the rate distribution function. twopass->mean_mod_score = modified_score_total / DOUBLE_DIVIDE_CHECK(stats->count); +#endif // Second scan using clamps based on the previous cycle average. // This may modify the total and average somewhat but we dont bother with @@ -1759,8 +1750,47 @@ void vp9_init_second_pass(VP9_COMP *cpi) { ++s; } twopass->normalized_score_left = modified_score_total; + +#ifdef CORPUS_VBR_EXPERIMENT + // If using Corpus wide VBR mode then update the clip target bandwidth. + oxcf->target_bandwidth = + (int64_t)((double)oxcf->target_bandwidth * + (twopass->normalized_score_left / stats->count)); +#endif + +#if COMPLEXITY_STATS_OUTPUT + { + FILE *compstats; + compstats = fopen("complexity_stats.stt", "a"); + fprintf(compstats, "%10.3lf\n", + twopass->normalized_score_left / stats->count); + fclose(compstats); + } +#endif } + frame_rate = 10000000.0 * stats->count / stats->duration; + // Each frame can have a different duration, as the frame rate in the source + // isn't guaranteed to be constant. The frame rate prior to the first frame + // encoded in the second pass is a guess. However, the sum duration is not. + // It is calculated based on the actual durations of all frames from the + // first pass. + + if (is_two_pass_svc) { + vp9_update_spatial_layer_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * + svc->layer_context[svc->spatial_layer_id].target_bandwidth / + 10000000.0); + } else { + vp9_new_framerate(cpi, frame_rate); + twopass->bits_left = + (int64_t)(stats->duration * oxcf->target_bandwidth / 10000000.0); + } + + // This variable monitors how far behind the second ref update is lagging. + twopass->sr_update_lag = 1; + // Reset the vbr bits off target counters rc->vbr_bits_off_target = 0; rc->vbr_bits_off_target_fast = 0; @@ -2155,6 +2185,28 @@ static void get_arf_buffer_indices(unsigned char *arf_buffer_indices) { arf_buffer_indices[1] = ARF_SLOT2; } +#ifdef CORPUS_VBR_EXPERIMENT +// Calculates the total normalized group complexity score for a given number +// of frames starting at the current position in the stats file. +static double calculate_group_score(VP9_COMP *cpi, double av_score, + int frame_count) { + VP9EncoderConfig *const oxcf = &cpi->oxcf; + TWO_PASS *const twopass = &cpi->twopass; + const FIRSTPASS_STATS *s = twopass->stats_in; + double score_total = 0.0; + int i = 0; + + while ((i < frame_count) && (s < twopass->stats_in_end)) { + score_total += calculate_norm_frame_score(cpi, twopass, oxcf, s, av_score); + ++s; + ++i; + } + assert(i == frame_count); + + return score_total; +} +#endif + static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, int gf_arf_bits) { RATE_CONTROL *const rc = &cpi->rc; @@ -2175,8 +2227,13 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, is_two_pass_svc(cpi) && cpi->svc.number_temporal_layers > 1; int normal_frames; int normal_frame_bits; - int last_frame_bits; - int last_frame_reduction; + int last_frame_reduction = 0; + +#ifdef CORPUS_VBR_EXPERIMENT + double av_score = get_distribution_av_err(twopass); + double tot_norm_frame_score; + double this_frame_score; +#endif // Only encode alt reference frame in temporal base layer. if (has_temporal_layers) alt_frame_index = cpi->svc.number_temporal_layers; @@ -2249,17 +2306,17 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending); +#ifndef CORPUS_VBR_EXPERIMENT // The last frame in the group is used less as a predictor so reduce // its allocation a little. if (normal_frames > 1) { normal_frame_bits = (int)(total_group_bits / normal_frames); - last_frame_reduction = normal_frame_bits / 16; - last_frame_bits = normal_frame_bits - last_frame_reduction; } else { normal_frame_bits = (int)total_group_bits; - last_frame_bits = normal_frame_bits; - last_frame_reduction = 0; } +#else + tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames); +#endif // Allocate bits to the other frames in the group. for (i = 0; i < normal_frames; ++i) { @@ -2270,11 +2327,18 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, ++frame_index; } - target_frame_size = (i == (normal_frames - 1)) - ? last_frame_bits - : (frame_index == mid_frame_idx) - ? normal_frame_bits + last_frame_reduction - : normal_frame_bits; +#ifdef CORPUS_VBR_EXPERIMENT + this_frame_score = calculate_norm_frame_score(cpi, twopass, &cpi->oxcf, + &frame_stats, av_score); + normal_frame_bits = (int)((double)total_group_bits * + (this_frame_score / tot_norm_frame_score)); +#endif + + target_frame_size = normal_frame_bits; + if ((i == (normal_frames - 1)) && (i >= 1)) { + last_frame_reduction = normal_frame_bits / 16; + target_frame_size -= last_frame_reduction; + } if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) { mid_boost_bits += (target_frame_size >> 4); @@ -2295,6 +2359,9 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits, ++frame_index; } + // Add in some extra bits for the middle frame in the group. + gf_group->bit_allocation[mid_frame_idx] += last_frame_reduction; + // Note: // We need to configure the frame at the end of the sequence + 1 that will be // the start frame for the next group. Otherwise prior to the call to diff --git a/vp9/encoder/vp9_frame_scale.c b/vp9/encoder/vp9_frame_scale.c index 832df18c8..a410d0407 100644 --- a/vp9/encoder/vp9_frame_scale.c +++ b/vp9/encoder/vp9_frame_scale.c @@ -28,7 +28,7 @@ void vp9_scale_and_extend_frame_c(const YV12_BUFFER_CONFIG *src, const InterpKernel *const kernel = vp9_filter_kernels[filter_type]; int x, y, i; -#if HAVE_NEON +#if HAVE_SSSE3 || HAVE_NEON // TODO(linfengz): The 4:3 specialized C code is disabled by default since // it's much slower than the general version which calls vpx_scaled_2d() even // if vpx_scaled_2d() is not optimized. It will only be enabled as a reference diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 8c71beaff..73d78a30c 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1970,9 +1970,11 @@ void vp9_set_target_rate(VP9_COMP *cpi) { else target_rate = vp9_rc_clamp_pframe_target_size(cpi, target_rate); +#ifndef CORPUS_VBR_EXPERIMENT // Correction to rate target based on prior over or under shoot. if (cpi->oxcf.rc_mode == VPX_VBR || cpi->oxcf.rc_mode == VPX_CQ) vbr_rate_correction(cpi, &target_rate); +#endif vp9_rc_set_frame_target(cpi, target_rate); } @@ -2119,7 +2121,7 @@ static void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_source_sad_lag = avg_sad_current; int high_source_sad_lagindex = -1; int steady_sad_lagindex = -1; - uint32_t sad_thresh1 = 60000; + uint32_t sad_thresh1 = 70000; uint32_t sad_thresh2 = 120000; int low_content = 0; int high_content = 0; @@ -2280,8 +2282,10 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { uint64_t avg_sad_current = 0; uint32_t min_thresh = 4000; float thresh = 8.0f; + uint32_t thresh_key = 140000; + if (cpi->oxcf.speed <= 5) thresh_key = 240000; if (cpi->oxcf.rc_mode == VPX_VBR) { - min_thresh = 70000; + min_thresh = 65000; thresh = 2.1f; } if (cpi->oxcf.lag_in_frames > 0) { @@ -2307,7 +2311,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { rc->high_source_sad = 1; else rc->high_source_sad = 0; - if (rc->high_source_sad && avg_sad_current > min_thresh << 1) + if (rc->high_source_sad && avg_sad_current > thresh_key) scene_cut_force_key_frame = 1; // Update recursive average for current frame. if (avg_sad_current > 0) @@ -2369,7 +2373,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { rc->high_source_sad = 1; else rc->high_source_sad = 0; - if (rc->high_source_sad && avg_sad > min_thresh << 1) + if (rc->high_source_sad && avg_sad > thresh_key) scene_cut_force_key_frame = 1; if (avg_sad > 0 || cpi->oxcf.rc_mode == VPX_CBR) rc->avg_source_sad[0] = (3 * rc->avg_source_sad[0] + avg_sad) >> 2; @@ -2402,8 +2406,7 @@ void vp9_scene_detection_onepass(VP9_COMP *cpi) { cpi->ext_refresh_frame_flags_pending == 0) { int target; cpi->refresh_golden_frame = 1; - if (cpi->oxcf.speed >= 6 && scene_cut_force_key_frame) - cm->frame_type = KEY_FRAME; + if (scene_cut_force_key_frame) cm->frame_type = KEY_FRAME; rc->source_alt_ref_pending = 0; if (cpi->sf.use_altref_onepass && cpi->oxcf.enable_auto_arf) rc->source_alt_ref_pending = 1; diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index bdae75542..f851e4286 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -24,6 +24,9 @@ extern "C" { // Used to control aggressive VBR mode. // #define AGGRESSIVE_VBR 1 +// Used to control Corpus VBR experiment +// #define CORPUS_VBR_EXPERIMENT 1 + // Bits Per MB at different Q (Multiplied by 512) #define BPER_MB_NORMBITS 9 diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 4d4a579e6..e5499d6dd 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -225,7 +225,11 @@ static void set_good_speed_feature_framesize_independent(VP9_COMP *cpi, } if (speed >= 2) { +#ifdef CORPUS_VBR_EXPERIMENT + sf->recode_loop = ALLOW_RECODE_FIRST; +#else sf->recode_loop = ALLOW_RECODE_KFARFGF; +#endif sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD : USE_LARGESTALL; @@ -366,6 +370,7 @@ static void set_rt_speed_feature_framesize_independent( sf->use_simple_block_yrd = 0; sf->adapt_partition_source_sad = 0; sf->use_altref_onepass = 0; + sf->nonrd_keyframe = 0; if (speed >= 1) { sf->allow_txfm_domain_distortion = 1; @@ -598,6 +603,7 @@ static void set_rt_speed_feature_framesize_independent( if (speed >= 8) { sf->adaptive_rd_thresh = 4; sf->skip_encode_sb = 1; + sf->nonrd_keyframe = 1; if (!cpi->use_svc) cpi->max_copied_frame = 4; if (cpi->row_mt && cpi->oxcf.max_threads > 1) sf->adaptive_rd_thresh_row_mt = 1; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 517369dae..9e5bf9a24 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -499,6 +499,9 @@ typedef struct SPEED_FEATURES { // Enable use of alt-refs in 1 pass VBR. int use_altref_onepass; + + // Always use nonrd_pick_intra for all block sizes on keyframes. + int nonrd_keyframe; } SPEED_FEATURES; struct VP9_COMP; diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c index 81e5b4229..7685e7bc3 100644 --- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c +++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c @@ -438,6 +438,202 @@ static void scale_plane_4_to_1_general(const uint8_t *src, const int src_stride, } while (x); } +typedef void (*shuffle_filter_funcs)(const int16_t *const filter, + __m128i *const f); + +typedef __m128i (*convolve8_funcs)(const __m128i *const s, + const __m128i *const f); + +static void scale_plane_4_to_3_general(const uint8_t *src, const int src_stride, + uint8_t *dst, const int dst_stride, + const int w, const int h, + const InterpKernel *const coef, + const int phase_scaler, + uint8_t *const temp_buffer) { + static const int step_q4 = 16 * 4 / 3; + const int width_hor = (w + 5) - ((w + 5) % 6); + const int stride_hor = 2 * width_hor + 4; // store 4 extra pixels + const int width_ver = (w + 7) & ~7; + // We need (SUBPEL_TAPS - 1) extra rows: (SUBPEL_TAPS / 2 - 1) extra rows + // above and (SUBPEL_TAPS / 2) extra rows below. + const int height_hor = (4 * h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + const int height_ver = (h + 5) - ((h + 5) % 6); + int x, y = height_hor; + uint8_t *t = temp_buffer; + __m128i s[12], d[6], dd[4]; + __m128i f0[4], f1[5], f2[5]; + // The offset of the first row is always less than 1 pixel. + const int offset1_q4 = phase_scaler + 1 * step_q4; + const int offset2_q4 = phase_scaler + 2 * step_q4; + // offset_idxx indicates the pixel offset is even (0) or odd (1). + // It's used to choose the src offset and filter coefficient offset. + const int offset_idx1 = (offset1_q4 >> 4) & 1; + const int offset_idx2 = (offset2_q4 >> 4) & 1; + static const shuffle_filter_funcs shuffle_filter_funcs[2] = { + shuffle_filter_ssse3, shuffle_filter_odd_ssse3 + }; + static const convolve8_funcs convolve8_funcs[2] = { + convolve8_8_even_offset_ssse3, convolve8_8_odd_offset_ssse3 + }; + + assert(w && h); + + shuffle_filter_ssse3(coef[(phase_scaler + 0 * step_q4) & SUBPEL_MASK], f0); + shuffle_filter_funcs[offset_idx1](coef[offset1_q4 & SUBPEL_MASK], f1); + shuffle_filter_funcs[offset_idx2](coef[offset2_q4 & SUBPEL_MASK], f2); + + // Sub 64 to avoid overflow. + // Coef 128 would be treated as -128 in PMADDUBSW. Sub 64 here. + // Coef 128 is in either fx[1] or fx[2] depending on the phase idx. + // When filter phase idx is 1, the two biggest coefficients are shuffled + // together, and the sum of them are always no less than 128. Sub 64 here. + // After the subtraction, when the sum of all positive coefficients are no + // larger than 128, and the sum of all negative coefficients are no + // less than -128, there will be no overflow in the convolve8 functions. + f0[1] = _mm_sub_epi8(f0[1], _mm_set1_epi8(64)); + f1[1 + offset_idx1] = _mm_sub_epi8(f1[1 + offset_idx1], _mm_set1_epi8(64)); + f2[1 + offset_idx2] = _mm_sub_epi8(f2[1 + offset_idx2], _mm_set1_epi8(64)); + + src -= (SUBPEL_TAPS / 2 - 1) * src_stride + SUBPEL_TAPS / 2 - 1; + + // horizontal 6x8 + do { + load_8bit_8x8(src, src_stride, s); + // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 + // 02 03 12 13 22 23 32 33 42 43 52 53 62 63 72 73 + // 04 05 14 15 24 25 34 35 44 45 54 55 64 65 74 75 + // 06 07 16 17 26 27 36 37 46 47 56 57 66 67 76 77 + transpose_16bit_4x8(s, s); + x = width_hor; + + do { + src += 8; + load_8bit_8x8(src, src_stride, &s[4]); + // 08 09 18 19 28 29 38 39 48 49 58 59 68 69 78 79 + // 0A 0B 1A 1B 2A 2B 3A 3B 4A 4B 5A 5B 6A 6B 7A 7B + // OC 0D 1C 1D 2C 2D 3C 3D 4C 4D 5C 5D 6C 6D 7C 7D + // 0E 0F 1E 1F 2E 2F 3E 3F 4E 4F 5E 5F 6E 6F 7E 7F + transpose_16bit_4x8(&s[4], &s[4]); + + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 10 20 30 40 50 60 70 02 12 22 32 42 52 62 72 + // 01 11 21 31 41 51 61 71 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 xx xx xx xx xx xx xx xx + // 05 15 25 35 45 55 65 75 xx xx xx xx xx xx xx xx + dd[0] = _mm_packus_epi16(d[0], d[2]); + dd[1] = _mm_packus_epi16(d[1], d[3]); + dd[2] = _mm_packus_epi16(d[4], d[4]); + dd[3] = _mm_packus_epi16(d[5], d[5]); + + // 00 10 01 11 20 30 21 31 40 50 41 51 60 70 61 71 + // 02 12 03 13 22 32 23 33 42 52 43 53 62 72 63 73 + // 04 14 05 15 24 34 25 35 44 54 45 55 64 74 65 75 + d[0] = _mm_unpacklo_epi16(dd[0], dd[1]); + d[1] = _mm_unpackhi_epi16(dd[0], dd[1]); + d[2] = _mm_unpacklo_epi16(dd[2], dd[3]); + + // 00 10 01 11 02 12 03 13 20 30 21 31 22 32 23 33 + // 40 50 41 51 42 52 43 53 60 70 61 71 62 72 63 73 + // 04 14 05 15 xx xx xx xx 24 34 25 35 xx xx xx xx + // 44 54 45 55 xx xx xx xx 64 74 65 75 xx xx xx xx + dd[0] = _mm_unpacklo_epi32(d[0], d[1]); + dd[1] = _mm_unpackhi_epi32(d[0], d[1]); + dd[2] = _mm_unpacklo_epi32(d[2], d[2]); + dd[3] = _mm_unpackhi_epi32(d[2], d[2]); + + // 00 10 01 11 02 12 03 13 04 14 05 15 xx xx xx xx + // 20 30 21 31 22 32 23 33 24 34 25 35 xx xx xx xx + // 40 50 41 51 42 52 43 53 44 54 45 55 xx xx xx xx + // 60 70 61 71 62 72 63 73 64 74 65 75 xx xx xx xx + d[0] = _mm_unpacklo_epi64(dd[0], dd[2]); + d[1] = _mm_unpackhi_epi64(dd[0], dd[2]); + d[2] = _mm_unpacklo_epi64(dd[1], dd[3]); + d[3] = _mm_unpackhi_epi64(dd[1], dd[3]); + + // store 4 extra pixels + storeu_8bit_16x4(d, t, stride_hor); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + t += 12; + x -= 6; + } while (x); + src += 8 * src_stride - 4 * width_hor / 3; + t += 3 * stride_hor + 4; + y -= 8; + } while (y); + + // vertical 8x6 + x = width_ver; + t = temp_buffer; + do { + // 00 10 01 11 02 12 03 13 04 14 05 15 06 16 07 17 + // 20 30 21 31 22 32 23 33 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 44 54 45 55 46 56 47 57 + // 60 70 61 71 62 72 63 73 64 74 65 75 66 76 67 77 + loadu_8bit_16x4(t, stride_hor, s); + y = height_ver; + + do { + // 80 90 81 91 82 92 83 93 84 94 85 95 86 96 87 97 + // A0 B0 A1 B1 A2 B2 A3 B3 A4 B4 A5 B5 A6 B6 A7 B7 + // C0 D0 C1 D1 C2 D2 C3 D3 C4 D4 C5 D5 C6 D6 C7 D7 + // E0 F0 E1 F1 E2 F2 E3 F3 E4 F4 E5 F5 E6 F6 E7 F7 + t += 4 * stride_hor; + loadu_8bit_16x4(t, stride_hor, &s[4]); + + d[0] = convolve8_8_even_offset_ssse3(&s[0], f0); + d[1] = convolve8_funcs[offset_idx1](&s[offset1_q4 >> 5], f1); + d[2] = convolve8_funcs[offset_idx2](&s[offset2_q4 >> 5], f2); + d[3] = convolve8_8_even_offset_ssse3(&s[2], f0); + d[4] = convolve8_funcs[offset_idx1](&s[2 + (offset1_q4 >> 5)], f1); + d[5] = convolve8_funcs[offset_idx2](&s[2 + (offset2_q4 >> 5)], f2); + + // 00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 50 51 52 53 54 55 56 57 + d[0] = _mm_packus_epi16(d[0], d[1]); + d[2] = _mm_packus_epi16(d[2], d[3]); + d[4] = _mm_packus_epi16(d[4], d[5]); + + _mm_storel_epi64((__m128i *)(dst + 0 * dst_stride), d[0]); + _mm_storeh_epi64((__m128i *)(dst + 1 * dst_stride), d[0]); + _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), d[2]); + _mm_storeh_epi64((__m128i *)(dst + 3 * dst_stride), d[2]); + _mm_storel_epi64((__m128i *)(dst + 4 * dst_stride), d[4]); + _mm_storeh_epi64((__m128i *)(dst + 5 * dst_stride), d[4]); + + s[0] = s[4]; + s[1] = s[5]; + s[2] = s[6]; + s[3] = s[7]; + + dst += 6 * dst_stride; + y -= 6; + } while (y); + t -= stride_hor * 2 * height_ver / 3; + t += 16; + dst -= height_ver * dst_stride; + dst += 8; + x -= 8; + } while (x); +} + static INLINE __m128i scale_1_to_2_phase_0_kernel(const __m128i *const s, const __m128i *const f) { __m128i ss[4], temp; @@ -652,6 +848,36 @@ void vp9_scale_and_extend_frame_ssse3(const YV12_BUFFER_CONFIG *src, scaled = 0; } } + } else if (4 * dst_w == 3 * src_w && 4 * dst_h == 3 * src_h) { + // 4 to 3 + const int buffer_stride_hor = (dst_w + 5) - ((dst_w + 5) % 6) + 2; + const int buffer_stride_ver = (dst_w + 7) & ~7; + const int buffer_height = (4 * dst_h / 3 + SUBPEL_TAPS - 1 + 7) & ~7; + // When the vertical filter reads more pixels than the horizontal filter + // generated in each row, we need extra padding to avoid heap read overflow. + // For example, the horizontal filter generates 18 pixels but the vertical + // filter reads 24 pixels in a row. The difference is multiplied by 2 since + // two rows are interlaced together in the optimization. + const int extra_padding = (buffer_stride_ver > buffer_stride_hor) + ? 2 * (buffer_stride_ver - buffer_stride_hor) + : 0; + const int buffer_size = buffer_stride_hor * buffer_height + extra_padding; + uint8_t *const temp_buffer = (uint8_t *)malloc(buffer_size); + if (temp_buffer) { + scaled = 1; + scale_plane_4_to_3_general( + src->y_buffer, src->y_stride, dst->y_buffer, dst->y_stride, dst_w, + dst_h, vp9_filter_kernels[filter_type], phase_scaler, temp_buffer); + scale_plane_4_to_3_general(src->u_buffer, src->uv_stride, dst->u_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + scale_plane_4_to_3_general(src->v_buffer, src->uv_stride, dst->v_buffer, + dst->uv_stride, dst_uv_w, dst_uv_h, + vp9_filter_kernels[filter_type], phase_scaler, + temp_buffer); + free(temp_buffer); + } } else if (dst_w == src_w * 2 && dst_h == src_h * 2 && phase_scaler == 0) { // 1 to 2 uint8_t *const temp_buffer = (uint8_t *)malloc(8 * ((src_w + 7) & ~7)); diff --git a/vpx_dsp/x86/convolve_ssse3.h b/vpx_dsp/x86/convolve_ssse3.h index b71da0e4e..8da28f0b2 100644 --- a/vpx_dsp/x86/convolve_ssse3.h +++ b/vpx_dsp/x86/convolve_ssse3.h @@ -11,6 +11,7 @@ #ifndef VPX_DSP_X86_CONVOLVE_SSSE3_H_ #define VPX_DSP_X86_CONVOLVE_SSSE3_H_ +#include <assert.h> #include <tmmintrin.h> // SSSE3 #include "./vpx_config.h" @@ -25,6 +26,20 @@ static INLINE void shuffle_filter_ssse3(const int16_t *const filter, f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); } +static INLINE void shuffle_filter_odd_ssse3(const int16_t *const filter, + __m128i *const f) { + const __m128i f_values = _mm_load_si128((const __m128i *)filter); + // pack and duplicate the filter values + // It utilizes the fact that the high byte of filter[3] is always 0 to clean + // half of f[0] and f[4]. + assert(filter[3] >= 0 && filter[3] < 256); + f[0] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0007u)); + f[1] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0402u)); + f[2] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0806u)); + f[3] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0c0au)); + f[4] = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x070eu)); +} + static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, const __m128i *const f) { // multiply 2 adjacent elements with the filter and add the result @@ -45,4 +60,50 @@ static INLINE __m128i convolve8_8_ssse3(const __m128i *const s, return temp; } +static INLINE __m128i convolve8_8_even_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + // compensate the subtracted 64 in f[1]. x4 is always non negative. + const __m128i x4 = _mm_maddubs_epi16(s[1], _mm_set1_epi8(64)); + // add and saturate the results together + __m128i temp = _mm_adds_epi16(x0, x3); + temp = _mm_adds_epi16(temp, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x4); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + +static INLINE __m128i convolve8_8_odd_offset_ssse3(const __m128i *const s, + const __m128i *const f) { + // multiply 2 adjacent elements with the filter and add the result + const __m128i k_64 = _mm_set1_epi16(1 << 6); + const __m128i x0 = _mm_maddubs_epi16(s[0], f[0]); + const __m128i x1 = _mm_maddubs_epi16(s[1], f[1]); + const __m128i x2 = _mm_maddubs_epi16(s[2], f[2]); + const __m128i x3 = _mm_maddubs_epi16(s[3], f[3]); + const __m128i x4 = _mm_maddubs_epi16(s[4], f[4]); + // compensate the subtracted 64 in f[2]. x5 is always non negative. + const __m128i x5 = _mm_maddubs_epi16(s[2], _mm_set1_epi8(64)); + __m128i temp; + + // add and saturate the results together + temp = _mm_adds_epi16(x0, x1); + temp = _mm_adds_epi16(temp, x2); + temp = _mm_adds_epi16(temp, x3); + temp = _mm_adds_epi16(temp, x4); + temp = _mm_adds_epi16(temp, x5); + // round and shift by 7 bit each 16 bit + temp = _mm_adds_epi16(temp, k_64); + temp = _mm_srai_epi16(temp, 7); + return temp; +} + #endif // VPX_DSP_X86_CONVOLVE_SSSE3_H_ diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h index f9f0a48a0..2ce738fb7 100644 --- a/vpx_dsp/x86/mem_sse2.h +++ b/vpx_dsp/x86/mem_sse2.h @@ -113,4 +113,12 @@ static INLINE void store_8bit_8x8(const __m128i *const s, uint8_t *const d, _mm_storel_epi64((__m128i *)(d + 7 * stride), s[7]); } +static INLINE void storeu_8bit_16x4(const __m128i *const s, uint8_t *const d, + const ptrdiff_t stride) { + _mm_storeu_si128((__m128i *)(d + 0 * stride), s[0]); + _mm_storeu_si128((__m128i *)(d + 1 * stride), s[1]); + _mm_storeu_si128((__m128i *)(d + 2 * stride), s[2]); + _mm_storeu_si128((__m128i *)(d + 3 * stride), s[3]); +} + #endif // VPX_DSP_X86_MEM_SSE2_H_ |