diff options
31 files changed, 2288 insertions, 282 deletions
diff --git a/test/external_frame_buffer_test.cc b/test/external_frame_buffer_test.cc index 44eba3317..70b300928 100644 --- a/test/external_frame_buffer_test.cc +++ b/test/external_frame_buffer_test.cc @@ -71,6 +71,7 @@ class ExternalFrameBufferList { if (ext_fb_list_[idx].size < min_size) { delete [] ext_fb_list_[idx].data; ext_fb_list_[idx].data = new uint8_t[min_size]; + memset(ext_fb_list_[idx].data, 0, min_size); ext_fb_list_[idx].size = min_size; } diff --git a/test/test.mk b/test/test.mk index b92b6da73..abf815cc9 100644 --- a/test/test.mk +++ b/test/test.mk @@ -128,6 +128,7 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct4x4_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += fdct8x8_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc +LIBVPX_TEST_SRCS-$(CONFIG_VP9) += vp9_intrapred_test.cc ifeq ($(CONFIG_VP9_ENCODER),yes) LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc diff --git a/test/vp9_intrapred_test.cc b/test/vp9_intrapred_test.cc new file mode 100644 index 000000000..7d08d9ee4 --- /dev/null +++ b/test/vp9_intrapred_test.cc @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <string> + +#include "test/acm_random.h" +#include "test/clear_system_state.h" +#include "test/register_state_check.h" +#include "third_party/googletest/src/include/gtest/gtest.h" + +#include "./vpx_config.h" +#include "./vp9_rtcd.h" +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_pred_common.h" +#include "vpx_mem/vpx_mem.h" +#include "test/util.h" + +namespace { + +using libvpx_test::ACMRandom; + +const int count_test_block = 100000; + +// Base class for VP9 intra prediction tests. +class VP9IntraPredBase { + public: + virtual ~VP9IntraPredBase() { libvpx_test::ClearSystemState(); } + + protected: + virtual void Predict(PREDICTION_MODE mode) = 0; + + void CheckPrediction(int test_case_number, int *error_count) const { + // For each pixel ensure that the calculated value is the same as reference. + for (int y = 0; y < block_size_; y++) { + for (int x = 0; x < block_size_; x++) { + *error_count += ref_dst_[x + y * stride_] != dst_[x + y * stride_]; + if (*error_count == 1) { + ASSERT_EQ(ref_dst_[x + y * stride_], dst_[x + y * stride_]) + << " Failed on Test Case Number "<< test_case_number; + } + } + } + } + + void RunTest(uint16_t* left_col, uint16_t* above_data, + uint16_t* dst, uint16_t* ref_dst) { + ACMRandom rnd(ACMRandom::DeterministicSeed()); + left_col_ = left_col; + dst_ = dst; + ref_dst_ = ref_dst; + above_row_ = above_data + 16; + int error_count = 0; + for (int i = 0; i < count_test_block; ++i) { + // Fill edges with random data, try first with saturated values. + for (int x = -1; x <= block_size_*2; x++) { + if (i == 0) { + above_row_[x] = mask_; + } else { + above_row_[x] = rnd.Rand16() & mask_; + } + } + for (int y = 0; y < block_size_; y++) { + if (i == 0) { + left_col_[y] = mask_; + } else { + left_col_[y] = rnd.Rand16() & mask_; + } + } + Predict(DC_PRED); + CheckPrediction(i, &error_count); + } + ASSERT_EQ(0, error_count); + } + + int block_size_; + uint16_t *above_row_; + uint16_t *left_col_; + uint16_t *dst_; + uint16_t *ref_dst_; + ptrdiff_t stride_; + int mask_; +}; + +typedef void (*intra_pred_fn_t)( + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, + const uint16_t *left, int bps); +typedef std::tr1::tuple<intra_pred_fn_t, + intra_pred_fn_t, int, int> intra_pred_params_t; +class VP9IntraPredTest + : public VP9IntraPredBase, + public ::testing::TestWithParam<intra_pred_params_t> { + + virtual void SetUp() { + pred_fn_ = GET_PARAM(0); + ref_fn_ = GET_PARAM(1); + block_size_ = GET_PARAM(2); + bit_depth_ = GET_PARAM(3); + stride_ = block_size_ * 3; + mask_ = (1 << bit_depth_) - 1; + } + + virtual void Predict(PREDICTION_MODE mode) { + const uint16_t *const_above_row = above_row_; + const uint16_t *const_left_col = left_col_; + ref_fn_(ref_dst_, stride_, const_above_row, const_left_col, bit_depth_); + ASM_REGISTER_STATE_CHECK(pred_fn_(dst_, stride_, const_above_row, + const_left_col, bit_depth_)); + } + intra_pred_fn_t pred_fn_; + intra_pred_fn_t ref_fn_; + int bit_depth_; +}; + +TEST_P(VP9IntraPredTest, IntraPredTests) { + // max block size is 32 + DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 2*32); + DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 2*32+32); + DECLARE_ALIGNED_ARRAY(16, uint16_t, dst, 3 * 32 * 32); + DECLARE_ALIGNED_ARRAY(16, uint16_t, ref_dst, 3 * 32 * 32); + RunTest(left_col, above_data, dst, ref_dst); +} + +using std::tr1::make_tuple; + +#if HAVE_SSE2 +#if CONFIG_VP9_HIGHBITDEPTH +#if ARCH_X86_64 +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_32x32_sse2, + &vp9_high_dc_predictor_32x32_c, 32, 8), + make_tuple(&vp9_high_tm_predictor_16x16_sse2, + &vp9_high_tm_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_tm_predictor_32x32_sse2, + &vp9_high_tm_predictor_32x32_c, 32, 8), + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 8), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 8), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 8), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 8))); +#else +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_8, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 8), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 8), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 8), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 8), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 8), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 8))); +#endif +#if ARCH_X86_64 +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_32x32_sse2, + &vp9_high_dc_predictor_32x32_c, 32, 10), + make_tuple(&vp9_high_tm_predictor_16x16_sse2, + &vp9_high_tm_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_tm_predictor_32x32_sse2, + &vp9_high_tm_predictor_32x32_c, 32, 10), + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 10), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 10), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 10), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 10))); +#else +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_10, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 10), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 10), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 10), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 10), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 10), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 10))); +#endif + +#if ARCH_X86_64 +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_32x32_sse2, + &vp9_high_dc_predictor_32x32_c, 32, 12), + make_tuple(&vp9_high_tm_predictor_16x16_sse2, + &vp9_high_tm_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_tm_predictor_32x32_sse2, + &vp9_high_tm_predictor_32x32_c, 32, 12), + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 12), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 12), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 12), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 12))); +#else +INSTANTIATE_TEST_CASE_P(SSE2_TO_C_12, VP9IntraPredTest, + ::testing::Values( + make_tuple(&vp9_high_dc_predictor_4x4_sse, + &vp9_high_dc_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_dc_predictor_8x8_sse2, + &vp9_high_dc_predictor_8x8_c, 8, 12), + make_tuple(&vp9_high_dc_predictor_16x16_sse2, + &vp9_high_dc_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_v_predictor_4x4_sse, + &vp9_high_v_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_v_predictor_8x8_sse2, + &vp9_high_v_predictor_8x8_c, 8, 12), + make_tuple(&vp9_high_v_predictor_16x16_sse2, + &vp9_high_v_predictor_16x16_c, 16, 12), + make_tuple(&vp9_high_v_predictor_32x32_sse2, + &vp9_high_v_predictor_32x32_c, 32, 12), + make_tuple(&vp9_high_tm_predictor_4x4_sse, + &vp9_high_tm_predictor_4x4_c, 4, 12), + make_tuple(&vp9_high_tm_predictor_8x8_sse2, + &vp9_high_tm_predictor_8x8_c, 8, 12))); +#endif +#endif // CONFIG_VP9_HIGHBITDEPTH +#endif // HAVE_SSE2 +} // namespace diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index e50d3935f..769c2de6b 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -214,6 +214,7 @@ static int q2mbl(int x) x = 50 + (x - 50) * 10 / 8; return x * x / 3; } + void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit) { int r, c, i; @@ -226,14 +227,14 @@ void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int co int sumsq = 0; int sum = 0; - for (i = -8; i<0; i++) + for (i = -8; i < 0; i++) s[i]=s[0]; /* 17 avoids valgrind warning - we buffer values in c in d * and only write them when we've read 8 ahead... */ - for (i = cols; i<cols+17; i++) - s[i]=s[cols-1]; + for (i = 0; i < 17; i++) + s[i+cols]=s[cols-1]; for (i = -8; i <= 6; i++) { @@ -264,7 +265,6 @@ void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int co } } - void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit) { int r, c, i; @@ -284,8 +284,8 @@ void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, i /* 17 avoids valgrind warning - we buffer values in c in d * and only write them when we've read 8 ahead... */ - for (i = rows; i < rows+17; i++) - s[i*pitch]=s[(rows-1)*pitch]; + for (i = 0; i < 17; i++) + s[(i+rows)*pitch]=s[(rows-1)*pitch]; for (i = -8; i <= 6; i++) { diff --git a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c index 06e4f9479..f806809df 100644 --- a/vp8/encoder/arm/neon/vp8_mse16x16_neon.c +++ b/vp8/encoder/arm/neon/vp8_mse16x16_neon.c @@ -18,7 +18,7 @@ unsigned int vp8_mse16x16_neon( unsigned int *sse) { int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; - int64_t d0s64; + int64x1_t d0s64; uint8x16_t q0u8, q1u8, q2u8, q3u8; int32x4_t q7s32, q8s32, q9s32, q10s32; uint16x8_t q11u16, q12u16, q13u16, q14u16; @@ -82,7 +82,7 @@ unsigned int vp8_get4x4sse_cs_neon( const unsigned char *ref_ptr, int recon_stride) { int16x4_t d22s16, d24s16, d26s16, d28s16; - int64_t d0s64; + int64x1_t d0s64; uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int32x4_t q7s32, q8s32, q9s32, q10s32; uint16x8_t q11u16, q12u16, q13u16, q14u16; diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h index 5587192e8..8305e7fa6 100644 --- a/vp9/common/vp9_common.h +++ b/vp9/common/vp9_common.h @@ -65,6 +65,18 @@ static INLINE int get_unsigned_bits(unsigned int num_values) { } #if CONFIG_VP9_HIGHBITDEPTH +static INLINE uint16_t clip_pixel_high(int val, int bd) { + switch (bd) { + case 8: + default: + return (uint16_t)clamp(val, 0, 255); + case 10: + return (uint16_t)clamp(val, 0, 1023); + case 12: + return (uint16_t)clamp(val, 0, 4095); + } +} + #define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1)) #define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1 )) #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/common/vp9_frame_buffers.c b/vp9/common/vp9_frame_buffers.c index 733b3a927..34795b74e 100644 --- a/vp9/common/vp9_frame_buffers.c +++ b/vp9/common/vp9_frame_buffers.c @@ -61,6 +61,10 @@ int vp9_get_frame_buffer(void *cb_priv, size_t min_size, if (!int_fb_list->int_fb[i].data) return -1; + // This memset is needed for fixing valgrind error from C loop filter + // due to access uninitialized memory in frame border. It could be + // removed if border is totally removed. + vpx_memset(int_fb_list->int_fb[i].data, 0, min_size); int_fb_list->int_fb[i].size = min_size; } diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c index 3332e58e6..564a3eb0c 100644 --- a/vp9/common/vp9_quant_common.c +++ b/vp9/common/vp9_quant_common.c @@ -47,6 +47,78 @@ static const int16_t dc_qlookup[QINDEX_RANGE] = { 1022, 1058, 1098, 1139, 1184, 1232, 1282, 1336, }; +#if CONFIG_VP9_HIGHBITDEPTH +static const int16_t dc_qlookup_10[QINDEX_RANGE] = { + 4, 9, 10, 13, 15, 17, 20, 22, + 25, 28, 31, 34, 37, 40, 43, 47, + 50, 53, 57, 60, 64, 68, 71, 75, + 78, 82, 86, 90, 93, 97, 101, 105, + 109, 113, 116, 120, 124, 128, 132, 136, + 140, 143, 147, 151, 155, 159, 163, 166, + 170, 174, 178, 182, 185, 189, 193, 197, + 200, 204, 208, 212, 215, 219, 223, 226, + 230, 233, 237, 241, 244, 248, 251, 255, + 259, 262, 266, 269, 273, 276, 280, 283, + 287, 290, 293, 297, 300, 304, 307, 310, + 314, 317, 321, 324, 327, 331, 334, 337, + 343, 350, 356, 362, 369, 375, 381, 387, + 394, 400, 406, 412, 418, 424, 430, 436, + 442, 448, 454, 460, 466, 472, 478, 484, + 490, 499, 507, 516, 525, 533, 542, 550, + 559, 567, 576, 584, 592, 601, 609, 617, + 625, 634, 644, 655, 666, 676, 687, 698, + 708, 718, 729, 739, 749, 759, 770, 782, + 795, 807, 819, 831, 844, 856, 868, 880, + 891, 906, 920, 933, 947, 961, 975, 988, + 1001, 1015, 1030, 1045, 1061, 1076, 1090, 1105, + 1120, 1137, 1153, 1170, 1186, 1202, 1218, 1236, + 1253, 1271, 1288, 1306, 1323, 1342, 1361, 1379, + 1398, 1416, 1436, 1456, 1476, 1496, 1516, 1537, + 1559, 1580, 1601, 1624, 1647, 1670, 1692, 1717, + 1741, 1766, 1791, 1817, 1844, 1871, 1900, 1929, + 1958, 1990, 2021, 2054, 2088, 2123, 2159, 2197, + 2236, 2276, 2319, 2363, 2410, 2458, 2508, 2561, + 2616, 2675, 2737, 2802, 2871, 2944, 3020, 3102, + 3188, 3280, 3375, 3478, 3586, 3702, 3823, 3953, + 4089, 4236, 4394, 4559, 4737, 4929, 5130, 5347, +}; + +static const int16_t dc_qlookup_12[QINDEX_RANGE] = { + 4, 12, 18, 25, 33, 41, 50, 60, + 70, 80, 91, 103, 115, 127, 140, 153, + 166, 180, 194, 208, 222, 237, 251, 266, + 281, 296, 312, 327, 343, 358, 374, 390, + 405, 421, 437, 453, 469, 484, 500, 516, + 532, 548, 564, 580, 596, 611, 627, 643, + 659, 674, 690, 706, 721, 737, 752, 768, + 783, 798, 814, 829, 844, 859, 874, 889, + 904, 919, 934, 949, 964, 978, 993, 1008, + 1022, 1037, 1051, 1065, 1080, 1094, 1108, 1122, + 1136, 1151, 1165, 1179, 1192, 1206, 1220, 1234, + 1248, 1261, 1275, 1288, 1302, 1315, 1329, 1342, + 1368, 1393, 1419, 1444, 1469, 1494, 1519, 1544, + 1569, 1594, 1618, 1643, 1668, 1692, 1717, 1741, + 1765, 1789, 1814, 1838, 1862, 1885, 1909, 1933, + 1957, 1992, 2027, 2061, 2096, 2130, 2165, 2199, + 2233, 2267, 2300, 2334, 2367, 2400, 2434, 2467, + 2499, 2532, 2575, 2618, 2661, 2704, 2746, 2788, + 2830, 2872, 2913, 2954, 2995, 3036, 3076, 3127, + 3177, 3226, 3275, 3324, 3373, 3421, 3469, 3517, + 3565, 3621, 3677, 3733, 3788, 3843, 3897, 3951, + 4005, 4058, 4119, 4181, 4241, 4301, 4361, 4420, + 4479, 4546, 4612, 4677, 4742, 4807, 4871, 4942, + 5013, 5083, 5153, 5222, 5291, 5367, 5442, 5517, + 5591, 5665, 5745, 5825, 5905, 5984, 6063, 6149, + 6234, 6319, 6404, 6495, 6587, 6678, 6769, 6867, + 6966, 7064, 7163, 7269, 7376, 7483, 7599, 7715, + 7832, 7958, 8085, 8214, 8352, 8492, 8635, 8788, + 8945, 9104, 9275, 9450, 9639, 9832, 10031, 10245, + 10465, 10702, 10946, 11210, 11482, 11776, 12081, 12409, + 12750, 13118, 13501, 13913, 14343, 14807, 15290, 15812, + 16356, 16943, 17575, 18237, 18949, 19718, 20521, 21387, +}; +#endif + static const int16_t ac_qlookup[QINDEX_RANGE] = { 4, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, @@ -82,15 +154,116 @@ static const int16_t ac_qlookup[QINDEX_RANGE] = { 1597, 1628, 1660, 1692, 1725, 1759, 1793, 1828, }; -int16_t vp9_dc_quant(int qindex, int delta) { +#if CONFIG_VP9_HIGHBITDEPTH +static const int16_t ac_qlookup_10[QINDEX_RANGE] = { + 4, 9, 11, 13, 16, 18, 21, 24, + 27, 30, 33, 37, 40, 44, 48, 51, + 55, 59, 63, 67, 71, 75, 79, 83, + 88, 92, 96, 100, 105, 109, 114, 118, + 122, 127, 131, 136, 140, 145, 149, 154, + 158, 163, 168, 172, 177, 181, 186, 190, + 195, 199, 204, 208, 213, 217, 222, 226, + 231, 235, 240, 244, 249, 253, 258, 262, + 267, 271, 275, 280, 284, 289, 293, 297, + 302, 306, 311, 315, 319, 324, 328, 332, + 337, 341, 345, 349, 354, 358, 362, 367, + 371, 375, 379, 384, 388, 392, 396, 401, + 409, 417, 425, 433, 441, 449, 458, 466, + 474, 482, 490, 498, 506, 514, 523, 531, + 539, 547, 555, 563, 571, 579, 588, 596, + 604, 616, 628, 640, 652, 664, 676, 688, + 700, 713, 725, 737, 749, 761, 773, 785, + 797, 809, 825, 841, 857, 873, 889, 905, + 922, 938, 954, 970, 986, 1002, 1018, 1038, + 1058, 1078, 1098, 1118, 1138, 1158, 1178, 1198, + 1218, 1242, 1266, 1290, 1314, 1338, 1362, 1386, + 1411, 1435, 1463, 1491, 1519, 1547, 1575, 1603, + 1631, 1663, 1695, 1727, 1759, 1791, 1823, 1859, + 1895, 1931, 1967, 2003, 2039, 2079, 2119, 2159, + 2199, 2239, 2283, 2327, 2371, 2415, 2459, 2507, + 2555, 2603, 2651, 2703, 2755, 2807, 2859, 2915, + 2971, 3027, 3083, 3143, 3203, 3263, 3327, 3391, + 3455, 3523, 3591, 3659, 3731, 3803, 3876, 3952, + 4028, 4104, 4184, 4264, 4348, 4432, 4516, 4604, + 4692, 4784, 4876, 4972, 5068, 5168, 5268, 5372, + 5476, 5584, 5692, 5804, 5916, 6032, 6148, 6268, + 6388, 6512, 6640, 6768, 6900, 7036, 7172, 7312, +}; + +static const int16_t ac_qlookup_12[QINDEX_RANGE] = { + 4, 13, 19, 27, 35, 44, 54, 64, + 75, 87, 99, 112, 126, 139, 154, 168, + 183, 199, 214, 230, 247, 263, 280, 297, + 314, 331, 349, 366, 384, 402, 420, 438, + 456, 475, 493, 511, 530, 548, 567, 586, + 604, 623, 642, 660, 679, 698, 716, 735, + 753, 772, 791, 809, 828, 846, 865, 884, + 902, 920, 939, 957, 976, 994, 1012, 1030, + 1049, 1067, 1085, 1103, 1121, 1139, 1157, 1175, + 1193, 1211, 1229, 1246, 1264, 1282, 1299, 1317, + 1335, 1352, 1370, 1387, 1405, 1422, 1440, 1457, + 1474, 1491, 1509, 1526, 1543, 1560, 1577, 1595, + 1627, 1660, 1693, 1725, 1758, 1791, 1824, 1856, + 1889, 1922, 1954, 1987, 2020, 2052, 2085, 2118, + 2150, 2183, 2216, 2248, 2281, 2313, 2346, 2378, + 2411, 2459, 2508, 2556, 2605, 2653, 2701, 2750, + 2798, 2847, 2895, 2943, 2992, 3040, 3088, 3137, + 3185, 3234, 3298, 3362, 3426, 3491, 3555, 3619, + 3684, 3748, 3812, 3876, 3941, 4005, 4069, 4149, + 4230, 4310, 4390, 4470, 4550, 4631, 4711, 4791, + 4871, 4967, 5064, 5160, 5256, 5352, 5448, 5544, + 5641, 5737, 5849, 5961, 6073, 6185, 6297, 6410, + 6522, 6650, 6778, 6906, 7034, 7162, 7290, 7435, + 7579, 7723, 7867, 8011, 8155, 8315, 8475, 8635, + 8795, 8956, 9132, 9308, 9484, 9660, 9836, 10028, + 10220, 10412, 10604, 10812, 11020, 11228, 11437, 11661, + 11885, 12109, 12333, 12573, 12813, 13053, 13309, 13565, + 13821, 14093, 14365, 14637, 14925, 15213, 15502, 15806, + 16110, 16414, 16734, 17054, 17390, 17726, 18062, 18414, + 18766, 19134, 19502, 19886, 20270, 20670, 21070, 21486, + 21902, 22334, 22766, 23214, 23662, 24126, 24590, 25070, + 25551, 26047, 26559, 27071, 27599, 28143, 28687, 29247, +}; +#endif + +int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: + return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_10: + return dc_qlookup_10[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_12: + return dc_qlookup_12[clamp(qindex + delta, 0, MAXQ)]; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else + (void) bit_depth; return dc_qlookup[clamp(qindex + delta, 0, MAXQ)]; +#endif } -int16_t vp9_ac_quant(int qindex, int delta) { +int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth) { +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: + return ac_qlookup[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_10: + return ac_qlookup_10[clamp(qindex + delta, 0, MAXQ)]; + case VPX_BITS_12: + return ac_qlookup_12[clamp(qindex + delta, 0, MAXQ)]; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else + (void) bit_depth; return ac_qlookup[clamp(qindex + delta, 0, MAXQ)]; +#endif } - int vp9_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex) { if (vp9_segfeature_active(seg, segment_id, SEG_LVL_ALT_Q)) { diff --git a/vp9/common/vp9_quant_common.h b/vp9/common/vp9_quant_common.h index d1545d93c..b6266059d 100644 --- a/vp9/common/vp9_quant_common.h +++ b/vp9/common/vp9_quant_common.h @@ -11,6 +11,7 @@ #ifndef VP9_COMMON_VP9_QUANT_COMMON_H_ #define VP9_COMMON_VP9_QUANT_COMMON_H_ +#include "vpx/vpx_codec.h" #include "vp9/common/vp9_blockd.h" #ifdef __cplusplus @@ -22,8 +23,8 @@ extern "C" { #define QINDEX_RANGE (MAXQ - MINQ + 1) #define QINDEX_BITS 8 -int16_t vp9_dc_quant(int qindex, int delta); -int16_t vp9_ac_quant(int qindex, int delta); +int16_t vp9_dc_quant(int qindex, int delta, vpx_bit_depth_t bit_depth); +int16_t vp9_ac_quant(int qindex, int delta, vpx_bit_depth_t bit_depth); int vp9_get_qindex(const struct segmentation *seg, int segment_id, int base_qindex); diff --git a/vp9/common/vp9_reconintra.c b/vp9/common/vp9_reconintra.c index 471929aea..7ebd2ea87 100644 --- a/vp9/common/vp9_reconintra.c +++ b/vp9/common/vp9_reconintra.c @@ -40,11 +40,289 @@ const TX_TYPE intra_mode_to_tx_type_lookup[INTRA_MODES] = { type##_predictor(dst, stride, size, above, left); \ } +#if CONFIG_VP9_HIGHBITDEPTH +#define intra_pred_high_sized(type, size) \ + void vp9_high_##type##_predictor_##size##x##size##_c( \ + uint16_t *dst, ptrdiff_t stride, const uint16_t *above, \ + const uint16_t *left, int bd) { \ + high_##type##_predictor(dst, stride, size, above, left, bd); \ + } + +#define intra_pred_allsizes(type) \ + intra_pred_sized(type, 4) \ + intra_pred_sized(type, 8) \ + intra_pred_sized(type, 16) \ + intra_pred_sized(type, 32) \ + intra_pred_high_sized(type, 4) \ + intra_pred_high_sized(type, 8) \ + intra_pred_high_sized(type, 16) \ + intra_pred_high_sized(type, 32) + +#else + #define intra_pred_allsizes(type) \ intra_pred_sized(type, 4) \ intra_pred_sized(type, 8) \ intra_pred_sized(type, 16) \ intra_pred_sized(type, 32) +#endif // CONFIG_VP9_HIGHBITDEPTH + +#if CONFIG_VP9_HIGHBITDEPTH +static INLINE void high_d207_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) above; + (void) bd; + + // First column. + for (r = 0; r < bs - 1; ++r) { + dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1], 1); + } + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Second column. + for (r = 0; r < bs - 2; ++r) { + dst[r * stride] = ROUND_POWER_OF_TWO(left[r] + left[r + 1] * 2 + + left[r + 2], 2); + } + dst[(bs - 2) * stride] = ROUND_POWER_OF_TWO(left[bs - 2] + + left[bs - 1] * 3, 2); + dst[(bs - 1) * stride] = left[bs - 1]; + dst++; + + // Rest of last row. + for (c = 0; c < bs - 2; ++c) + dst[(bs - 1) * stride + c] = left[bs - 1]; + + for (r = bs - 2; r >= 0; --r) { + for (c = 0; c < bs - 2; ++c) + dst[r * stride + c] = dst[(r + 1) * stride + c - 2]; + } +} + +static INLINE void high_d63_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) left; + (void) bd; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r & 1 ? ROUND_POWER_OF_TWO(above[r/2 + c] + + above[r/2 + c + 1] * 2 + + above[r/2 + c + 2], 2) + : ROUND_POWER_OF_TWO(above[r/2 + c] + + above[r/2 + c + 1], 1); + } + dst += stride; + } +} + +static INLINE void high_d45_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) left; + (void) bd; + for (r = 0; r < bs; ++r) { + for (c = 0; c < bs; ++c) { + dst[c] = r + c + 2 < bs * 2 ? ROUND_POWER_OF_TWO(above[r + c] + + above[r + c + 1] * 2 + + above[r + c + 2], 2) + : above[bs * 2 - 1]; + } + dst += stride; + } +} + +static INLINE void high_d117_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) bd; + + // first row + for (c = 0; c < bs; c++) + dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c], 1); + dst += stride; + + // second row + dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + for (c = 1; c < bs; c++) + dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); + dst += stride; + + // the rest of first col + dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + for (r = 3; r < bs; ++r) + dst[(r - 2) * stride] = ROUND_POWER_OF_TWO(left[r - 3] + left[r - 2] * 2 + + left[r - 1], 2); + + // the rest of the block + for (r = 2; r < bs; ++r) { + for (c = 1; c < bs; c++) + dst[c] = dst[-2 * stride + c - 1]; + dst += stride; + } +} + +static INLINE void high_d135_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) bd; + dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + for (c = 1; c < bs; c++) + dst[c] = ROUND_POWER_OF_TWO(above[c - 2] + above[c - 1] * 2 + above[c], 2); + + dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + for (r = 2; r < bs; ++r) + dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + + left[r], 2); + + dst += stride; + for (r = 1; r < bs; ++r) { + for (c = 1; c < bs; c++) + dst[c] = dst[-stride + c - 1]; + dst += stride; + } +} + +static INLINE void high_d153_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + (void) bd; + dst[0] = ROUND_POWER_OF_TWO(above[-1] + left[0], 1); + for (r = 1; r < bs; r++) + dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 1] + left[r], 1); + dst++; + + dst[0] = ROUND_POWER_OF_TWO(left[0] + above[-1] * 2 + above[0], 2); + dst[stride] = ROUND_POWER_OF_TWO(above[-1] + left[0] * 2 + left[1], 2); + for (r = 2; r < bs; r++) + dst[r * stride] = ROUND_POWER_OF_TWO(left[r - 2] + left[r - 1] * 2 + + left[r], 2); + dst++; + + for (c = 0; c < bs - 2; c++) + dst[c] = ROUND_POWER_OF_TWO(above[c - 1] + above[c] * 2 + above[c + 1], 2); + dst += stride; + + for (r = 1; r < bs; ++r) { + for (c = 0; c < bs - 2; c++) + dst[c] = dst[-stride + c - 2]; + dst += stride; + } +} + +static INLINE void high_v_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void) left; + (void) bd; + for (r = 0; r < bs; r++) { + vpx_memcpy(dst, above, bs * sizeof(uint16_t)); + dst += stride; + } +} + +static INLINE void high_h_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, const uint16_t *left, + int bd) { + int r; + (void) above; + (void) bd; + for (r = 0; r < bs; r++) { + vpx_memset16(dst, left[r], bs); + dst += stride; + } +} + +static INLINE void high_tm_predictor(uint16_t *dst, ptrdiff_t stride, int bs, + const uint16_t *above, + const uint16_t *left, int bd) { + int r, c; + int ytop_left = above[-1]; + (void) bd; + + for (r = 0; r < bs; r++) { + for (c = 0; c < bs; c++) + dst[c] = clip_pixel_high(left[r] + above[c] - ytop_left, bd); + dst += stride; + } +} + +static INLINE void high_dc_128_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int r; + (void) above; + (void) left; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, 128 << (bd - 8), bs); + dst += stride; + } +} + +static INLINE void high_dc_left_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void) above; + (void) bd; + + for (i = 0; i < bs; i++) + sum += left[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void high_dc_top_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + (void) left; + (void) bd; + + for (i = 0; i < bs; i++) + sum += above[i]; + expected_dc = (sum + (bs >> 1)) / bs; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} + +static INLINE void high_dc_predictor(uint16_t *dst, ptrdiff_t stride, + int bs, const uint16_t *above, + const uint16_t *left, int bd) { + int i, r, expected_dc, sum = 0; + const int count = 2 * bs; + (void) bd; + + for (i = 0; i < bs; i++) { + sum += above[i]; + sum += left[i]; + } + + expected_dc = (sum + (count >> 1)) / count; + + for (r = 0; r < bs; r++) { + vpx_memset16(dst, expected_dc, bs); + dst += stride; + } +} +#endif // CONFIG_VP9_HIGHBITDEPTH static INLINE void d207_predictor(uint8_t *dst, ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) { @@ -293,6 +571,14 @@ typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, static intra_pred_fn pred[INTRA_MODES][TX_SIZES]; static intra_pred_fn dc_pred[2][2][TX_SIZES]; +#if CONFIG_VP9_HIGHBITDEPTH +typedef void (*intra_high_pred_fn)(uint16_t *dst, ptrdiff_t stride, + const uint16_t *above, const uint16_t *left, + int bd); +static intra_high_pred_fn pred_high[INTRA_MODES][4]; +static intra_high_pred_fn dc_pred_high[2][2][4]; +#endif // CONFIG_VP9_HIGHBITDEPTH + void vp9_init_intra_predictors() { #define INIT_ALL_SIZES(p, type) \ p[TX_4X4] = vp9_##type##_predictor_4x4; \ @@ -315,8 +601,163 @@ void vp9_init_intra_predictors() { INIT_ALL_SIZES(dc_pred[1][0], dc_left); INIT_ALL_SIZES(dc_pred[1][1], dc); -#undef INIT_ALL_SIZES +#if CONFIG_VP9_HIGHBITDEPTH + INIT_ALL_SIZES(pred_high[V_PRED], high_v); + INIT_ALL_SIZES(pred_high[H_PRED], high_h); + INIT_ALL_SIZES(pred_high[D207_PRED], high_d207); + INIT_ALL_SIZES(pred_high[D45_PRED], high_d45); + INIT_ALL_SIZES(pred_high[D63_PRED], high_d63); + INIT_ALL_SIZES(pred_high[D117_PRED], high_d117); + INIT_ALL_SIZES(pred_high[D135_PRED], high_d135); + INIT_ALL_SIZES(pred_high[D153_PRED], high_d153); + INIT_ALL_SIZES(pred_high[TM_PRED], high_tm); + + INIT_ALL_SIZES(dc_pred_high[0][0], high_dc_128); + INIT_ALL_SIZES(dc_pred_high[0][1], high_dc_top); + INIT_ALL_SIZES(dc_pred_high[1][0], high_dc_left); + INIT_ALL_SIZES(dc_pred_high[1][1], high_dc); +#endif // CONFIG_VP9_HIGHBITDEPTH + +#undef intra_pred_allsizes +} + +#if CONFIG_VP9_HIGHBITDEPTH +static void build_intra_predictors_high(const MACROBLOCKD *xd, + const uint8_t *ref8, + int ref_stride, + uint8_t *dst8, + int dst_stride, + PREDICTION_MODE mode, + TX_SIZE tx_size, + int up_available, + int left_available, + int right_available, + int x, int y, + int plane, int bd) { + int i; + uint16_t *dst = CONVERT_TO_SHORTPTR(dst8); + uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); + DECLARE_ALIGNED_ARRAY(16, uint16_t, left_col, 64); + DECLARE_ALIGNED_ARRAY(16, uint16_t, above_data, 128 + 16); + uint16_t *above_row = above_data + 16; + const uint16_t *const_above_row = above_row; + const int bs = 4 << tx_size; + int frame_width, frame_height; + int x0, y0; + const struct macroblockd_plane *const pd = &xd->plane[plane]; + // int base=128; + int base = 128 << (bd - 8); + // 127 127 127 .. 127 127 127 127 127 127 + // 129 A B .. Y Z + // 129 C D .. W X + // 129 E F .. U V + // 129 G H .. S T T T T T + + // Get current frame pointer, width and height. + if (plane == 0) { + frame_width = xd->cur_buf->y_width; + frame_height = xd->cur_buf->y_height; + } else { + frame_width = xd->cur_buf->uv_width; + frame_height = xd->cur_buf->uv_height; + } + + // Get block position in current frame. + x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x; + y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y; + + // left + if (left_available) { + if (xd->mb_to_bottom_edge < 0) { + /* slower path if the block needs border extension */ + if (y0 + bs <= frame_height) { + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } else { + const int extend_bottom = frame_height - y0; + for (i = 0; i < extend_bottom; ++i) + left_col[i] = ref[i * ref_stride - 1]; + for (; i < bs; ++i) + left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1]; + } + } else { + /* faster path if the block does not need extension */ + for (i = 0; i < bs; ++i) + left_col[i] = ref[i * ref_stride - 1]; + } + } else { + // TODO(Peter): this value should probably change for high bitdepth + vpx_memset16(left_col, base + 1, bs); + } + + // TODO(hkuang) do not extend 2*bs pixels for all modes. + // above + if (up_available) { + const uint16_t *above_ref = ref - ref_stride; + if (xd->mb_to_right_edge < 0) { + /* slower path if the block needs border extension */ + if (x0 + 2 * bs <= frame_width) { + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, 2 * bs * sizeof(uint16_t)); + } else { + vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t)); + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 + bs <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t)); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t)); + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + } + } else if (x0 <= frame_width) { + const int r = frame_width - x0; + if (right_available && bs == 4) { + vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t)); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } else { + vpx_memcpy(above_row, above_ref, r * sizeof(uint16_t)); + vpx_memset16(above_row + r, above_row[r - 1], + x0 + 2 * bs - frame_width); + } + } + // TODO(Peter) this value should probably change for high bitdepth + above_row[-1] = left_available ? above_ref[-1] : (base+1); + } else { + /* faster path if the block does not need extension */ + if (bs == 4 && right_available && left_available) { + const_above_row = above_ref; + } else { + vpx_memcpy(above_row, above_ref, bs * sizeof(uint16_t)); + if (bs == 4 && right_available) + vpx_memcpy(above_row + bs, above_ref + bs, bs * sizeof(uint16_t)); + else + vpx_memset16(above_row + bs, above_row[bs - 1], bs); + // TODO(Peter): this value should probably change for high bitdepth + above_row[-1] = left_available ? above_ref[-1] : (base+1); + } + } + } else { + vpx_memset16(above_row, base - 1, bs * 2); + // TODO(Peter): this value should probably change for high bitdepth + above_row[-1] = base - 1; + } + + // predict + if (mode == DC_PRED) { + dc_pred_high[left_available][up_available][tx_size](dst, dst_stride, + const_above_row, + left_col, xd->bd); + } else { + pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col, + xd->bd); + } } +#endif // CONFIG_VP9_HIGHBITDEPTH static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref, int ref_stride, uint8_t *dst, int dst_stride, @@ -454,6 +895,14 @@ void vp9_predict_intra_block(const MACROBLOCKD *xd, int block_idx, int bwl_in, const int y = loff * 4; assert(bwl >= 0); +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode, + tx_size, have_top, have_left, have_right, + x, y, plane, xd->bd); + return; + } +#endif build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size, have_top, have_left, have_right, x, y, plane); } diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index c695a5dc0..b75ea64f0 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -445,61 +445,219 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { specialize qw/vp9_iwht4x4_16_add/; } - # High bitdepth functions if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { -# -# dct -# -add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct4x4_1_add/; + # + # Intra prediction + # + add_proto qw/void vp9_high_d207_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d207_predictor_4x4/; + + add_proto qw/void vp9_high_d45_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d45_predictor_4x4/; + + add_proto qw/void vp9_high_d63_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d63_predictor_4x4/; + + add_proto qw/void vp9_high_h_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_h_predictor_4x4/; + + add_proto qw/void vp9_high_d117_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d117_predictor_4x4/; + + add_proto qw/void vp9_high_d135_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d135_predictor_4x4/; + + add_proto qw/void vp9_high_d153_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d153_predictor_4x4/; + + add_proto qw/void vp9_high_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_v_predictor_4x4 neon/, "$sse_x86inc"; + + add_proto qw/void vp9_high_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_tm_predictor_4x4/, "$sse_x86inc"; + + add_proto qw/void vp9_high_dc_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_predictor_4x4/, "$sse_x86inc"; + + add_proto qw/void vp9_high_dc_top_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_top_predictor_4x4/; + + add_proto qw/void vp9_high_dc_left_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_left_predictor_4x4/; + + add_proto qw/void vp9_high_dc_128_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_128_predictor_4x4/; + + add_proto qw/void vp9_high_d207_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d207_predictor_8x8/; + + add_proto qw/void vp9_high_d45_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d45_predictor_8x8/; + + add_proto qw/void vp9_high_d63_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d63_predictor_8x8/; + + add_proto qw/void vp9_high_h_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_h_predictor_8x8/; + + add_proto qw/void vp9_high_d117_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d117_predictor_8x8/; + + add_proto qw/void vp9_high_d135_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d135_predictor_8x8/; + + add_proto qw/void vp9_high_d153_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d153_predictor_8x8/; + + add_proto qw/void vp9_high_v_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_v_predictor_8x8/, "$sse2_x86inc"; + + add_proto qw/void vp9_high_tm_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_tm_predictor_8x8/, "$sse2_x86inc"; + + add_proto qw/void vp9_high_dc_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_predictor_8x8/, "$sse2_x86inc";; + + add_proto qw/void vp9_high_dc_top_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_top_predictor_8x8/; + + add_proto qw/void vp9_high_dc_left_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_left_predictor_8x8/; + + add_proto qw/void vp9_high_dc_128_predictor_8x8/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_128_predictor_8x8/; -add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct4x4_16_add/; + add_proto qw/void vp9_high_d207_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d207_predictor_16x16/; -add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct8x8_1_add/; + add_proto qw/void vp9_high_d45_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d45_predictor_16x16/; -add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct8x8_64_add/; + add_proto qw/void vp9_high_d63_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d63_predictor_16x16/; -add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct8x8_10_add/; + add_proto qw/void vp9_high_h_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_h_predictor_16x16/; -add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct16x16_1_add/; + add_proto qw/void vp9_high_d117_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d117_predictor_16x16/; -add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct16x16_256_add/; + add_proto qw/void vp9_high_d135_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d135_predictor_16x16/; -add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct16x16_10_add/; + add_proto qw/void vp9_high_d153_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d153_predictor_16x16/; -add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct32x32_1024_add/; + add_proto qw/void vp9_high_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_v_predictor_16x16 neon/, "$sse2_x86inc"; -add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct32x32_34_add/; + add_proto qw/void vp9_high_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_tm_predictor_16x16/, "$sse2_x86_64"; -add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_idct32x32_1_add/; + add_proto qw/void vp9_high_dc_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_predictor_16x16/, "$sse2_x86inc"; -add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; -specialize qw/vp9_high_iht4x4_16_add/; + add_proto qw/void vp9_high_dc_top_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_top_predictor_16x16/; -add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; -specialize qw/vp9_high_iht8x8_64_add/; + add_proto qw/void vp9_high_dc_left_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_left_predictor_16x16/; -add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd"; -specialize qw/vp9_high_iht16x16_256_add/; + add_proto qw/void vp9_high_dc_128_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_128_predictor_16x16/; -# dct and add + add_proto qw/void vp9_high_d207_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d207_predictor_32x32/; + + add_proto qw/void vp9_high_d45_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d45_predictor_32x32/; + + add_proto qw/void vp9_high_d63_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d63_predictor_32x32/; + + add_proto qw/void vp9_high_h_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_h_predictor_32x32/; + + add_proto qw/void vp9_high_d117_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d117_predictor_32x32/; + + add_proto qw/void vp9_high_d135_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d135_predictor_32x32/; + + add_proto qw/void vp9_high_d153_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_d153_predictor_32x32/; + + add_proto qw/void vp9_high_v_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_v_predictor_32x32/, "$sse2_x86inc"; + + add_proto qw/void vp9_high_tm_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_tm_predictor_32x32/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_dc_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_predictor_32x32/, "$sse2_x86_64"; + + add_proto qw/void vp9_high_dc_top_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_top_predictor_32x32/; + + add_proto qw/void vp9_high_dc_left_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_left_predictor_32x32/; + + add_proto qw/void vp9_high_dc_128_predictor_32x32/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bps"; + specialize qw/vp9_high_dc_128_predictor_32x32/; + + # + # dct + # + add_proto qw/void vp9_high_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct4x4_1_add/; + + add_proto qw/void vp9_high_idct4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct4x4_16_add/; + + add_proto qw/void vp9_high_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct8x8_1_add/; + + add_proto qw/void vp9_high_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct8x8_64_add/; + + add_proto qw/void vp9_high_idct8x8_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct8x8_10_add/; + + add_proto qw/void vp9_high_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct16x16_1_add/; + + add_proto qw/void vp9_high_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct16x16_256_add/; + + add_proto qw/void vp9_high_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct16x16_10_add/; + + add_proto qw/void vp9_high_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct32x32_1024_add/; + + add_proto qw/void vp9_high_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct32x32_34_add/; + + add_proto qw/void vp9_high_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_idct32x32_1_add/; + + add_proto qw/void vp9_high_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; + specialize qw/vp9_high_iht4x4_16_add/; + + add_proto qw/void vp9_high_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type, int bd"; + specialize qw/vp9_high_iht8x8_64_add/; + + add_proto qw/void vp9_high_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type, int bd"; + specialize qw/vp9_high_iht16x16_256_add/; + + # dct and add -add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_iwht4x4_1_add/; + add_proto qw/void vp9_high_iwht4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_iwht4x4_1_add/; -add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; -specialize qw/vp9_high_iwht4x4_16_add/; + add_proto qw/void vp9_high_iwht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int bd"; + specialize qw/vp9_high_iwht4x4_16_add/; } # diff --git a/vp9/common/x86/vp9_high_intrapred_sse2.asm b/vp9/common/x86/vp9_high_intrapred_sse2.asm new file mode 100644 index 000000000..ff450711e --- /dev/null +++ b/vp9/common/x86/vp9_high_intrapred_sse2.asm @@ -0,0 +1,476 @@ +; +; Copyright (c) 2014 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_4: times 8 dw 4 +pw_8: times 8 dw 8 +pw_16: times 4 dd 16 +pw_32: times 4 dd 32 + +SECTION .text +INIT_MMX sse +cglobal high_dc_predictor_4x4, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + movq m0, [aboveq] + movq m2, [leftq] + DEFINE_ARGS dst, stride, one + mov oned, 0x0001 + pxor m1, m1 + movd m3, oned + pshufw m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_4)] + psraw m0, 3 + pshufw m0, m0, 0x0 + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal high_dc_predictor_8x8, 4, 5, 4, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [leftq] + DEFINE_ARGS dst, stride, stride3, one + mov oned, 0x00010001 + lea stride3q, [strideq*3] + movd m3, oned + pshufd m3, m3, 0x0 + paddw m0, m2 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + packssdw m0, m1 + pmaddwd m0, m3 + paddw m0, [GLOBAL(pw_8)] + psrlw m0, 4 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + + RESTORE_GOT + RET + +INIT_XMM sse2 +cglobal high_dc_predictor_16x16, 4, 5, 5, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m3, [aboveq+16] + mova m2, [leftq] + mova m4, [leftq+16] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 4 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_16)] + psrad m0, 5 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal high_dc_predictor_32x32, 4, 5, 9, dst, stride, above, left, goffset + GET_GOT goffsetq + + pxor m1, m1 + mova m0, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + mova m5, [leftq] + mova m6, [leftq+16] + mova m7, [leftq+32] + mova m8, [leftq+48] + DEFINE_ARGS dst, stride, stride3, lines4 + lea stride3q, [strideq*3] + mov lines4d, 8 + paddw m0, m2 + paddw m0, m3 + paddw m0, m4 + paddw m0, m5 + paddw m0, m6 + paddw m0, m7 + paddw m0, m8 + movhlps m2, m0 + paddw m0, m2 + punpcklwd m0, m1 + movhlps m2, m0 + paddd m0, m2 + punpckldq m0, m1 + movhlps m2, m0 + paddd m0, m2 + paddd m0, [GLOBAL(pw_32)] + psrad m0, 6 + pshuflw m0, m0, 0x0 + punpcklqdq m0, m0 +.loop: + mova [dstq ], m0 + mova [dstq +16 ], m0 + mova [dstq +32 ], m0 + mova [dstq +48 ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2+16 ], m0 + mova [dstq+strideq*2+32 ], m0 + mova [dstq+strideq*2+48 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4+16 ], m0 + mova [dstq+strideq*4+32 ], m0 + mova [dstq+strideq*4+48 ], m0 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m0 + mova [dstq+stride3q*2 +32], m0 + mova [dstq+stride3q*2 +48], m0 + lea dstq, [dstq+strideq*8] + dec lines4d + jnz .loop + + RESTORE_GOT + REP_RET +%endif + +INIT_MMX sse +cglobal high_v_predictor_4x4, 3, 3, 1, dst, stride, above + movq m0, [aboveq] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + lea dstq, [dstq+strideq*4] + movq [dstq ], m0 + movq [dstq+strideq*2], m0 + RET + +INIT_XMM sse2 +cglobal high_v_predictor_8x8, 3, 3, 1, dst, stride, above + mova m0, [aboveq] + DEFINE_ARGS dst, stride, stride3 + lea stride3q, [strideq*3] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + lea dstq, [dstq+strideq*8] + mova [dstq ], m0 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*4 ], m0 + mova [dstq+stride3q*2], m0 + RET + +INIT_XMM sse2 +cglobal high_v_predictor_16x16, 3, 4, 2, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 4 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2+16], m1 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal high_v_predictor_32x32, 3, 4, 4, dst, stride, above + mova m0, [aboveq] + mova m1, [aboveq+16] + mova m2, [aboveq+32] + mova m3, [aboveq+48] + DEFINE_ARGS dst, stride, stride3, nlines4 + lea stride3q, [strideq*3] + mov nlines4d, 8 +.loop: + mova [dstq ], m0 + mova [dstq +16], m1 + mova [dstq +32], m2 + mova [dstq +48], m3 + mova [dstq+strideq*2 ], m0 + mova [dstq+strideq*2 +16], m1 + mova [dstq+strideq*2 +32], m2 + mova [dstq+strideq*2 +48], m3 + mova [dstq+strideq*4 ], m0 + mova [dstq+strideq*4 +16], m1 + mova [dstq+strideq*4 +32], m2 + mova [dstq+strideq*4 +48], m3 + mova [dstq+stride3q*2 ], m0 + mova [dstq+stride3q*2 +16], m1 + mova [dstq+stride3q*2 +32], m2 + mova [dstq+stride3q*2 +48], m3 + lea dstq, [dstq+strideq*8] + dec nlines4d + jnz .loop + REP_RET + +INIT_MMX sse +cglobal high_tm_predictor_4x4, 5, 6, 5, dst, stride, above, left, bps, one + movd m1, [aboveq-2] + movq m0, [aboveq] + pshufw m1, m1, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + movd m3, oned + movd m4, bpsd + pshufw m3, m3, 0x0 + DEFINE_ARGS dst, stride, line, left + mov lineq, -2 + mova m2, m3 + psllw m3, m4 + add leftq, 8 + psubw m3, m2 ; max possible value + pxor m4, m4 ; min possible value + psubw m0, m1 +.loop: + movq m1, [leftq+lineq*4] + movq m2, [leftq+lineq*4+2] + pshufw m1, m1, 0x0 + pshufw m2, m2, 0x0 + paddw m1, m0 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m1, m3 + pminsw m2, m3 + pmaxsw m1, m4 + pmaxsw m2, m4 + ;Store the values + movq [dstq ], m1 + movq [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal high_tm_predictor_8x8, 5, 6, 5, dst, stride, above, left, bps, one + movd m1, [aboveq-2] + mova m0, [aboveq] + pshuflw m1, m1, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m3, m3 + pxor m4, m4 + pinsrw m3, oned, 0 + pinsrw m4, bpsd, 0 + pshuflw m3, m3, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m3, m3 + mov lineq, -4 + mova m2, m3 + punpcklqdq m1, m1 + psllw m3, m4 + add leftq, 16 + psubw m3, m2 ; max possible value + pxor m4, m4 ; min possible value + psubw m0, m1 +.loop: + movd m1, [leftq+lineq*4] + movd m2, [leftq+lineq*4+2] + pshuflw m1, m1, 0x0 + pshuflw m2, m2, 0x0 + punpcklqdq m1, m1 + punpcklqdq m2, m2 + paddw m1, m0 + paddw m2, m0 + ;Clamp to the bit-depth + pminsw m1, m3 + pminsw m2, m3 + pmaxsw m1, m4 + pmaxsw m2, m4 + ;Store the values + mova [dstq ], m1 + mova [dstq+strideq*2], m2 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +%if ARCH_X86_64 +INIT_XMM sse2 +cglobal high_tm_predictor_16x16, 5, 6, 8, dst, stride, above, left, bps, one + movd m2, [aboveq-2] + mova m0, [aboveq] + mova m1, [aboveq+16] + pshuflw m2, m2, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m7, m7 + pxor m8, m8 + pinsrw m7, oned, 0 + pinsrw m8, bpsd, 0 + pshuflw m7, m7, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m7, m7 + mov lineq, -8 + mova m5, m7 + punpcklqdq m2, m2 + psllw m7, m8 + add leftq, 32 + psubw m7, m5 ; max possible value + pxor m8, m8 ; min possible value + psubw m0, m2 + psubw m1, m2 +.loop: + movd m2, [leftq+lineq*4] + movd m3, [leftq+lineq*4+2] + pshuflw m2, m2, 0x0 + pshuflw m3, m3, 0x0 + punpcklqdq m2, m2 + punpcklqdq m3, m3 + paddw m4, m2, m0 + paddw m5, m3, m0 + paddw m2, m1 + paddw m3, m1 + ;Clamp to the bit-depth + pminsw m4, m7 + pminsw m5, m7 + pminsw m2, m7 + pminsw m3, m7 + pmaxsw m4, m8 + pmaxsw m5, m8 + pmaxsw m2, m8 + pmaxsw m3, m8 + ;Store the values + mova [dstq ], m4 + mova [dstq+strideq*2 ], m5 + mova [dstq +16], m2 + mova [dstq+strideq*2+16], m3 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET + +INIT_XMM sse2 +cglobal high_tm_predictor_32x32, 5, 6, 12, dst, stride, above, left, bps, one + movd m0, [aboveq-2] + mova m1, [aboveq] + mova m2, [aboveq+16] + mova m3, [aboveq+32] + mova m4, [aboveq+48] + pshuflw m0, m0, 0x0 + ; Get the values to compute the maximum value at this bit depth + mov oned, 1 + pxor m10, m10 + pxor m11, m11 + pinsrw m10, oned, 0 + pinsrw m11, bpsd, 0 + pshuflw m10, m10, 0x0 + DEFINE_ARGS dst, stride, line, left + punpcklqdq m10, m10 + mov lineq, -16 + mova m5, m10 + punpcklqdq m0, m0 + psllw m10, m11 + add leftq, 64 + psubw m10, m5 ; max possible value + pxor m11, m11 ; min possible value + psubw m1, m0 + psubw m2, m0 + psubw m3, m0 + psubw m4, m0 +.loop: + movd m5, [leftq+lineq*4] + movd m6, [leftq+lineq*4+2] + pshuflw m5, m5, 0x0 + pshuflw m6, m6, 0x0 + punpcklqdq m5, m5 + punpcklqdq m6, m6 + paddw m7, m5, m1 + paddw m8, m5, m2 + paddw m9, m5, m3 + paddw m5, m4 + ;Clamp these values to the bit-depth + pminsw m7, m10 + pminsw m8, m10 + pminsw m9, m10 + pminsw m5, m10 + pmaxsw m7, m11 + pmaxsw m8, m11 + pmaxsw m9, m11 + pmaxsw m5, m11 + ;Store these values + mova [dstq ], m7 + mova [dstq +16], m8 + mova [dstq +32], m9 + mova [dstq +48], m5 + paddw m7, m6, m1 + paddw m8, m6, m2 + paddw m9, m6, m3 + paddw m6, m4 + ;Clamp these values to the bit-depth + pminsw m7, m10 + pminsw m8, m10 + pminsw m9, m10 + pminsw m6, m10 + pmaxsw m7, m11 + pmaxsw m8, m11 + pmaxsw m9, m11 + pmaxsw m6, m11 + ;Store these values + mova [dstq+strideq*2 ], m7 + mova [dstq+strideq*2+16], m8 + mova [dstq+strideq*2+32], m9 + mova [dstq+strideq*2+48], m6 + lea dstq, [dstq+strideq*4] + inc lineq + jnz .loop + REP_RET +%endif diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 7615cddda..f99fa7a58 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -1366,11 +1366,11 @@ void vp9_init_dequantizer(VP9_COMMON *cm) { int q; for (q = 0; q < QINDEX_RANGE; q++) { - cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q); - cm->y_dequant[q][1] = vp9_ac_quant(q, 0); + cm->y_dequant[q][0] = vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth); + cm->y_dequant[q][1] = vp9_ac_quant(q, 0, cm->bit_depth); - cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q); - cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q); + cm->uv_dequant[q][0] = vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth); + cm->uv_dequant[q][1] = vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth); } } diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index 76ca1ae8f..df46f64e7 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -13,6 +13,7 @@ #include "vp9/common/vp9_blockd.h" #include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" #include "vp9/decoder/vp9_detokenize.h" @@ -31,29 +32,31 @@ #define INCREMENT_COUNT(token) \ do { \ if (!cm->frame_parallel_decoding_mode) \ - ++coef_counts[band][ctx][token]; \ + ++coef_counts[band][ctx][token]; \ } while (0) -#define WRITE_COEF_CONTINUE(val, token) \ - { \ - v = (val * dqv) >> dq_shift; \ - dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v; \ - token_cache[scan[c]] = vp9_pt_energy_class[token]; \ - ++c; \ - ctx = get_coef_context(nb, token_cache, c); \ - dqv = dq[1]; \ - continue; \ - } +static INLINE int read_coeff(const vp9_prob *probs, int n, vp9_reader *r) { + int i, val = 0; + for (i = 0; i < n; ++i) + val = (val << 1) | vp9_read(r, probs[i]); + return val; +} -#define ADJUST_COEF(prob, bits_count) \ - do { \ - val += (vp9_read(r, prob) << bits_count); \ - } while (0) +static const vp9_tree_index coeff_subtree_high[TREE_SIZE(ENTROPY_TOKENS)] = { + 2, 6, /* 0 = LOW_VAL */ + -TWO_TOKEN, 4, /* 1 = TWO */ + -THREE_TOKEN, -FOUR_TOKEN, /* 2 = THREE */ + 8, 10, /* 3 = HIGH_LOW */ + -CATEGORY1_TOKEN, -CATEGORY2_TOKEN, /* 4 = CAT_ONE */ + 12, 14, /* 5 = CAT_THREEFOUR */ + -CATEGORY3_TOKEN, -CATEGORY4_TOKEN, /* 6 = CAT_THREE */ + -CATEGORY5_TOKEN, -CATEGORY6_TOKEN /* 7 = CAT_FIVE */ +}; static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, - tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq, - int ctx, const int16_t *scan, const int16_t *nb, - vp9_reader *r) { + tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq, + int ctx, const int16_t *scan, const int16_t *nb, + vp9_reader *r) { const int max_eob = 16 << (tx_size << 1); const FRAME_CONTEXT *const fc = &cm->fc; FRAME_COUNTS *const counts = &cm->counts; @@ -69,11 +72,11 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, uint8_t token_cache[32 * 32]; const uint8_t *band_translate = get_band_translate(tx_size); const int dq_shift = (tx_size == TX_32X32); - int v; + int v, token; int16_t dqv = dq[0]; while (c < max_eob) { - int val; + int val = -1; band = *band_translate++; prob = coef_probs[band][ctx]; if (!cm->frame_parallel_decoding_mode) @@ -95,81 +98,46 @@ static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd, PLANE_TYPE type, prob = coef_probs[band][ctx]; } - // ONE_CONTEXT_NODE_0_ if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) { INCREMENT_COUNT(ONE_TOKEN); - WRITE_COEF_CONTINUE(1, ONE_TOKEN); - } - - INCREMENT_COUNT(TWO_TOKEN); - - prob = vp9_pareto8_full[prob[PIVOT_NODE] - 1]; - - if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) { - if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) { - WRITE_COEF_CONTINUE(2, TWO_TOKEN); + token = ONE_TOKEN; + val = 1; + } else { + INCREMENT_COUNT(TWO_TOKEN); + token = vp9_read_tree(r, coeff_subtree_high, + vp9_pareto8_full[prob[PIVOT_NODE] - 1]); + switch (token) { + case TWO_TOKEN: + case THREE_TOKEN: + case FOUR_TOKEN: + val = token; + break; + case CATEGORY1_TOKEN: + val = CAT1_MIN_VAL + read_coeff(vp9_cat1_prob, 1, r); + break; + case CATEGORY2_TOKEN: + val = CAT2_MIN_VAL + read_coeff(vp9_cat2_prob, 2, r); + break; + case CATEGORY3_TOKEN: + val = CAT3_MIN_VAL + read_coeff(vp9_cat3_prob, 3, r); + break; + case CATEGORY4_TOKEN: + val = CAT4_MIN_VAL + read_coeff(vp9_cat4_prob, 4, r); + break; + case CATEGORY5_TOKEN: + val = CAT5_MIN_VAL + read_coeff(vp9_cat5_prob, 5, r); + break; + case CATEGORY6_TOKEN: + val = CAT6_MIN_VAL + read_coeff(vp9_cat6_prob, 14, r); + break; } - if (!vp9_read(r, prob[THREE_CONTEXT_NODE])) { - WRITE_COEF_CONTINUE(3, THREE_TOKEN); - } - WRITE_COEF_CONTINUE(4, FOUR_TOKEN); } - - if (!vp9_read(r, prob[HIGH_LOW_CONTEXT_NODE])) { - if (!vp9_read(r, prob[CAT_ONE_CONTEXT_NODE])) { - val = CAT1_MIN_VAL; - ADJUST_COEF(vp9_cat1_prob[0], 0); - WRITE_COEF_CONTINUE(val, CATEGORY1_TOKEN); - } - val = CAT2_MIN_VAL; - ADJUST_COEF(vp9_cat2_prob[0], 1); - ADJUST_COEF(vp9_cat2_prob[1], 0); - WRITE_COEF_CONTINUE(val, CATEGORY2_TOKEN); - } - - if (!vp9_read(r, prob[CAT_THREEFOUR_CONTEXT_NODE])) { - if (!vp9_read(r, prob[CAT_THREE_CONTEXT_NODE])) { - val = CAT3_MIN_VAL; - ADJUST_COEF(vp9_cat3_prob[0], 2); - ADJUST_COEF(vp9_cat3_prob[1], 1); - ADJUST_COEF(vp9_cat3_prob[2], 0); - WRITE_COEF_CONTINUE(val, CATEGORY3_TOKEN); - } - val = CAT4_MIN_VAL; - ADJUST_COEF(vp9_cat4_prob[0], 3); - ADJUST_COEF(vp9_cat4_prob[1], 2); - ADJUST_COEF(vp9_cat4_prob[2], 1); - ADJUST_COEF(vp9_cat4_prob[3], 0); - WRITE_COEF_CONTINUE(val, CATEGORY4_TOKEN); - } - - if (!vp9_read(r, prob[CAT_FIVE_CONTEXT_NODE])) { - val = CAT5_MIN_VAL; - ADJUST_COEF(vp9_cat5_prob[0], 4); - ADJUST_COEF(vp9_cat5_prob[1], 3); - ADJUST_COEF(vp9_cat5_prob[2], 2); - ADJUST_COEF(vp9_cat5_prob[3], 1); - ADJUST_COEF(vp9_cat5_prob[4], 0); - WRITE_COEF_CONTINUE(val, CATEGORY5_TOKEN); - } - val = 0; - val = (val << 1) | vp9_read(r, vp9_cat6_prob[0]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[1]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[2]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[3]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[4]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[5]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[6]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[7]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[8]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[9]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[10]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[11]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[12]); - val = (val << 1) | vp9_read(r, vp9_cat6_prob[13]); - val += CAT6_MIN_VAL; - - WRITE_COEF_CONTINUE(val, CATEGORY6_TOKEN); + v = (val * dqv) >> dq_shift; + dqcoeff[scan[c]] = vp9_read_bit(r) ? -v : v; + token_cache[scan[c]] = vp9_pt_energy_class[token]; + ++c; + ctx = get_coef_context(nb, token_cache, c); + dqv = dq[1]; } return c; diff --git a/vp9/encoder/vp9_aq_complexity.c b/vp9/encoder/vp9_aq_complexity.c index 33f92393c..f7fca0cde 100644 --- a/vp9/encoder/vp9_aq_complexity.c +++ b/vp9/encoder/vp9_aq_complexity.c @@ -23,9 +23,9 @@ static const double aq_c_q_adj_factor[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = static const double aq_c_transitions[AQ_C_STRENGTHS][AQ_C_SEGMENTS] = {{1.0, 1.0, 1.0}, {1.0, 0.25, 0.0}, {1.0, 0.5, 0.25}}; -static int get_aq_c_strength(int q_index) { +static int get_aq_c_strength(int q_index, vpx_bit_depth_t bit_depth) { // Approximate base quatizer (truncated to int) - int base_quant = vp9_ac_quant(q_index, 0) / 4; + const int base_quant = vp9_ac_quant(q_index, 0, bit_depth) / 4; return (base_quant > 20) + (base_quant > 45); } @@ -40,7 +40,7 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { cpi->refresh_alt_ref_frame || (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) { int segment; - const int aq_strength = get_aq_c_strength(cm->base_qindex); + const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); const int active_segments = aq_c_active_segments[aq_strength]; // Clear down the segment map. @@ -70,7 +70,8 @@ void vp9_setup_in_frame_q_adj(VP9_COMP *cpi) { for (segment = 1; segment < active_segments; ++segment) { int qindex_delta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, cm->base_qindex, - aq_c_q_adj_factor[aq_strength][segment]); + aq_c_q_adj_factor[aq_strength][segment], + cm->bit_depth); // For AQ complexity mode, we dont allow Q0 in a segment if the base // Q is not 0. Q0 (lossless) implies 4x4 only and in AQ mode 2 a segment @@ -115,7 +116,7 @@ void vp9_select_in_frame_q_segment(VP9_COMP *cpi, // It is converted to bits * 256 units. const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) / (bw * bh); - const int aq_strength = get_aq_c_strength(cm->base_qindex); + const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth); const int active_segments = aq_c_active_segments[aq_strength]; // The number of segments considered and the transition points used to diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c index e7f0daac3..514ff7a52 100644 --- a/vp9/encoder/vp9_aq_cyclicrefresh.c +++ b/vp9/encoder/vp9_aq_cyclicrefresh.c @@ -200,7 +200,7 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { // Rate target ratio to set q delta. const float rate_ratio_qdelta = 2.0; - const double q = vp9_convert_qindex_to_q(cm->base_qindex); + const double q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); vp9_clear_system_state(); // Some of these parameters may be set via codec-control function later. cr->max_sbs_perframe = 10; @@ -242,7 +242,8 @@ void vp9_cyclic_refresh_setup(VP9_COMP *const cpi) { // Set the q delta for segment 1. qindex_delta = vp9_compute_qdelta_by_rate(rc, cm->frame_type, cm->base_qindex, - rate_ratio_qdelta); + rate_ratio_qdelta, + cm->bit_depth); // TODO(marpan): Incorporate the actual-vs-target rate over/undershoot from // previous encoded frame. if (-qindex_delta > cr->max_qdelta_perc * cm->base_qindex / 100) diff --git a/vp9/encoder/vp9_aq_variance.c b/vp9/encoder/vp9_aq_variance.c index 56db95eb7..b96f00fd1 100644 --- a/vp9/encoder/vp9_aq_variance.c +++ b/vp9/encoder/vp9_aq_variance.c @@ -75,7 +75,7 @@ void vp9_vaq_init() { void vp9_vaq_frame_setup(VP9_COMP *cpi) { VP9_COMMON *cm = &cpi->common; struct segmentation *seg = &cm->seg; - const double base_q = vp9_convert_qindex_to_q(cm->base_qindex); + const double base_q = vp9_convert_qindex_to_q(cm->base_qindex, cm->bit_depth); const int base_rdmult = vp9_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q); int i; @@ -99,7 +99,8 @@ void vp9_vaq_frame_setup(VP9_COMP *cpi) { continue; } - qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i)); + qindex_delta = vp9_compute_qdelta(&cpi->rc, base_q, base_q * Q_RATIO(i), + cm->bit_depth); vp9_set_segdata(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q, qindex_delta); vp9_enable_segfeature(seg, SEGMENT_ID(i), SEG_LVL_ALT_Q); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index e16b0b356..9545ba0f3 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -330,7 +330,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) { seg->update_map = 1; seg->update_data = 1; - qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875); + qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 0.875, + cm->bit_depth); vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta - 2); vp9_set_segdata(seg, 1, SEG_LVL_ALT_LF, -2); @@ -351,7 +352,8 @@ static void configure_static_seg_features(VP9_COMP *cpi) { seg->update_data = 1; seg->abs_delta = SEGMENT_DELTADATA; - qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125); + qi_delta = vp9_compute_qdelta(rc, rc->avg_q, rc->avg_q * 1.125, + cm->bit_depth); vp9_set_segdata(seg, 1, SEG_LVL_ALT_Q, qi_delta + 2); vp9_enable_segfeature(seg, 1, SEG_LVL_ALT_Q); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 54b57cf88..df82be5ec 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -62,8 +62,8 @@ static void swap_yv12(YV12_BUFFER_CONFIG *a, YV12_BUFFER_CONFIG *b) { *b = temp; } -static int gfboost_qadjust(int qindex) { - const double q = vp9_convert_qindex_to_q(qindex); +static int gfboost_qadjust(int qindex, vpx_bit_depth_t bit_depth) { + const double q = vp9_convert_qindex_to_q(qindex, bit_depth); return (int)((0.00000828 * q * q * q) + (-0.0055 * q * q) + (1.32 * q) + 79.3); @@ -360,11 +360,11 @@ static BLOCK_SIZE get_bsize(const VP9_COMMON *cm, int mb_row, int mb_col) { } } -static int find_fp_qindex() { +static int find_fp_qindex(vpx_bit_depth_t bit_depth) { int i; for (i = 0; i < QINDEX_RANGE; ++i) - if (vp9_convert_qindex_to_q(i) >= 30.0) + if (vp9_convert_qindex_to_q(i, bit_depth) >= 30.0) break; if (i == QINDEX_RANGE) @@ -434,7 +434,7 @@ void vp9_first_pass(VP9_COMP *cpi, const struct lookahead_entry *source) { vp9_clear_system_state(); set_first_pass_params(cpi); - vp9_set_quantizer(cm, find_fp_qindex()); + vp9_set_quantizer(cm, find_fp_qindex(cm->bit_depth)); if (lc != NULL) { twopass = &lc->twopass; @@ -935,12 +935,13 @@ static double calc_correction_factor(double err_per_mb, double err_divisor, double pt_low, double pt_high, - int q) { + int q, + vpx_bit_depth_t bit_depth) { const double error_term = err_per_mb / err_divisor; // Adjustment based on actual quantizer to power term. - const double power_term = MIN(vp9_convert_qindex_to_q(q) * 0.0125 + pt_low, - pt_high); + const double power_term = + MIN(vp9_convert_qindex_to_q(q, bit_depth) * 0.0125 + pt_low, pt_high); // Calculate correction factor. if (power_term < 1.0) @@ -975,9 +976,11 @@ static int get_twopass_worst_quality(const VP9_COMP *cpi, const double factor = calc_correction_factor(err_per_mb, ERR_DIVISOR, is_svc_upper_layer ? SVC_FACTOR_PT_LOW : - FACTOR_PT_LOW, FACTOR_PT_HIGH, q); + FACTOR_PT_LOW, FACTOR_PT_HIGH, q, + cpi->common.bit_depth); const int bits_per_mb = vp9_rc_bits_per_mb(INTER_FRAME, q, - factor * speed_term); + factor * speed_term, + cpi->common.bit_depth); if (bits_per_mb <= target_norm_bits_per_mb) break; } @@ -1594,7 +1597,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // At high Q when there are few bits to spare we are better with a longer // interval to spread the cost of the GF. active_max_gf_interval = - 12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME]) >> 5); + 12 + ((int)vp9_convert_qindex_to_q(rc->last_q[INTER_FRAME], + cpi->common.bit_depth) >> 5); if (active_max_gf_interval > rc->max_gf_interval) active_max_gf_interval = rc->max_gf_interval; @@ -1736,7 +1740,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Calculate the extra bits to be used for boosted frame(s) { int q = rc->last_q[INTER_FRAME]; - int boost = (rc->gfu_boost * gfboost_qadjust(q)) / 100; + int boost = + (rc->gfu_boost * gfboost_qadjust(q, cpi->common.bit_depth)) / 100; // Set max and minimum boost and hence minimum allocation. boost = clamp(boost, 125, (rc->baseline_gf_interval + 1) * 200); @@ -2227,7 +2232,7 @@ void vp9_rc_get_second_pass_params(VP9_COMP *cpi) { section_target_bandwidth); twopass->active_worst_quality = tmp_q; rc->ni_av_qi = tmp_q; - rc->avg_q = vp9_convert_qindex_to_q(tmp_q); + rc->avg_q = vp9_convert_qindex_to_q(tmp_q, cm->bit_depth); } vp9_zero(this_frame); if (EOF == input_stats(twopass, &this_frame)) diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 5557d7fe7..2fc05e7fe 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -142,7 +142,7 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, } else if (method >= LPF_PICK_FROM_Q) { const int min_filter_level = 0; const int max_filter_level = get_max_filter_level(cpi); - const int q = vp9_ac_quant(cm->base_qindex, 0); + const int q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth); // These values were determined by linear fitting the result of the // searched level, filt_guess = q * 0.316206 + 3.87252 int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index a97d77831..2edd52bae 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -440,7 +440,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, unsigned int sse_y = UINT_MAX; const int intra_cost_penalty = - 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); + 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); const int64_t inter_mode_thresh = RDCOST(x->rdmult, x->rddiv, intra_cost_penalty, 0); const int intra_mode_cost = 50; diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index d49eb956f..2f225d74e 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -40,6 +40,31 @@ void vp9_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, *eob_ptr = eob + 1; } +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_quantize_dc(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + int eob = -1; + + if (!skip_block) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + const int64_t tmp = + (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) * + quant) >> 16; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr; + if (tmp) + eob = 0; + } + *eob_ptr = eob + 1; +} +#endif + void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, const int16_t *round_ptr, const int16_t quant, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, @@ -62,6 +87,31 @@ void vp9_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, *eob_ptr = eob + 1; } +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block, + const int16_t *round_ptr, const int16_t quant, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t dequant_ptr, uint16_t *eob_ptr) { + int eob = -1; + + if (!skip_block) { + const int rc = 0; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + const int64_t tmp = + (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) * + quant) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr / 2; + if (tmp) + eob = 0; + } + *eob_ptr = eob + 1; +} +#endif + void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -103,6 +153,51 @@ void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i; + int eob = -1; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + const int64_t tmp = + (clamp(abs_coeff + round_ptr[rc != 0], INT32_MIN, INT32_MAX) * + quant_ptr[rc != 0]) >> 16; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) + eob = i; + } + } + *eob_ptr = eob + 1; +} +#endif + // TODO(jingning) Refactor this file and combine functions with similar // operations. void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, @@ -146,6 +241,51 @@ void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i, eob = -1; + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + int64_t tmp = 0; + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) { + tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), + INT32_MIN, INT32_MAX); + tmp = (tmp * quant_ptr[rc != 0]) >> 15; + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + } + + if (tmp) + eob = i; + } + } + *eob_ptr = eob + 1; +} +#endif + void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -199,6 +339,62 @@ void vp9_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, int zbin_oq_value, + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { + int i, non_zero_count = (int)n_coeffs, eob = -1; + const int zbins[2] = { zbin_ptr[0] + zbin_oq_value, + zbin_ptr[1] + zbin_oq_value }; + const int nzbins[2] = { zbins[0] * -1, + zbins[1] * -1 }; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = (int)n_coeffs - 1; i >= 0; i--) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0]) + non_zero_count--; + else + break; + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < non_zero_count; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + + if (abs_coeff >= zbins[rc != 0]) { + int64_t tmp = clamp(abs_coeff + round_ptr[rc != 0], + INT32_MIN, INT32_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> 16; // quantization + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]; + + if (tmp) + eob = i; + } + } + } + *eob_ptr = eob + 1; +} +#endif + void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, @@ -255,12 +451,84 @@ void vp9_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, *eob_ptr = eob + 1; } +#if CONFIG_VP9_HIGHBITDEPTH +void vp9_high_quantize_b_32x32_c(const tran_low_t *coeff_ptr, + intptr_t n_coeffs, int skip_block, + const int16_t *zbin_ptr, + const int16_t *round_ptr, + const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, + tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + const int zbins[2] = { ROUND_POWER_OF_TWO(zbin_ptr[0] + zbin_oq_value, 1), + ROUND_POWER_OF_TWO(zbin_ptr[1] + zbin_oq_value, 1) }; + const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 }; + + int idx = 0; + int idx_arr[1024]; + int i, eob = -1; + (void)iscan; + + vpx_memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr)); + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + const int rc = scan[i]; + const int coeff = coeff_ptr[rc]; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (coeff >= zbins[rc != 0] || coeff <= nzbins[rc != 0]) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + const int rc = scan[idx_arr[i]]; + const int coeff = coeff_ptr[rc]; + const int coeff_sign = (coeff >> 31); + const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign; + int64_t tmp = clamp(abs_coeff + + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), + INT32_MIN, INT32_MAX); + tmp = ((((tmp * quant_ptr[rc != 0]) >> 16) + tmp) * + quant_shift_ptr[rc != 0]) >> 15; + + qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign; + dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2; + + if (tmp) + eob = idx_arr[i]; + } + } + *eob_ptr = eob + 1; +} +#endif + void vp9_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block, const int16_t *scan, const int16_t *iscan) { MACROBLOCKD *const xd = &x->e_mbd; struct macroblock_plane *p = &x->plane[plane]; struct macroblockd_plane *pd = &xd->plane[plane]; +#if CONFIG_VP9_HIGHBITDEPTH + if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) { + vp9_high_quantize_b(BLOCK_OFFSET(p->coeff, block), + 16, x->skip_block, + p->zbin, p->round, p->quant, p->quant_shift, + BLOCK_OFFSET(p->qcoeff, block), + BLOCK_OFFSET(pd->dqcoeff, block), + pd->dequant, p->zbin_extra, &p->eobs[block], + scan, iscan); + return; + } +#endif vp9_quantize_b(BLOCK_OFFSET(p->coeff, block), 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, @@ -281,9 +549,23 @@ static void invert_quant(int16_t *quant, int16_t *shift, int d) { } static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) { - int quant = vp9_dc_quant(q, 0); + const int quant = vp9_dc_quant(q, 0, bit_depth); +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: + return q == 0 ? 64 : (quant < 148 ? 84 : 80); + case VPX_BITS_10: + return q == 0 ? 64 : (quant < 592 ? 84 : 80); + case VPX_BITS_12: + return q == 0 ? 64 : (quant < 2368 ? 84 : 80); + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else (void) bit_depth; return q == 0 ? 64 : (quant < 148 ? 84 : 80); +#endif } void vp9_init_quantizer(VP9_COMP *cpi) { @@ -301,8 +583,8 @@ void vp9_init_quantizer(VP9_COMP *cpi) { qrounding_factor_fp = 64; // y - quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q) - : vp9_ac_quant(q, 0); + quant = i == 0 ? vp9_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth) + : vp9_ac_quant(q, 0, cm->bit_depth); invert_quant(&quants->y_quant[q][i], &quants->y_quant_shift[q][i], quant); quants->y_quant_fp[q][i] = (1 << 16) / quant; quants->y_round_fp[q][i] = (qrounding_factor_fp * quant) >> 7; @@ -311,8 +593,8 @@ void vp9_init_quantizer(VP9_COMP *cpi) { cm->y_dequant[q][i] = quant; // uv - quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q) - : vp9_ac_quant(q, cm->uv_ac_delta_q); + quant = i == 0 ? vp9_dc_quant(q, cm->uv_dc_delta_q, cm->bit_depth) + : vp9_ac_quant(q, cm->uv_ac_delta_q, cm->bit_depth); invert_quant(&quants->uv_quant[q][i], &quants->uv_quant_shift[q][i], quant); quants->uv_quant_fp[q][i] = (1 << 16) / quant; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index b607c8559..94c0b64dd 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -42,13 +42,56 @@ #define FRAME_OVERHEAD_BITS 200 +#if CONFIG_VP9_HIGHBITDEPTH +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case VPX_BITS_8: \ + name = name##_8; \ + break; \ + case VPX_BITS_10: \ + name = name##_10; \ + break; \ + case VPX_BITS_12: \ + name = name##_12; \ + break; \ + default: \ + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10" \ + " or VPX_BITS_12"); \ + name = NULL; \ + } \ + } while (0) +#else +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + (void) bit_depth; \ + name = name##_8; \ + } while (0) +#endif + // Tables relating active max Q to active min Q -static int kf_low_motion_minq[QINDEX_RANGE]; -static int kf_high_motion_minq[QINDEX_RANGE]; -static int arfgf_low_motion_minq[QINDEX_RANGE]; -static int arfgf_high_motion_minq[QINDEX_RANGE]; -static int inter_minq[QINDEX_RANGE]; -static int rtc_minq[QINDEX_RANGE]; +static int kf_low_motion_minq_8[QINDEX_RANGE]; +static int kf_high_motion_minq_8[QINDEX_RANGE]; +static int arfgf_low_motion_minq_8[QINDEX_RANGE]; +static int arfgf_high_motion_minq_8[QINDEX_RANGE]; +static int inter_minq_8[QINDEX_RANGE]; +static int rtc_minq_8[QINDEX_RANGE]; + +#if CONFIG_VP9_HIGHBITDEPTH +static int kf_low_motion_minq_10[QINDEX_RANGE]; +static int kf_high_motion_minq_10[QINDEX_RANGE]; +static int arfgf_low_motion_minq_10[QINDEX_RANGE]; +static int arfgf_high_motion_minq_10[QINDEX_RANGE]; +static int inter_minq_10[QINDEX_RANGE]; +static int rtc_minq_10[QINDEX_RANGE]; +static int kf_low_motion_minq_12[QINDEX_RANGE]; +static int kf_high_motion_minq_12[QINDEX_RANGE]; +static int arfgf_low_motion_minq_12[QINDEX_RANGE]; +static int arfgf_high_motion_minq_12[QINDEX_RANGE]; +static int inter_minq_12[QINDEX_RANGE]; +static int rtc_minq_12[QINDEX_RANGE]; +#endif + static int gf_high = 2000; static int gf_low = 400; static int kf_high = 5000; @@ -58,7 +101,8 @@ static int kf_low = 400; // formulaic approach to facilitate easier adjustment of the Q tables. // The formulae were derived from computing a 3rd order polynomial best // fit to the original data (after plotting real maxq vs minq (not q index)) -static int get_minq_index(double maxq, double x3, double x2, double x1) { +static int get_minq_index(double maxq, double x3, double x2, double x1, + vpx_bit_depth_t bit_depth) { int i; const double minqtarget = MIN(((x3 * maxq + x2) * maxq + x1) * maxq, maxq); @@ -68,38 +112,69 @@ static int get_minq_index(double maxq, double x3, double x2, double x1) { if (minqtarget <= 2.0) return 0; - for (i = 0; i < QINDEX_RANGE; i++) - if (minqtarget <= vp9_convert_qindex_to_q(i)) + for (i = 0; i < QINDEX_RANGE; i++) { + if (minqtarget <= vp9_convert_qindex_to_q(i, bit_depth)) return i; + } return QINDEX_RANGE - 1; } -void vp9_rc_init_minq_luts() { +static void init_minq_luts(int *kf_low_m, int *kf_high_m, + int *arfgf_low, int *arfgf_high, + int *inter, int *rtc, vpx_bit_depth_t bit_depth) { int i; - for (i = 0; i < QINDEX_RANGE; i++) { - const double maxq = vp9_convert_qindex_to_q(i); - kf_low_motion_minq[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.125); - kf_high_motion_minq[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50); - arfgf_low_motion_minq[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30); - arfgf_high_motion_minq[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50); - inter_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90); - rtc_minq[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70); + const double maxq = vp9_convert_qindex_to_q(i, bit_depth); + kf_low_m[i] = get_minq_index(maxq, 0.000001, -0.0004, 0.125, bit_depth); + kf_high_m[i] = get_minq_index(maxq, 0.000002, -0.0012, 0.50, bit_depth); + arfgf_low[i] = get_minq_index(maxq, 0.0000015, -0.0009, 0.30, bit_depth); + arfgf_high[i] = get_minq_index(maxq, 0.0000021, -0.00125, 0.50, bit_depth); + inter[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.90, bit_depth); + rtc[i] = get_minq_index(maxq, 0.00000271, -0.00113, 0.70, bit_depth); } } +void vp9_rc_init_minq_luts() { + init_minq_luts(kf_low_motion_minq_8, kf_high_motion_minq_8, + arfgf_low_motion_minq_8, arfgf_high_motion_minq_8, + inter_minq_8, rtc_minq_8, VPX_BITS_8); +#if CONFIG_VP9_HIGHBITDEPTH + init_minq_luts(kf_low_motion_minq_10, kf_high_motion_minq_10, + arfgf_low_motion_minq_10, arfgf_high_motion_minq_10, + inter_minq_10, rtc_minq_10, VPX_BITS_10); + init_minq_luts(kf_low_motion_minq_12, kf_high_motion_minq_12, + arfgf_low_motion_minq_12, arfgf_high_motion_minq_12, + inter_minq_12, rtc_minq_12, VPX_BITS_12); +#endif +} + // These functions use formulaic calculations to make playing with the // quantizer tables easier. If necessary they can be replaced by lookup // tables if and when things settle down in the experimental bitstream -double vp9_convert_qindex_to_q(int qindex) { +double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) { // Convert the index to a real Q value (scaled down to match old Q values) - return vp9_ac_quant(qindex, 0) / 4.0; +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: + return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; + case VPX_BITS_10: + return vp9_ac_quant(qindex, 0, bit_depth) / 16.0; + case VPX_BITS_12: + return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1.0; + } +#else + return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; +#endif } int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, - double correction_factor) { - const double q = vp9_convert_qindex_to_q(qindex); + double correction_factor, + vpx_bit_depth_t bit_depth) { + const double q = vp9_convert_qindex_to_q(qindex, bit_depth); int enumerator = frame_type == KEY_FRAME ? 3300000 : 2250000; // q based adjustment to baseline enumerator @@ -108,8 +183,10 @@ int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, } static int estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, - double correction_factor) { - const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor)); + double correction_factor, + vpx_bit_depth_t bit_depth) { + const int bpm = (int)(vp9_rc_bits_per_mb(frame_type, q, correction_factor, + bit_depth)); return ((uint64_t)bpm * mbs) >> BPER_MB_NORMBITS; } @@ -227,7 +304,7 @@ void vp9_rc_init(const VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc) { rc->ni_frames = 0; rc->tot_q = 0.0; - rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q); + rc->avg_q = vp9_convert_qindex_to_q(oxcf->worst_allowed_q, oxcf->bit_depth); for (i = 0; i < RATE_FACTOR_LEVELS; ++i) { rc->rate_correction_factors[i] = 1.0; @@ -330,7 +407,8 @@ void vp9_rc_update_rate_correction_factors(VP9_COMP *cpi, int damp_var) { // Stay in double to avoid int overflow when values are large projected_size_based_on_q = estimate_bits_at_q(cm->frame_type, cm->base_qindex, cm->MBs, - rate_correction_factor); + rate_correction_factor, + cm->bit_depth); // Work out a size correction factor. if (projected_size_based_on_q > 0) correction_factor = (100 * cpi->rc.projected_frame_size) / @@ -392,7 +470,8 @@ int vp9_rc_regulate_q(const VP9_COMP *cpi, int target_bits_per_frame, do { const int bits_per_mb_at_this_q = (int)vp9_rc_bits_per_mb(cm->frame_type, i, - correction_factor); + correction_factor, + cm->bit_depth); if (bits_per_mb_at_this_q <= target_bits_per_mb) { if ((target_bits_per_mb - bits_per_mb_at_this_q) <= last_error) @@ -424,12 +503,22 @@ static int get_active_quality(int q, int gfu_boost, int low, int high, } } -static int get_kf_active_quality(const RATE_CONTROL *const rc, int q) { +static int get_kf_active_quality(const RATE_CONTROL *const rc, int q, + vpx_bit_depth_t bit_depth) { + int *kf_low_motion_minq; + int *kf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, kf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, kf_high_motion_minq); return get_active_quality(q, rc->kf_boost, kf_low, kf_high, kf_low_motion_minq, kf_high_motion_minq); } -static int get_gf_active_quality(const RATE_CONTROL *const rc, int q) { +static int get_gf_active_quality(const RATE_CONTROL *const rc, int q, + vpx_bit_depth_t bit_depth) { + int *arfgf_low_motion_minq; + int *arfgf_high_motion_minq; + ASSIGN_MINQ_TABLE(bit_depth, arfgf_low_motion_minq); + ASSIGN_MINQ_TABLE(bit_depth, arfgf_high_motion_minq); return get_active_quality(q, rc->gfu_boost, gf_low, gf_high, arfgf_low_motion_minq, arfgf_high_motion_minq); } @@ -516,6 +605,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, int active_best_quality; int active_worst_quality = calc_active_worst_quality_one_pass_cbr(cpi); int q; + int *rtc_minq; + ASSIGN_MINQ_TABLE(cm->bit_depth, rtc_minq); if (frame_is_intra_only(cm)) { active_best_quality = rc->best_quality; @@ -524,9 +615,10 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, // based on the ambient Q to reduce the risk of popping. if (rc->this_key_frame_forced) { int qindex = rc->last_boosted_qindex; - double last_boosted_q = vp9_convert_qindex_to_q(qindex); + double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, - (last_boosted_q * 0.75)); + (last_boosted_q * 0.75), + cm->bit_depth); active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); } else if (cm->current_video_frame > 0) { // not first frame of one pass and kf_boost is set @@ -534,7 +626,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, double q_val; active_best_quality = - get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]); + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], + cm->bit_depth); // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { @@ -543,9 +636,10 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, // Convert the adjustment factor to a qindex delta // on active_best_quality. - q_val = vp9_convert_qindex_to_q(active_best_quality); + q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); active_best_quality += vp9_compute_qdelta(rc, q_val, - q_val * q_adj_factor); + q_val * q_adj_factor, + cm->bit_depth); } } else if (!rc->is_src_frame_alt_ref && !cpi->use_svc && @@ -559,7 +653,7 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, } else { q = active_worst_quality; } - active_best_quality = get_gf_active_quality(rc, q); + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); } else { // Use the lower of active_worst_quality and recent/average Q. if (cm->current_video_frame > 1) { @@ -592,7 +686,8 @@ static int rc_pick_q_and_bounds_one_pass_cbr(const VP9_COMP *cpi, int qdelta = 0; vp9_clear_system_state(); qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, - active_worst_quality, 2.0); + active_worst_quality, 2.0, + cm->bit_depth); *top_index = active_worst_quality + qdelta; *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; } @@ -644,6 +739,8 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, int active_best_quality; int active_worst_quality = calc_active_worst_quality_one_pass_vbr(cpi); int q; + int *inter_minq; + ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); if (frame_is_intra_only(cm)) { @@ -652,9 +749,10 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, // based on the ambient Q to reduce the risk of popping. if (rc->this_key_frame_forced) { int qindex = rc->last_boosted_qindex; - double last_boosted_q = vp9_convert_qindex_to_q(qindex); + double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 0.75); + last_boosted_q * 0.75, + cm->bit_depth); active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); } else { // not first frame of one pass and kf_boost is set @@ -662,7 +760,8 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, double q_val; active_best_quality = - get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME]); + get_kf_active_quality(rc, rc->avg_frame_qindex[KEY_FRAME], + cm->bit_depth); // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { @@ -671,9 +770,10 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, // Convert the adjustment factor to a qindex delta // on active_best_quality. - q_val = vp9_convert_qindex_to_q(active_best_quality); + q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); active_best_quality += vp9_compute_qdelta(rc, q_val, - q_val * q_adj_factor); + q_val * q_adj_factor, + cm->bit_depth); } } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { @@ -691,7 +791,7 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q); + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; @@ -700,10 +800,10 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, if (!cpi->refresh_alt_ref_frame) { active_best_quality = cq_level; } else { - active_best_quality = get_gf_active_quality(rc, q); + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); } } else { - active_best_quality = get_gf_active_quality(rc, q); + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); } } else { if (oxcf->rc_mode == VPX_Q) { @@ -742,11 +842,13 @@ static int rc_pick_q_and_bounds_one_pass_vbr(const VP9_COMP *cpi, !rc->this_key_frame_forced && !(cm->current_video_frame == 0)) { qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, - active_worst_quality, 2.0); + active_worst_quality, 2.0, + cm->bit_depth); } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, - active_worst_quality, 1.75); + active_worst_quality, 1.75, + cm->bit_depth); } *top_index = active_worst_quality + qdelta; *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; @@ -788,6 +890,8 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, int active_best_quality; int active_worst_quality = cpi->twopass.active_worst_quality; int q; + int *inter_minq; + ASSIGN_MINQ_TABLE(cm->bit_depth, inter_minq); if (frame_is_intra_only(cm) || vp9_is_upper_layer_key_frame(cpi)) { // Handle the special case for key frames forced when we have75 reached @@ -795,16 +899,18 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, // based on the ambient Q to reduce the risk of popping. if (rc->this_key_frame_forced) { int qindex = rc->last_boosted_qindex; - double last_boosted_q = vp9_convert_qindex_to_q(qindex); + double last_boosted_q = vp9_convert_qindex_to_q(qindex, cm->bit_depth); int delta_qindex = vp9_compute_qdelta(rc, last_boosted_q, - last_boosted_q * 0.75); + last_boosted_q * 0.75, + cm->bit_depth); active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); } else { // Not forced keyframe. double q_adj_factor = 1.0; double q_val; // Baseline value derived from cpi->active_worst_quality and kf boost. - active_best_quality = get_kf_active_quality(rc, active_worst_quality); + active_best_quality = get_kf_active_quality(rc, active_worst_quality, + cm->bit_depth); // Allow somewhat lower kf minq with small image formats. if ((cm->width * cm->height) <= (352 * 288)) { @@ -816,9 +922,10 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, // Convert the adjustment factor to a qindex delta // on active_best_quality. - q_val = vp9_convert_qindex_to_q(active_best_quality); + q_val = vp9_convert_qindex_to_q(active_best_quality, cm->bit_depth); active_best_quality += vp9_compute_qdelta(rc, q_val, - q_val * q_adj_factor); + q_val * q_adj_factor, + cm->bit_depth); } } else if (!rc->is_src_frame_alt_ref && (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)) { @@ -836,7 +943,7 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, if (q < cq_level) q = cq_level; - active_best_quality = get_gf_active_quality(rc, q); + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); // Constrained quality use slightly lower active best. active_best_quality = active_best_quality * 15 / 16; @@ -845,10 +952,10 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, if (!cpi->refresh_alt_ref_frame) { active_best_quality = cq_level; } else { - active_best_quality = get_gf_active_quality(rc, q); + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); } } else { - active_best_quality = get_gf_active_quality(rc, q); + active_best_quality = get_gf_active_quality(rc, q, cm->bit_depth); } } else { if (oxcf->rc_mode == VPX_Q) { @@ -888,7 +995,8 @@ static int rc_pick_q_and_bounds_two_pass(const VP9_COMP *cpi, const double rate_factor = rate_factor_deltas[gf_group->rf_level[gf_group->index]]; int qdelta = vp9_compute_qdelta_by_rate(&cpi->rc, cm->frame_type, - active_worst_quality, rate_factor); + active_worst_quality, rate_factor, + cm->bit_depth); *top_index = active_worst_quality + qdelta; *top_index = (*top_index > *bottom_index) ? *top_index : *bottom_index; } @@ -1038,7 +1146,7 @@ void vp9_rc_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { rc->avg_frame_qindex[INTER_FRAME] = ROUND_POWER_OF_TWO(3 * rc->avg_frame_qindex[INTER_FRAME] + qindex, 2); rc->ni_frames++; - rc->tot_q += vp9_convert_qindex_to_q(qindex); + rc->tot_q += vp9_convert_qindex_to_q(qindex, cm->bit_depth); rc->avg_q = rc->tot_q / rc->ni_frames; // Calculate the average Q for normal inter frames (not key or GFU // frames). @@ -1294,7 +1402,8 @@ void vp9_rc_get_one_pass_cbr_params(VP9_COMP *cpi) { rc->baseline_gf_interval = INT_MAX; } -int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget) { +int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + vpx_bit_depth_t bit_depth) { int start_index = rc->worst_quality; int target_index = rc->worst_quality; int i; @@ -1302,14 +1411,14 @@ int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget) { // Convert the average q value to an index. for (i = rc->best_quality; i < rc->worst_quality; ++i) { start_index = i; - if (vp9_convert_qindex_to_q(i) >= qstart) + if (vp9_convert_qindex_to_q(i, bit_depth) >= qstart) break; } // Convert the q target to an index for (i = rc->best_quality; i < rc->worst_quality; ++i) { target_index = i; - if (vp9_convert_qindex_to_q(i) >= qtarget) + if (vp9_convert_qindex_to_q(i, bit_depth) >= qtarget) break; } @@ -1317,12 +1426,14 @@ int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget) { } int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, - int qindex, double rate_target_ratio) { + int qindex, double rate_target_ratio, + vpx_bit_depth_t bit_depth) { int target_index = rc->worst_quality; int i; // Look up the current projected bits per block for the base index - const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0); + const int base_bits_per_mb = vp9_rc_bits_per_mb(frame_type, qindex, 1.0, + bit_depth); // Find the target bits per mb based on the base value and given ratio. const int target_bits_per_mb = (int)(rate_target_ratio * base_bits_per_mb); @@ -1330,7 +1441,7 @@ int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, // Convert the q target to an index for (i = rc->best_quality; i < rc->worst_quality; ++i) { target_index = i; - if (vp9_rc_bits_per_mb(frame_type, i, 1.0) <= target_bits_per_mb ) + if (vp9_rc_bits_per_mb(frame_type, i, 1.0, bit_depth) <= target_bits_per_mb) break; } diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 456daf48d..2ced8e6dd 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -12,6 +12,7 @@ #ifndef VP9_ENCODER_VP9_RATECTRL_H_ #define VP9_ENCODER_VP9_RATECTRL_H_ +#include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" #include "vp9/common/vp9_blockd.h" @@ -104,7 +105,7 @@ struct VP9EncoderConfig; void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc); -double vp9_convert_qindex_to_q(int qindex); +double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth); void vp9_rc_init_minq_luts(); @@ -167,7 +168,7 @@ int vp9_rc_regulate_q(const struct VP9_COMP *cpi, int target_bits_per_frame, // Estimates bits per mb for a given qindex and correction factor. int vp9_rc_bits_per_mb(FRAME_TYPE frame_type, int qindex, - double correction_factor); + double correction_factor, vpx_bit_depth_t bit_depth); // Clamping utilities for bitrate targets for iframes and pframes. int vp9_rc_clamp_iframe_target_size(const struct VP9_COMP *const cpi, @@ -180,12 +181,14 @@ void vp9_rc_set_frame_target(struct VP9_COMP *cpi, int target); // Computes a q delta (in "q index" terms) to get from a starting q value // to a target q value -int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget); +int vp9_compute_qdelta(const RATE_CONTROL *rc, double qstart, double qtarget, + vpx_bit_depth_t bit_depth); // Computes a q delta (in "q index" terms) to get from a starting q value // to a value that should equate to the given rate ratio. int vp9_compute_qdelta_by_rate(const RATE_CONTROL *rc, FRAME_TYPE frame_type, - int qindex, double rate_target_ratio); + int qindex, double rate_target_ratio, + vpx_bit_depth_t bit_depth); void vp9_rc_update_framerate(struct VP9_COMP *cpi); diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 1dd44b4aa..8b7066b13 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -93,34 +93,69 @@ static void fill_token_costs(vp9_coeff_cost *c, } // Values are now correlated to quantizer. -static int sad_per_bit16lut[QINDEX_RANGE]; -static int sad_per_bit4lut[QINDEX_RANGE]; - -void vp9_init_me_luts() { +static int sad_per_bit16lut_8[QINDEX_RANGE]; +static int sad_per_bit4lut_8[QINDEX_RANGE]; + +#if CONFIG_VP9_HIGHBITDEPTH +static int sad_per_bit16lut_10[QINDEX_RANGE]; +static int sad_per_bit4lut_10[QINDEX_RANGE]; +static int sad_per_bit16lut_12[QINDEX_RANGE]; +static int sad_per_bit4lut_12[QINDEX_RANGE]; +#endif + +static void init_me_luts_bd(int *bit16lut, int *bit4lut, int range, + vpx_bit_depth_t bit_depth) { int i; - // Initialize the sad lut tables using a formulaic calculation for now. // This is to make it easier to resolve the impact of experimental changes // to the quantizer tables. - for (i = 0; i < QINDEX_RANGE; ++i) { - const double q = vp9_convert_qindex_to_q(i); - sad_per_bit16lut[i] = (int)(0.0418 * q + 2.4107); - sad_per_bit4lut[i] = (int)(0.063 * q + 2.742); + for (i = 0; i < range; i++) { + const double q = vp9_convert_qindex_to_q(i, bit_depth); + bit16lut[i] = (int)(0.0418 * q + 2.4107); + bit4lut[i] = (int)(0.063 * q + 2.742); } } +void vp9_init_me_luts() { + init_me_luts_bd(sad_per_bit16lut_8, sad_per_bit4lut_8, QINDEX_RANGE, + VPX_BITS_8); +#if CONFIG_VP9_HIGHBITDEPTH + init_me_luts_bd(sad_per_bit16lut_10, sad_per_bit4lut_10, QINDEX_RANGE, + VPX_BITS_10); + init_me_luts_bd(sad_per_bit16lut_12, sad_per_bit4lut_12, QINDEX_RANGE, + VPX_BITS_12); +#endif +} + static const int rd_boost_factor[16] = { 64, 32, 32, 32, 24, 16, 12, 12, 8, 8, 4, 4, 2, 2, 1, 0 }; static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = { -128, 144, 128, 128, 144 + 128, 144, 128, 128, 144 }; int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { - const int q = vp9_dc_quant(qindex, 0); + const int q = vp9_dc_quant(qindex, 0, cpi->common.bit_depth); +#if CONFIG_VP9_HIGHBITDEPTH + int rdmult = 0; + switch (cpi->common.bit_depth) { + case VPX_BITS_8: + rdmult = 88 * q * q / 24; + break; + case VPX_BITS_10: + rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); + break; + case VPX_BITS_12: + rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); + break; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else int rdmult = 88 * q * q / 24; - +#endif if (cpi->oxcf.pass == 2 && (cpi->common.frame_type != KEY_FRAME)) { const GF_GROUP *const gf_group = &cpi->twopass.gf_group; const FRAME_UPDATE_TYPE frame_type = gf_group->update_type[gf_group->index]; @@ -132,15 +167,53 @@ int vp9_compute_rd_mult(const VP9_COMP *cpi, int qindex) { return rdmult; } -static int compute_rd_thresh_factor(int qindex) { +static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { + double q; +#if CONFIG_VP9_HIGHBITDEPTH + switch (bit_depth) { + case VPX_BITS_8: + q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; + break; + case VPX_BITS_10: + q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; + break; + case VPX_BITS_12: + q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; + break; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + return -1; + } +#else + (void) bit_depth; + q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; +#endif // TODO(debargha): Adjust the function below. - const int q = (int)(pow(vp9_dc_quant(qindex, 0) / 4.0, RD_THRESH_POW) * 5.12); - return MAX(q, 8); + return MAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8); } void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex) { - cpi->mb.sadperbit16 = sad_per_bit16lut[qindex]; - cpi->mb.sadperbit4 = sad_per_bit4lut[qindex]; +#if CONFIG_VP9_HIGHBITDEPTH + switch (cpi->common.bit_depth) { + case VPX_BITS_8: + cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex]; + cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex]; + break; + case VPX_BITS_10: + cpi->mb.sadperbit16 = sad_per_bit16lut_10[qindex]; + cpi->mb.sadperbit4 = sad_per_bit4lut_10[qindex]; + break; + case VPX_BITS_12: + cpi->mb.sadperbit16 = sad_per_bit16lut_12[qindex]; + cpi->mb.sadperbit4 = sad_per_bit4lut_12[qindex]; + break; + default: + assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + } +#else + cpi->mb.sadperbit16 = sad_per_bit16lut_8[qindex]; + cpi->mb.sadperbit4 = sad_per_bit4lut_8[qindex]; +#endif } static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) { @@ -149,9 +222,8 @@ static void set_block_thresholds(const VP9_COMMON *cm, RD_OPT *rd) { for (segment_id = 0; segment_id < MAX_SEGMENTS; ++segment_id) { const int qindex = clamp(vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex) + - cm->y_dc_delta_q, - 0, MAXQ); - const int q = compute_rd_thresh_factor(qindex); + cm->y_dc_delta_q, 0, MAXQ); + const int q = compute_rd_thresh_factor(qindex, cm->bit_depth); for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { // Threshold here seems unnecessarily harsh but fine given actual diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 7be557df9..bf27ba682 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2582,7 +2582,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t dist_uv[TX_SIZES]; int skip_uv[TX_SIZES]; PREDICTION_MODE mode_uv[TX_SIZES]; - int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); + const int intra_cost_penalty = + 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); int best_skip2 = 0; uint8_t ref_frame_skip_mask[2] = { 0 }; uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 }; @@ -3312,7 +3313,8 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, int64_t dist_uv; int skip_uv; PREDICTION_MODE mode_uv = DC_PRED; - int intra_cost_penalty = 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q); + const int intra_cost_penalty = + 20 * vp9_dc_quant(cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth); int_mv seg_mvs[4][MAX_REF_FRAMES]; b_mode_info best_bmodes[4]; int best_skip2 = 0; diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 51d6f766b..ff026666b 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -389,10 +389,10 @@ static void adjust_arnr_filter(VP9_COMP *cpi, // Adjust the strength based on active max q. if (cpi->common.current_video_frame > 1) q = ((int)vp9_convert_qindex_to_q( - cpi->rc.avg_frame_qindex[INTER_FRAME])); + cpi->rc.avg_frame_qindex[INTER_FRAME], cpi->common.bit_depth)); else q = ((int)vp9_convert_qindex_to_q( - cpi->rc.avg_frame_qindex[KEY_FRAME])); + cpi->rc.avg_frame_qindex[KEY_FRAME], cpi->common.bit_depth)); if (q > 16) { strength = oxcf->arnr_strength; } else { diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 90f03426b..e88060c64 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -89,6 +89,10 @@ VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm endif +ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_high_intrapred_sse2.asm +endif + # common (c) VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_common_dspr2.h VP9_COMMON_SRCS-$(HAVE_DSPR2) += common/mips/dspr2/vp9_convolve2_avg_dspr2.c diff --git a/vpx/vpx_frame_buffer.h b/vpx/vpx_frame_buffer.h index e69df4bc8..41038b10d 100644 --- a/vpx/vpx_frame_buffer.h +++ b/vpx/vpx_frame_buffer.h @@ -43,15 +43,15 @@ typedef struct vpx_codec_frame_buffer { * * This callback is invoked by the decoder to retrieve data for the frame * buffer in order for the decode call to complete. The callback must - * allocate at least min_size in bytes and assign it to fb->data. Then the - * callback must set fb->size to the allocated size. The application does not - * need to align the allocated data. The callback is triggered when the - * decoder needs a frame buffer to decode a compressed image into. This - * function may be called more than once for every call to vpx_codec_decode. - * The application may set fb->priv to some data which will be passed - * back in the ximage and the release function call. |fb| is guaranteed to - * not be NULL. On success the callback must return 0. Any failure the - * callback must return a value less than 0. + * allocate at least min_size in bytes and assign it to fb->data. The callback + * must zero out all the data allocated. Then the callback must set fb->size + * to the allocated size. The application does not need to align the allocated + * data. The callback is triggered when the decoder needs a frame buffer to + * decode a compressed image into. This function may be called more than once + * for every call to vpx_codec_decode. The application may set fb->priv to + * some data which will be passed back in the ximage and the release function + * call. |fb| is guaranteed to not be NULL. On success the callback must + * return 0. Any failure the callback must return a value less than 0. * * \param[in] priv Callback's private data * \param[in] new_size Size in bytes needed by the buffer diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c index 70d7ac0c8..475d231e1 100644 --- a/vpx_scale/generic/yv12config.c +++ b/vpx_scale/generic/yv12config.c @@ -199,11 +199,6 @@ int vp9_realloc_frame_buffer(YV12_BUFFER_CONFIG *ybf, if (fb->data == NULL || fb->size < external_frame_size) return -1; - // This memset is needed for fixing valgrind error from C loop filter - // due to access uninitialized memory in frame border. It could be - // removed if border is totally removed. - vpx_memset(fb->data, 0, fb->size); - ybf->buffer_alloc = (uint8_t *)yv12_align_addr(fb->data, 32); } else if (frame_size > (size_t)ybf->buffer_alloc_sz) { // Allocation to hold larger frame, or first allocation. @@ -384,7 +384,7 @@ int get_vp9_frame_buffer(void *cb_priv, size_t min_size, if (ext_fb_list->ext_fb[i].size < min_size) { free(ext_fb_list->ext_fb[i].data); - ext_fb_list->ext_fb[i].data = (uint8_t *)malloc(min_size); + ext_fb_list->ext_fb[i].data = (uint8_t *)calloc(min_size, sizeof(uint8_t)); if (!ext_fb_list->ext_fb[i].data) return -1; |