diff options
59 files changed, 1646 insertions, 2118 deletions
diff --git a/build/make/rtcd.sh b/build/make/rtcd.sh index 2967b5aed..ed037132a 100755 --- a/build/make/rtcd.sh +++ b/build/make/rtcd.sh @@ -209,6 +209,10 @@ common_top() { #define RTCD_EXTERN extern #endif +#ifdef __cplusplus +extern "C" { +#endif + $(process_forward_decls) $(declare_function_pointers c $ALL_ARCHS) @@ -219,6 +223,11 @@ EOF common_bottom() { cat <<EOF + +#ifdef __cplusplus +} // extern "C" +#endif + #endif EOF } diff --git a/example_xma.c b/example_xma.c deleted file mode 100644 index 7aa879810..000000000 --- a/example_xma.c +++ /dev/null @@ -1,191 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -/* This is a simple program showing how to initialize the decoder in XMA mode */ -#include <stdio.h> -#include <stdlib.h> -#include <stdarg.h> -#include <string.h> -#define VPX_CODEC_DISABLE_COMPAT 1 -#include "vpx_config.h" -#include "vpx/vpx_decoder.h" -#include "vpx/vpx_integer.h" -#if CONFIG_VP9_DECODER -#include "vpx/vp8dx.h" -#endif - -static char *exec_name; -static int verbose = 0; - -static const struct { - const char *name; - const vpx_codec_iface_t *iface; -} ifaces[] = { -#if CONFIG_VP9_DECODER - {"vp9", &vpx_codec_vp8_dx_algo}, -#endif -}; - -static void usage_exit(void) { - int i; - - printf("Usage: %s <options>\n\n" - "Options:\n" - "\t--codec <name>\tCodec to use (default=%s)\n" - "\t-h <height>\tHeight of the simulated video frame, in pixels\n" - "\t-w <width> \tWidth of the simulated video frame, in pixels\n" - "\t-v \tVerbose mode (show individual segment sizes)\n" - "\t--help \tShow this message\n" - "\n" - "Included decoders:\n" - "\n", - exec_name, - ifaces[0].name); - - for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++) - printf(" %-6s - %s\n", - ifaces[i].name, - vpx_codec_iface_name(ifaces[i].iface)); - - exit(EXIT_FAILURE); -} - -static void usage_error(const char *fmt, ...) { - va_list ap; - va_start(ap, fmt); - vprintf(fmt, ap); - printf("\n"); - usage_exit(); -} - -void my_mem_dtor(vpx_codec_mmap_t *mmap) { - if (verbose) - printf("freeing segment %d\n", mmap->id); - - free(mmap->priv); -} - -int main(int argc, char **argv) { - vpx_codec_ctx_t decoder; - vpx_codec_iface_t *iface = ifaces[0].iface; - vpx_codec_iter_t iter; - vpx_codec_dec_cfg_t cfg; - vpx_codec_err_t res = VPX_CODEC_OK; - unsigned int alloc_sz = 0; - unsigned int w = 352; - unsigned int h = 288; - int i; - - exec_name = argv[0]; - - for (i = 1; i < argc; i++) { - if (!strcmp(argv[i], "--codec")) { - if (i + 1 < argc) { - int j, k = -1; - - i++; - - for (j = 0; j < sizeof(ifaces) / sizeof(ifaces[0]); j++) - if (!strcmp(ifaces[j].name, argv[i])) - k = j; - - if (k >= 0) - iface = ifaces[k].iface; - else - usage_error("Error: Unrecognized argument (%s) to --codec\n", - argv[i]); - } else - usage_error("Error: Option --codec requires argument.\n"); - } else if (!strcmp(argv[i], "-v")) - verbose = 1; - else if (!strcmp(argv[i], "-h")) - if (i + 1 < argc) { - h = atoi(argv[++i]); - } else - usage_error("Error: Option -h requires argument.\n"); - else if (!strcmp(argv[i], "-w")) - if (i + 1 < argc) { - w = atoi(argv[++i]); - } else - usage_error("Error: Option -w requires argument.\n"); - else if (!strcmp(argv[i], "--help")) - usage_exit(); - else - usage_error("Error: Unrecognized option %s\n\n", argv[i]); - } - - if (argc == 1) - printf("Using built-in defaults. For options, rerun with --help\n\n"); - - /* XMA mode is not supported on all decoders! */ - if (!(vpx_codec_get_caps(iface) & VPX_CODEC_CAP_XMA)) { - printf("%s does not support XMA mode!\n", vpx_codec_iface_name(iface)); - return EXIT_FAILURE; - } - - /* The codec knows how much memory to allocate based on the size of the - * encoded frames. This data can be parsed from the bitstream with - * vpx_codec_peek_stream_info() if a bitstream is available. Otherwise, - * a fixed size can be used that will be the upper limit on the frame - * size the decoder can decode. - */ - cfg.w = w; - cfg.h = h; - - /* Initialize the decoder in XMA mode. */ - if (vpx_codec_dec_init(&decoder, iface, &cfg, VPX_CODEC_USE_XMA)) { - printf("Failed to initialize decoder in XMA mode: %s\n", vpx_codec_error(&decoder)); - return EXIT_FAILURE; - } - - /* Iterate through the list of memory maps, allocating them with the - * requested alignment. - */ - iter = NULL; - - do { - vpx_codec_mmap_t mmap; - unsigned int align; - - res = vpx_codec_get_mem_map(&decoder, &mmap, &iter); - align = mmap.align ? mmap.align - 1 : 0; - - if (!res) { - if (verbose) - printf("Allocating segment %u, size %lu, align %u %s\n", - mmap.id, mmap.sz, mmap.align, - mmap.flags & VPX_CODEC_MEM_ZERO ? "(ZEROED)" : ""); - - if (mmap.flags & VPX_CODEC_MEM_ZERO) - mmap.priv = calloc(1, mmap.sz + align); - else - mmap.priv = malloc(mmap.sz + align); - - mmap.base = (void *)((((uintptr_t)mmap.priv) + align) & ~(uintptr_t)align); - mmap.dtor = my_mem_dtor; - alloc_sz += mmap.sz + align; - - if (vpx_codec_set_mem_map(&decoder, &mmap, 1)) { - printf("Failed to set mmap: %s\n", vpx_codec_error(&decoder)); - return EXIT_FAILURE; - } - } else if (res != VPX_CODEC_LIST_END) { - printf("Failed to get mmap: %s\n", vpx_codec_error(&decoder)); - return EXIT_FAILURE; - } - } while (res != VPX_CODEC_LIST_END); - - printf("%s\n %d bytes external memory required for %dx%d.\n", - decoder.name, alloc_sz, cfg.w, cfg.h); - vpx_codec_destroy(&decoder); - return EXIT_SUCCESS; - -} diff --git a/examples.mk b/examples.mk index 66b719ca0..b29ab9c34 100644 --- a/examples.mk +++ b/examples.mk @@ -26,6 +26,7 @@ vpxdec.SRCS += args.c args.h vpxdec.SRCS += ivfdec.c ivfdec.h vpxdec.SRCS += tools_common.c tools_common.h vpxdec.SRCS += webmdec.c webmdec.h +vpxdec.SRCS += y4menc.c y4menc.h vpxdec.SRCS += nestegg/halloc/halloc.h vpxdec.SRCS += nestegg/halloc/src/align.h vpxdec.SRCS += nestegg/halloc/src/halloc.c @@ -109,11 +110,13 @@ GEN_EXAMPLES-$(CONFIG_VP8_ENCODER) += vp8cx_set_ref.c vp8cx_set_ref.GUID = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A vp8cx_set_ref.DESCRIPTION = VP8 set encoder reference frame -# C file is provided, not generated automatically. -UTILS-$(CONFIG_MULTI_RES_ENCODING) += vp8_multi_resolution_encoder.c -vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS) -vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de -vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding + +ifeq ($(CONFIG_MULTI_RES_ENCODING),yes) +GEN_EXAMPLES-$(CONFIG_VP8_DECODER) += vp8_multi_resolution_encoder.c +vp8_multi_resolution_encoder.SRCS += $(LIBYUV_SRCS) +vp8_multi_resolution_encoder.GUID = 04f8738e-63c8-423b-90fa-7c2703a374de +vp8_multi_resolution_encoder.DESCRIPTION = VP8 Multiple-resolution Encoding +endif # Handle extra library flags depending on codec configuration diff --git a/vp8_multi_resolution_encoder.c b/examples/vp8_multi_resolution_encoder.c index 4c29056e5..4c29056e5 100644 --- a/vp8_multi_resolution_encoder.c +++ b/examples/vp8_multi_resolution_encoder.c diff --git a/test/codec_factory.h b/test/codec_factory.h index 2ca6ff086..c060e86dc 100644 --- a/test/codec_factory.h +++ b/test/codec_factory.h @@ -10,7 +10,6 @@ #ifndef TEST_CODEC_FACTORY_H_ #define TEST_CODEC_FACTORY_H_ -extern "C" { #include "./vpx_config.h" #include "vpx/vpx_decoder.h" #include "vpx/vpx_encoder.h" @@ -20,7 +19,6 @@ extern "C" { #if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER #include "vpx/vp8dx.h" #endif -} #include "test/decode_test_driver.h" #include "test/encode_test_driver.h" diff --git a/test/datarate_test.cc b/test/datarate_test.cc index 0b4ddaece..db7dfdb53 100644 --- a/test/datarate_test.cc +++ b/test/datarate_test.cc @@ -7,11 +7,13 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ +#include "./vpx_config.h" #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "test/y4m_video_source.h" namespace { @@ -286,6 +288,37 @@ TEST_P(DatarateTestVP9, BasicRateTargeting) { } } +#if CONFIG_NON420 +// Check basic rate targeting, +TEST_P(DatarateTestVP9, BasicRateTargeting444) { + ::libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 140); + + cfg_.g_profile = 1; + cfg_.g_timebase = video.timebase(); + + cfg_.rc_buf_initial_sz = 500; + cfg_.rc_buf_optimal_sz = 500; + cfg_.rc_buf_sz = 1000; + cfg_.rc_dropframe_thresh = 1; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 63; + cfg_.rc_end_usage = VPX_CBR; + + for (int i = 250; i < 900; i += 200) { + cfg_.rc_target_bitrate = i; + ResetModel(); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + ASSERT_GE(static_cast<double>(cfg_.rc_target_bitrate), + effective_datarate_ * 0.85) + << " The datarate for the file exceeds the target by too much!"; + ASSERT_LE(static_cast<double>(cfg_.rc_target_bitrate), + effective_datarate_ * 1.15) + << " The datarate for the file missed the target!" + << cfg_.rc_target_bitrate << " "<< effective_datarate_; + } +} +#endif + // Check that (1) the first dropped frame gets earlier and earlier // as the drop frame threshold is increased, and (2) that the total number of // frame drops does not decrease as we increase frame drop threshold. diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc index d8c61ffb2..5f4c33a81 100644 --- a/test/idct8x8_test.cc +++ b/test/idct8x8_test.cc @@ -14,9 +14,7 @@ #include "third_party/googletest/src/include/gtest/gtest.h" -extern "C" { #include "./vp9_rtcd.h" -} #include "test/acm_random.h" #include "vpx/vpx_integer.h" diff --git a/test/idct_test.cc b/test/idct_test.cc index 2c7fa0ef8..1bbf80a0a 100644 --- a/test/idct_test.cc +++ b/test/idct_test.cc @@ -8,10 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ -extern "C" { #include "./vpx_config.h" #include "./vp8_rtcd.h" -} #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "third_party/googletest/src/include/gtest/gtest.h" diff --git a/test/pp_filter_test.cc b/test/pp_filter_test.cc index e5ac9db2b..ff7bb08e3 100644 --- a/test/pp_filter_test.cc +++ b/test/pp_filter_test.cc @@ -10,12 +10,10 @@ #include "test/clear_system_state.h" #include "test/register_state_check.h" #include "third_party/googletest/src/include/gtest/gtest.h" -extern "C" { #include "./vpx_config.h" #include "./vp8_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" -} typedef void (*post_proc_func_t)(unsigned char *src_ptr, unsigned char *dst_ptr, diff --git a/test/resize_test.cc b/test/resize_test.cc index e8c2c825b..1963453fd 100644 --- a/test/resize_test.cc +++ b/test/resize_test.cc @@ -208,7 +208,7 @@ class ResizeInternalTest : public ResizeTest { virtual void PSNRPktHook(const vpx_codec_cx_pkt_t *pkt) { if (!frame0_psnr_) frame0_psnr_ = pkt->data.psnr.psnr[0]; - EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 1.5); + EXPECT_NEAR(pkt->data.psnr.psnr[0], frame0_psnr_, 2.0); } virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) { diff --git a/test/sad_test.cc b/test/sad_test.cc index 453b3a84e..4a91b0b60 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -13,7 +13,6 @@ #include <limits.h> #include <stdio.h> -extern "C" { #include "./vpx_config.h" #if CONFIG_VP8_ENCODER #include "./vp8_rtcd.h" @@ -22,7 +21,6 @@ extern "C" { #include "./vp9_rtcd.h" #endif #include "vpx_mem/vpx_mem.h" -} #include "test/acm_random.h" #include "test/clear_system_state.h" diff --git a/test/sixtap_predict_test.cc b/test/sixtap_predict_test.cc index 0f5c0a5e8..3434662fb 100644 --- a/test/sixtap_predict_test.cc +++ b/test/sixtap_predict_test.cc @@ -16,12 +16,10 @@ #include "test/register_state_check.h" #include "test/util.h" #include "third_party/googletest/src/include/gtest/gtest.h" -extern "C" { #include "./vpx_config.h" #include "./vp8_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_mem/vpx_mem.h" -} namespace { diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 492705948..6daf69e63 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -1,5 +1,6 @@ d5dfb0151c9051f8c85999255645d7a23916d3c0 hantro_collage_w352h288.yuv b87815bf86020c592ccc7a846ba2e28ec8043902 hantro_odd.yuv +b1f1c3ec79114b9a0651af24ce634afb44a9a419 rush_hour_444.y4m 5184c46ddca8b1fadd16742e8500115bc8f749da vp80-00-comprehensive-001.ivf 65bf1bbbced81b97bd030f376d1b7f61a224793f vp80-00-comprehensive-002.ivf 906b4c1e99eb734504c504b3f1ad8052137ce672 vp80-00-comprehensive-003.ivf diff --git a/test/test.mk b/test/test.mk index 5a1d39de5..178b16210 100644 --- a/test/test.mk +++ b/test/test.mk @@ -24,6 +24,8 @@ LIBVPX_TEST_SRCS-yes += encode_test_driver.cc LIBVPX_TEST_SRCS-yes += encode_test_driver.h LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += error_resilience_test.cc LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += i420_video_source.h +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += y4m_video_source.h +LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += ../y4minput.h ../y4minput.c LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += keyframe_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += resize_test.cc @@ -118,6 +120,7 @@ endif ## LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_collage_w352h288.yuv LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv +LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-002.ivf diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc index 6d93bb88f..4adf9af91 100644 --- a/test/test_vector_test.cc +++ b/test/test_vector_test.cc @@ -19,9 +19,7 @@ #include "test/test_vectors.h" #include "test/util.h" #include "test/webm_video_source.h" -extern "C" { #include "vpx_mem/vpx_mem.h" -} namespace { diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc index 863a3669a..d7144522b 100644 --- a/test/tile_independence_test.cc +++ b/test/tile_independence_test.cc @@ -17,9 +17,7 @@ #include "test/i420_video_source.h" #include "test/util.h" #include "test/md5_helper.h" -extern "C" { #include "vpx_mem/vpx_mem.h" -} namespace { class TileIndependenceTest : public ::libvpx_test::EncoderTest, diff --git a/test/vp8_fdct4x4_test.cc b/test/vp8_fdct4x4_test.cc index 25465c53c..e3c292ea1 100644 --- a/test/vp8_fdct4x4_test.cc +++ b/test/vp8_fdct4x4_test.cc @@ -15,9 +15,7 @@ #include <string.h> #include <sys/types.h> -extern "C" { #include "./vp8_rtcd.h" -} #include "test/acm_random.h" #include "third_party/googletest/src/include/gtest/gtest.h" diff --git a/test/vp9_lossless_test.cc b/test/vp9_lossless_test.cc index 03b89f8df..2282687dc 100644 --- a/test/vp9_lossless_test.cc +++ b/test/vp9_lossless_test.cc @@ -7,12 +7,13 @@ * in the file PATENTS. All contributing project authors may * be found in the AUTHORS file in the root of the source tree. */ - +#include "./vpx_config.h" #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" #include "test/encode_test_driver.h" #include "test/i420_video_source.h" #include "test/util.h" +#include "test/y4m_video_source.h" namespace { @@ -71,5 +72,25 @@ TEST_P(LossLessTest, TestLossLessEncoding) { const double psnr_lossless = GetMinPsnr(); EXPECT_GE(psnr_lossless, kMaxPsnr); } + +#if CONFIG_NON420 +TEST_P(LossLessTest, TestLossLessEncoding444) { + libvpx_test::Y4mVideoSource video("rush_hour_444.y4m", 0, 10); + + cfg_.g_profile = 1; + cfg_.g_timebase = video.timebase(); + cfg_.rc_target_bitrate = 2000; + cfg_.g_lag_in_frames = 25; + cfg_.rc_min_quantizer = 0; + cfg_.rc_max_quantizer = 0; + + init_flags_ = VPX_CODEC_USE_PSNR; + + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); + const double psnr_lossless = GetMinPsnr(); + EXPECT_GE(psnr_lossless, kMaxPsnr); +} +#endif + VP9_INSTANTIATE_TEST_CASE(LossLessTest, ALL_TEST_MODES); } // namespace diff --git a/test/y4m_video_source.h b/test/y4m_video_source.h new file mode 100644 index 000000000..bd86c2c04 --- /dev/null +++ b/test/y4m_video_source.h @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef TEST_Y4M_VIDEO_SOURCE_H_ +#define TEST_Y4M_VIDEO_SOURCE_H_ +#include <string> + +#include "test/video_source.h" +extern "C" { +#include "./y4minput.h" +} + +namespace libvpx_test { + +// This class extends VideoSource to allow parsing of raw yv12 +// so that we can do actual file encodes. +class Y4mVideoSource : public VideoSource { + public: + Y4mVideoSource(const std::string &file_name, + unsigned int start, int limit) + : file_name_(file_name), + input_file_(NULL), + img_(new vpx_image_t()), + start_(start), + limit_(limit), + frame_(0), + framerate_numerator_(0), + framerate_denominator_(0), + y4m_() { + } + + virtual ~Y4mVideoSource() { + vpx_img_free(img_.get()); + y4m_input_close(&y4m_); + if (input_file_) + fclose(input_file_); + } + + virtual void Begin() { + if (input_file_) + fclose(input_file_); + input_file_ = OpenTestDataFile(file_name_); + ASSERT_TRUE(input_file_ != NULL) << "Input file open failed. Filename: " + << file_name_; + + y4m_input_open(&y4m_, input_file_, NULL, 0, 0); + framerate_numerator_ = y4m_.fps_n; + framerate_denominator_ = y4m_.fps_d; + + frame_ = 0; + for (unsigned int i = 0; i < start_; i++) { + Next(); + } + + FillFrame(); + } + + virtual void Next() { + ++frame_; + FillFrame(); + } + + virtual vpx_image_t *img() const { + return (frame_ < limit_) ? img_.get() : NULL; + } + + // Models a stream where Timebase = 1/FPS, so pts == frame. + virtual vpx_codec_pts_t pts() const { return frame_; } + + virtual unsigned long duration() const { return 1; } + + virtual vpx_rational_t timebase() const { + const vpx_rational_t t = { framerate_denominator_, framerate_numerator_ }; + return t; + } + + virtual unsigned int frame() const { return frame_; } + + virtual unsigned int limit() const { return limit_; } + + virtual void FillFrame() { + ASSERT_TRUE(input_file_ != NULL); + // Read a frame from input_file. + y4m_input_fetch_frame(&y4m_, input_file_, img_.get()); + } + + protected: + std::string file_name_; + FILE *input_file_; + testing::internal::scoped_ptr<vpx_image_t> img_; + unsigned int start_; + unsigned int limit_; + unsigned int frame_; + int framerate_numerator_; + int framerate_denominator_; + y4m_input y4m_; +}; + +} // namespace libvpx_test + +#endif // TEST_Y4M_VIDEO_SOURCE_H_ diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 209a25d82..119e40cdc 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -39,8 +39,8 @@ extern "C" typedef enum { - USAGE_STREAM_FROM_SERVER = 0x0, - USAGE_LOCAL_FILE_PLAYBACK = 0x1, + USAGE_LOCAL_FILE_PLAYBACK = 0x0, + USAGE_STREAM_FROM_SERVER = 0x1, USAGE_CONSTRAINED_QUALITY = 0x2, USAGE_CONSTANT_QUALITY = 0x3 } END_USAGE; diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c index d517dfa37..3f8539759 100644 --- a/vp8/encoder/arm/neon/denoising_neon.c +++ b/vp8/encoder/arm/neon/denoising_neon.c @@ -119,8 +119,10 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, v_abs_adjustment); v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); - v_sum_diff = vqaddq_s8(v_sum_diff, (int8x16_t)v_pos_adjustment); - v_sum_diff = vqsubq_s8(v_sum_diff, (int8x16_t)v_neg_adjustment); + v_sum_diff = vqaddq_s8(v_sum_diff, + vreinterpretq_s8_u8(v_pos_adjustment)); + v_sum_diff = vqsubq_s8(v_sum_diff, + vreinterpretq_s8_u8(v_neg_adjustment)); /* Store results. */ vst1q_u8(running_avg_y, v_running_avg_y); diff --git a/vp9/common/arm/neon/vp9_reconintra_neon.asm b/vp9/common/arm/neon/vp9_reconintra_neon.asm index 98619bb30..71bf24c9f 100644 --- a/vp9/common/arm/neon/vp9_reconintra_neon.asm +++ b/vp9/common/arm/neon/vp9_reconintra_neon.asm @@ -17,6 +17,7 @@ EXPORT |vp9_h_predictor_16x16_neon| EXPORT |vp9_h_predictor_32x32_neon| EXPORT |vp9_tm_predictor_4x4_neon| + EXPORT |vp9_tm_predictor_8x8_neon| ARM REQUIRE8 PRESERVE8 @@ -328,8 +329,78 @@ loop_h vqshrun.s16 d1, q2, #0 vst1.32 {d0[0]}, [r0], r1 vst1.32 {d1[0]}, [r0], r1 - bx lr ENDP ; |vp9_tm_predictor_4x4_neon| +;void vp9_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride, +; const uint8_t *above, +; const uint8_t *left) +; r0 uint8_t *dst +; r1 ptrdiff_t y_stride +; r2 const uint8_t *above +; r3 const uint8_t *left + +|vp9_tm_predictor_8x8_neon| PROC + ; Load ytop_left = above[-1]; + sub r12, r2, #1 + ldrb r12, [r12] + vdup.u8 d0, r12 + + ; Load above 8 pixels + vld1.64 {d2}, [r2] + + ; Compute above - ytop_left + vsubl.u8 q3, d2, d0 + + ; Load left row by row and compute left + (above - ytop_left) + ; 1st row and 2nd row + ldrb r12, [r3], #1 + ldrb r2, [r3], #1 + vdup.u16 q1, r12 + vdup.u16 q2, r2 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqshrun.s16 d0, q1, #0 + vqshrun.s16 d1, q2, #0 + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + + ; 3rd row and 4th row + ldrb r12, [r3], #1 + ldrb r2, [r3], #1 + vdup.u16 q1, r12 + vdup.u16 q2, r2 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqshrun.s16 d0, q1, #0 + vqshrun.s16 d1, q2, #0 + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + + ; 5th row and 6th row + ldrb r12, [r3], #1 + ldrb r2, [r3], #1 + vdup.u16 q1, r12 + vdup.u16 q2, r2 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqshrun.s16 d0, q1, #0 + vqshrun.s16 d1, q2, #0 + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + + ; 7rd row and 8th row + ldrb r12, [r3], #1 + ldrb r2, [r3], #1 + vdup.u16 q1, r12 + vdup.u16 q2, r2 + vadd.s16 q1, q1, q3 + vadd.s16 q2, q2, q3 + vqshrun.s16 d0, q1, #0 + vqshrun.s16 d1, q2, #0 + vst1.64 {d0}, [r0], r1 + vst1.64 {d1}, [r0], r1 + bx lr + ENDP ; |vp9_tm_predictor_8x8_neon| + END diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c index ff20553d6..ca42090c1 100644 --- a/vp9/common/vp9_alloccommon.c +++ b/vp9/common/vp9_alloccommon.c @@ -85,7 +85,7 @@ int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) { int mi_size; if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9BORDERINPIXELS, NULL, NULL, NULL) < 0) + VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL) < 0) goto fail; set_mb_mi(cm, aligned_width, aligned_height); @@ -154,7 +154,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { for (i = 0; i < cm->fb_count; i++) { cm->fb_idx_ref_cnt[i] = 0; if (vp9_alloc_frame_buffer(&cm->yv12_fb[i], width, height, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) + VP9_ENC_BORDER_IN_PIXELS) < 0) goto fail; } @@ -167,7 +167,7 @@ int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) { } if (vp9_alloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y, - VP9BORDERINPIXELS) < 0) + VP9_ENC_BORDER_IN_PIXELS) < 0) goto fail; set_mb_mi(cm, aligned_width, aligned_height); diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 21e2b16a4..ad78b0dc4 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -122,7 +122,6 @@ typedef struct { TX_SIZE tx_size; int_mv mv[2]; // for each reference frame used int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES]; - int_mv best_mv[2]; uint8_t mode_context[MAX_REF_FRAMES]; @@ -242,6 +241,9 @@ typedef struct macroblockd { /* pointer to current frame */ const YV12_BUFFER_CONFIG *cur_buf; + /* mc buffer */ + DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]); + int lossless; /* Inverse transform function pointers. */ void (*itxm_add)(const int16_t *input, uint8_t *dest, int stride, int eob); diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h index f43a85f14..ba162fd20 100644 --- a/vp9/common/vp9_entropy.h +++ b/vp9/common/vp9_entropy.h @@ -112,8 +112,8 @@ static INLINE void reset_skip_context(MACROBLOCKD *xd, BLOCK_SIZE bsize) { // This macro is currently unused but may be used by certain implementations #define MAXBAND_INDEX 21 -extern const uint8_t vp9_coefband_trans_8x8plus[1024]; -extern const uint8_t vp9_coefband_trans_4x4[16]; +extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_8x8plus[1024]); +extern DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]); static const uint8_t *get_band_translate(TX_SIZE tx_size) { return tx_size == TX_4X4 ? vp9_coefband_trans_4x4 diff --git a/vp9/common/vp9_mvref_common.h b/vp9/common/vp9_mvref_common.h index 06adbabaa..cd89390d5 100644 --- a/vp9/common/vp9_mvref_common.h +++ b/vp9/common/vp9_mvref_common.h @@ -32,8 +32,10 @@ static INLINE void vp9_find_mv_refs(const VP9_COMMON *cm, const MACROBLOCKD *xd, mv_ref_list, -1, mi_row, mi_col); } -#define LEFT_TOP_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3) -#define RIGHT_BOTTOM_MARGIN ((VP9BORDERINPIXELS - VP9_INTERP_EXTEND) << 3) +#define LEFT_TOP_MARGIN ((VP9_ENC_BORDER_IN_PIXELS \ + - VP9_INTERP_EXTEND) << 3) +#define RIGHT_BOTTOM_MARGIN ((VP9_ENC_BORDER_IN_PIXELS \ + - VP9_INTERP_EXTEND) << 3) // check a list of motion vectors by sad score using a number rows of pixels // above and a number cols of pixels in the left to select the one with best diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c index 397f446f3..b5a9248c3 100644 --- a/vp9/common/vp9_reconinter.c +++ b/vp9/common/vp9_reconinter.c @@ -20,15 +20,16 @@ #include "vp9/common/vp9_reconinter.h" #include "vp9/common/vp9_reconintra.h" -static void build_mc_border(const uint8_t *src, uint8_t *dst, int stride, - int x, int y, int b_w, int b_h, int w, int h) { +static void build_mc_border(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + int x, int y, int b_w, int b_h, int w, int h) { // Get a pointer to the start of the real data for this row. - const uint8_t *ref_row = src - x - y * stride; + const uint8_t *ref_row = src - x - y * src_stride; if (y >= h) - ref_row += (h - 1) * stride; + ref_row += (h - 1) * src_stride; else if (y > 0) - ref_row += y * stride; + ref_row += y * src_stride; do { int right = 0, copy; @@ -49,16 +50,16 @@ static void build_mc_border(const uint8_t *src, uint8_t *dst, int stride, memset(dst, ref_row[0], left); if (copy) - memmove(dst + left, ref_row + x + left, copy); + memcpy(dst + left, ref_row + x + left, copy); if (right) memset(dst + left + copy, ref_row[w - 1], right); - dst += stride; + dst += dst_stride; ++y; if (y > 0 && y < h) - ref_row += stride; + ref_row += src_stride; } while (--b_h); } @@ -281,7 +282,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, MV32 scaled_mv; int xs, ys, x0, y0, x0_16, y0_16, x1, y1, frame_width, - frame_height, subpel_x, subpel_y; + frame_height, subpel_x, subpel_y, buf_stride; uint8_t *ref_frame, *buf_ptr; const YV12_BUFFER_CONFIG *ref_buf = xd->block_refs[ref]->buf; @@ -308,7 +309,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf); xs = sf->x_step_q4; ys = sf->y_step_q4; - // Get block position in the scaled reference frame. + // Map the top left corner of the block into the reference frame. x0 = sf->scale_value_x(x0, sf); y0 = sf->scale_value_y(y0, sf); x0_16 = sf->scale_value_x(x0_16, sf); @@ -321,7 +322,7 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, subpel_x = scaled_mv.col & SUBPEL_MASK; subpel_y = scaled_mv.row & SUBPEL_MASK; - // Get reference block top left coordinate. + // Calculate the top left corner of the best matching block in the reference frame. x0 += scaled_mv.col >> SUBPEL_BITS; y0 += scaled_mv.row >> SUBPEL_BITS; x0_16 += scaled_mv.col; @@ -329,24 +330,28 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, // Get reference block bottom right coordinate. x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1; - y1 = ((y0_16 + (h - 1) * xs) >> SUBPEL_BITS) + 1; + y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1; // Get reference block pointer. buf_ptr = ref_frame + y0 * pre_buf->stride + x0; + buf_stride = pre_buf->stride; - // Do border extension if there is motion or + // Do border extension if there is motion or the // width/height is not a multiple of 8 pixels. if (scaled_mv.col || scaled_mv.row || (frame_width & 0x7) || (frame_height & 0x7)) { + int x_pad = 0, y_pad = 0; - if (subpel_x) { + if (subpel_x || (sf->x_step_q4 & SUBPEL_MASK)) { x0 -= VP9_INTERP_EXTEND - 1; x1 += VP9_INTERP_EXTEND; + x_pad = 1; } - if (subpel_y) { + if (subpel_y || (sf->y_step_q4 & SUBPEL_MASK)) { y0 -= VP9_INTERP_EXTEND - 1; y1 += VP9_INTERP_EXTEND; + y_pad = 1; } // Skip border extension if block is inside the frame. @@ -354,12 +359,14 @@ static void dec_build_inter_predictors(MACROBLOCKD *xd, int plane, int block, y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) { uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0; // Extend the border. - build_mc_border(buf_ptr1, buf_ptr1, pre_buf->stride, x0, y0, x1 - x0, - y1 - y0, frame_width, frame_height); + build_mc_border(buf_ptr1, pre_buf->stride, xd->mc_buf, x1 - x0, + x0, y0, x1 - x0, y1 - y0, frame_width, frame_height); + buf_stride = x1 - x0; + buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3; } } - inter_predictor(buf_ptr, pre_buf->stride, dst, dst_buf->stride, subpel_x, + inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x, subpel_y, sf, w, h, ref, &xd->subpix, xs, ys); } } diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 8f858f47c..caa6947b3 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -96,7 +96,7 @@ prototype void vp9_v_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint specialize vp9_v_predictor_8x8 $sse_x86inc neon prototype void vp9_tm_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" -specialize vp9_tm_predictor_8x8 $sse2_x86inc dspr2 +specialize vp9_tm_predictor_8x8 $sse2_x86inc neon dspr2 prototype void vp9_dc_predictor_8x8 "uint8_t *dst, ptrdiff_t y_stride, const uint8_t *above, const uint8_t *left" specialize vp9_dc_predictor_8x8 $sse_x86inc dspr2 @@ -742,7 +742,7 @@ specialize vp9_full_search_sad sse3 sse4_1 vp9_full_search_sad_sse3=vp9_full_search_sadx3 vp9_full_search_sad_sse4_1=vp9_full_search_sadx8 -prototype int vp9_refining_search_sad "struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" +prototype int vp9_refining_search_sad "const struct macroblock *x, struct mv *ref_mv, int sad_per_bit, int distance, struct vp9_variance_vtable *fn_ptr, DEC_MVCOSTS, const struct mv *center_mv" specialize vp9_refining_search_sad sse3 vp9_refining_search_sad_sse3=vp9_refining_search_sadx4 @@ -756,9 +756,5 @@ specialize vp9_full_range_search prototype void vp9_temporal_filter_apply "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count" specialize vp9_temporal_filter_apply sse2 -prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction" -specialize vp9_yv12_copy_partial_frame - - fi # end encoder functions diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index f4f758297..f95423678 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -23,105 +23,20 @@ typedef void filter8_1dfunction ( const short *filter ); -#if (HAVE_SSSE3) +#if HAVE_SSSE3 +filter8_1dfunction vp9_filter_block1d16_v8_ssse3; +filter8_1dfunction vp9_filter_block1d16_h8_ssse3; +filter8_1dfunction vp9_filter_block1d8_v8_ssse3; +filter8_1dfunction vp9_filter_block1d8_h8_ssse3; +filter8_1dfunction vp9_filter_block1d4_v8_ssse3; +filter8_1dfunction vp9_filter_block1d4_h8_ssse3; filter8_1dfunction vp9_filter_block1d16_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d16_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d8_h8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_v8_avg_ssse3; filter8_1dfunction vp9_filter_block1d4_h8_avg_ssse3; -#if (ARCH_X86_64) -filter8_1dfunction vp9_filter_block1d16_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_intrin_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_intrin_ssse3; - -void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - /* Ensure the filter can be compressed to int16_t. */ - if (x_step_q4 == 16 && filter_x[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_h8_intrin_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_h8_intrin_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_h8_intrin_ssse3(src, src_stride, - dst, dst_stride, - h, filter_x); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} -void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, - uint8_t *dst, ptrdiff_t dst_stride, - const int16_t *filter_x, int x_step_q4, - const int16_t *filter_y, int y_step_q4, - int w, int h) { - if (y_step_q4 == 16 && filter_y[3] != 128) { - while (w >= 16) { - vp9_filter_block1d16_v8_intrin_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 16; - dst += 16; - w -= 16; - } - while (w >= 8) { - vp9_filter_block1d8_v8_intrin_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 8; - dst += 8; - w -= 8; - } - while (w >= 4) { - vp9_filter_block1d4_v8_intrin_ssse3(src - src_stride * 3, src_stride, - dst, dst_stride, - h, filter_y); - src += 4; - dst += 4; - w -= 4; - } - } - if (w) { - vp9_convolve8_vert_c(src, src_stride, dst, dst_stride, - filter_x, x_step_q4, filter_y, y_step_q4, - w, h); - } -} - -#else -filter8_1dfunction vp9_filter_block1d16_v8_ssse3; -filter8_1dfunction vp9_filter_block1d16_h8_ssse3; -filter8_1dfunction vp9_filter_block1d8_v8_ssse3; -filter8_1dfunction vp9_filter_block1d8_h8_ssse3; -filter8_1dfunction vp9_filter_block1d4_v8_ssse3; -filter8_1dfunction vp9_filter_block1d4_h8_ssse3; void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, @@ -198,7 +113,6 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride, w, h); } } -#endif void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, diff --git a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c b/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c deleted file mode 100644 index 303fced3b..000000000 --- a/vp9/common/x86/vp9_subpixel_8t_intrin_ssse3.c +++ /dev/null @@ -1,591 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <tmmintrin.h> -#include "vpx_ports/mem.h" -#include "vpx_ports/emmintrin_compat.h" - - -// filters only for the 4_h8 convolution -DECLARE_ALIGNED(16, const unsigned char, -filt1_4_h8[16])= {0, 1, 1, 2, 2, 3, 3, 4, 2, 3, 3, 4, 4, 5, 5, 6}; - -DECLARE_ALIGNED(16, const unsigned char, -filt2_4_h8[16])= {4, 5, 5, 6, 6, 7, 7, 8, 6, 7, 7, 8, 8, 9, 9, 10}; - -// filters for 8_h8 and 16_h8 -DECLARE_ALIGNED(16, const unsigned char, -filt1_global[16])= {0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8}; - -DECLARE_ALIGNED(16, const unsigned char, -filt2_global[16])= {2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10}; - -DECLARE_ALIGNED(16, const unsigned char, -filt3_global[16])= {4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12}; - -DECLARE_ALIGNED(16, const unsigned char, -filt4_global[16])= {6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14}; - - - -void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, srcReg, minReg; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 =_mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter into the first lane - firstFilters = _mm_shufflelo_epi16(filtersReg, 0); - // duplicate only the third 16 bit in the filter into the first lane - secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); - // duplicate only the seconds 16 bits in the filter into the second lane - firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); - // duplicate only the forth 16 bits in the filter into the second lane - secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); - - // loading the local filters - thirdFilters =_mm_load_si128((__m128i const *)filt1_4_h8); - forthFilters = _mm_load_si128((__m128i const *)filt2_4_h8); - - for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); - - // filter the source buffer - srcRegFilt1= _mm_shuffle_epi8(srcReg, thirdFilters); - srcRegFilt2= _mm_shuffle_epi8(srcReg, forthFilters); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // extract the higher half of the lane - srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); - srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); - - minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); - - // add and saturate all the results together - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pixels_per_line; - - // save only 4 bytes - *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); - - output_ptr+=output_pitch; - } -} - - -void vp9_filter_block1d8_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - __m128i addFilterReg64, filtersReg, minReg; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); - - for (i = 0; i < output_height; i++) { - srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); - - // filter the source buffer - srcRegFilt1= _mm_shuffle_epi8(srcReg, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg, filt3Reg); - srcRegFilt4= _mm_shuffle_epi8(srcReg, filt4Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); - - // add and saturate all the results together - minReg = _mm_min_epi16(srcRegFilt4, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - - srcRegFilt4= _mm_max_epi16(srcRegFilt4, srcRegFilt3); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); - - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bits - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pixels_per_line; - - // save only 8 bytes - _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); - - output_ptr+=output_pitch; - } -} - -void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pixels_per_line, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; - __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits (first and second byte) - // across 128 bit register - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits (third and forth byte) - // across 128 bit register - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits (fifth and sixth byte) - // across 128 bit register - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits (seventh and eighth byte) - // across 128 bit register - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - filt1Reg = _mm_load_si128((__m128i const *)filt1_global); - filt2Reg = _mm_load_si128((__m128i const *)filt2_global); - filt3Reg = _mm_load_si128((__m128i const *)filt3_global); - filt4Reg = _mm_load_si128((__m128i const *)filt4_global); - - for (i = 0; i < output_height; i++) { - srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); - - // filter the source buffer - srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt4Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - - // reading the next 16 bytes. - // (part of it was being read by earlier read) - srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); - - // add and saturate the results together - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - // filter the source buffer - srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt2Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); - - // filter the source buffer - srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt4Reg); - srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - - // add and saturate the results together - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_min_epi16(srcRegFilt3, srcRegFilt2)); - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, - _mm_max_epi16(srcRegFilt3, srcRegFilt2)); - - srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); - - srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); - srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); - - src_ptr+=src_pixels_per_line; - - // save 16 bytes - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); - - output_ptr+=output_pitch; - } -} - - - -void vp9_filter_block1d4_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i addFilterReg64, filtersReg, firstFilters, secondFilters; - __m128i minReg, srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter into the first lane - firstFilters = _mm_shufflelo_epi16(filtersReg, 0); - // duplicate only the second 16 bits in the filter into the second lane - firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); - // duplicate only the third 16 bits in the filter into the first lane - secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); - // duplicate only the forth 16 bits in the filter into the second lane - secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); - - for (i = 0; i < output_height; i++) { - // load the first 4 byte - srcRegFilt1 = _mm_cvtsi32_si128(*((int*)&src_ptr[0])); - // load the next 4 bytes in stride of src_pitch - srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch)[0])); - - // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); - - - srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*2)[0])); - srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*3)[0])); - - // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); - - srcRegFilt3 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*4)[0])); - srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*5)[0])); - - // merge the result together - srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); - srcRegFilt1 = _mm_unpacklo_epi64(srcRegFilt1, srcRegFilt2); - - srcRegFilt4 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*6)[0])); - srcRegFilt2 = _mm_cvtsi32_si128(*((int*)&(src_ptr+src_pitch*7)[0])); - - // merge the result together - srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt4, srcRegFilt2); - srcRegFilt3 = _mm_unpacklo_epi64(srcRegFilt3, srcRegFilt4); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - - // extract the second lane of the 128 bit register - srcRegFilt2 = _mm_srli_si128(srcRegFilt1, 8); - - // add and saturate the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_srli_si128(srcRegFilt3, 8)); - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pitch; - - // save only 4 bytes convolve result - *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); - - output_ptr+=out_pitch; - } -} - -void vp9_filter_block1d8_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i addFilterReg64, filtersReg, minReg, srcRegFilt6; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4, srcRegFilt5; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - for (i = 0; i < output_height; i++) { - // load the first 8 bytes - srcRegFilt1 = _mm_loadl_epi64((__m128i *)&src_ptr[0]); - // load the next 8 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch)[0]); - srcRegFilt3 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*2)[0]); - srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*3)[0]); - - // merge the result together - srcRegFilt1 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt3 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); - - // load the next 8 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*4)[0]); - srcRegFilt4 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*5)[0]); - srcRegFilt5 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*6)[0]); - srcRegFilt6 = _mm_loadl_epi64((__m128i *)&(src_ptr+src_pitch*7)[0]); - - // merge the result together - srcRegFilt2 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt4); - srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt5, srcRegFilt6); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); - srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); - - // add and saturate the results together - minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); - srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits - srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); - - src_ptr+=src_pitch; - - // save only 8 bytes convolve result - _mm_storel_epi64((__m128i*)&output_ptr[0], srcRegFilt1); - - output_ptr+=out_pitch; - } -} - - -void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, - unsigned int src_pitch, - unsigned char *output_ptr, - unsigned int out_pitch, - unsigned int output_height, - int16_t *filter) { - __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; - __m128i firstFilters, secondFilters, thirdFilters, forthFilters; - __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; - unsigned int i; - - // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 - addFilterReg64 = _mm_set1_epi32((int)0x0400040u); - filtersReg = _mm_loadu_si128((__m128i *)filter); - // converting the 16 bit (short) to 8 bit (byte) and have the same data - // in both lanes of 128 bit register. - filtersReg =_mm_packs_epi16(filtersReg, filtersReg); - - // duplicate only the first 16 bits in the filter - firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); - // duplicate only the second 16 bits in the filter - secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); - // duplicate only the third 16 bits in the filter - thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); - // duplicate only the forth 16 bits in the filter - forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); - - - for (i = 0; i < output_height; i++) { - // load the first 16 bytes - srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); - // load the next 16 bytes in stride of src_pitch - srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); - srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); - srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); - - // merge the result together - srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); - srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); - srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); - srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); - srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); - srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); - - - // add and saturate the results together - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); - - // load the next 16 bytes in stride of two/three src_pitch - srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); - srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); - - // merge the result together - srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); - srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); - srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); - - // load the next 16 bytes in stride of four/five src_pitch - srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); - srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); - - // merge the result together - srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); - srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); - - // multiply 2 adjacent elements with the filter and add the result - srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); - srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); - - - // add and saturate the results together - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, - _mm_min_epi16(srcRegFilt4, srcRegFilt7)); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_min_epi16(srcRegFilt6, srcRegFilt8)); - - - // add and saturate the results together - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, - _mm_max_epi16(srcRegFilt4, srcRegFilt7)); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, - _mm_max_epi16(srcRegFilt6, srcRegFilt8)); - srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); - srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); - - // shift by 7 bit each 16 bit - srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); - srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); - - // shrink to 8 bit each 16 bits, the first lane contain the first - // convolve result and the second lane contain the second convolve - // result - srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); - - src_ptr+=src_pitch; - - // save 16 bytes convolve result - _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); - - output_ptr+=out_pitch; - } -} diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c index 59faccdf7..d66ee2730 100644 --- a/vp9/decoder/vp9_decodeframe.c +++ b/vp9/decoder/vp9_decodeframe.c @@ -704,7 +704,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) { if (vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS, ext_fb, + VP9_DEC_BORDER_IN_PIXELS, ext_fb, cm->realloc_fb_cb, cm->user_priv)) { vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate external frame buffer"); @@ -712,7 +712,7 @@ static void apply_frame_size(VP9D_COMP *pbi, int width, int height) { } else { vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS, NULL, NULL, NULL); + VP9_DEC_BORDER_IN_PIXELS, NULL, NULL, NULL); } } @@ -1129,11 +1129,12 @@ static size_t read_uncompressed_header(VP9D_COMP *pbi, cm->show_existing_frame = vp9_rb_read_bit(rb); if (cm->show_existing_frame) { - // show an existing frame directly + // Show an existing frame directly. int frame_to_show = cm->ref_frame_map[vp9_rb_read_literal(rb, 3)]; ref_cnt_fb(cm->fb_idx_ref_cnt, &cm->new_fb_idx, frame_to_show); pbi->refresh_frame_flags = 0; cm->lf.filter_level = 0; + cm->show_frame = 1; return 0; } diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index c81378153..2eb99ea15 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -357,9 +357,9 @@ static void read_intra_block_mode_info(VP9_COMMON *const cm, MODE_INFO *mi, } static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, - int_mv mv[2], int_mv best_mv[2], - int_mv nearest_mv[2], int_mv near_mv[2], - int is_compound, int allow_hp, vp9_reader *r) { + int_mv mv[2], int_mv ref_mv[2], + int_mv nearest_mv[2], int_mv near_mv[2], + int is_compound, int allow_hp, vp9_reader *r) { int i; int ret = 1; @@ -367,10 +367,10 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, case NEWMV: { nmv_context_counts *const mv_counts = cm->frame_parallel_decoding_mode ? NULL : &cm->counts.mv; - read_mv(r, &mv[0].as_mv, &best_mv[0].as_mv, + read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, &cm->fc.nmvc, mv_counts, allow_hp); if (is_compound) - read_mv(r, &mv[1].as_mv, &best_mv[1].as_mv, + read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, &cm->fc.nmvc, mv_counts, allow_hp); for (i = 0; i < 1 + is_compound; ++i) { ret = ret && mv[i].as_mv.row < MV_UPP && mv[i].as_mv.row > MV_LOW; @@ -380,17 +380,20 @@ static INLINE int assign_mv(VP9_COMMON *cm, MB_PREDICTION_MODE mode, } case NEARESTMV: { mv[0].as_int = nearest_mv[0].as_int; - if (is_compound) mv[1].as_int = nearest_mv[1].as_int; + if (is_compound) + mv[1].as_int = nearest_mv[1].as_int; break; } case NEARMV: { mv[0].as_int = near_mv[0].as_int; - if (is_compound) mv[1].as_int = near_mv[1].as_int; + if (is_compound) + mv[1].as_int = near_mv[1].as_int; break; } case ZEROMV: { mv[0].as_int = 0; - if (is_compound) mv[1].as_int = 0; + if (is_compound) + mv[1].as_int = 0; break; } default: { @@ -423,7 +426,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, const BLOCK_SIZE bsize = mbmi->sb_type; const int allow_hp = cm->allow_high_precision_mv; - int_mv nearest[2], nearmv[2], best[2]; + int_mv nearestmv[2], nearmv[2]; int inter_mode_ctx, ref, is_compound; read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame); @@ -452,8 +455,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) { for (ref = 0; ref < 1 + is_compound; ++ref) { vp9_find_best_ref_mvs(xd, allow_hp, mbmi->ref_mvs[mbmi->ref_frame[ref]], - &nearest[ref], &nearmv[ref]); - best[ref].as_int = nearest[ref].as_int; + &nearestmv[ref], &nearmv[ref]); } } @@ -466,6 +468,7 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, const int num_4x4_h = num_4x4_blocks_high_lookup[bsize]; // 1 or 2 int idx, idy; int b_mode; + int_mv nearest_sub8x8[2], near_sub8x8[2]; for (idy = 0; idy < 2; idy += num_4x4_h) { for (idx = 0; idx < 2; idx += num_4x4_w) { int_mv block[2]; @@ -475,9 +478,11 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, if (b_mode == NEARESTMV || b_mode == NEARMV) for (ref = 0; ref < 1 + is_compound; ++ref) vp9_append_sub8x8_mvs_for_idx(cm, xd, tile, j, ref, mi_row, mi_col, - &nearest[ref], &nearmv[ref]); + &nearest_sub8x8[ref], + &near_sub8x8[ref]); - if (!assign_mv(cm, b_mode, block, best, nearest, nearmv, + if (!assign_mv(cm, b_mode, block, nearestmv, + nearest_sub8x8, near_sub8x8, is_compound, allow_hp, r)) { xd->corrupted |= 1; break; @@ -499,9 +504,8 @@ static void read_inter_block_mode_info(VP9_COMMON *const cm, mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int; mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int; } else { - xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, - best, nearest, nearmv, - is_compound, allow_hp, r); + xd->corrupted |= !assign_mv(cm, mbmi->mode, mbmi->mv, nearestmv, + nearestmv, nearmv, is_compound, allow_hp, r); } } diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index ec4dc14f4..7188d7674 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -260,6 +260,7 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { struct segmentation *seg = &cm->seg; MB_MODE_INFO *const mi = &m->mbmi; const MV_REFERENCE_FRAME rf = mi->ref_frame[0]; + const MV_REFERENCE_FRAME sec_rf = mi->ref_frame[1]; const MB_PREDICTION_MODE mode = mi->mode; const int segment_id = mi->segment_id; int skip_coeff; @@ -355,11 +356,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { active_section = 11; #endif vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[0].as_mv, - &mi->best_mv[0].as_mv, nmvc, allow_hp); + &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp); if (has_second_ref(mi)) vp9_encode_mv(cpi, bc, &m->bmi[j].as_mv[1].as_mv, - &mi->best_mv[1].as_mv, nmvc, allow_hp); + &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp); } } } @@ -368,11 +369,11 @@ static void pack_inter_mode_mvs(VP9_COMP *cpi, MODE_INFO *m, vp9_writer *bc) { active_section = 5; #endif vp9_encode_mv(cpi, bc, &mi->mv[0].as_mv, - &mi->best_mv[0].as_mv, nmvc, allow_hp); + &mi->ref_mvs[rf][0].as_mv, nmvc, allow_hp); if (has_second_ref(mi)) vp9_encode_mv(cpi, bc, &mi->mv[1].as_mv, - &mi->best_mv[1].as_mv, nmvc, allow_hp); + &mi->ref_mvs[sec_rf][0].as_mv, nmvc, allow_hp); } } } @@ -745,7 +746,6 @@ static void update_coef_probs(VP9_COMP* cpi, vp9_writer* w) { const TX_MODE tx_mode = cpi->common.tx_mode; const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode]; TX_SIZE tx_size; - vp9_clear_system_state(); for (tx_size = TX_4X4; tx_size <= TX_32X32; ++tx_size) build_tree_distribution(cpi, tx_size); @@ -1295,8 +1295,6 @@ void vp9_pack_bitstream(VP9_COMP *cpi, uint8_t *dest, size_t *size) { active_section = 7; #endif - vp9_clear_system_state(); // __asm emms; - first_part_size = write_compressed_header(cpi, data); data += first_part_size; vp9_wb_write_literal(&saved_wb, first_part_size, 16); diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 75ea64a2f..9d02c8f95 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -500,17 +500,8 @@ static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, if (is_inter_block(mbmi) && (mbmi->sb_type < BLOCK_8X8 || mbmi->mode == NEWMV)) { int_mv best_mv[2]; - const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0]; - const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1]; - best_mv[0].as_int = ctx->best_ref_mv[0].as_int; - best_mv[1].as_int = ctx->best_ref_mv[1].as_int; - if (mbmi->mode == NEWMV) { - best_mv[0].as_int = mbmi->ref_mvs[rf1][0].as_int; - if (rf2 > 0) - best_mv[1].as_int = mbmi->ref_mvs[rf2][0].as_int; - } - mbmi->best_mv[0].as_int = best_mv[0].as_int; - mbmi->best_mv[1].as_int = best_mv[1].as_int; + for (i = 0; i < 1 + has_second_ref(mbmi); ++i) + best_mv[i].as_int = mbmi->ref_mvs[mbmi->ref_frame[i]][0].as_int; vp9_update_mv_count(cpi, x, best_mv); } @@ -630,11 +621,11 @@ static void set_offsets(VP9_COMP *cpi, const TileInfo *const tile, } } -static void pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, - int mi_row, int mi_col, - int *totalrate, int64_t *totaldist, - BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, - int64_t best_rd) { +static void rd_pick_sb_modes(VP9_COMP *cpi, const TileInfo *const tile, + int mi_row, int mi_col, + int *totalrate, int64_t *totaldist, + BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx, + int64_t best_rd) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; @@ -1079,35 +1070,35 @@ static void pick_partition_type(VP9_COMP *cpi, switch (partition) { case PARTITION_NONE: - pick_sb_modes(cpi, tile, mi_row, mi_col, rate, dist, - bsize, get_block_context(x, bsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, rate, dist, + bsize, get_block_context(x, bsize), INT64_MAX); break; case PARTITION_HORZ: *get_sb_index(x, subsize) = 0; - pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0], - subsize, get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0], + subsize, get_block_context(x, subsize), INT64_MAX); if (bsize >= BLOCK_8X8 && mi_row + num_8x8_subsize < cm->mi_rows) { update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(x, subsize) = 1; - pick_sb_modes(cpi, tile, mi_row + num_8x8_subsize, mi_col, - &sub_rate[1], &sub_dist[1], subsize, - get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row + num_8x8_subsize, mi_col, + &sub_rate[1], &sub_dist[1], subsize, + get_block_context(x, subsize), INT64_MAX); } *rate = sub_rate[0] + sub_rate[1]; *dist = sub_dist[0] + sub_dist[1]; break; case PARTITION_VERT: *get_sb_index(x, subsize) = 0; - pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0], - subsize, get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sub_rate[0], &sub_dist[0], + subsize, get_block_context(x, subsize), INT64_MAX); if (bsize >= BLOCK_8X8 && mi_col + num_8x8_subsize < cm->mi_cols) { update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(x, subsize) = 1; - pick_sb_modes(cpi, tile, mi_row, mi_col + num_8x8_subsize, - &sub_rate[1], &sub_dist[1], subsize, - get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col + num_8x8_subsize, + &sub_rate[1], &sub_dist[1], subsize, + get_block_context(x, subsize), INT64_MAX); } *rate = sub_rate[0] + sub_rate[1]; *dist = sub_dist[1] + sub_dist[1]; @@ -1244,8 +1235,8 @@ static void rd_use_partition(VP9_COMP *cpi, mi_row + (ms >> 1) < cm->mi_rows && mi_col + (ms >> 1) < cm->mi_cols) { *(get_sb_partitioning(x, bsize)) = bsize; - pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize, - get_block_context(x, bsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &none_rate, &none_dist, bsize, + get_block_context(x, bsize), INT64_MAX); pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, @@ -1260,13 +1251,15 @@ static void rd_use_partition(VP9_COMP *cpi, switch (partition) { case PARTITION_NONE: - pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, - bsize, get_block_context(x, bsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, bsize, + get_block_context(x, bsize), INT64_MAX); break; case PARTITION_HORZ: *get_sb_index(x, subsize) = 0; - pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, - subsize, get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, subsize, + get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_row + (mh >> 1) < cm->mi_rows) { int rt = 0; @@ -1274,8 +1267,8 @@ static void rd_use_partition(VP9_COMP *cpi, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(x, subsize) = 1; - pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, subsize, - get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row + (ms >> 1), mi_col, &rt, &dt, + subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { last_part_rate = INT_MAX; last_part_dist = INT_MAX; @@ -1288,8 +1281,9 @@ static void rd_use_partition(VP9_COMP *cpi, break; case PARTITION_VERT: *get_sb_index(x, subsize) = 0; - pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, &last_part_dist, - subsize, get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &last_part_rate, + &last_part_dist, subsize, + get_block_context(x, subsize), INT64_MAX); if (last_part_rate != INT_MAX && bsize >= BLOCK_8X8 && mi_col + (ms >> 1) < cm->mi_cols) { int rt = 0; @@ -1297,8 +1291,8 @@ static void rd_use_partition(VP9_COMP *cpi, update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *get_sb_index(x, subsize) = 1; - pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, subsize, - get_block_context(x, subsize), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col + (ms >> 1), &rt, &dt, + subsize, get_block_context(x, subsize), INT64_MAX); if (rt == INT_MAX || dt == INT_MAX) { last_part_rate = INT_MAX; last_part_dist = INT_MAX; @@ -1372,9 +1366,9 @@ static void rd_use_partition(VP9_COMP *cpi, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt, - split_subsize, get_block_context(x, split_subsize), - INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row + y_idx, mi_col + x_idx, &rt, &dt, + split_subsize, get_block_context(x, split_subsize), + INT64_MAX); restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -1738,8 +1732,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, // PARTITION_NONE if (partition_none_allowed) { - pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, - get_block_context(x, bsize), best_rd); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &this_rate, &this_dist, bsize, + get_block_context(x, bsize), best_rd); if (this_rate != INT_MAX) { if (bsize >= BLOCK_8X8) { pl = partition_plane_context(cpi->above_seg_context, @@ -1849,8 +1843,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) get_block_context(x, subsize)->pred_filter_type = get_block_context(x, bsize)->mic.mbmi.interp_filter; - pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, - get_block_context(x, subsize), best_rd); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + get_block_context(x, subsize), best_rd); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_row + ms < cm->mi_rows) { @@ -1864,9 +1858,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) get_block_context(x, subsize)->pred_filter_type = get_block_context(x, bsize)->mic.mbmi.interp_filter; - pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate, - &this_dist, subsize, get_block_context(x, subsize), - best_rd - sum_rd); + rd_pick_sb_modes(cpi, tile, mi_row + ms, mi_col, &this_rate, + &this_dist, subsize, get_block_context(x, subsize), + best_rd - sum_rd); if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { @@ -1902,8 +1896,8 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) get_block_context(x, subsize)->pred_filter_type = get_block_context(x, bsize)->mic.mbmi.interp_filter; - pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, - get_block_context(x, subsize), best_rd); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &sum_rate, &sum_dist, subsize, + get_block_context(x, subsize), best_rd); sum_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist); if (sum_rd < best_rd && mi_col + ms < cm->mi_cols) { update_state(cpi, get_block_context(x, subsize), subsize, 0); @@ -1916,9 +1910,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, partition_none_allowed) get_block_context(x, subsize)->pred_filter_type = get_block_context(x, bsize)->mic.mbmi.interp_filter; - pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, - &this_dist, subsize, get_block_context(x, subsize), - best_rd - sum_rd); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col + ms, &this_rate, + &this_dist, subsize, get_block_context(x, subsize), + best_rd - sum_rd); if (this_rate == INT_MAX) { sum_rd = INT64_MAX; } else { @@ -1989,8 +1983,8 @@ static void rd_pick_reference_frame(VP9_COMP *cpi, const TileInfo *const tile, if ((mi_row + (ms >> 1) < cm->mi_rows) && (mi_col + (ms >> 1) < cm->mi_cols)) { cpi->set_ref_frame_mask = 1; - pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64, - get_block_context(x, BLOCK_64X64), INT64_MAX); + rd_pick_sb_modes(cpi, tile, mi_row, mi_col, &r, &d, BLOCK_64X64, + get_block_context(x, BLOCK_64X64), INT64_MAX); pl = partition_plane_context(cpi->above_seg_context, cpi->left_seg_context, mi_row, mi_col, BLOCK_64X64); r += x->partition_cost[pl][PARTITION_NONE]; diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c index af710a8f4..853094b29 100644 --- a/vp9/encoder/vp9_encodemv.c +++ b/vp9/encoder/vp9_encodemv.c @@ -224,13 +224,9 @@ void vp9_encode_mv(VP9_COMP* cpi, vp9_writer* w, } } -void vp9_build_nmv_cost_table(int *mvjoint, - int *mvcost[2], +void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], const nmv_context* const mvctx, - int usehp, - int mvc_flag_v, - int mvc_flag_h) { - vp9_clear_system_state(); + int usehp, int mvc_flag_v, int mvc_flag_h) { vp9_cost_tokens(mvjoint, mvctx->joints, vp9_mv_joint_tree); if (mvc_flag_v) build_nmv_component_cost_table(mvcost[0], &mvctx->comps[0], usehp); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 538599d58..56872682a 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -317,31 +317,23 @@ static const double weight_table[256] = { 1.000000, 1.000000, 1.000000, 1.000000 }; -static double simple_weight(YV12_BUFFER_CONFIG *source) { +static double simple_weight(const YV12_BUFFER_CONFIG *buf) { int i, j; + double sum = 0.0; + const int w = buf->y_crop_width; + const int h = buf->y_crop_height; + const uint8_t *row = buf->y_buffer; + + for (i = 0; i < h; ++i) { + const uint8_t *pixel = row; + for (j = 0; j < w; ++j) + sum += weight_table[*pixel++]; + row += buf->y_stride; + } - uint8_t *src = source->y_buffer; - double sum_weights = 0.0; - - // Loop through the Y plane examining levels and creating a weight for - // the image. - i = source->y_height; - do { - j = source->y_width; - do { - sum_weights += weight_table[ *src]; - src++; - } while (--j); - src -= source->y_width; - src += source->y_stride; - } while (--i); - - sum_weights /= (source->y_height * source->y_width); - - return sum_weights; + return MAX(0.1, sum / (w * h)); } - // This function returns the maximum target rate per frame. static int frame_max_bits(VP9_COMP *cpi) { int64_t max_bits = @@ -394,42 +386,35 @@ static unsigned int zz_motion_search(VP9_COMP *cpi, MACROBLOCK *x, } static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, - MV *ref_mv, MV *best_mv, + const MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset) { MACROBLOCKD *const xd = &x->e_mbd; - int num00; - MV tmp_mv = {0, 0}; - MV ref_mv_full; - - int tmp_err; + MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3}; + int num00, tmp_err, n, sr = 0; int step_param = 3; int further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; - int n; - vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[xd->mi_8x8[0]->mbmi.sb_type]; + const BLOCK_SIZE bsize = xd->mi_8x8[0]->mbmi.sb_type; + vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize]; int new_mv_mode_penalty = 256; - - int sr = 0; - int quart_frm = MIN(cpi->common.width, cpi->common.height); + const int quart_frm = MIN(cpi->common.width, cpi->common.height); // refine the motion search range accroding to the frame dimension // for first pass test while ((quart_frm << sr) < MAX_FULL_PEL_VAL) sr++; - step_param += sr; + step_param += sr; further_steps -= sr; // override the default variance function to use MSE - v_fn_ptr.vf = get_block_variance_fn(xd->mi_8x8[0]->mbmi.sb_type); + v_fn_ptr.vf = get_block_variance_fn(bsize); // Set up pointers for this macro block recon buffer xd->plane[0].pre[0].buf = recon_buffer->y_buffer + recon_yoffset; // Initial step/diamond search centred on best mv - ref_mv_full.col = ref_mv->col >> 3; - ref_mv_full.row = ref_mv->row >> 3; tmp_err = cpi->diamond_search_sad(x, &ref_mv_full, &tmp_mv, step_param, x->sadperbit16, &num00, &v_fn_ptr, @@ -504,6 +489,7 @@ void vp9_first_pass(VP9_COMP *cpi) { int new_mv_count = 0; int sum_in_vectors = 0; uint32_t lastmv_as_int = 0; + struct twopass_rc *const twopass = &cpi->twopass; int_mv zero_ref_mv; @@ -792,20 +778,8 @@ void vp9_first_pass(VP9_COMP *cpi) { fps.intra_error = intra_error >> 8; fps.coded_error = coded_error >> 8; fps.sr_coded_error = sr_coded_error >> 8; - fps.ssim_weighted_pred_err = fps.coded_error * - MAX(0.1, simple_weight(cpi->Source)); - fps.pcnt_inter = 0.0; - fps.pcnt_motion = 0.0; - fps.MVr = 0.0; - fps.mvr_abs = 0.0; - fps.MVc = 0.0; - fps.mvc_abs = 0.0; - fps.MVrv = 0.0; - fps.MVcv = 0.0; - fps.mv_in_out_count = 0.0; - fps.new_mv_count = 0.0; + fps.ssim_weighted_pred_err = fps.coded_error * simple_weight(cpi->Source); fps.count = 1.0; - fps.pcnt_inter = (double)intercount / cm->MBs; fps.pcnt_second_ref = (double)second_ref_count / cm->MBs; fps.pcnt_neutral = (double)neutral_count / cm->MBs; @@ -821,7 +795,17 @@ void vp9_first_pass(VP9_COMP *cpi) { mvcount; fps.mv_in_out_count = (double)sum_in_vectors / (mvcount * 2); fps.new_mv_count = new_mv_count; - fps.pcnt_motion = 1.0 * (double)mvcount / cpi->common.MBs; + fps.pcnt_motion = (double)mvcount / cpi->common.MBs; + } else { + fps.MVr = 0.0; + fps.mvr_abs = 0.0; + fps.MVc = 0.0; + fps.mvc_abs = 0.0; + fps.MVrv = 0.0; + fps.MVcv = 0.0; + fps.mv_in_out_count = 0.0; + fps.new_mv_count = 0.0; + fps.pcnt_motion = 0.0; } // TODO(paulwilkins): Handle the case when duration is set to 0, or @@ -830,23 +814,22 @@ void vp9_first_pass(VP9_COMP *cpi) { fps.duration = (double)(cpi->source->ts_end - cpi->source->ts_start); // don't want to do output stats with a stack variable! - cpi->twopass.this_frame_stats = fps; - output_stats(cpi, cpi->output_pkt_list, &cpi->twopass.this_frame_stats); - accumulate_stats(&cpi->twopass.total_stats, &fps); + twopass->this_frame_stats = fps; + output_stats(cpi, cpi->output_pkt_list, &twopass->this_frame_stats); + accumulate_stats(&twopass->total_stats, &fps); } // Copy the previous Last Frame back into gf and and arf buffers if // the prediction is good enough... but also dont allow it to lag too far - if ((cpi->twopass.sr_update_lag > 3) || + if ((twopass->sr_update_lag > 3) || ((cm->current_video_frame > 0) && - (cpi->twopass.this_frame_stats.pcnt_inter > 0.20) && - ((cpi->twopass.this_frame_stats.intra_error / - DOUBLE_DIVIDE_CHECK(cpi->twopass.this_frame_stats.coded_error)) > - 2.0))) { + (twopass->this_frame_stats.pcnt_inter > 0.20) && + ((twopass->this_frame_stats.intra_error / + DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) { vp8_yv12_copy_frame(lst_yv12, gld_yv12); - cpi->twopass.sr_update_lag = 1; + twopass->sr_update_lag = 1; } else { - cpi->twopass.sr_update_lag++; + twopass->sr_update_lag++; } // swap frame pointers so last frame refers to the frame we just compressed swap_yv12(lst_yv12, new_yv12); @@ -1034,37 +1017,38 @@ extern void vp9_new_framerate(VP9_COMP *cpi, double framerate); void vp9_init_second_pass(VP9_COMP *cpi) { FIRSTPASS_STATS this_frame; FIRSTPASS_STATS *start_pos; + struct twopass_rc *const twopass = &cpi->twopass; - zero_stats(&cpi->twopass.total_stats); - zero_stats(&cpi->twopass.total_left_stats); + zero_stats(&twopass->total_stats); + zero_stats(&twopass->total_left_stats); - if (!cpi->twopass.stats_in_end) + if (!twopass->stats_in_end) return; - cpi->twopass.total_stats = *cpi->twopass.stats_in_end; - cpi->twopass.total_left_stats = cpi->twopass.total_stats; + twopass->total_stats = *twopass->stats_in_end; + twopass->total_left_stats = twopass->total_stats; // each frame can have a different duration, as the frame rate in the source // isn't guaranteed to be constant. The frame rate prior to the first frame // encoded in the second pass is a guess. However the sum duration is not. // Its calculated based on the actual durations of all frames from the first // pass. - vp9_new_framerate(cpi, 10000000.0 * cpi->twopass.total_stats.count / - cpi->twopass.total_stats.duration); + vp9_new_framerate(cpi, 10000000.0 * twopass->total_stats.count / + twopass->total_stats.duration); cpi->output_framerate = cpi->oxcf.framerate; - cpi->twopass.bits_left = (int64_t)(cpi->twopass.total_stats.duration * - cpi->oxcf.target_bandwidth / 10000000.0); + twopass->bits_left = (int64_t)(twopass->total_stats.duration * + cpi->oxcf.target_bandwidth / 10000000.0); // Calculate a minimum intra value to be used in determining the IIratio // scores used in the second pass. We have this minimum to make sure // that clips that are static but "low complexity" in the intra domain // are still boosted appropriately for KF/GF/ARF - cpi->twopass.kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; - cpi->twopass.gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs; + twopass->kf_intra_err_min = KF_MB_INTRA_MIN * cpi->common.MBs; + twopass->gf_intra_err_min = GF_MB_INTRA_MIN * cpi->common.MBs; // This variable monitors how far behind the second ref update is lagging - cpi->twopass.sr_update_lag = 1; + twopass->sr_update_lag = 1; // Scan the first pass file and calculate an average Intra / Inter error score // ratio for the sequence. @@ -1072,43 +1056,43 @@ void vp9_init_second_pass(VP9_COMP *cpi) { double sum_iiratio = 0.0; double IIRatio; - start_pos = cpi->twopass.stats_in; // Note the starting "file" position. + start_pos = twopass->stats_in; // Note the starting "file" position. - while (input_stats(&cpi->twopass, &this_frame) != EOF) { + while (input_stats(twopass, &this_frame) != EOF) { IIRatio = this_frame.intra_error / DOUBLE_DIVIDE_CHECK(this_frame.coded_error); IIRatio = (IIRatio < 1.0) ? 1.0 : (IIRatio > 20.0) ? 20.0 : IIRatio; sum_iiratio += IIRatio; } - cpi->twopass.avg_iiratio = sum_iiratio / - DOUBLE_DIVIDE_CHECK((double)cpi->twopass.total_stats.count); + twopass->avg_iiratio = sum_iiratio / + DOUBLE_DIVIDE_CHECK((double)twopass->total_stats.count); // Reset file position - reset_fpf_position(&cpi->twopass, start_pos); + reset_fpf_position(twopass, start_pos); } // Scan the first pass file and calculate a modified total error based upon // the bias/power function used to allocate bits. { - double av_error = cpi->twopass.total_stats.ssim_weighted_pred_err / - DOUBLE_DIVIDE_CHECK(cpi->twopass.total_stats.count); + double av_error = twopass->total_stats.ssim_weighted_pred_err / + DOUBLE_DIVIDE_CHECK(twopass->total_stats.count); - start_pos = cpi->twopass.stats_in; // Note starting "file" position + start_pos = twopass->stats_in; // Note starting "file" position - cpi->twopass.modified_error_total = 0.0; - cpi->twopass.modified_error_min = + twopass->modified_error_total = 0.0; + twopass->modified_error_min = (av_error * cpi->oxcf.two_pass_vbrmin_section) / 100; - cpi->twopass.modified_error_max = + twopass->modified_error_max = (av_error * cpi->oxcf.two_pass_vbrmax_section) / 100; - while (input_stats(&cpi->twopass, &this_frame) != EOF) { - cpi->twopass.modified_error_total += + while (input_stats(twopass, &this_frame) != EOF) { + twopass->modified_error_total += calculate_modified_err(cpi, &this_frame); } - cpi->twopass.modified_error_left = cpi->twopass.modified_error_total; + twopass->modified_error_left = twopass->modified_error_total; - reset_fpf_position(&cpi->twopass, start_pos); // Reset file position + reset_fpf_position(twopass, start_pos); } } @@ -1965,7 +1949,10 @@ void vp9_get_one_pass_params(VP9_COMP *cpi) { cpi->rc.frames_to_key == 0 || (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { cm->frame_type = KEY_FRAME; + cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 && + cpi->rc.frames_to_key == 0; cpi->rc.frames_to_key = cpi->key_frame_frequency; + cpi->rc.kf_boost = 300; } else { cm->frame_type = INTER_FRAME; } @@ -1982,7 +1969,10 @@ void vp9_get_one_pass_cbr_params(VP9_COMP *cpi) { cpi->rc.frames_to_key == 0 || (cpi->oxcf.auto_key && test_for_kf_one_pass(cpi)))) { cm->frame_type = KEY_FRAME; + cpi->rc.this_key_frame_forced = cm->current_video_frame != 0 && + cpi->rc.frames_to_key == 0; cpi->rc.frames_to_key = cpi->key_frame_frequency; + cpi->rc.kf_boost = 300; } else { cm->frame_type = INTER_FRAME; } @@ -2054,7 +2044,8 @@ void vp9_get_second_pass_params(VP9_COMP *cpi) { this_frame_coded_error = this_frame.coded_error; // keyframe and section processing ! - if (rc->frames_to_key == 0) { + if (rc->frames_to_key == 0 || + (cpi->common.frame_flags & FRAMEFLAGS_KEY)) { // Define next KF group and assign bits to it this_frame_copy = this_frame; find_next_key_frame(cpi, &this_frame_copy); @@ -2225,12 +2216,13 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { double recent_loop_decay[8] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0}; RATE_CONTROL *const rc = &cpi->rc; + struct twopass_rc *const twopass = &cpi->twopass; vp9_zero(next_frame); vp9_clear_system_state(); // __asm emms; - start_position = cpi->twopass.stats_in; + start_position = twopass->stats_in; cpi->common.frame_type = KEY_FRAME; // is this a forced key frame by interval @@ -2247,14 +2239,14 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Take a copy of the initial frame details first_frame = *this_frame; - cpi->twopass.kf_group_bits = 0; // Total bits available to kf group - cpi->twopass.kf_group_error_left = 0; // Group modified error score. + twopass->kf_group_bits = 0; // Total bits available to kf group + twopass->kf_group_error_left = 0; // Group modified error score. kf_mod_err = calculate_modified_err(cpi, this_frame); // find the next keyframe i = 0; - while (cpi->twopass.stats_in < cpi->twopass.stats_in_end) { + while (twopass->stats_in < twopass->stats_in_end) { // Accumulate kf group error kf_group_err += calculate_modified_err(cpi, this_frame); @@ -2266,11 +2258,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // load a the next frame's stats last_frame = *this_frame; - input_stats(&cpi->twopass, this_frame); + input_stats(twopass, this_frame); // Provided that we are not at the end of the file... if (cpi->oxcf.auto_key && - lookup_next_frame_stats(&cpi->twopass, &next_frame) != EOF) { + lookup_next_frame_stats(twopass, &next_frame) != EOF) { // Normal scene cut check if (test_candidate_kf(cpi, &last_frame, this_frame, &next_frame)) break; @@ -2320,7 +2312,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { tmp_frame = first_frame; // Reset to the start of the group - reset_fpf_position(&cpi->twopass, start_position); + reset_fpf_position(twopass, start_position); kf_group_err = 0; kf_group_intra_err = 0; @@ -2334,17 +2326,17 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { kf_group_coded_err += tmp_frame.coded_error; // Load a the next frame's stats - input_stats(&cpi->twopass, &tmp_frame); + input_stats(twopass, &tmp_frame); } rc->next_key_frame_forced = 1; - } else if (cpi->twopass.stats_in == cpi->twopass.stats_in_end) { + } else if (twopass->stats_in == twopass->stats_in_end) { rc->next_key_frame_forced = 1; } else { rc->next_key_frame_forced = 0; } // Special case for the last key frame of the file - if (cpi->twopass.stats_in >= cpi->twopass.stats_in_end) { + if (twopass->stats_in >= twopass->stats_in_end) { // Accumulate kf group error kf_group_err += calculate_modified_err(cpi, this_frame); @@ -2356,8 +2348,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { } // Calculate the number of bits that should be assigned to the kf group. - if ((cpi->twopass.bits_left > 0) && - (cpi->twopass.modified_error_left > 0.0)) { + if (twopass->bits_left > 0 && twopass->modified_error_left > 0.0) { // Max for a single normal frame (not key frame) int max_bits = frame_max_bits(cpi); @@ -2366,19 +2357,18 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Default allocation based on bits left and relative // complexity of the section - cpi->twopass.kf_group_bits = (int64_t)(cpi->twopass.bits_left * - (kf_group_err / - cpi->twopass.modified_error_left)); + twopass->kf_group_bits = (int64_t)(twopass->bits_left * + (kf_group_err / twopass->modified_error_left)); // Clip based on maximum per frame rate defined by the user. max_grp_bits = (int64_t)max_bits * (int64_t)rc->frames_to_key; - if (cpi->twopass.kf_group_bits > max_grp_bits) - cpi->twopass.kf_group_bits = max_grp_bits; + if (twopass->kf_group_bits > max_grp_bits) + twopass->kf_group_bits = max_grp_bits; } else { - cpi->twopass.kf_group_bits = 0; + twopass->kf_group_bits = 0; } // Reset the first pass file position - reset_fpf_position(&cpi->twopass, start_position); + reset_fpf_position(twopass, start_position); // Determine how big to make this keyframe based on how well the subsequent // frames use inter blocks. @@ -2390,7 +2380,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { for (i = 0; i < rc->frames_to_key; i++) { double r; - if (EOF == input_stats(&cpi->twopass, &next_frame)) + if (EOF == input_stats(twopass, &next_frame)) break; // Monitor for static sections. @@ -2402,11 +2392,11 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // For the first few frames collect data to decide kf boost. if (i <= (rc->max_gf_interval * 2)) { - if (next_frame.intra_error > cpi->twopass.kf_intra_err_min) + if (next_frame.intra_error > twopass->kf_intra_err_min) r = (IIKFACTOR2 * next_frame.intra_error / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); else - r = (IIKFACTOR2 * cpi->twopass.kf_intra_err_min / + r = (IIKFACTOR2 * twopass->kf_intra_err_min / DOUBLE_DIVIDE_CHECK(next_frame.coded_error)); if (r > RMAX) @@ -2428,21 +2418,21 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { FIRSTPASS_STATS sectionstats; zero_stats(§ionstats); - reset_fpf_position(&cpi->twopass, start_position); + reset_fpf_position(twopass, start_position); for (i = 0; i < rc->frames_to_key; i++) { - input_stats(&cpi->twopass, &next_frame); + input_stats(twopass, &next_frame); accumulate_stats(§ionstats, &next_frame); } avg_stats(§ionstats); - cpi->twopass.section_intra_rating = (int) (sectionstats.intra_error / + twopass->section_intra_rating = (int) (sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error)); } // Reset the first pass file position - reset_fpf_position(&cpi->twopass, start_position); + reset_fpf_position(twopass, start_position); // Work out how many bits to allocate for the key frame itself if (1) { @@ -2459,7 +2449,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Make a note of baseline boost and the zero motion // accumulator value for use elsewhere. rc->kf_boost = kf_boost; - cpi->twopass.kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); + twopass->kf_zeromotion_pct = (int)(zero_motion_accumulator * 100.0); // We do three calculations for kf size. // The first is based on the error score for the whole kf group. @@ -2474,11 +2464,9 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // cpi->rc.frames_to_key-1 because key frame itself is taken // care of by kf_boost. if (zero_motion_accumulator >= 0.99) { - allocation_chunks = - ((rc->frames_to_key - 1) * 10) + kf_boost; + allocation_chunks = ((rc->frames_to_key - 1) * 10) + kf_boost; } else { - allocation_chunks = - ((rc->frames_to_key - 1) * 100) + kf_boost; + allocation_chunks = ((rc->frames_to_key - 1) * 100) + kf_boost; } // Prevent overflow @@ -2488,58 +2476,54 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { allocation_chunks /= divisor; } - cpi->twopass.kf_group_bits = (cpi->twopass.kf_group_bits < 0) ? 0 - : cpi->twopass.kf_group_bits; + twopass->kf_group_bits = (twopass->kf_group_bits < 0) ? 0 + : twopass->kf_group_bits; // Calculate the number of bits to be spent on the key frame - cpi->twopass.kf_bits = (int)((double)kf_boost * - ((double)cpi->twopass.kf_group_bits / (double)allocation_chunks)); + twopass->kf_bits = (int)((double)kf_boost * + ((double)twopass->kf_group_bits / allocation_chunks)); // If the key frame is actually easier than the average for the // kf group (which does sometimes happen... eg a blank intro frame) // Then use an alternate calculation based on the kf error score // which should give a smaller key frame. if (kf_mod_err < kf_group_err / rc->frames_to_key) { - double alt_kf_grp_bits = - ((double)cpi->twopass.bits_left * + double alt_kf_grp_bits = ((double)twopass->bits_left * (kf_mod_err * (double)rc->frames_to_key) / - DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left)); + DOUBLE_DIVIDE_CHECK(twopass->modified_error_left)); alt_kf_bits = (int)((double)kf_boost * (alt_kf_grp_bits / (double)allocation_chunks)); - if (cpi->twopass.kf_bits > alt_kf_bits) { - cpi->twopass.kf_bits = alt_kf_bits; - } + if (twopass->kf_bits > alt_kf_bits) + twopass->kf_bits = alt_kf_bits; } else { // Else if it is much harder than other frames in the group make sure // it at least receives an allocation in keeping with its relative // error score - alt_kf_bits = (int)((double)cpi->twopass.bits_left * - (kf_mod_err / - DOUBLE_DIVIDE_CHECK(cpi->twopass.modified_error_left))); + alt_kf_bits = (int)((double)twopass->bits_left * (kf_mod_err / + DOUBLE_DIVIDE_CHECK(twopass->modified_error_left))); - if (alt_kf_bits > cpi->twopass.kf_bits) { - cpi->twopass.kf_bits = alt_kf_bits; + if (alt_kf_bits > twopass->kf_bits) { + twopass->kf_bits = alt_kf_bits; } } - cpi->twopass.kf_group_bits -= cpi->twopass.kf_bits; + twopass->kf_group_bits -= twopass->kf_bits; // Peer frame bit target for this frame - rc->per_frame_bandwidth = cpi->twopass.kf_bits; + rc->per_frame_bandwidth = twopass->kf_bits; // Convert to a per second bitrate - cpi->target_bandwidth = (int)(cpi->twopass.kf_bits * - cpi->output_framerate); + cpi->target_bandwidth = (int)(twopass->kf_bits * cpi->output_framerate); } // Note the total error score of the kf group minus the key frame itself - cpi->twopass.kf_group_error_left = (int)(kf_group_err - kf_mod_err); + twopass->kf_group_error_left = (int)(kf_group_err - kf_mod_err); // Adjust the count of total modified error left. // The count of bits left is adjusted elsewhere based on real coded frame // sizes. - cpi->twopass.modified_error_left -= kf_group_err; + twopass->modified_error_left -= kf_group_err; } void vp9_twopass_postencode_update(VP9_COMP *cpi, uint64_t bytes_used) { diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c index 277bd7db1..ee73ff15a 100644 --- a/vp9/encoder/vp9_lookahead.c +++ b/vp9/encoder/vp9_lookahead.c @@ -73,7 +73,7 @@ struct lookahead_ctx * vp9_lookahead_init(unsigned int width, for (i = 0; i < depth; i++) if (vp9_alloc_frame_buffer(&ctx->buf[i].img, width, height, subsampling_x, subsampling_y, - VP9BORDERINPIXELS)) + VP9_ENC_BORDER_IN_PIXELS)) goto bail; } return ctx; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index efb5ce16d..ad9cc00b1 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -24,10 +24,15 @@ // #define NEW_DIAMOND_SEARCH void vp9_set_mv_search_range(MACROBLOCK *x, const MV *mv) { - const int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); - const int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); - const int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; - const int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; + int col_min = (mv->col >> 3) - MAX_FULL_PEL_VAL + (mv->col & 7 ? 1 : 0); + int row_min = (mv->row >> 3) - MAX_FULL_PEL_VAL + (mv->row & 7 ? 1 : 0); + int col_max = (mv->col >> 3) + MAX_FULL_PEL_VAL; + int row_max = (mv->row >> 3) + MAX_FULL_PEL_VAL; + + col_min = MAX(col_min, (MV_LOW >> 3) + 1); + row_min = MAX(row_min, (MV_LOW >> 3) + 1); + col_max = MIN(col_max, (MV_UPP >> 3) - 1); + row_max = MIN(row_max, (MV_UPP >> 3) - 1); // Get intersection of UMV window and valid MV window to reduce # of checks // in diamond search. @@ -174,8 +179,10 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { error_per_bit + 4096) >> 13 : 0) -#define SP(x) (((x) & 7) << 1) // convert motion vector component to offset - // for svf calc +// convert motion vector component to offset for svf calc +static INLINE int sp(int x) { + return (x & 7) << 1; +} #define IFMVCV(r, c, s, e) \ if (c >= minc && c <= maxc && r >= minr && r <= maxr) \ @@ -183,12 +190,14 @@ void vp9_init3smotion_compensation(MACROBLOCK *x, int stride) { else \ e; -/* pointer to predictor base of a motionvector */ -#define PRE(r, c) (y + (((r) >> 3) * y_stride + ((c) >> 3) -(offset))) +static INLINE uint8_t *pre(uint8_t *buf, int stride, int r, int c, int offset) { + return &buf[(r >> 3) * stride + (c >> 3) - offset]; +} /* returns subpixel variance error function */ #define DIST(r, c) \ - vfp->svf(PRE(r, c), y_stride, SP(c), SP(r), z, src_stride, &sse) + vfp->svf(pre(y, y_stride, r, c, offset), y_stride, sp(c), sp(r), z, \ + src_stride, &sse) /* checks if (r, c) has better score than previous best */ #define CHECK_BETTER(v, r, c) \ @@ -358,7 +367,7 @@ int vp9_find_best_sub_pixel_tree(MACROBLOCK *x, #undef DIST /* returns subpixel variance error function */ #define DIST(r, c) \ - vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \ + vfp->svaf(pre(y, y_stride, r, c, offset), y_stride, sp(c), sp(r), \ z, src_stride, &sse, second_pred) int vp9_find_best_sub_pixel_comp_tree(MACROBLOCK *x, @@ -1703,58 +1712,51 @@ int vp9_full_search_sadx8(MACROBLOCK *x, MV *ref_mv, else return INT_MAX; } -int vp9_refining_search_sad_c(MACROBLOCK *x, + +int vp9_refining_search_sad_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv) { - const MACROBLOCKD* const xd = &x->e_mbd; - MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}}; + const MACROBLOCKD *const xd = &x->e_mbd; + const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}}; int i, j; - int this_row_offset, this_col_offset; - int what_stride = x->plane[0].src.stride; - int in_what_stride = xd->plane[0].pre[0].stride; - uint8_t *what = x->plane[0].src.buf; - uint8_t *best_address = xd->plane[0].pre[0].buf + - (ref_mv->row * xd->plane[0].pre[0].stride) + - ref_mv->col; - uint8_t *check_here; + const int what_stride = x->plane[0].src.stride; + const uint8_t *const what = x->plane[0].src.buf; + const int in_what_stride = xd->plane[0].pre[0].stride; + const uint8_t *const in_what = xd->plane[0].pre[0].buf; + const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride + + ref_mv->col]; unsigned int thissad; + + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; MV this_mv; - unsigned int bestsad = INT_MAX; - MV fcenter_mv; - int *mvjsadcost = x->nmvjointsadcost; + const int *mvjsadcost = x->nmvjointsadcost; int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - fcenter_mv.row = center_mv->row >> 3; - fcenter_mv.col = center_mv->col >> 3; - - bestsad = fn_ptr->sdf(what, what_stride, best_address, - in_what_stride, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, - mvjsadcost, mvsadcost, error_per_bit); + unsigned int bestsad = fn_ptr->sdf(what, what_stride, best_address, + in_what_stride, 0x7fffffff) + + mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); for (i = 0; i < search_range; i++) { int best_site = -1; for (j = 0; j < 4; j++) { - this_row_offset = ref_mv->row + neighbors[j].row; - this_col_offset = ref_mv->col + neighbors[j].col; - - if ((this_col_offset > x->mv_col_min) && - (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && - (this_row_offset < x->mv_row_max)) { - check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + - best_address; + this_mv.row = ref_mv->row + neighbors[j].row; + this_mv.col = ref_mv->col + neighbors[j].col; + + if ((this_mv.col > x->mv_col_min) && + (this_mv.col < x->mv_col_max) && + (this_mv.row > x->mv_row_min) && + (this_mv.row < x->mv_row_max)) { + const uint8_t *check_here = &in_what[this_mv.row * in_what_stride + + this_mv.col]; thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride, bestsad); if (thissad < bestsad) { - this_mv.row = this_row_offset; - this_mv.col = this_col_offset; thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); @@ -1771,8 +1773,7 @@ int vp9_refining_search_sad_c(MACROBLOCK *x, } else { ref_mv->row += neighbors[best_site].row; ref_mv->col += neighbors[best_site].col; - best_address += (neighbors[best_site].row) * in_what_stride + - neighbors[best_site].col; + best_address = &in_what[ref_mv->row * in_what_stride + ref_mv->col]; } } @@ -1782,13 +1783,12 @@ int vp9_refining_search_sad_c(MACROBLOCK *x, if (bestsad < INT_MAX) return fn_ptr->vf(what, what_stride, best_address, in_what_stride, (unsigned int *)(&thissad)) + - mv_err_cost(&this_mv, center_mv, - mvjcost, mvcost, x->errorperbit); + mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit); else return INT_MAX; } -int vp9_refining_search_sadx4(MACROBLOCK *x, +int vp9_refining_search_sadx4(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], @@ -1810,7 +1810,7 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, unsigned int bestsad = INT_MAX; MV fcenter_mv; - int *mvjsadcost = x->nmvjointsadcost; + const int *mvjsadcost = x->nmvjointsadcost; int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; fcenter_mv.row = center_mv->row >> 3; @@ -1903,65 +1903,54 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, return INT_MAX; } -/* This function is called when we do joint motion search in comp_inter_inter - * mode. - */ -int vp9_refining_search_8p_c(MACROBLOCK *x, +// This function is called when we do joint motion search in comp_inter_inter +// mode. +int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], const MV *center_mv, const uint8_t *second_pred, int w, int h) { - const MACROBLOCKD* const xd = &x->e_mbd; - MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}, - {-1, -1}, {1, -1}, {-1, 1}, {1, 1}}; + const MACROBLOCKD *const xd = &x->e_mbd; + const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}, + {-1, -1}, {1, -1}, {-1, 1}, {1, 1}}; int i, j; - int this_row_offset, this_col_offset; - int what_stride = x->plane[0].src.stride; - int in_what_stride = xd->plane[0].pre[0].stride; - uint8_t *what = x->plane[0].src.buf; - uint8_t *best_address = xd->plane[0].pre[0].buf + - (ref_mv->row * xd->plane[0].pre[0].stride) + - ref_mv->col; - uint8_t *check_here; + const uint8_t *what = x->plane[0].src.buf; + const int what_stride = x->plane[0].src.stride; + const uint8_t *in_what = xd->plane[0].pre[0].buf; + const int in_what_stride = xd->plane[0].pre[0].stride; + const uint8_t *best_address = &in_what[ref_mv->row * in_what_stride + + ref_mv->col]; unsigned int thissad; MV this_mv; - unsigned int bestsad = INT_MAX; - MV fcenter_mv; + const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3}; - int *mvjsadcost = x->nmvjointsadcost; + const int *mvjsadcost = x->nmvjointsadcost; int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; - fcenter_mv.row = center_mv->row >> 3; - fcenter_mv.col = center_mv->col >> 3; - /* Get compound pred by averaging two pred blocks. */ - bestsad = fn_ptr->sdaf(what, what_stride, best_address, in_what_stride, - second_pred, 0x7fffffff) + - mvsad_err_cost(ref_mv, &fcenter_mv, - mvjsadcost, mvsadcost, error_per_bit); + unsigned int bestsad = fn_ptr->sdaf(what, what_stride, + best_address, in_what_stride, + second_pred, 0x7fffffff) + + mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); - for (i = 0; i < search_range; i++) { + for (i = 0; i < search_range; ++i) { int best_site = -1; for (j = 0; j < 8; j++) { - this_row_offset = ref_mv->row + neighbors[j].row; - this_col_offset = ref_mv->col + neighbors[j].col; + this_mv.row = ref_mv->row + neighbors[j].row; + this_mv.col = ref_mv->col + neighbors[j].col; - if ((this_col_offset > x->mv_col_min) && - (this_col_offset < x->mv_col_max) && - (this_row_offset > x->mv_row_min) && - (this_row_offset < x->mv_row_max)) { - check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + - best_address; + if ((this_mv.col > x->mv_col_min) && + (this_mv.col < x->mv_col_max) && + (this_mv.row > x->mv_row_min) && + (this_mv.row < x->mv_row_max)) { + const uint8_t *check_here = &in_what[this_mv.row * in_what_stride + + this_mv.col]; - /* Get compound block and use it to calculate SAD. */ thissad = fn_ptr->sdaf(what, what_stride, check_here, in_what_stride, second_pred, bestsad); - if (thissad < bestsad) { - this_mv.row = this_row_offset; - this_mv.col = this_col_offset; thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); if (thissad < bestsad) { @@ -1977,8 +1966,7 @@ int vp9_refining_search_8p_c(MACROBLOCK *x, } else { ref_mv->row += neighbors[best_site].row; ref_mv->col += neighbors[best_site].col; - best_address += (neighbors[best_site].row) * in_what_stride + - neighbors[best_site].col; + best_address = &in_what[ref_mv->row * in_what_stride + ref_mv->col]; } } diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index b3d89752d..74035842f 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -108,7 +108,7 @@ typedef int (*vp9_full_search_fn_t)(MACROBLOCK *x, int *mvjcost, int *mvcost[2], const MV *center_mv, int n); -typedef int (*vp9_refining_search_fn_t)(MACROBLOCK *x, +typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv, int sad_per_bit, int distance, vp9_variance_fn_ptr_t *fn_ptr, @@ -123,7 +123,7 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x, int *mvjcost, int *mvcost[2], const MV *center_mv); -int vp9_refining_search_8p_c(MACROBLOCK *x, +int vp9_refining_search_8p_c(const MACROBLOCK *x, MV *ref_mv, int error_per_bit, int search_range, vp9_variance_fn_ptr_t *fn_ptr, int *mvjcost, int *mvcost[2], diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 42d4196c5..27531d232 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -36,6 +36,7 @@ #include "vp9/encoder/vp9_segmentation.h" #include "vp9/encoder/vp9_temporal_filter.h" #include "vp9/encoder/vp9_vaq.h" +#include "vp9/encoder/vp9_resize.h" #include "vpx_ports/vpx_timer.h" @@ -968,7 +969,7 @@ static void alloc_raw_frame_buffers(VP9_COMP *cpi) { if (vp9_realloc_frame_buffer(&cpi->alt_ref_buffer, cpi->oxcf.width, cpi->oxcf.height, cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); } @@ -983,14 +984,14 @@ void vp9_alloc_compressor_data(VP9_COMP *cpi) { if (vp9_alloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS)) + VP9_ENC_BORDER_IN_PIXELS)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate last frame buffer"); if (vp9_alloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS)) + VP9_ENC_BORDER_IN_PIXELS)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to allocate scaled source buffer"); @@ -1036,14 +1037,14 @@ static void update_frame_size(VP9_COMP *cpi) { if (vp9_realloc_frame_buffer(&cpi->last_frame_uf, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate last frame buffer"); if (vp9_realloc_frame_buffer(&cpi->scaled_source, cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS, NULL, NULL, NULL)) + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL)) vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR, "Failed to reallocate scaled source buffer"); @@ -2295,6 +2296,42 @@ void vp9_write_yuv_rec_frame(VP9_COMMON *cm) { } #endif +static void scale_and_extend_frame_nonnormative(YV12_BUFFER_CONFIG *src_fb, + YV12_BUFFER_CONFIG *dst_fb) { + const int in_w = src_fb->y_crop_width; + const int in_h = src_fb->y_crop_height; + const int out_w = dst_fb->y_crop_width; + const int out_h = dst_fb->y_crop_height; + const int in_w_uv = src_fb->uv_crop_width; + const int in_h_uv = src_fb->uv_crop_height; + const int out_w_uv = dst_fb->uv_crop_width; + const int out_h_uv = dst_fb->uv_crop_height; + int i; + + uint8_t *srcs[4] = {src_fb->y_buffer, src_fb->u_buffer, src_fb->v_buffer, + src_fb->alpha_buffer}; + int src_strides[4] = {src_fb->y_stride, src_fb->uv_stride, src_fb->uv_stride, + src_fb->alpha_stride}; + + uint8_t *dsts[4] = {dst_fb->y_buffer, dst_fb->u_buffer, dst_fb->v_buffer, + dst_fb->alpha_buffer}; + int dst_strides[4] = {dst_fb->y_stride, dst_fb->uv_stride, dst_fb->uv_stride, + dst_fb->alpha_stride}; + + for (i = 0; i < MAX_MB_PLANE; ++i) { + if (i == 0 || i == 3) { + // Y and alpha planes + vp9_resize_plane(srcs[i], in_h, in_w, src_strides[i], + dsts[i], out_h, out_w, dst_strides[i]); + } else { + // Chroma planes + vp9_resize_plane(srcs[i], in_h_uv, in_w_uv, src_strides[i], + dsts[i], out_h_uv, out_w_uv, dst_strides[i]); + } + } + vp8_yv12_extend_frame_borders(dst_fb); +} + static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb, YV12_BUFFER_CONFIG *dst_fb) { const int in_w = src_fb->y_crop_width; @@ -2316,7 +2353,7 @@ static void scale_and_extend_frame(YV12_BUFFER_CONFIG *src_fb, for (y = 0; y < out_h; y += 16) { for (x = 0; x < out_w; x += 16) { for (i = 0; i < MAX_MB_PLANE; ++i) { - const int factor = i == 0 ? 1 : 2; + const int factor = (i == 0 || i == 3 ? 1 : 2); const int x_q4 = x * (16 / factor) * in_w / out_w; const int y_q4 = y * (16 / factor) * in_h / out_h; const int src_stride = src_strides[i]; @@ -2552,7 +2589,7 @@ static void scale_references(VP9_COMP *cpi) { vp9_realloc_frame_buffer(&cm->yv12_fb[new_fb], cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS, NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); scale_and_extend_frame(ref, &cm->yv12_fb[new_fb]); cpi->scaled_ref_idx[ref_frame - 1] = new_fb; } else { @@ -2924,7 +2961,8 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, /* Scale the source buffer, if required. */ if (cm->mi_cols * 8 != cpi->un_scaled_source->y_width || cm->mi_rows * 8 != cpi->un_scaled_source->y_height) { - scale_and_extend_frame(cpi->un_scaled_source, &cpi->scaled_source); + scale_and_extend_frame_nonnormative(cpi->un_scaled_source, + &cpi->scaled_source); cpi->Source = &cpi->scaled_source; } else { cpi->Source = cpi->un_scaled_source; @@ -3279,12 +3317,12 @@ static void Pass2Encode(VP9_COMP *cpi, size_t *size, vp9_twopass_postencode_update(cpi, *size); } -static void check_initial_width(VP9_COMP *cpi, YV12_BUFFER_CONFIG *sd) { +static void check_initial_width(VP9_COMP *cpi, int subsampling_x, + int subsampling_y) { VP9_COMMON *const cm = &cpi->common; if (!cpi->initial_width) { - // TODO(agrange) Subsampling defaults to assuming sampled chroma. - cm->subsampling_x = sd != NULL ? (sd->uv_width < sd->y_width) : 1; - cm->subsampling_y = sd != NULL ? (sd->uv_height < sd->y_height) : 1; + cm->subsampling_x = subsampling_x; + cm->subsampling_y = subsampling_y; alloc_raw_frame_buffers(cpi); cpi->initial_width = cm->width; cpi->initial_height = cm->height; @@ -3298,8 +3336,10 @@ int vp9_receive_raw_frame(VP9_PTR ptr, unsigned int frame_flags, VP9_COMP *cpi = (VP9_COMP *) ptr; struct vpx_usec_timer timer; int res = 0; + const int subsampling_x = sd->uv_width < sd->y_width; + const int subsampling_y = sd->uv_height < sd->y_height; - check_initial_width(cpi, sd); + check_initial_width(cpi, subsampling_x, subsampling_y); vpx_usec_timer_start(&timer); if (vp9_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, frame_flags, cpi->active_map_enabled ? cpi->active_map : NULL)) @@ -3377,7 +3417,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, struct vpx_usec_timer cmptimer; YV12_BUFFER_CONFIG *force_src_buffer = NULL; MV_REFERENCE_FRAME ref_frame; - // FILE *fp_out = fopen("enc_frame_type.txt", "a"); if (!cpi) return -1; @@ -3499,8 +3538,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, vp9_end_first_pass(cpi); /* get last stats packet */ cpi->twopass.first_pass_done = 1; } - - // fclose(fp_out); return -1; } @@ -3543,7 +3580,7 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, vp9_realloc_frame_buffer(get_frame_new_buffer(cm), cm->width, cm->height, cm->subsampling_x, cm->subsampling_y, - VP9BORDERINPIXELS, NULL, NULL, NULL); + VP9_ENC_BORDER_IN_PIXELS, NULL, NULL, NULL); for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { @@ -3669,7 +3706,6 @@ int vp9_get_compressed_data(VP9_PTR ptr, unsigned int *frame_flags, } #endif - // fclose(fp_out); return 0; } @@ -3689,7 +3725,8 @@ int vp9_get_preview_raw_frame(VP9_PTR comp, YV12_BUFFER_CONFIG *dest, *dest = *cpi->common.frame_to_show; dest->y_width = cpi->common.width; dest->y_height = cpi->common.height; - dest->uv_height = cpi->common.height / 2; + dest->uv_width = cpi->common.width >> cpi->common.subsampling_x; + dest->uv_height = cpi->common.height >> cpi->common.subsampling_y; ret = 0; } else { ret = -1; @@ -3797,7 +3834,7 @@ int vp9_set_size_literal(VP9_PTR comp, unsigned int width, VP9_COMP *cpi = (VP9_COMP *)comp; VP9_COMMON *cm = &cpi->common; - check_initial_width(cpi, NULL); + check_initial_width(cpi, 1, 1); if (width) { cm->width = width; diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 7a5282dda..a4ceabdf1 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -20,77 +20,43 @@ #include "vp9/common/vp9_loopfilter.h" #include "./vpx_scale_rtcd.h" -void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc, - YV12_BUFFER_CONFIG *dst_ybc, int fraction) { - const int height = src_ybc->y_height; - const int stride = src_ybc->y_stride; - const int offset = stride * ((height >> 5) * 16 - 8); - const int lines_to_copy = MAX(height >> (fraction + 4), 1) << 4; - - assert(src_ybc->y_stride == dst_ybc->y_stride); - vpx_memcpy(dst_ybc->y_buffer + offset, src_ybc->y_buffer + offset, - stride * (lines_to_copy + 16)); -} - -// Enforce a minimum filter level based upon baseline Q static int get_min_filter_level(VP9_COMP *cpi, int base_qindex) { - int min_filter_level; - min_filter_level = 0; - - return min_filter_level; + return 0; } -// Enforce a maximum filter level based upon baseline Q static int get_max_filter_level(VP9_COMP *cpi, int base_qindex) { - int max_filter_level = MAX_LOOP_FILTER; - (void)base_qindex; - - if (cpi->twopass.section_intra_rating > 8) - max_filter_level = MAX_LOOP_FILTER * 3 / 4; - - return max_filter_level; + return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4 + : MAX_LOOP_FILTER; } - // Stub function for now Alt LF not used void vp9_set_alt_lf_level(VP9_COMP *cpi, int filt_val) { } void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { + MACROBLOCKD *const xd = &cpi->mb.e_mbd; VP9_COMMON *const cm = &cpi->common; struct loopfilter *const lf = &cm->lf; - - int best_err = 0; - int filt_err = 0; const int min_filter_level = get_min_filter_level(cpi, cm->base_qindex); const int max_filter_level = get_max_filter_level(cpi, cm->base_qindex); - - int filter_step; - int filt_high = 0; - // Start search at previous frame filter level - int filt_mid = lf->filter_level; - int filt_low = 0; + int best_err = 0; + int filt_err = 0; int filt_best; int filt_direction = 0; - - int Bias = 0; // Bias against raising loop filter in favor of lowering it. - - // Make a copy of the unfiltered / processed recon buffer - vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); + // Start the search at the previous frame filter level unless it is now out of + // range. + int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); + int filter_step = filt_mid < 16 ? 4 : filt_mid / 4; lf->sharpness_level = cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness; - // Start the search at the previous frame filter level unless it is now out of - // range. - filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level); - - // Define the initial step size - filter_step = filt_mid < 16 ? 4 : filt_mid / 4; + // Make a copy of the unfiltered / processed recon buffer + vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf); // Get baseline error score vp9_set_alt_lf_level(cpi, filt_mid); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_mid, 1, partial); + vp9_loop_filter_frame(cm, xd, filt_mid, 1, partial); best_err = vp9_calc_ss_err(sd, cm->frame_to_show); filt_best = filt_mid; @@ -99,35 +65,32 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); while (filter_step > 0) { - Bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; + const int filt_high = MIN(filt_mid + filter_step, max_filter_level); + const int filt_low = MAX(filt_mid - filter_step, min_filter_level); + + // Bias against raising loop filter in favor of lowering it. + int bias = (best_err >> (15 - (filt_mid / 8))) * filter_step; if (cpi->twopass.section_intra_rating < 20) - Bias = Bias * cpi->twopass.section_intra_rating / 20; + bias = bias * cpi->twopass.section_intra_rating / 20; // yx, bias less for large block size - if (cpi->common.tx_mode != ONLY_4X4) - Bias >>= 1; - - filt_high = ((filt_mid + filter_step) > max_filter_level) - ? max_filter_level - : (filt_mid + filter_step); - filt_low = ((filt_mid - filter_step) < min_filter_level) - ? min_filter_level - : (filt_mid - filter_step); + if (cm->tx_mode != ONLY_4X4) + bias >>= 1; - if ((filt_direction <= 0) && (filt_low != filt_mid)) { + if (filt_direction <= 0 && filt_low != filt_mid) { // Get Low filter error score vp9_set_alt_lf_level(cpi, filt_low); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_low, 1, partial); + vp9_loop_filter_frame(cm, xd, filt_low, 1, partial); filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); - // Re-instate the unfiltered frame + // Re-instate the unfiltered frame vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); // If value is close to the best so far then bias towards a lower loop // filter value. - if ((filt_err - Bias) < best_err) { + if ((filt_err - bias) < best_err) { // Was it actually better than the previous best? if (filt_err < best_err) best_err = filt_err; @@ -137,9 +100,9 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { } // Now look at filt_high - if ((filt_direction >= 0) && (filt_high != filt_mid)) { + if (filt_direction >= 0 && filt_high != filt_mid) { vp9_set_alt_lf_level(cpi, filt_high); - vp9_loop_filter_frame(cm, &cpi->mb.e_mbd, filt_high, 1, partial); + vp9_loop_filter_frame(cm, xd, filt_high, 1, partial); filt_err = vp9_calc_ss_err(sd, cm->frame_to_show); @@ -147,7 +110,7 @@ void vp9_pick_filter_level(YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, int partial) { vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show); // Was it better than the previous best? - if (filt_err < (best_err - Bias)) { + if (filt_err < (best_err - bias)) { best_err = filt_err; filt_best = filt_high; } diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 17d1f5984..f317f2a0d 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -174,7 +174,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; if (cpi->ref_frame_flags & flag_list[ref_frame]) { - vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame), + vp9_setup_buffer_inter(cpi, x, tile, ref_frame, block_size, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 72ab00f98..3ebf98c0f 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -211,19 +211,16 @@ static int estimate_bits_at_q(int frame_kind, int q, int mbs, static void calc_iframe_target_size(VP9_COMP *cpi) { - // boost defaults to half second + const VP9_CONFIG *oxcf = &cpi->oxcf; + RATE_CONTROL *const rc = &cpi->rc; int target; - // Clear down mmx registers to allow floating point in what follows vp9_clear_system_state(); // __asm emms; - // New Two pass RC - target = cpi->rc.per_frame_bandwidth; - // For 1-pass. if (cpi->pass == 0) { if (cpi->common.current_video_frame == 0) { - target = cpi->oxcf.starting_buffer_level / 2; + target = oxcf->starting_buffer_level / 2; } else { // TODO(marpan): Add in adjustment based on Q. // If this keyframe was forced, use a more recent Q estimate. @@ -235,47 +232,49 @@ static void calc_iframe_target_size(VP9_COMP *cpi) { // Adjustment up based on q: need to fix. // kf_boost = kf_boost * kfboost_qadjust(Q) / 100; // Frame separation adjustment (down). - if (cpi->rc.frames_since_key < cpi->output_framerate / 2) { - kf_boost = (int)(kf_boost * cpi->rc.frames_since_key / - (cpi->output_framerate / 2)); + if (rc->frames_since_key < cpi->output_framerate / 2) { + kf_boost = (int)(kf_boost * rc->frames_since_key / + (cpi->output_framerate / 2)); } kf_boost = (kf_boost < 16) ? 16 : kf_boost; - target = ((16 + kf_boost) * cpi->rc.per_frame_bandwidth) >> 4; + target = ((16 + kf_boost) * rc->per_frame_bandwidth) >> 4; } - cpi->rc.active_worst_quality = cpi->rc.worst_quality; + rc->active_worst_quality = rc->worst_quality; + } else { + target = rc->per_frame_bandwidth; } - if (cpi->oxcf.rc_max_intra_bitrate_pct) { - int max_rate = cpi->rc.per_frame_bandwidth - * cpi->oxcf.rc_max_intra_bitrate_pct / 100; - - if (target > max_rate) - target = max_rate; + if (oxcf->rc_max_intra_bitrate_pct) { + const int max_rate = rc->per_frame_bandwidth * + oxcf->rc_max_intra_bitrate_pct / 100; + target = MIN(target, max_rate); } - cpi->rc.this_frame_target = target; + rc->this_frame_target = target; } // Update the buffer level: leaky bucket model. void vp9_update_buffer_level(VP9_COMP *const cpi, int encoded_frame_size) { - VP9_COMMON *const cm = &cpi->common; + const VP9_COMMON *const cm = &cpi->common; + const VP9_CONFIG *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; + // Non-viewable frames are a special case and are treated as pure overhead. if (!cm->show_frame) { rc->bits_off_target -= encoded_frame_size; } else { rc->bits_off_target += rc->av_per_frame_bandwidth - encoded_frame_size; } + // Clip the buffer level to the maximum specified buffer size. - if (rc->bits_off_target > cpi->oxcf.maximum_buffer_size) { - rc->bits_off_target = cpi->oxcf.maximum_buffer_size; - } - rc->buffer_level = rc->bits_off_target; + rc->buffer_level = MIN(rc->bits_off_target, oxcf->maximum_buffer_size); } int vp9_drop_frame(VP9_COMP *const cpi) { + const VP9_CONFIG *oxcf = &cpi->oxcf; RATE_CONTROL *const rc = &cpi->rc; - if (!cpi->oxcf.drop_frames_water_mark) { + + if (!oxcf->drop_frames_water_mark) { return 0; } else { if (rc->buffer_level < 0) { @@ -284,8 +283,8 @@ int vp9_drop_frame(VP9_COMP *const cpi) { } else { // If buffer is below drop_mark, for now just drop every other frame // (starting with the next frame) until it increases back over drop_mark. - int drop_mark = (int)(cpi->oxcf.drop_frames_water_mark * - cpi->oxcf.optimal_buffer_level / 100); + int drop_mark = (int)(oxcf->drop_frames_water_mark * + oxcf->optimal_buffer_level / 100); if ((rc->buffer_level > drop_mark) && (rc->decimation_factor > 0)) { --rc->decimation_factor; @@ -310,14 +309,14 @@ int vp9_drop_frame(VP9_COMP *const cpi) { } // Adjust active_worst_quality level based on buffer level. -static int adjust_active_worst_quality_from_buffer_level(const VP9_COMP *cpi) { +static int adjust_active_worst_quality_from_buffer_level(const VP9_CONFIG *oxcf, + const RATE_CONTROL *rc) { // Adjust active_worst_quality: If buffer is above the optimal/target level, // bring active_worst_quality down depending on fullness over buffer. // If buffer is below the optimal level, let the active_worst_quality go from // ambient Q (at buffer = optimal level) to worst_quality level // (at buffer = critical level). - const RATE_CONTROL *const rc = &cpi->rc; - const VP9_CONFIG *const oxcf = &cpi->oxcf; + int active_worst_quality = rc->active_worst_quality; // Maximum limit for down adjustment, ~20%. int max_adjustment_down = active_worst_quality / 5; @@ -354,31 +353,23 @@ static int adjust_active_worst_quality_from_buffer_level(const VP9_COMP *cpi) { } // Adjust target frame size with respect to the buffering constraints: -static int target_size_from_buffer_level(const VP9_COMP *cpi) { - const RATE_CONTROL *const rc = &cpi->rc; - const VP9_CONFIG *const oxcf = &cpi->oxcf; - int this_frame_target = cpi->rc.this_frame_target; - int percent_low = 0; - int percent_high = 0; - int one_percent_bits = (int)(1 + oxcf->optimal_buffer_level / 100); - if (rc->buffer_level < oxcf->optimal_buffer_level) { - percent_low = (int)((oxcf->optimal_buffer_level - rc->buffer_level) / - one_percent_bits); - if (percent_low > oxcf->under_shoot_pct) - percent_low = oxcf->under_shoot_pct; +static int target_size_from_buffer_level(const VP9_CONFIG *oxcf, + const RATE_CONTROL *rc) { + int target = rc->this_frame_target; + const int64_t diff = oxcf->optimal_buffer_level - rc->buffer_level; + const int one_pct_bits = 1 + oxcf->optimal_buffer_level / 100; + if (diff > 0) { // Lower the target bandwidth for this frame. - this_frame_target -= (this_frame_target * percent_low) / 200; - } else if (rc->buffer_level > oxcf->optimal_buffer_level) { - percent_high = (int)((rc->buffer_level - oxcf->optimal_buffer_level) / - one_percent_bits); - if (percent_high > oxcf->over_shoot_pct) - percent_high = oxcf->over_shoot_pct; - + const int pct_low = MIN(diff / one_pct_bits, oxcf->under_shoot_pct); + target -= (target * pct_low) / 200; + } else if (diff < 0) { // Increase the target bandwidth for this frame. - this_frame_target += (this_frame_target * percent_high) / 200; + const int pct_high = MIN(-diff / one_pct_bits, oxcf->over_shoot_pct); + target += (target * pct_high) / 200; } - return this_frame_target; + + return target; } static void calc_pframe_target_size(VP9_COMP *const cpi) { @@ -400,10 +391,10 @@ static void calc_pframe_target_size(VP9_COMP *const cpi) { // For now, use: cpi->rc.av_per_frame_bandwidth / 16: min_frame_target = MAX(rc->av_per_frame_bandwidth >> 4, FRAME_OVERHEAD_BITS); - rc->this_frame_target = target_size_from_buffer_level(cpi); + rc->this_frame_target = target_size_from_buffer_level(oxcf, rc); // Adjust qp-max based on buffer level. rc->active_worst_quality = - adjust_active_worst_quality_from_buffer_level(cpi); + adjust_active_worst_quality_from_buffer_level(oxcf, rc); } } @@ -602,7 +593,7 @@ int vp9_rc_pick_q_and_adjust_q_bounds(const VP9_COMP *cpi, (last_boosted_q * 0.75)); active_best_quality = MAX(qindex + delta_qindex, rc->best_quality); } else if (!(cpi->pass == 0 && cm->current_video_frame == 0)) { - // not first frame of one pass + // not first frame of one pass and kf_boost is set double q_adj_factor = 1.0; double q_val; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index fa6b362d4..242aa8710 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -233,8 +233,8 @@ static void set_block_thresholds(VP9_COMP *cpi) { const int q = compute_rd_thresh_factor(qindex); for (bsize = 0; bsize < BLOCK_SIZES; ++bsize) { - // Threshold here seem unecessarily harsh but fine given actual - // range of values used for cpi->sf.thresh_mult[] + // Threshold here seems unnecessarily harsh but fine given actual + // range of values used for cpi->sf.thresh_mult[]. const int t = q * rd_thresh_block_size_factor[bsize]; const int thresh_max = INT_MAX / t; @@ -419,18 +419,12 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE bsize, struct macroblock_plane *const p = &x->plane[i]; struct macroblockd_plane *const pd = &xd->plane[i]; const BLOCK_SIZE bs = get_plane_block_size(bsize, pd); - int rate; - int64_t dist; (void) cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride, &sse); if (i == 0) x->pred_sse[ref] = sse; - // sse works better than var, since there is no dc prediction used - model_rd_from_var_lapndz(sse, 1 << num_pels_log2_lookup[bs], - pd->dequant[1] >> 3, &rate, &dist); - rate_sum += rate; - dist_sum += (int)dist; + dist_sum += (int)sse; } *out_rate_sum = rate_sum; @@ -2276,41 +2270,38 @@ static void setup_pred_block(const MACROBLOCKD *xd, void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, - int idx, MV_REFERENCE_FRAME frame_type, + MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE block_size, int mi_row, int mi_col, int_mv frame_nearest_mv[MAX_REF_FRAMES], int_mv frame_near_mv[MAX_REF_FRAMES], struct buf_2d yv12_mb[4][MAX_MB_PLANE]) { - VP9_COMMON *cm = &cpi->common; - YV12_BUFFER_CONFIG *yv12 = &cm->yv12_fb[cpi->common.ref_frame_map[idx]]; + const VP9_COMMON *cm = &cpi->common; + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); MACROBLOCKD *const xd = &x->e_mbd; - MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi; - const struct scale_factors *const sf = &cm->frame_refs[frame_type - 1].sf; - + MODE_INFO *const mi = xd->mi_8x8[0]; + int_mv *const candidates = mi->mbmi.ref_mvs[ref_frame]; + const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf; // TODO(jkoleszar): Is the UV buffer ever used here? If so, need to make this // use the UV scaling factors. - setup_pred_block(xd, yv12_mb[frame_type], yv12, mi_row, mi_col, sf, sf); + setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf); // Gets an initial list of candidate vectors from neighbours and orders them - vp9_find_mv_refs(cm, xd, tile, xd->mi_8x8[0], - xd->last_mi, - frame_type, - mbmi->ref_mvs[frame_type], mi_row, mi_col); + vp9_find_mv_refs(cm, xd, tile, mi, xd->last_mi, ref_frame, candidates, + mi_row, mi_col); // Candidate refinement carried out at encoder and decoder - vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, - mbmi->ref_mvs[frame_type], - &frame_nearest_mv[frame_type], - &frame_near_mv[frame_type]); + vp9_find_best_ref_mvs(xd, cm->allow_high_precision_mv, candidates, + &frame_nearest_mv[ref_frame], + &frame_near_mv[ref_frame]); // Further refinement that is encode side only to test the top few candidates // in full and choose the best as the centre point for subsequent searches. // The current implementation doesn't support scaling. if (!vp9_is_scaled(sf) && block_size >= BLOCK_8X8) - mv_pred(cpi, x, yv12_mb[frame_type][0].buf, yv12->y_stride, - frame_type, block_size); + mv_pred(cpi, x, yv12_mb[ref_frame][0].buf, yv12->y_stride, + ref_frame, block_size); } YV12_BUFFER_CONFIG *vp9_get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) { @@ -3173,7 +3164,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { x->pred_mv_sad[ref_frame] = INT_MAX; if (cpi->ref_frame_flags & flag_list[ref_frame]) { - vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame), + vp9_setup_buffer_inter(cpi, x, tile, ref_frame, block_size, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); } @@ -3798,7 +3789,7 @@ int64_t vp9_rd_pick_inter_mode_sub8x8(VP9_COMP *cpi, MACROBLOCK *x, for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { if (cpi->ref_frame_flags & flag_list[ref_frame]) { - vp9_setup_buffer_inter(cpi, x, tile, get_ref_frame_idx(cpi, ref_frame), + vp9_setup_buffer_inter(cpi, x, tile, ref_frame, block_size, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb); diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index 696cf6b11..9ac1f5404 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -39,7 +39,7 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); void vp9_setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, const TileInfo *const tile, - int idx, MV_REFERENCE_FRAME frame_type, + MV_REFERENCE_FRAME ref_frame, BLOCK_SIZE block_size, int mi_row, int mi_col, int_mv frame_nearest_mv[MAX_REF_FRAMES], diff --git a/vp9/encoder/vp9_resize.c b/vp9/encoder/vp9_resize.c new file mode 100644 index 000000000..f15abc07d --- /dev/null +++ b/vp9/encoder/vp9_resize.c @@ -0,0 +1,418 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <limits.h> +#include <math.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include "vp9/common/vp9_common.h" +#include "vp9/encoder/vp9_resize.h" +#include "vpx/vpx_integer.h" + +#define FILTER_BITS 7 + +#define INTERP_TAPS 8 +#define SUBPEL_BITS 5 +#define SUBPEL_MASK ((1 << SUBPEL_BITS) - 1) +#define INTERP_PRECISION_BITS 32 + +#define ROUND_POWER_OF_TWO(value, n) \ + (((value) + (1 << ((n) - 1))) >> (n)) + +typedef int16_t interp_kernel[INTERP_TAPS]; + +// Filters for interpolation - note this also filters integer pels. +const interp_kernel vp9_filteredinterp_filters[(1 << SUBPEL_BITS)] = { + {-1, -8, 33, 80, 33, -8, -1, 0}, + {-1, -8, 30, 80, 35, -8, -1, 1}, + {-1, -8, 28, 80, 37, -7, -2, 1}, + {0, -8, 26, 79, 39, -7, -2, 1}, + {0, -8, 24, 79, 41, -7, -2, 1}, + {0, -8, 22, 78, 43, -6, -2, 1}, + {0, -8, 20, 78, 45, -5, -3, 1}, + {0, -8, 18, 77, 48, -5, -3, 1}, + {0, -8, 16, 76, 50, -4, -3, 1}, + {0, -8, 15, 75, 52, -3, -4, 1}, + {0, -7, 13, 74, 54, -3, -4, 1}, + {0, -7, 11, 73, 56, -2, -4, 1}, + {0, -7, 10, 71, 58, -1, -4, 1}, + {1, -7, 8, 70, 60, 0, -5, 1}, + {1, -6, 6, 68, 62, 1, -5, 1}, + {1, -6, 5, 67, 63, 2, -5, 1}, + {1, -6, 4, 65, 65, 4, -6, 1}, + {1, -5, 2, 63, 67, 5, -6, 1}, + {1, -5, 1, 62, 68, 6, -6, 1}, + {1, -5, 0, 60, 70, 8, -7, 1}, + {1, -4, -1, 58, 71, 10, -7, 0}, + {1, -4, -2, 56, 73, 11, -7, 0}, + {1, -4, -3, 54, 74, 13, -7, 0}, + {1, -4, -3, 52, 75, 15, -8, 0}, + {1, -3, -4, 50, 76, 16, -8, 0}, + {1, -3, -5, 48, 77, 18, -8, 0}, + {1, -3, -5, 45, 78, 20, -8, 0}, + {1, -2, -6, 43, 78, 22, -8, 0}, + {1, -2, -7, 41, 79, 24, -8, 0}, + {1, -2, -7, 39, 79, 26, -8, 0}, + {1, -2, -7, 37, 80, 28, -8, -1}, + {1, -1, -8, 35, 80, 30, -8, -1}, +}; + +// Filters for factor of 2 downsampling. +static const int16_t vp9_down2_symeven_half_filter[] = {56, 12, -3, -1}; +static const int16_t vp9_down2_symodd_half_filter[] = {64, 35, 0, -3}; + +static void interpolate(const uint8_t *const input, int inlength, + uint8_t *output, int outlength) { + const int64_t delta = (((uint64_t)inlength << 32) + outlength / 2) / + outlength; + const int64_t offset = inlength > outlength ? + (((int64_t)(inlength - outlength) << 31) + outlength / 2) / outlength : + -(((int64_t)(outlength - inlength) << 31) + outlength / 2) / outlength; + uint8_t *optr = output; + int x, x1, x2, sum, k, int_pel, sub_pel; + int64_t y; + + x = 0; + y = offset; + while ((y >> INTERP_PRECISION_BITS) < (INTERP_TAPS / 2 - 1)) { + x++; + y += delta; + } + x1 = x; + x = outlength - 1; + y = delta * x + offset; + while ((y >> INTERP_PRECISION_BITS) + + (int64_t)(INTERP_TAPS / 2) >= inlength) { + x--; + y -= delta; + } + x2 = x; + if (x1 > x2) { + for (x = 0, y = offset; x < outlength; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = vp9_filteredinterp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) { + const int pk = int_pel - INTERP_TAPS / 2 + 1 + k; + sum += filter[k] * input[(pk < 0 ? 0 : + (pk >= inlength ? inlength - 1 : pk))]; + } + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } else { + // Initial part. + for (x = 0, y = offset; x < x1; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = vp9_filteredinterp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k < 0 ? + 0 : + int_pel - INTERP_TAPS / 2 + 1 + k)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // Middle part. + for (; x <= x2; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = vp9_filteredinterp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[int_pel - INTERP_TAPS / 2 + 1 + k]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + // End part. + for (; x < outlength; ++x, y += delta) { + const int16_t *filter; + int_pel = y >> INTERP_PRECISION_BITS; + sub_pel = (y >> (INTERP_PRECISION_BITS - SUBPEL_BITS)) & SUBPEL_MASK; + filter = vp9_filteredinterp_filters[sub_pel]; + sum = 0; + for (k = 0; k < INTERP_TAPS; ++k) + sum += filter[k] * input[(int_pel - INTERP_TAPS / 2 + 1 + k >= + inlength ? inlength - 1 : + int_pel - INTERP_TAPS / 2 + 1 + k)]; + *optr++ = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)); + } + } +} + +static void down2_symeven(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half. + static const int16_t *filter = vp9_down2_symeven_half_filter; + const int filter_len_half = sizeof(vp9_down2_symeven_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half; + int l2 = (length - filter_len_half); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + 1 + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)); + for (j = 0; j < filter_len_half; ++j) { + sum += (input[i - j] + + input[(i + 1 + j >= length ? length - 1 : i + 1 + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static void down2_symodd(const uint8_t *const input, int length, + uint8_t *output) { + // Actual filter len = 2 * filter_len_half - 1. + static const int16_t *filter = vp9_down2_symodd_half_filter; + const int filter_len_half = sizeof(vp9_down2_symodd_half_filter) / 2; + int i, j; + uint8_t *optr = output; + int l1 = filter_len_half - 1; + int l2 = (length - filter_len_half + 1); + l1 += (l1 & 1); + l2 += (l2 & 1); + if (l1 > l2) { + // Short input length. + for (i = 0; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } else { + // Initial part. + for (i = 0; i < l1; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[(i - j < 0 ? 0 : i - j)] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // Middle part. + for (; i < l2; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[i + j]) * filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + // End part. + for (; i < length; i += 2) { + int sum = (1 << (FILTER_BITS - 1)) + input[i] * filter[0]; + for (j = 1; j < filter_len_half; ++j) { + sum += (input[i - j] + input[(i + j >= length ? length - 1 : i + j)]) * + filter[j]; + } + sum >>= FILTER_BITS; + *optr++ = clip_pixel(sum); + } + } +} + +static int get_down2_length(int length, int steps) { + int s; + for (s = 0; s < steps; ++s) + length = (length + 1) >> 1; + return length; +} + +int get_down2_steps(int in_length, int out_length) { + int steps = 0; + int proj_in_length; + while ((proj_in_length = get_down2_length(in_length, 1)) >= out_length) { + ++steps; + in_length = proj_in_length; + } + return steps; +} + +static void resize_multistep(const uint8_t *const input, + int length, + uint8_t *output, + int olength, + uint8_t *buf) { + int steps; + if (length == olength) { + memcpy(output, input, sizeof(uint8_t) * length); + return; + } + steps = get_down2_steps(length, olength); + + if (steps > 0) { + int s; + uint8_t *out = NULL; + uint8_t *tmpbuf = NULL; + uint8_t *otmp, *otmp2; + int filteredlength = length; + if (!tmpbuf) { + tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * length); + otmp = tmpbuf; + } else { + otmp = buf; + } + otmp2 = otmp + get_down2_length(length, 1); + for (s = 0; s < steps; ++s) { + const int proj_filteredlength = get_down2_length(filteredlength, 1); + const uint8_t *const in = (s == 0 ? input : out); + if (s == steps - 1 && proj_filteredlength == olength) + out = output; + else + out = (s & 1 ? otmp2 : otmp); + if (filteredlength & 1) + down2_symodd(in, filteredlength, out); + else + down2_symeven(in, filteredlength, out); + filteredlength = proj_filteredlength; + } + if (filteredlength != olength) { + interpolate(out, filteredlength, output, olength); + } + if (tmpbuf) + free(tmpbuf); + } else { + interpolate(input, length, output, olength); + } +} + +static void fill_col_to_arr(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *aptr++ = *iptr; + } +} + +static void fill_arr_to_col(uint8_t *img, int stride, int len, uint8_t *arr) { + int i; + uint8_t *iptr = img; + uint8_t *aptr = arr; + for (i = 0; i < len; ++i, iptr += stride) { + *iptr = *aptr++; + } +} + +void vp9_resize_plane(const uint8_t *const input, + int height, + int width, + int in_stride, + uint8_t *output, + int height2, + int width2, + int out_stride) { + int i; + uint8_t *intbuf = (uint8_t *)malloc(sizeof(uint8_t) * width2 * height); + uint8_t *tmpbuf = (uint8_t *)malloc(sizeof(uint8_t) * + (width < height ? height : width)); + uint8_t *arrbuf = (uint8_t *)malloc(sizeof(uint8_t) * (height + height2)); + for (i = 0; i < height; ++i) + resize_multistep(input + in_stride * i, width, + intbuf + width2 * i, width2, tmpbuf); + for (i = 0; i < width2; ++i) { + fill_col_to_arr(intbuf + i, width2, height, arrbuf); + resize_multistep(arrbuf, height, arrbuf + height, height2, tmpbuf); + fill_arr_to_col(output + i, out_stride, height2, arrbuf + height); + } + free(intbuf); + free(tmpbuf); + free(arrbuf); +} + +void vp9_resize_frame420(const uint8_t *const y, + int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, + int height, int width, + uint8_t *oy, int oy_stride, + uint8_t *ou, uint8_t *ov, int ouv_stride, + int oheight, int owidth) { + vp9_resize_plane(y, height, width, y_stride, + oy, oheight, owidth, oy_stride); + vp9_resize_plane(u, height / 2, width / 2, uv_stride, + ou, oheight / 2, owidth / 2, ouv_stride); + vp9_resize_plane(v, height / 2, width / 2, uv_stride, + ov, oheight / 2, owidth / 2, ouv_stride); +} + +void vp9_resize_frame422(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, + int height, int width, + uint8_t *oy, int oy_stride, + uint8_t *ou, uint8_t *ov, int ouv_stride, + int oheight, int owidth) { + vp9_resize_plane(y, height, width, y_stride, + oy, oheight, owidth, oy_stride); + vp9_resize_plane(u, height, width / 2, uv_stride, + ou, oheight, owidth / 2, ouv_stride); + vp9_resize_plane(v, height, width / 2, uv_stride, + ov, oheight, owidth / 2, ouv_stride); +} + +void vp9_resize_frame444(const uint8_t *const y, int y_stride, + const uint8_t *const u, const uint8_t *const v, + int uv_stride, + int height, int width, + uint8_t *oy, int oy_stride, + uint8_t *ou, uint8_t *ov, int ouv_stride, + int oheight, int owidth) { + vp9_resize_plane(y, height, width, y_stride, + oy, oheight, owidth, oy_stride); + vp9_resize_plane(u, height, width, uv_stride, + ou, oheight, owidth, ouv_stride); + vp9_resize_plane(v, height, width, uv_stride, + ov, oheight, owidth, ouv_stride); +} diff --git a/vp9/encoder/vp9_resize.h b/vp9/encoder/vp9_resize.h new file mode 100644 index 000000000..c67595a3f --- /dev/null +++ b/vp9/encoder/vp9_resize.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VP9_ENCODER_VP9_RESIZE_H_ +#define VP9_ENCODER_VP9_RESIZE_H_ + +#include <stdio.h> + +void vp9_resize_plane(const uint8_t *const input, + int height, + int width, + int in_stride, + uint8_t *output, + int height2, + int width2, + int out_stride); +void vp9_resize_frame420(const uint8_t *const y, + int y_stride, + const uint8_t *const u, + const uint8_t *const v, + int uv_stride, + int height, + int width, + uint8_t *oy, + int oy_stride, + uint8_t *ou, + uint8_t *ov, + int ouv_stride, + int oheight, + int owidth); +void vp9_resize_frame422(const uint8_t *const y, + int y_stride, + const uint8_t *const u, + const uint8_t *const v, + int uv_stride, + int height, + int width, + uint8_t *oy, + int oy_stride, + uint8_t *ou, + uint8_t *ov, + int ouv_stride, + int oheight, + int owidth); +void vp9_resize_frame444(const uint8_t *const y, + int y_stride, + const uint8_t *const u, + const uint8_t *const v, + int uv_stride, + int height, + int width, + uint8_t *oy, + int oy_stride, + uint8_t *ou, + uint8_t *ov, + int ouv_stride, + int oheight, + int owidth); + +#endif // VP9_ENCODER_VP9_RESIZE_H_ diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c index 55d595baf..58c5df47e 100644 --- a/vp9/encoder/vp9_sad_c.c +++ b/vp9/encoder/vp9_sad_c.c @@ -8,31 +8,44 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include <stdlib.h> + #include "./vp9_rtcd.h" #include "./vpx_config.h" -#include "vp9/encoder/vp9_sadmxn.h" -#include "vp9/encoder/vp9_variance.h" + #include "vpx/vpx_integer.h" +#include "vp9/encoder/vp9_variance.h" + +static INLINE unsigned int sad(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int width, int height) { + int y, x; + unsigned int sad = 0; + + for (y = 0; y < height; y++) { + for (x = 0; x < width; x++) + sad += abs(a[x] - b[x]); + + a += a_stride; + b += b_stride; + } + + return sad; +} #define sad_mxn_func(m, n) \ -unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride, \ +unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ unsigned int max_sad) { \ - return sad_mx_n_c(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ + return sad(src_ptr, src_stride, ref_ptr, ref_stride, m, n); \ } \ -unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, \ - int src_stride, \ - const uint8_t *ref_ptr, \ - int ref_stride, \ +unsigned int vp9_sad##m##x##n##_avg_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *ref_ptr, int ref_stride, \ const uint8_t *second_pred, \ unsigned int max_sad) { \ uint8_t comp_pred[m * n]; \ comp_avg_pred(comp_pred, second_pred, m, n, ref_ptr, ref_stride); \ - return sad_mx_n_c(src_ptr, src_stride, comp_pred, m, m, n); \ + return sad(src_ptr, src_stride, comp_pred, m, m, n); \ } sad_mxn_func(64, 64) @@ -49,567 +62,263 @@ sad_mxn_func(8, 4) sad_mxn_func(4, 8) sad_mxn_func(4, 4) -void vp9_sad64x32x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad64x32(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad64x32(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad64x32(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad64x32(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad64x32(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad32x64x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad32x64(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x64(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x64(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad32x64(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad32x64(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad32x16x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad32x16(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x16(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x16(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad32x16(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad32x16(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad16x32x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x32(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x32(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x32(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x32(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad16x32(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad64x64x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad64x64x3_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64(src_ptr, src_stride, ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + 2, ref_stride, - 0x7fffffff); + int i; + for (i = 0; i < 3; ++i) + sad_array[i] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad32x32x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad32x32x3_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 3; ++i) + sad_array[i] = vp9_sad32x32(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad64x64x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad64x64x8_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + int i; + for (i = 0; i < 8; ++i) + sad_array[i] = vp9_sad64x64(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad32x32x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + int i; + for (i = 0; i < 8; ++i) + sad_array[i] = vp9_sad32x32(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad16x16x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 3; ++i) + sad_array[i] = vp9_sad16x16(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad16x16x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array) { - sad_array[0] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + int i; + for (i = 0; i < 8; ++i) + sad_array[i] = vp9_sad16x16(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad16x8x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 3; ++i) + sad_array[i] = vp9_sad16x8(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad16x8x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array) { - sad_array[0] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + int i; + for (i = 0; i < 8; ++i) + sad_array[i] = vp9_sad16x8(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad8x8x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 3; ++i) + sad_array[i] = vp9_sad8x8(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad8x8x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array) { - sad_array[0] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + int i; + for (i = 0; i < 8; ++i) + sad_array[i] = vp9_sad8x8(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad8x16x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 3; ++i) + sad_array[i] = vp9_sad8x16(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad8x16x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array) { - sad_array[0] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + int i; + for (i = 0; i < 8; ++i) + sad_array[i] = vp9_sad8x16(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad4x4x3_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr, ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 1, ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 2, ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 3; ++i) + sad_array[i] = vp9_sad4x4(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad4x4x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, +void vp9_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array) { - sad_array[0] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); + int i; + for (i = 0; i < 8; ++i) + sad_array[i] = vp9_sad4x4(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad64x64x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad64x64(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad64x64(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad32x32x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad32x32(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad32x32(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad16x16x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x16(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad16x16(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad16x8x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad16x8(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad16x8(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad8x8x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x8(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad8x8(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad8x16x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x16(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad8x16(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad8x4x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad8x4(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad8x4x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - uint32_t *sad_array) { - sad_array[0] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad8x4(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); +void vp9_sad8x4x8_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sad_array) { + int i; + for (i = 0; i < 8; ++i) + sad_array[i] = vp9_sad8x4(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad4x8x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad4x8(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } -void vp9_sad4x8x8_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - uint32_t *sad_array) { - sad_array[0] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr, ref_stride, - 0x7fffffff); - sad_array[1] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 1, ref_stride, - 0x7fffffff); - sad_array[2] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 2, ref_stride, - 0x7fffffff); - sad_array[3] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 3, ref_stride, - 0x7fffffff); - sad_array[4] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 4, ref_stride, - 0x7fffffff); - sad_array[5] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 5, ref_stride, - 0x7fffffff); - sad_array[6] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 6, ref_stride, - 0x7fffffff); - sad_array[7] = vp9_sad4x8(src_ptr, src_stride, - ref_ptr + 7, ref_stride, - 0x7fffffff); +void vp9_sad4x8x8_c(const uint8_t *src_ptr, int src_stride, + const uint8_t *ref_ptr, int ref_stride, + uint32_t *sad_array) { + int i; + for (i = 0; i < 8; ++i) + sad_array[i] = vp9_sad4x8(src_ptr, src_stride, ref_ptr + i, ref_stride, + 0x7fffffff); } -void vp9_sad4x4x4d_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t* const ref_ptr[], - int ref_stride, +void vp9_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, + const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array) { - sad_array[0] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr[0], ref_stride, 0x7fffffff); - sad_array[1] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr[1], ref_stride, 0x7fffffff); - sad_array[2] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr[2], ref_stride, 0x7fffffff); - sad_array[3] = vp9_sad4x4(src_ptr, src_stride, - ref_ptr[3], ref_stride, 0x7fffffff); + int i; + for (i = 0; i < 4; ++i) + sad_array[i] = vp9_sad4x4(src_ptr, src_stride, ref_ptr[i], ref_stride, + 0x7fffffff); } diff --git a/vp9/encoder/vp9_sadmxn.h b/vp9/encoder/vp9_sadmxn.h deleted file mode 100644 index 1bae4dd67..000000000 --- a/vp9/encoder/vp9_sadmxn.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP9_ENCODER_VP9_SADMXN_H_ -#define VP9_ENCODER_VP9_SADMXN_H_ - -#include "./vpx_config.h" -#include "vpx/vpx_integer.h" - -static INLINE unsigned int sad_mx_n_c(const uint8_t *src_ptr, - int src_stride, - const uint8_t *ref_ptr, - int ref_stride, - int m, - int n) { - int r, c; - unsigned int sad = 0; - - for (r = 0; r < n; r++) { - for (c = 0; c < m; c++) { - sad += abs(src_ptr[c] - ref_ptr[c]); - } - - src_ptr += src_stride; - ref_ptr += ref_stride; - } - - return sad; -} - -#endif // VP9_ENCODER_VP9_SADMXN_H_ diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index 876219268..c2eea0aaa 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -227,7 +227,7 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, for (mb_row = 0; mb_row < mb_rows; mb_row++) { #if ALT_REF_MC_ENABLED // Source frames are extended to 16 pixels. This is different than - // L/A/G reference frames that have a border of 32 (VP9BORDERINPIXELS) + // L/A/G reference frames that have a border of 32 (VP9ENCBORDERINPIXELS) // A 6/8 tap filter is used for motion search. This requires 2 pixels // before and 3 pixels after. So the largest Y mv on a border would // then be 16 - VP9_INTERP_EXTEND. The UV blocks are half the size of the diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index 478b45ac0..b1c029cba 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -74,9 +74,6 @@ VP9_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_subpixel_8t_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm -ifeq ($(ARCH_X86_64),yes) -VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_intrin_ssse3.c -endif ifeq ($(CONFIG_VP9_POSTPROC),yes) VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_postproc_mmx.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_postproc_sse2.asm diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 5c14b2e40..897ecd702 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -513,10 +513,8 @@ static vpx_codec_err_t vp9e_common_init(vpx_codec_ctx_t *ctx) { priv->vp8_cfg = extracfg_map[i].cfg; priv->vp8_cfg.pkt_list = &priv->pkt_list.head; - // TODO(agrange) Check the limits set on this buffer, or the check that is - // applied in vp9e_encode. + // Maximum buffer size approximated based on having multiple ARF. priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8; -// priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 2; if (priv->cx_data_sz < 4096) priv->cx_data_sz = 4096; @@ -692,7 +690,7 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, } } - /* Initialize the encoder instance on the first frame*/ + /* Initialize the encoder instance on the first frame. */ if (!res && ctx->cpi) { unsigned int lib_flags; YV12_BUFFER_CONFIG sd; @@ -704,9 +702,6 @@ static vpx_codec_err_t vp9e_encode(vpx_codec_alg_priv_t *ctx, if (ctx->base.init_flags & VPX_CODEC_USE_PSNR) ((VP9_COMP *)ctx->cpi)->b_calculate_psnr = 1; - // if (ctx->base.init_flags & VPX_CODEC_USE_OUTPUT_PARTITION) - // ((VP9_COMP *)ctx->cpi)->output_partition = 1; - /* Convert API flags to internal codec lib flags */ lib_flags = (flags & VPX_EFLAG_FORCE_KF) ? FRAMEFLAGS_KEY : 0; diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h index ed0122c1b..58256b22b 100644 --- a/vp9/vp9_iface_common.h +++ b/vp9/vp9_iface_common.h @@ -29,7 +29,7 @@ static void yuvconfig2image(vpx_image_t *img, const YV12_BUFFER_CONFIG *yv12, img->fmt = VPX_IMG_FMT_I420; } img->w = yv12->y_stride; - img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9BORDERINPIXELS, 3); + img->h = ALIGN_POWER_OF_TWO(yv12->y_height + 2 * VP9_ENC_BORDER_IN_PIXELS, 3); img->d_w = yv12->y_crop_width; img->d_h = yv12->y_crop_height; img->x_chroma_shift = yv12->uv_width < yv12->y_width; @@ -75,7 +75,7 @@ static vpx_codec_err_t image2yuvconfig(const vpx_image_t *img, yv12->border = (img->stride[VPX_PLANE_Y] - img->w) / 2; #if CONFIG_ALPHA - // For development purposes, force alpha to hold the same data a Y for now. + // For development purposes, force alpha to hold the same data as Y for now. yv12->alpha_buffer = yv12->y_buffer; yv12->alpha_width = yv12->y_width; yv12->alpha_height = yv12->y_height; diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 9ea0f549f..63003b9c2 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -44,7 +44,6 @@ VP9_CX_SRCS-yes += encoder/vp9_quantize.h VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h VP9_CX_SRCS-yes += encoder/vp9_rdopt.h VP9_CX_SRCS-yes += encoder/vp9_pickmode.h -VP9_CX_SRCS-yes += encoder/vp9_sadmxn.h VP9_CX_SRCS-yes += encoder/vp9_tokenize.h VP9_CX_SRCS-yes += encoder/vp9_treewriter.h VP9_CX_SRCS-yes += encoder/vp9_variance.h @@ -62,6 +61,8 @@ VP9_CX_SRCS-yes += encoder/vp9_segmentation.c VP9_CX_SRCS-yes += encoder/vp9_segmentation.h VP9_CX_SRCS-yes += encoder/vp9_subexp.c VP9_CX_SRCS-yes += encoder/vp9_subexp.h +VP9_CX_SRCS-yes += encoder/vp9_resize.c +VP9_CX_SRCS-yes += encoder/vp9_resize.h VP9_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/vp9_ssim.c VP9_CX_SRCS-yes += encoder/vp9_tokenize.c VP9_CX_SRCS-yes += encoder/vp9_treewriter.c diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h index bf5fc0779..610e7d280 100644 --- a/vpx_scale/yv12config.h +++ b/vpx_scale/yv12config.h @@ -18,10 +18,11 @@ extern "C" { #include "vpx/vpx_external_frame_buffer.h" #include "vpx/vpx_integer.h" -#define VP8BORDERINPIXELS 32 -#define VP9INNERBORDERINPIXELS 96 -#define VP9BORDERINPIXELS 160 -#define VP9_INTERP_EXTEND 4 +#define VP8BORDERINPIXELS 32 +#define VP9INNERBORDERINPIXELS 96 +#define VP9_INTERP_EXTEND 4 +#define VP9_ENC_BORDER_IN_PIXELS 160 +#define VP9_DEC_BORDER_IN_PIXELS 32 typedef struct yv12_buffer_config { int y_width; @@ -33,6 +33,7 @@ #include "./tools_common.h" #include "./webmdec.h" +#include "./y4menc.h" static const char *exec_name; @@ -131,6 +132,21 @@ static const arg_def_t *vp8_pp_args[] = { }; #endif +static int vpx_image_scale(vpx_image_t *src, vpx_image_t *dst, + FilterMode mode) { + assert(src->fmt == VPX_IMG_FMT_I420); + assert(dst->fmt == VPX_IMG_FMT_I420); + return I420Scale(src->planes[VPX_PLANE_Y], src->stride[VPX_PLANE_Y], + src->planes[VPX_PLANE_U], src->stride[VPX_PLANE_U], + src->planes[VPX_PLANE_V], src->stride[VPX_PLANE_V], + src->d_w, src->d_h, + dst->planes[VPX_PLANE_Y], dst->stride[VPX_PLANE_Y], + dst->planes[VPX_PLANE_U], dst->stride[VPX_PLANE_U], + dst->planes[VPX_PLANE_V], dst->stride[VPX_PLANE_V], + dst->d_w, dst->d_h, + mode); +} + void usage_exit() { int i; @@ -229,47 +245,51 @@ static int read_frame(struct VpxDecInputContext *input, uint8_t **buf, } } -void *out_open(const char *out_fn, int do_md5) { - void *out = NULL; +static int get_image_plane_width(int plane, const vpx_image_t *img) { + return (plane > 0 && img->x_chroma_shift > 0) ? + (img->d_w + 1) >> img->x_chroma_shift : + img->d_w; +} - if (do_md5) { - MD5Context *md5_ctx = out = malloc(sizeof(MD5Context)); - (void)out_fn; - MD5Init(md5_ctx); - } else { - FILE *outfile = out = strcmp("-", out_fn) ? fopen(out_fn, "wb") - : set_binary_mode(stdout); +static int get_image_plane_height(int plane, const vpx_image_t *img) { + return (plane > 0 && img->y_chroma_shift > 0) ? + (img->d_h + 1) >> img->y_chroma_shift : + img->d_h; +} - if (!outfile) { - fatal("Failed to output file"); - } - } +static void update_image_md5(const vpx_image_t *img, const int planes[3], + MD5Context *md5) { + int i, y; - return out; -} + for (i = 0; i < 3; ++i) { + const int plane = planes[i]; + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = get_image_plane_width(plane, img); + const int h = get_image_plane_height(plane, img); -void out_put(void *out, const uint8_t *buf, unsigned int len, int do_md5) { - if (do_md5) { - MD5Update(out, buf, len); - } else { - (void) fwrite(buf, 1, len, out); + for (y = 0; y < h; ++y) { + MD5Update(md5, buf, w); + buf += stride; + } } } -void out_close(void *out, const char *out_fn, int do_md5) { - if (do_md5) { - uint8_t md5[16]; - int i; +static void write_image_file(const vpx_image_t *img, const int planes[3], + FILE *file) { + int i, y; - MD5Final(md5, out); - free(out); + for (i = 0; i < 3; ++i) { + const int plane = planes[i]; + const unsigned char *buf = img->planes[plane]; + const int stride = img->stride[plane]; + const int w = get_image_plane_width(plane, img); + const int h = get_image_plane_height(plane, img); - for (i = 0; i < 16; i++) - printf("%02x", md5[i]); - - printf(" %s\n", out_fn); - } else { - fclose(out); + for (y = 0; y < h; ++y) { + fwrite(buf, 1, w, file); + buf += stride; + } } } @@ -413,6 +433,39 @@ void generate_filename(const char *pattern, char *out, size_t q_len, } while (*p); } +static int is_single_file(const char *outfile_pattern) { + const char *p = outfile_pattern; + + do { + p = strchr(p, '%'); + if (p && p[1] >= '1' && p[1] <= '9') + return 0; // pattern contains sequence number, so it's not unique + if (p) + p++; + } while (p); + + return 1; +} + +static void print_md5(unsigned char digest[16], const char *filename) { + int i; + + for (i = 0; i < 16; ++i) + printf("%02x", digest[i]); + printf(" %s\n", filename); +} + +static FILE *open_outfile(const char *name) { + if (strcmp("-", name) == 0) { + set_binary_mode(stdout); + return stdout; + } else { + FILE *file = fopen(name, "wb"); + if (!file) + fatal("Failed to output file %s", name); + return file; + } +} int main_loop(int argc, const char **argv_) { vpx_codec_ctx_t decoder; @@ -430,11 +483,9 @@ int main_loop(int argc, const char **argv_) { unsigned long dx_time = 0; struct arg arg; char **argv, **argi, **argj; - const char *outfile_pattern = 0; - char outfile[PATH_MAX]; + int single_file; int use_y4m = 1; - void *out = NULL; vpx_codec_dec_cfg_t cfg = {0}; #if CONFIG_VP8_DECODER vp8_postproc_cfg_t vp8_pp_cfg = {0}; @@ -451,8 +502,13 @@ int main_loop(int argc, const char **argv_) { int num_external_frame_buffers = 0; int fb_lru_cache = 0; vpx_codec_frame_buffer_t *frame_buffers = NULL; - int display_width = 0; - int display_height = 0; + + const char *outfile_pattern = NULL; + char outfile_name[PATH_MAX] = {0}; + FILE *outfile = NULL; + + MD5Context md5_ctx; + unsigned char md5_digest[16]; struct VpxDecInputContext input = {0}; struct VpxInputContext vpx_input_ctx = {0}; @@ -588,8 +644,7 @@ int main_loop(int argc, const char **argv_) { infile = strcmp(fn, "-") ? fopen(fn, "rb") : set_binary_mode(stdin); if (!infile) { - fprintf(stderr, "Failed to open file '%s'", - strcmp(fn, "-") ? fn : "stdin"); + fprintf(stderr, "Failed to open file '%s'", strcmp(fn, "-") ? fn : "stdin"); return EXIT_FAILURE; } #if CONFIG_OS_SUPPORT @@ -613,58 +668,32 @@ int main_loop(int argc, const char **argv_) { return EXIT_FAILURE; } - /* If the output file is not set or doesn't have a sequence number in - * it, then we only open it once. - */ outfile_pattern = outfile_pattern ? outfile_pattern : "-"; - single_file = 1; - { - const char *p = outfile_pattern; - do { - p = strchr(p, '%'); - if (p && p[1] >= '1' && p[1] <= '9') { - /* pattern contains sequence number, so it's not unique. */ - single_file = 0; - break; - } - if (p) - p++; - } while (p); - } + single_file = is_single_file(outfile_pattern); - if (single_file && !noblit) { - generate_filename(outfile_pattern, outfile, sizeof(outfile) - 1, + if (!noblit && single_file) { + generate_filename(outfile_pattern, outfile_name, PATH_MAX, vpx_input_ctx.width, vpx_input_ctx.height, 0); - out = out_open(outfile, do_md5); + if (do_md5) + MD5Init(&md5_ctx); + else + outfile = open_outfile(outfile_name); } if (use_y4m && !noblit) { - char buffer[128]; - if (!single_file) { fprintf(stderr, "YUV4MPEG2 not supported with output patterns," " try --i420 or --yv12.\n"); return EXIT_FAILURE; } - if (vpx_input_ctx.file_type == FILE_TYPE_WEBM) + if (vpx_input_ctx.file_type == FILE_TYPE_WEBM) { if (webm_guess_framerate(input.webm_ctx, input.vpx_input_ctx)) { fprintf(stderr, "Failed to guess framerate -- error parsing " "webm file?\n"); return EXIT_FAILURE; } - - - /*Note: We can't output an aspect ratio here because IVF doesn't - store one, and neither does VP8. - That will have to wait until these tools support WebM natively.*/ - snprintf(buffer, sizeof(buffer), "YUV4MPEG2 W%u H%u F%u:%u I%c ", - vpx_input_ctx.width, vpx_input_ctx.height, - vpx_input_ctx.framerate.numerator, - vpx_input_ctx.framerate.denominator, - 'p'); - out_put(out, (unsigned char *)buffer, - (unsigned int)strlen(buffer), do_md5); + } } /* Try to determine the codec from the fourcc. */ @@ -811,25 +840,20 @@ int main_loop(int argc, const char **argv_) { show_progress(frame_in, frame_out, dx_time); if (!noblit) { - if (frame_out == 1 && img && use_y4m) { - /* Write out the color format to terminate the header line */ - const char *color = - img->fmt == VPX_IMG_FMT_444A ? "C444alpha\n" : - img->fmt == VPX_IMG_FMT_I444 ? "C444\n" : - img->fmt == VPX_IMG_FMT_I422 ? "C422\n" : - "C420jpeg\n"; - - out_put(out, (const unsigned char*)color, strlen(color), do_md5); - } + if (frame_out == 1 && img && single_file && !do_md5 && use_y4m) + y4m_write_file_header(outfile, + vpx_input_ctx.width, vpx_input_ctx.height, + &vpx_input_ctx.framerate, img->fmt); if (img && do_scale) { if (frame_out == 1) { // If the output frames are to be scaled to a fixed display size then // use the width and height specified in the container. If either of // these is set to 0, use the display size set in the first frame - // header. - display_width = vpx_input_ctx.width; - display_height = vpx_input_ctx.height; + // header. If that is unavailable, use the raw decoded size of the + // first decoded frame. + int display_width = vpx_input_ctx.width; + int display_height = vpx_input_ctx.height; if (!display_width || !display_height) { int display_size[2]; if (vpx_codec_control(&decoder, VP9D_GET_DISPLAY_SIZE, @@ -846,67 +870,40 @@ int main_loop(int argc, const char **argv_) { display_height, 16); } - if (img->d_w != display_width || img->d_h != display_height) { - assert(img->fmt == VPX_IMG_FMT_I420); - I420Scale(img->planes[VPX_PLANE_Y], img->stride[VPX_PLANE_Y], - img->planes[VPX_PLANE_U], img->stride[VPX_PLANE_U], - img->planes[VPX_PLANE_V], img->stride[VPX_PLANE_V], - img->d_w, img->d_h, - scaled_img->planes[VPX_PLANE_Y], - scaled_img->stride[VPX_PLANE_Y], - scaled_img->planes[VPX_PLANE_U], - scaled_img->stride[VPX_PLANE_U], - scaled_img->planes[VPX_PLANE_V], - scaled_img->stride[VPX_PLANE_V], - display_width, display_height, - kFilterBox); + if (img->d_w != scaled_img->d_w || img->d_h != scaled_img->d_h) { + vpx_image_scale(img, scaled_img, kFilterBox); img = scaled_img; } } + if (img) { - unsigned int y; - char out_fn[PATH_MAX]; - uint8_t *buf; - unsigned int c_w = - img->x_chroma_shift ? (1 + img->d_w) >> img->x_chroma_shift - : img->d_w; - unsigned int c_h = - img->y_chroma_shift ? (1 + img->d_h) >> img->y_chroma_shift - : img->d_h; + const int PLANES_YUV[] = {VPX_PLANE_Y, VPX_PLANE_U, VPX_PLANE_V}; + const int PLANES_YVU[] = {VPX_PLANE_Y, VPX_PLANE_V, VPX_PLANE_U}; - if (!single_file) { - size_t len = sizeof(out_fn) - 1; + const int *planes = flipuv ? PLANES_YVU : PLANES_YUV; - out_fn[len] = '\0'; - generate_filename(outfile_pattern, out_fn, len - 1, + if (!single_file) { + generate_filename(outfile_pattern, outfile_name, PATH_MAX, img->d_w, img->d_h, frame_in); - out = out_open(out_fn, do_md5); - } else if (use_y4m) - out_put(out, (unsigned char *)"FRAME\n", 6, do_md5); - - buf = img->planes[VPX_PLANE_Y]; - - for (y = 0; y < img->d_h; y++) { - out_put(out, buf, img->d_w, do_md5); - buf += img->stride[VPX_PLANE_Y]; - } - - buf = img->planes[flipuv ? VPX_PLANE_V : VPX_PLANE_U]; - - for (y = 0; y < c_h; y++) { - out_put(out, buf, c_w, do_md5); - buf += img->stride[VPX_PLANE_U]; - } - - buf = img->planes[flipuv ? VPX_PLANE_U : VPX_PLANE_V]; - - for (y = 0; y < c_h; y++) { - out_put(out, buf, c_w, do_md5); - buf += img->stride[VPX_PLANE_V]; + if (do_md5) { + MD5Init(&md5_ctx); + update_image_md5(img, planes, &md5_ctx); + MD5Final(md5_digest, &md5_ctx); + print_md5(md5_digest, outfile_name); + } else { + outfile = open_outfile(outfile_name); + write_image_file(img, planes, outfile); + fclose(outfile); + } + } else { + if (do_md5) { + update_image_md5(img, planes, &md5_ctx); + } else { + if (use_y4m) + y4m_write_frame_header(outfile); + write_image_file(img, planes, outfile); + } } - - if (!single_file) - out_close(out, out_fn, do_md5); } } @@ -930,8 +927,14 @@ fail: return EXIT_FAILURE; } - if (single_file && !noblit) - out_close(out, outfile, do_md5); + if (!noblit && single_file) { + if (do_md5) { + MD5Final(md5_digest, &md5_ctx); + print_md5(md5_digest, outfile_name); + } else { + fclose(outfile); + } + } if (input.vpx_input_ctx->file_type == FILE_TYPE_WEBM) webm_free(input.webm_ctx); @@ -833,8 +833,8 @@ static int compare_img(vpx_image_t *img1, vpx_image_t *img2) unsigned int i; match &= (img1->fmt == img2->fmt); - match &= (img1->w == img2->w); - match &= (img1->h == img2->h); + match &= (img1->d_w == img2->d_w); + match &= (img1->d_h == img2->d_h); for (i = 0; i < img1->d_h; i++) match &= (memcmp(img1->planes[VPX_PLANE_Y]+i*img1->stride[VPX_PLANE_Y], diff --git a/y4menc.c b/y4menc.c new file mode 100644 index 000000000..8321b432e --- /dev/null +++ b/y4menc.c @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "./y4menc.h" + +void y4m_write_file_header(FILE *file, int width, int height, + const struct VpxRational *framerate, + vpx_img_fmt_t fmt) { + const char *color = fmt == VPX_IMG_FMT_444A ? "C444alpha\n" : + fmt == VPX_IMG_FMT_I444 ? "C444\n" : + fmt == VPX_IMG_FMT_I422 ? "C422\n" : + "C420jpeg\n"; + + // Note: We can't output an aspect ratio here because IVF doesn't + // store one, and neither does VP8. + // That will have to wait until these tools support WebM natively.*/ + fprintf(file, "YUV4MPEG2 W%u H%u F%u:%u I%c %s", width, height, + framerate->numerator, framerate->denominator, 'p', color); +} + +void y4m_write_frame_header(FILE *file) { + fprintf(file, "FRAME\n"); +} diff --git a/y4menc.h b/y4menc.h new file mode 100644 index 000000000..e5f7978a7 --- /dev/null +++ b/y4menc.h @@ -0,0 +1,27 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef Y4MENC_H_ +#define Y4MENC_H_ + +#include <stdio.h> + +#include "./tools_common.h" + +#include "vpx/vpx_decoder.h" + +void y4m_write_file_header(FILE *file, int width, int height, + const struct VpxRational *framerate, + vpx_img_fmt_t fmt); + +void y4m_write_frame_header(FILE *file); + + +#endif // Y4MENC_H_ |