diff options
-rw-r--r-- | examples.mk | 2 | ||||
-rw-r--r-- | libs.mk | 1 | ||||
-rw-r--r-- | test/encode_api_test.cc | 60 | ||||
-rw-r--r-- | test/encode_test_driver.h | 2 | ||||
-rw-r--r-- | test/fdct8x8_test.cc | 12 | ||||
-rw-r--r-- | test/video_source.h | 11 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.h | 1 | ||||
-rw-r--r-- | vp9/vp9_cx_iface.c | 2 | ||||
-rw-r--r-- | vpx/exports_com | 3 | ||||
-rw-r--r-- | vpx/src/vpx_tpl.c | 107 | ||||
-rw-r--r-- | vpx/vpx_codec.mk | 3 | ||||
-rw-r--r-- | vpx/vpx_encoder.h | 32 | ||||
-rw-r--r-- | vpx/vpx_tpl.h | 99 | ||||
-rw-r--r-- | vpx_dsp/arm/fdct_partial_neon.c | 12 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_common.h | 12 | ||||
-rw-r--r-- | vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c | 226 |
16 files changed, 392 insertions, 193 deletions
diff --git a/examples.mk b/examples.mk index 9e506dcd4..22726a3d4 100644 --- a/examples.mk +++ b/examples.mk @@ -82,8 +82,6 @@ ifeq ($(CONFIG_LIBYUV),yes) $(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS} endif ifeq ($(CONFIG_WEBM_IO),yes) - vpxdec.SRCS += $(LIBWEBM_COMMON_SRCS) - vpxdec.SRCS += $(LIBWEBM_MUXER_SRCS) vpxdec.SRCS += $(LIBWEBM_PARSER_SRCS) vpxdec.SRCS += webmdec.cc webmdec.h endif @@ -178,6 +178,7 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h INSTALL-LIBS-yes += include/vpx/vpx_integer.h INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h +INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h ifeq ($(CONFIG_EXTERNAL_BUILD),yes) ifeq ($(CONFIG_MSVS),yes) INSTALL-LIBS-yes += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib) diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc index 2b0aa1fdf..e8a044ae1 100644 --- a/test/encode_api_test.cc +++ b/test/encode_api_test.cc @@ -11,6 +11,7 @@ #include <climits> #include <cstring> #include <initializer_list> +#include <new> #include "third_party/googletest/src/include/gtest/gtest.h" #include "test/codec_factory.h" @@ -20,7 +21,7 @@ #include "./vpx_config.h" #include "vpx/vp8cx.h" -#include "vpx/vpx_encoder.h" +#include "vpx/vpx_tpl.h" namespace { @@ -368,7 +369,7 @@ class EncodeApiGetTplStatsTest : public ::libvpx_test::EncoderTest, public ::testing::TestWithParam<const libvpx_test::CodecFactory *> { public: - EncodeApiGetTplStatsTest() : EncoderTest(GetParam()) {} + EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {} ~EncodeApiGetTplStatsTest() override {} protected: @@ -396,6 +397,34 @@ class EncodeApiGetTplStatsTest return VPX_CODEC_OK; } + void CompareTplGopStats(const VpxTplGopStats &ref_gop_stats, + const VpxTplGopStats &test_gop_stats) { + ASSERT_EQ(ref_gop_stats.size, test_gop_stats.size); + for (int frame = 0; frame < ref_gop_stats.size; frame++) { + const VpxTplFrameStats &ref_frame_stats = + ref_gop_stats.frame_stats_list[frame]; + const VpxTplFrameStats &test_frame_stats = + test_gop_stats.frame_stats_list[frame]; + ASSERT_EQ(ref_frame_stats.num_blocks, test_frame_stats.num_blocks); + ASSERT_EQ(ref_frame_stats.frame_width, test_frame_stats.frame_width); + ASSERT_EQ(ref_frame_stats.frame_height, test_frame_stats.frame_height); + for (int block = 0; block < ref_frame_stats.num_blocks; block++) { + const VpxTplBlockStats &ref_block_stats = + ref_frame_stats.block_stats_list[block]; + const VpxTplBlockStats &test_block_stats = + test_frame_stats.block_stats_list[block]; + ASSERT_EQ(ref_block_stats.inter_cost, test_block_stats.inter_cost); + ASSERT_EQ(ref_block_stats.intra_cost, test_block_stats.intra_cost); + ASSERT_EQ(ref_block_stats.mv_c, test_block_stats.mv_c); + ASSERT_EQ(ref_block_stats.mv_r, test_block_stats.mv_r); + ASSERT_EQ(ref_block_stats.recrf_dist, test_block_stats.recrf_dist); + ASSERT_EQ(ref_block_stats.recrf_rate, test_block_stats.recrf_rate); + ASSERT_EQ(ref_block_stats.ref_frame_index, + test_block_stats.ref_frame_index); + } + } + } + void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override { ::libvpx_test::CxDataIterator iter = encoder->GetCxData(); while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) { @@ -416,7 +445,21 @@ class EncodeApiGetTplStatsTest } } ASSERT_TRUE(stats_not_all_zero); - // Free the memory right away now as this is only a test. + if (test_io_ && tpl_stats.size > 0) { + libvpx_test::TempOutFile *temp_out_file = + new (std::nothrow) libvpx_test::TempOutFile("w+"); + ASSERT_NE(temp_out_file, nullptr); + ASSERT_NE(temp_out_file->file(), nullptr); + vpx_write_tpl_gop_stats(temp_out_file->file(), &tpl_stats); + rewind(temp_out_file->file()); + VpxTplGopStats gop_stats_io; + ASSERT_EQ( + vpx_read_tpl_gop_stats(temp_out_file->file(), &gop_stats_io), + VPX_CODEC_OK); + CompareTplGopStats(gop_stats_io, tpl_stats); + vpx_free_tpl_gop_stats(&gop_stats_io); + delete temp_out_file; + } free(tpl_stats.frame_stats_list); break; } @@ -427,6 +470,7 @@ class EncodeApiGetTplStatsTest int width_; int height_; + bool test_io_; }; TEST_P(EncodeApiGetTplStatsTest, GetTplStats) { @@ -438,6 +482,16 @@ TEST_P(EncodeApiGetTplStatsTest, GetTplStats) { ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); } +TEST_P(EncodeApiGetTplStatsTest, GetTplStatsIO) { + cfg_.g_lag_in_frames = 25; + width_ = 352; + height_ = 288; + test_io_ = true; + ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_, + height_, 30, 1, 0, 50); + ASSERT_NO_FATAL_FAILURE(RunLoop(&video)); +} + INSTANTIATE_TEST_SUITE_P( VP9, EncodeApiGetTplStatsTest, ::testing::Values( diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h index 922c49f42..165fcfabf 100644 --- a/test/encode_test_driver.h +++ b/test/encode_test_driver.h @@ -19,7 +19,7 @@ #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER #include "vpx/vp8cx.h" #endif -#include "vpx/vpx_encoder.h" +#include "vpx/vpx_tpl.h" namespace libvpx_test { diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index fcc84690a..21f8dcffa 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -132,6 +132,15 @@ void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) { #endif // HAVE_SSE2 #endif // CONFIG_VP9_HIGHBITDEPTH +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck(). +// See: +// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", off) +#endif class FwdTrans8x8TestBase { public: virtual ~FwdTrans8x8TestBase() {} @@ -523,6 +532,9 @@ class FwdTrans8x8TestBase { vpx_bit_depth_t bit_depth_; int mask_; }; +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", on) +#endif class FwdTrans8x8DCT : public FwdTrans8x8TestBase, public ::testing::TestWithParam<Dct8x8Param> { diff --git a/test/video_source.h b/test/video_source.h index a10ff6fb0..5ed99d063 100644 --- a/test/video_source.h +++ b/test/video_source.h @@ -64,7 +64,7 @@ inline FILE *OpenTestDataFile(const std::string &file_name) { return fopen(path_to_source.c_str(), "rb"); } -static FILE *GetTempOutFile(std::string *file_name) { +static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) { file_name->clear(); #if defined(_WIN32) char fname[MAX_PATH]; @@ -73,7 +73,7 @@ static FILE *GetTempOutFile(std::string *file_name) { // Assume for now that the filename generated is unique per process if (GetTempFileNameA(tmppath, "lvx", 0, fname)) { file_name->assign(fname); - return fopen(fname, "wb+"); + return fopen(fname, io_mode); } } return nullptr; @@ -94,13 +94,16 @@ static FILE *GetTempOutFile(std::string *file_name) { const int fd = mkstemp(temp_file_name.get()); if (fd == -1) return nullptr; *file_name = temp_file_name.get(); - return fdopen(fd, "wb+"); + return fdopen(fd, io_mode); #endif } class TempOutFile { public: - TempOutFile() { file_ = GetTempOutFile(&file_name_); } + TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); } + TempOutFile(const char *io_mode) { + file_ = GetTempOutFile(&file_name_, io_mode); + } ~TempOutFile() { CloseFile(); if (!file_name_.empty()) { diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 2528bc231..2e0c4db9e 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -18,6 +18,7 @@ #include "vpx/internal/vpx_codec_internal.h" #include "vpx/vpx_ext_ratectrl.h" #include "vpx/vp8cx.h" +#include "vpx/vpx_tpl.h" #if CONFIG_INTERNAL_STATS #include "vpx_dsp/ssim.h" #endif diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 8f157274f..409069b4e 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -29,6 +29,8 @@ #include "vp9/vp9_cx_iface.h" #include "vp9/vp9_iface_common.h" +#include "vpx/vpx_tpl.h" + typedef struct vp9_extracfg { int cpu_used; // available cpu percentage in 1/16 unsigned int enable_auto_alt_ref; diff --git a/vpx/exports_com b/vpx/exports_com index 2ab05099f..f0b46aa17 100644 --- a/vpx/exports_com +++ b/vpx/exports_com @@ -14,3 +14,6 @@ text vpx_img_flip text vpx_img_free text vpx_img_set_rect text vpx_img_wrap +text vpx_free_tpl_gop_stats +text vpx_read_tpl_gop_stats +text vpx_write_tpl_gop_stats diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c new file mode 100644 index 000000000..9cdb4a0a0 --- /dev/null +++ b/vpx/src/vpx_tpl.c @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <stdlib.h> + +#include "vpx/vpx_codec.h" +#include "vpx/vpx_tpl.h" +#include "vpx_mem/vpx_mem.h" + +#define CHECK_FPRINTF_ERROR(expr) \ + do { \ + if (expr < 0) { \ + return VPX_CODEC_ERROR; \ + } \ + } while (0) + +#define CHECK_FSCANF_ERROR(expr, expected_value) \ + do { \ + if (expr != expected_value) { \ + return VPX_CODEC_ERROR; \ + } \ + } while (0) + +vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, + const VpxTplGopStats *tpl_gop_stats) { + int i; + if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM; + CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size)); + + for (i = 0; i < tpl_gop_stats->size; i++) { + VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i]; + const int num_blocks = frame_stats.num_blocks; + int block; + CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width, + frame_stats.frame_height, num_blocks)); + for (block = 0; block < num_blocks; block++) { + VpxTplBlockStats block_stats = frame_stats.block_stats_list[block]; + CHECK_FPRINTF_ERROR( + fprintf(tpl_file, + "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64 + " %" PRId64 " %d\n", + block_stats.inter_cost, block_stats.intra_cost, + block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist, + block_stats.recrf_rate, block_stats.ref_frame_index)); + } + } + + return VPX_CODEC_OK; +} + +vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, + VpxTplGopStats *tpl_gop_stats) { + int i, frame_list_size; + if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM; + CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1); + tpl_gop_stats->size = frame_list_size; + tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc( + frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0])); + if (tpl_gop_stats->frame_stats_list == NULL) { + return VPX_CODEC_MEM_ERROR; + } + for (i = 0; i < frame_list_size; i++) { + VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i]; + int num_blocks, width, height, block; + CHECK_FSCANF_ERROR( + fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3); + frame_stats->num_blocks = num_blocks; + frame_stats->frame_width = width; + frame_stats->frame_height = height; + frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc( + num_blocks, sizeof(frame_stats->block_stats_list[0])); + if (frame_stats->block_stats_list == NULL) { + vpx_free_tpl_gop_stats(tpl_gop_stats); + return VPX_CODEC_MEM_ERROR; + } + for (block = 0; block < num_blocks; block++) { + VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block]; + CHECK_FSCANF_ERROR( + fscanf(tpl_file, + "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64 + " %" SCNd64 " %d\n", + &block_stats->inter_cost, &block_stats->intra_cost, + &block_stats->mv_c, &block_stats->mv_r, + &block_stats->recrf_dist, &block_stats->recrf_rate, + &block_stats->ref_frame_index), + 7); + } + } + + return VPX_CODEC_OK; +} + +void vpx_free_tpl_gop_stats(VpxTplGopStats *data) { + int frame; + if (data == NULL) return; + for (frame = 0; frame < data->size; frame++) { + vpx_free(data->frame_stats_list[frame].block_stats_list); + } + vpx_free(data->frame_stats_list); +} diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk index de86579d5..25c815ef5 100644 --- a/vpx/vpx_codec.mk +++ b/vpx/vpx_codec.mk @@ -27,6 +27,7 @@ API_DOC_SRCS-yes += vpx_encoder.h API_DOC_SRCS-yes += vpx_ext_ratectrl.h API_DOC_SRCS-yes += vpx_frame_buffer.h API_DOC_SRCS-yes += vpx_image.h +API_DOC_SRCS-yes += vpx_tpl.h API_SRCS-yes += src/vpx_decoder.c API_SRCS-yes += vpx_decoder.h @@ -36,9 +37,11 @@ API_SRCS-yes += internal/vpx_codec_internal.h API_SRCS-yes += internal/vpx_ratectrl_rtc.h API_SRCS-yes += src/vpx_codec.c API_SRCS-yes += src/vpx_image.c +API_SRCS-yes += src/vpx_tpl.c API_SRCS-yes += vpx_codec.h API_SRCS-yes += vpx_codec.mk API_SRCS-yes += vpx_frame_buffer.h API_SRCS-yes += vpx_image.h API_SRCS-yes += vpx_integer.h API_SRCS-yes += vpx_ext_ratectrl.h +API_SRCS-yes += vpx_tpl.h diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index fb95723dd..c45d1a2ba 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -31,6 +31,7 @@ extern "C" { #include "./vpx_codec.h" #include "./vpx_ext_ratectrl.h" +#include "./vpx_tpl.h" /*! Temporal Scalability: Maximum length of the sequence defining frame * layer membership @@ -57,9 +58,9 @@ extern "C" { * types, removing or reassigning enums, adding/removing/rearranging * fields to structures */ -#define VPX_ENCODER_ABI_VERSION \ - (16 + VPX_CODEC_ABI_VERSION + \ - VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/ +#define VPX_ENCODER_ABI_VERSION \ + (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \ + VPX_TPL_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * @@ -252,31 +253,6 @@ enum vpx_kf_mode { VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */ }; -/*!\brief Temporal dependency model stats for each block before propagation */ -typedef struct VpxTplBlockStats { - int64_t intra_cost; /**< Intra cost */ - int64_t inter_cost; /**< Inter cost */ - int16_t mv_r; /**< Motion vector row */ - int16_t mv_c; /**< Motion vector col */ - int64_t recrf_rate; /**< Rate from reconstructed ref frame */ - int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ - int ref_frame_index; /**< Ref frame index */ -} VpxTplBlockStats; - -/*!\brief Temporal dependency model stats for each frame before propagation */ -typedef struct VpxTplFrameStats { - int frame_width; /**< Frame width */ - int frame_height; /**< Frame height */ - int num_blocks; /**< Number of blocks. Size of block_stats_list */ - VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ -} VpxTplFrameStats; - -/*!\brief Temporal dependency model stats for each GOP before propagation */ -typedef struct VpxTplGopStats { - int size; /**< GOP size, also the size of frame_stats_list. */ - VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */ -} VpxTplGopStats; - /*!\brief Encoded Frame Flags * * This type indicates a bitfield to be passed to vpx_codec_encode(), defining diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h new file mode 100644 index 000000000..50aec49eb --- /dev/null +++ b/vpx/vpx_tpl.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2023 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/*!\file + * \brief Describes the TPL stats descriptor and associated operations + * + */ +#ifndef VPX_VPX_VPX_TPL_H_ +#define VPX_VPX_VPX_TPL_H_ + +#include <stdio.h> + +#include "./vpx_integer.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/*!\brief Current ABI version number + * + * \internal + * If this file is altered in any way that changes the ABI, this value + * must be bumped. Examples include, but are not limited to, changing + * types, removing or reassigning enums, adding/removing/rearranging + * fields to structures + */ +#define VPX_TPL_ABI_VERSION (1) /**<\hideinitializer*/ + +/*!\brief Temporal dependency model stats for each block before propagation */ +typedef struct VpxTplBlockStats { + int64_t intra_cost; /**< Intra cost */ + int64_t inter_cost; /**< Inter cost */ + int16_t mv_r; /**< Motion vector row */ + int16_t mv_c; /**< Motion vector col */ + int64_t recrf_rate; /**< Rate from reconstructed ref frame */ + int64_t recrf_dist; /**< Distortion from reconstructed ref frame */ + int ref_frame_index; /**< Ref frame index */ +} VpxTplBlockStats; + +/*!\brief Temporal dependency model stats for each frame before propagation */ +typedef struct VpxTplFrameStats { + int frame_width; /**< Frame width */ + int frame_height; /**< Frame height */ + int num_blocks; /**< Number of blocks. Size of block_stats_list */ + VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */ +} VpxTplFrameStats; + +/*!\brief Temporal dependency model stats for each GOP before propagation */ +typedef struct VpxTplGopStats { + int size; /**< GOP size, also the size of frame_stats_list. */ + VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */ +} VpxTplGopStats; + +/*!\brief Write VpxTplGopStats to file + * + * Accepts an opened file handle and writes \p tpl_gop_stats. + * + * \param[in] tpl_file A FILE pointer that's already been opened. + * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + * + * \return VPX_CODEC_OK if TPL stats are successfully written. + */ +vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file, + const VpxTplGopStats *tpl_gop_stats); + +/*!\brief Read VpxTplGopStats from file + * + * Accepts an opened file handle and reads TPL stats and stores them into + * \p tpl_gop_stats. Allocates memory for TPL stats. + * + * \param[in] tpl_file A FILE pointer that's already been opened. + * \param[out] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + * + * \return VPX_CODEC_OK if TPL stats are successfully read from file. + */ +vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file, + VpxTplGopStats *tpl_gop_stats); + +/*!\brief Free the memory allocated for VpxTplGopStats + * + * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the + * whole GOP. + */ +void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats); + +#ifdef __cplusplus +} // extern "C" +#endif + +#endif // VPX_VPX_VPX_TPL_H_ diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c index 718dba0d9..df0da543c 100644 --- a/vpx_dsp/arm/fdct_partial_neon.c +++ b/vpx_dsp/arm/fdct_partial_neon.c @@ -37,6 +37,15 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) { output[1] = 0; } +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// will fail with an internal compiler error. +// See: +// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", off) +#endif void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { int r; int16x8_t sum = vld1q_s16(&input[0]); @@ -49,6 +58,9 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) { output[0] = (tran_low_t)horizontal_add_int16x8(sum); output[1] = 0; } +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +#pragma optimize("", on) +#endif void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output, int stride) { diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h index 2de449546..4b946d756 100644 --- a/vpx_dsp/vpx_dsp_common.h +++ b/vpx_dsp/vpx_dsp_common.h @@ -45,9 +45,21 @@ typedef int16_t tran_low_t; typedef int16_t tran_coef_t; +// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled +// produces invalid code for clip_pixel() when the return type is uint8_t. +// See: +// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361 +// TODO(jzern): check the compiler version after a fix for the issue is +// released. +#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__) +static INLINE int clip_pixel(int val) { + return (val > 255) ? 255 : (val < 0) ? 0 : val; +} +#else static INLINE uint8_t clip_pixel(int val) { return (val > 255) ? 255 : (val < 0) ? 0 : val; } +#endif static INLINE int clamp(int value, int low, int high) { return value < low ? low : (value > high ? high : value); diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 2498bba17..526c28382 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -46,7 +46,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = { }; #define CALC_CONVOLVE8_HORZ_ROW \ - srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); \ + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); \ s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \ s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \ s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \ @@ -60,16 +60,6 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = { _mm256_extractf128_si256(s1[0], 1)); \ output_ptr += output_pitch; -// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0 -static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) { - // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0 - __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo))); - - // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0 - a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1); - return a; -} - static INLINE void vpx_filter_block1d16_h8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter, @@ -93,12 +83,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( __m256i srcReg; // load the 2 strides of source - srcReg = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3))); - srcReg = _mm256_inserti128_si256( - srcReg, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)), - 1); + srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3); // filter the source buffer s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); @@ -109,12 +94,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) - srcReg = - _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5))); - srcReg = _mm256_inserti128_si256( - srcReg, - _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)), - 1); + srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5); // filter the source buffer s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); @@ -129,60 +109,37 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2( src_ptr += src_stride; - // average if necessary - outReg1 = _mm256_castsi256_si128(outReg32b1); - outReg2 = _mm256_extractf128_si256(outReg32b1, 1); if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - outReg2 = _mm_avg_epu8( - outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch))); + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch)); + outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg); } - - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2); - + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + output_pitch), &outReg32b1); output_ptr += dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { - __m128i srcReg; - - // load the first 16 bytes of the last row - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); + const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + const __m256i srcReg = + _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1); // filter the source buffer - s[0] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); - s[1] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); - s[2] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); - s[3] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); - outReg1 = convolve8_8_avx2(s, f); - - // reading the next 16 bytes - // (part of it was being read by earlier read) - srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + s[0] = _mm256_shuffle_epi8(srcReg, filt[0]); + s[1] = _mm256_shuffle_epi8(srcReg, filt[1]); + s[2] = _mm256_shuffle_epi8(srcReg, filt[2]); + s[3] = _mm256_shuffle_epi8(srcReg, filt[3]); - // filter the source buffer - s[0] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0]))); - s[1] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1]))); - s[2] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2]))); - s[3] = _mm256_castsi128_si256( - _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3]))); - outReg2 = convolve8_8_avx2(s, f); + // The low and high 128-bits of each lane contain the first and second + // convolve result respectively + outReg32b1 = convolve8_16_avx2(s, f); + outReg1 = _mm256_castsi256_si128(outReg32b1); + outReg2 = _mm256_extractf128_si256(outReg32b1, 1); - // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane - // contain the first and second convolve result respectively + // shrink to 8 bit each 16 bits outReg1 = _mm_packus_epi16(outReg1, outReg2); // average if necessary @@ -266,7 +223,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter, const int avg) { - __m128i outReg1, outReg2; __m256i srcRegHead1; unsigned int i; ptrdiff_t src_stride, dst_stride; @@ -345,19 +301,14 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2( src_ptr += src_stride; // average if necessary - outReg1 = _mm256_castsi256_si128(s1[0]); - outReg2 = _mm256_extractf128_si256(s1[0], 1); if (avg) { - outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr)); - outReg2 = _mm_avg_epu8( - outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch))); + const __m256i outReg = mm256_loadu2_si128( + (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch)); + s1[0] = _mm256_avg_epu8(s1[0], outReg); } - // save 16 bytes - _mm_store_si128((__m128i *)output_ptr, outReg1); - - // save the next 16 bits - _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2); + mm256_store2_si128((__m128i *)output_ptr, + (__m128i *)(output_ptr + out_pitch), s1); output_ptr += dst_stride; @@ -1094,7 +1045,7 @@ static void vpx_filter_block1d4_h8_avx2( // load the 2 strides of source // r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07 // r06 r05 r04 r03 r02 r01 r00 - srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); + srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); // filter the source buffer // r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06 @@ -1188,8 +1139,7 @@ static void vpx_filter_block1d4_v8_avx2( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m256i f[4], ss[4]; - __m256i r[8]; - __m128i r1[10]; + __m256i r[9], rr[2]; __m128i s[11]; unsigned int y = output_height; @@ -1210,48 +1160,35 @@ static void vpx_filter_block1d4_v8_avx2( s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch)); s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch)); - // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00 - r1[0] = _mm_unpacklo_epi32(s[0], s[1]); - - // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10 - r1[1] = _mm_unpacklo_epi32(s[1], s[2]); - - // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20 - r1[2] = _mm_unpacklo_epi32(s[2], s[3]); - - // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30 - r1[3] = _mm_unpacklo_epi32(s[3], s[4]); - - // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40 - r1[4] = _mm_unpacklo_epi32(s[4], s[5]); - - // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50 - r1[5] = _mm_unpacklo_epi32(s[5], s[6]); + r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1); + r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1); + r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1); + r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1); + r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1); - // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02 - // r01 r00 - r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1); + // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[0], r[1]); - // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12 - // r11 r10 - r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1); - - // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22 - // r21 r20 - r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1); - - // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32 - // r31 r30 - r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1); + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[1], r[2]); // r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10 // r00| - ss[0] = _mm256_unpacklo_epi8(r[0], r[1]); + ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]); + + // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12 + // r11 r10 r03 r02 r01 r00 + rr[0] = _mm256_unpacklo_epi32(r[2], r[3]); + + // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22 + // r21 r20 r13 r12 r11 r10 + rr[1] = _mm256_unpacklo_epi32(r[3], r[4]); // r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30 // r20| - ss[1] = _mm256_unpacklo_epi8(r[2], r[3]); - + ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]); // Process 4 rows at a time while (y >= 4) { s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); @@ -1259,41 +1196,17 @@ static void vpx_filter_block1d4_v8_avx2( s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch)); s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch)); - // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 - r1[6] = _mm_unpacklo_epi32(s[6], s[7]); - - // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 - r1[7] = _mm_unpacklo_epi32(s[7], s[8]); - - // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80 - r1[8] = _mm_unpacklo_epi32(s[8], s[9]); - - // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90 - r1[9] = _mm_unpacklo_epi32(s[9], s[10]); - - // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43 - // r42 r41 r40 - r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1); - - // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53 - // r52 r51 r50 - r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1); + r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1); + r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1); + rr[0] = _mm256_unpacklo_epi32(r[4], r[5]); + rr[1] = _mm256_unpacklo_epi32(r[5], r[6]); + ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]); - // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63 - // r62 r61 r60 - r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1); - - // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81 - // r80|r73 r72 r71 r70 - r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1); - - // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50 - // r40| - ss[2] = _mm256_unpacklo_epi8(r[4], r[5]); - - // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73 - // r63....r70 r60| - ss[3] = _mm256_unpacklo_epi8(r[6], r[7]); + r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1); + r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1); + rr[0] = _mm256_unpacklo_epi32(r[6], r[7]); + rr[1] = _mm256_unpacklo_epi32(r[7], r[8]); + ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]); ss[0] = convolve8_16_avx2(ss, f); @@ -1315,17 +1228,17 @@ static void vpx_filter_block1d4_v8_avx2( ss[1] = ss[3]; s[6] = s[10]; + s[5] = s[9]; - r1[4] = r1[8]; - r1[5] = r1[9]; - + r[4] = r[8]; y -= 4; } // Process 2 rows if (y == 2) { - __m128i ss1[4], f1[4]; + __m128i ss1[4], f1[4], r1[4]; + s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch)); s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch)); s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch)); @@ -1334,11 +1247,14 @@ static void vpx_filter_block1d4_v8_avx2( f1[2] = _mm256_castsi256_si128(f[2]); f1[3] = _mm256_castsi256_si128(f[3]); + r1[0] = _mm_unpacklo_epi32(s[4], s[5]); + r1[1] = _mm_unpacklo_epi32(s[5], s[6]); + // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60 - r1[6] = _mm_unpacklo_epi32(s[6], s[7]); + r1[2] = _mm_unpacklo_epi32(s[6], s[7]); // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70 - r1[7] = _mm_unpacklo_epi32(s[7], s[8]); + r1[3] = _mm_unpacklo_epi32(s[7], s[8]); // r23 r13....r20 r10|r13 r03....r10 r00 ss1[0] = _mm256_castsi256_si128(ss[0]); @@ -1347,10 +1263,10 @@ static void vpx_filter_block1d4_v8_avx2( ss1[1] = _mm256_castsi256_si128(ss[1]); // r63 r53....r60 r50|r53 r43....r50 r40 - ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]); + ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]); // r83 r73....r80 r70|r73 r63....r70 r60 - ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]); + ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]); ss1[0] = convolve8_8_ssse3(ss1, f1); |