summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--examples.mk2
-rw-r--r--libs.mk1
-rw-r--r--test/encode_api_test.cc60
-rw-r--r--test/encode_test_driver.h2
-rw-r--r--test/fdct8x8_test.cc12
-rw-r--r--test/video_source.h11
-rw-r--r--vp9/encoder/vp9_encoder.h1
-rw-r--r--vp9/vp9_cx_iface.c2
-rw-r--r--vpx/exports_com3
-rw-r--r--vpx/src/vpx_tpl.c107
-rw-r--r--vpx/vpx_codec.mk3
-rw-r--r--vpx/vpx_encoder.h32
-rw-r--r--vpx/vpx_tpl.h99
-rw-r--r--vpx_dsp/arm/fdct_partial_neon.c12
-rw-r--r--vpx_dsp/vpx_dsp_common.h12
-rw-r--r--vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c226
16 files changed, 392 insertions, 193 deletions
diff --git a/examples.mk b/examples.mk
index 9e506dcd4..22726a3d4 100644
--- a/examples.mk
+++ b/examples.mk
@@ -82,8 +82,6 @@ ifeq ($(CONFIG_LIBYUV),yes)
$(BUILD_PFX)third_party/libyuv/%.cc.o: CXXFLAGS += ${LIBYUV_CXXFLAGS}
endif
ifeq ($(CONFIG_WEBM_IO),yes)
- vpxdec.SRCS += $(LIBWEBM_COMMON_SRCS)
- vpxdec.SRCS += $(LIBWEBM_MUXER_SRCS)
vpxdec.SRCS += $(LIBWEBM_PARSER_SRCS)
vpxdec.SRCS += webmdec.cc webmdec.h
endif
diff --git a/libs.mk b/libs.mk
index 1411fee9a..f6f6cc94c 100644
--- a/libs.mk
+++ b/libs.mk
@@ -178,6 +178,7 @@ INSTALL-LIBS-yes += include/vpx/vpx_image.h
INSTALL-LIBS-yes += include/vpx/vpx_integer.h
INSTALL-LIBS-$(CONFIG_DECODERS) += include/vpx/vpx_decoder.h
INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_encoder.h
+INSTALL-LIBS-$(CONFIG_ENCODERS) += include/vpx/vpx_tpl.h
ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
ifeq ($(CONFIG_MSVS),yes)
INSTALL-LIBS-yes += $(foreach p,$(VS_PLATFORMS),$(LIBSUBDIR)/$(p)/$(CODEC_LIB).lib)
diff --git a/test/encode_api_test.cc b/test/encode_api_test.cc
index 2b0aa1fdf..e8a044ae1 100644
--- a/test/encode_api_test.cc
+++ b/test/encode_api_test.cc
@@ -11,6 +11,7 @@
#include <climits>
#include <cstring>
#include <initializer_list>
+#include <new>
#include "third_party/googletest/src/include/gtest/gtest.h"
#include "test/codec_factory.h"
@@ -20,7 +21,7 @@
#include "./vpx_config.h"
#include "vpx/vp8cx.h"
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
namespace {
@@ -368,7 +369,7 @@ class EncodeApiGetTplStatsTest
: public ::libvpx_test::EncoderTest,
public ::testing::TestWithParam<const libvpx_test::CodecFactory *> {
public:
- EncodeApiGetTplStatsTest() : EncoderTest(GetParam()) {}
+ EncodeApiGetTplStatsTest() : EncoderTest(GetParam()), test_io_(false) {}
~EncodeApiGetTplStatsTest() override {}
protected:
@@ -396,6 +397,34 @@ class EncodeApiGetTplStatsTest
return VPX_CODEC_OK;
}
+ void CompareTplGopStats(const VpxTplGopStats &ref_gop_stats,
+ const VpxTplGopStats &test_gop_stats) {
+ ASSERT_EQ(ref_gop_stats.size, test_gop_stats.size);
+ for (int frame = 0; frame < ref_gop_stats.size; frame++) {
+ const VpxTplFrameStats &ref_frame_stats =
+ ref_gop_stats.frame_stats_list[frame];
+ const VpxTplFrameStats &test_frame_stats =
+ test_gop_stats.frame_stats_list[frame];
+ ASSERT_EQ(ref_frame_stats.num_blocks, test_frame_stats.num_blocks);
+ ASSERT_EQ(ref_frame_stats.frame_width, test_frame_stats.frame_width);
+ ASSERT_EQ(ref_frame_stats.frame_height, test_frame_stats.frame_height);
+ for (int block = 0; block < ref_frame_stats.num_blocks; block++) {
+ const VpxTplBlockStats &ref_block_stats =
+ ref_frame_stats.block_stats_list[block];
+ const VpxTplBlockStats &test_block_stats =
+ test_frame_stats.block_stats_list[block];
+ ASSERT_EQ(ref_block_stats.inter_cost, test_block_stats.inter_cost);
+ ASSERT_EQ(ref_block_stats.intra_cost, test_block_stats.intra_cost);
+ ASSERT_EQ(ref_block_stats.mv_c, test_block_stats.mv_c);
+ ASSERT_EQ(ref_block_stats.mv_r, test_block_stats.mv_r);
+ ASSERT_EQ(ref_block_stats.recrf_dist, test_block_stats.recrf_dist);
+ ASSERT_EQ(ref_block_stats.recrf_rate, test_block_stats.recrf_rate);
+ ASSERT_EQ(ref_block_stats.ref_frame_index,
+ test_block_stats.ref_frame_index);
+ }
+ }
+ }
+
void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) override {
::libvpx_test::CxDataIterator iter = encoder->GetCxData();
while (const vpx_codec_cx_pkt_t *pkt = iter.Next()) {
@@ -416,7 +445,21 @@ class EncodeApiGetTplStatsTest
}
}
ASSERT_TRUE(stats_not_all_zero);
- // Free the memory right away now as this is only a test.
+ if (test_io_ && tpl_stats.size > 0) {
+ libvpx_test::TempOutFile *temp_out_file =
+ new (std::nothrow) libvpx_test::TempOutFile("w+");
+ ASSERT_NE(temp_out_file, nullptr);
+ ASSERT_NE(temp_out_file->file(), nullptr);
+ vpx_write_tpl_gop_stats(temp_out_file->file(), &tpl_stats);
+ rewind(temp_out_file->file());
+ VpxTplGopStats gop_stats_io;
+ ASSERT_EQ(
+ vpx_read_tpl_gop_stats(temp_out_file->file(), &gop_stats_io),
+ VPX_CODEC_OK);
+ CompareTplGopStats(gop_stats_io, tpl_stats);
+ vpx_free_tpl_gop_stats(&gop_stats_io);
+ delete temp_out_file;
+ }
free(tpl_stats.frame_stats_list);
break;
}
@@ -427,6 +470,7 @@ class EncodeApiGetTplStatsTest
int width_;
int height_;
+ bool test_io_;
};
TEST_P(EncodeApiGetTplStatsTest, GetTplStats) {
@@ -438,6 +482,16 @@ TEST_P(EncodeApiGetTplStatsTest, GetTplStats) {
ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
}
+TEST_P(EncodeApiGetTplStatsTest, GetTplStatsIO) {
+ cfg_.g_lag_in_frames = 25;
+ width_ = 352;
+ height_ = 288;
+ test_io_ = true;
+ ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", width_,
+ height_, 30, 1, 0, 50);
+ ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+
INSTANTIATE_TEST_SUITE_P(
VP9, EncodeApiGetTplStatsTest,
::testing::Values(
diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 922c49f42..165fcfabf 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h
@@ -19,7 +19,7 @@
#if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
#include "vpx/vp8cx.h"
#endif
-#include "vpx/vpx_encoder.h"
+#include "vpx/vpx_tpl.h"
namespace libvpx_test {
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index fcc84690a..21f8dcffa 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -132,6 +132,15 @@ void idct8x8_64_add_12_sse2(const tran_low_t *in, uint8_t *out, int stride) {
#endif // HAVE_SSE2
#endif // CONFIG_VP9_HIGHBITDEPTH
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// produces invalid code in RunExtremalCheck() and RunInvAccuracyCheck().
+// See:
+// https://developercommunity.visualstudio.com/t/1770-preview-1:-Misoptimization-for-AR/10369786
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", off)
+#endif
class FwdTrans8x8TestBase {
public:
virtual ~FwdTrans8x8TestBase() {}
@@ -523,6 +532,9 @@ class FwdTrans8x8TestBase {
vpx_bit_depth_t bit_depth_;
int mask_;
};
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", on)
+#endif
class FwdTrans8x8DCT : public FwdTrans8x8TestBase,
public ::testing::TestWithParam<Dct8x8Param> {
diff --git a/test/video_source.h b/test/video_source.h
index a10ff6fb0..5ed99d063 100644
--- a/test/video_source.h
+++ b/test/video_source.h
@@ -64,7 +64,7 @@ inline FILE *OpenTestDataFile(const std::string &file_name) {
return fopen(path_to_source.c_str(), "rb");
}
-static FILE *GetTempOutFile(std::string *file_name) {
+static FILE *GetTempOutFile(std::string *file_name, const char *io_mode) {
file_name->clear();
#if defined(_WIN32)
char fname[MAX_PATH];
@@ -73,7 +73,7 @@ static FILE *GetTempOutFile(std::string *file_name) {
// Assume for now that the filename generated is unique per process
if (GetTempFileNameA(tmppath, "lvx", 0, fname)) {
file_name->assign(fname);
- return fopen(fname, "wb+");
+ return fopen(fname, io_mode);
}
}
return nullptr;
@@ -94,13 +94,16 @@ static FILE *GetTempOutFile(std::string *file_name) {
const int fd = mkstemp(temp_file_name.get());
if (fd == -1) return nullptr;
*file_name = temp_file_name.get();
- return fdopen(fd, "wb+");
+ return fdopen(fd, io_mode);
#endif
}
class TempOutFile {
public:
- TempOutFile() { file_ = GetTempOutFile(&file_name_); }
+ TempOutFile() { file_ = GetTempOutFile(&file_name_, "wb+"); }
+ TempOutFile(const char *io_mode) {
+ file_ = GetTempOutFile(&file_name_, io_mode);
+ }
~TempOutFile() {
CloseFile();
if (!file_name_.empty()) {
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 2528bc231..2e0c4db9e 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -18,6 +18,7 @@
#include "vpx/internal/vpx_codec_internal.h"
#include "vpx/vpx_ext_ratectrl.h"
#include "vpx/vp8cx.h"
+#include "vpx/vpx_tpl.h"
#if CONFIG_INTERNAL_STATS
#include "vpx_dsp/ssim.h"
#endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 8f157274f..409069b4e 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -29,6 +29,8 @@
#include "vp9/vp9_cx_iface.h"
#include "vp9/vp9_iface_common.h"
+#include "vpx/vpx_tpl.h"
+
typedef struct vp9_extracfg {
int cpu_used; // available cpu percentage in 1/16
unsigned int enable_auto_alt_ref;
diff --git a/vpx/exports_com b/vpx/exports_com
index 2ab05099f..f0b46aa17 100644
--- a/vpx/exports_com
+++ b/vpx/exports_com
@@ -14,3 +14,6 @@ text vpx_img_flip
text vpx_img_free
text vpx_img_set_rect
text vpx_img_wrap
+text vpx_free_tpl_gop_stats
+text vpx_read_tpl_gop_stats
+text vpx_write_tpl_gop_stats
diff --git a/vpx/src/vpx_tpl.c b/vpx/src/vpx_tpl.c
new file mode 100644
index 000000000..9cdb4a0a0
--- /dev/null
+++ b/vpx/src/vpx_tpl.c
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_tpl.h"
+#include "vpx_mem/vpx_mem.h"
+
+#define CHECK_FPRINTF_ERROR(expr) \
+ do { \
+ if (expr < 0) { \
+ return VPX_CODEC_ERROR; \
+ } \
+ } while (0)
+
+#define CHECK_FSCANF_ERROR(expr, expected_value) \
+ do { \
+ if (expr != expected_value) { \
+ return VPX_CODEC_ERROR; \
+ } \
+ } while (0)
+
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+ const VpxTplGopStats *tpl_gop_stats) {
+ int i;
+ if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+ CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d\n", tpl_gop_stats->size));
+
+ for (i = 0; i < tpl_gop_stats->size; i++) {
+ VpxTplFrameStats frame_stats = tpl_gop_stats->frame_stats_list[i];
+ const int num_blocks = frame_stats.num_blocks;
+ int block;
+ CHECK_FPRINTF_ERROR(fprintf(tpl_file, "%d %d %d\n", frame_stats.frame_width,
+ frame_stats.frame_height, num_blocks));
+ for (block = 0; block < num_blocks; block++) {
+ VpxTplBlockStats block_stats = frame_stats.block_stats_list[block];
+ CHECK_FPRINTF_ERROR(
+ fprintf(tpl_file,
+ "%" PRId64 " %" PRId64 " %" PRId16 " %" PRId16 " %" PRId64
+ " %" PRId64 " %d\n",
+ block_stats.inter_cost, block_stats.intra_cost,
+ block_stats.mv_c, block_stats.mv_r, block_stats.recrf_dist,
+ block_stats.recrf_rate, block_stats.ref_frame_index));
+ }
+ }
+
+ return VPX_CODEC_OK;
+}
+
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+ VpxTplGopStats *tpl_gop_stats) {
+ int i, frame_list_size;
+ if (tpl_file == NULL || tpl_gop_stats == NULL) return VPX_CODEC_INVALID_PARAM;
+ CHECK_FSCANF_ERROR(fscanf(tpl_file, "%d\n", &frame_list_size), 1);
+ tpl_gop_stats->size = frame_list_size;
+ tpl_gop_stats->frame_stats_list = (VpxTplFrameStats *)vpx_calloc(
+ frame_list_size, sizeof(tpl_gop_stats->frame_stats_list[0]));
+ if (tpl_gop_stats->frame_stats_list == NULL) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+ for (i = 0; i < frame_list_size; i++) {
+ VpxTplFrameStats *frame_stats = &tpl_gop_stats->frame_stats_list[i];
+ int num_blocks, width, height, block;
+ CHECK_FSCANF_ERROR(
+ fscanf(tpl_file, "%d %d %d\n", &width, &height, &num_blocks), 3);
+ frame_stats->num_blocks = num_blocks;
+ frame_stats->frame_width = width;
+ frame_stats->frame_height = height;
+ frame_stats->block_stats_list = (VpxTplBlockStats *)vpx_calloc(
+ num_blocks, sizeof(frame_stats->block_stats_list[0]));
+ if (frame_stats->block_stats_list == NULL) {
+ vpx_free_tpl_gop_stats(tpl_gop_stats);
+ return VPX_CODEC_MEM_ERROR;
+ }
+ for (block = 0; block < num_blocks; block++) {
+ VpxTplBlockStats *block_stats = &frame_stats->block_stats_list[block];
+ CHECK_FSCANF_ERROR(
+ fscanf(tpl_file,
+ "%" SCNd64 " %" SCNd64 " %" SCNd16 " %" SCNd16 " %" SCNd64
+ " %" SCNd64 " %d\n",
+ &block_stats->inter_cost, &block_stats->intra_cost,
+ &block_stats->mv_c, &block_stats->mv_r,
+ &block_stats->recrf_dist, &block_stats->recrf_rate,
+ &block_stats->ref_frame_index),
+ 7);
+ }
+ }
+
+ return VPX_CODEC_OK;
+}
+
+void vpx_free_tpl_gop_stats(VpxTplGopStats *data) {
+ int frame;
+ if (data == NULL) return;
+ for (frame = 0; frame < data->size; frame++) {
+ vpx_free(data->frame_stats_list[frame].block_stats_list);
+ }
+ vpx_free(data->frame_stats_list);
+}
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index de86579d5..25c815ef5 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -27,6 +27,7 @@ API_DOC_SRCS-yes += vpx_encoder.h
API_DOC_SRCS-yes += vpx_ext_ratectrl.h
API_DOC_SRCS-yes += vpx_frame_buffer.h
API_DOC_SRCS-yes += vpx_image.h
+API_DOC_SRCS-yes += vpx_tpl.h
API_SRCS-yes += src/vpx_decoder.c
API_SRCS-yes += vpx_decoder.h
@@ -36,9 +37,11 @@ API_SRCS-yes += internal/vpx_codec_internal.h
API_SRCS-yes += internal/vpx_ratectrl_rtc.h
API_SRCS-yes += src/vpx_codec.c
API_SRCS-yes += src/vpx_image.c
+API_SRCS-yes += src/vpx_tpl.c
API_SRCS-yes += vpx_codec.h
API_SRCS-yes += vpx_codec.mk
API_SRCS-yes += vpx_frame_buffer.h
API_SRCS-yes += vpx_image.h
API_SRCS-yes += vpx_integer.h
API_SRCS-yes += vpx_ext_ratectrl.h
+API_SRCS-yes += vpx_tpl.h
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index fb95723dd..c45d1a2ba 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -31,6 +31,7 @@ extern "C" {
#include "./vpx_codec.h"
#include "./vpx_ext_ratectrl.h"
+#include "./vpx_tpl.h"
/*! Temporal Scalability: Maximum length of the sequence defining frame
* layer membership
@@ -57,9 +58,9 @@ extern "C" {
* types, removing or reassigning enums, adding/removing/rearranging
* fields to structures
*/
-#define VPX_ENCODER_ABI_VERSION \
- (16 + VPX_CODEC_ABI_VERSION + \
- VPX_EXT_RATECTRL_ABI_VERSION) /**<\hideinitializer*/
+#define VPX_ENCODER_ABI_VERSION \
+ (16 + VPX_CODEC_ABI_VERSION + VPX_EXT_RATECTRL_ABI_VERSION + \
+ VPX_TPL_ABI_VERSION) /**<\hideinitializer*/
/*! \brief Encoder capabilities bitfield
*
@@ -252,31 +253,6 @@ enum vpx_kf_mode {
VPX_KF_DISABLED = 0 /**< Encoder does not place keyframes. */
};
-/*!\brief Temporal dependency model stats for each block before propagation */
-typedef struct VpxTplBlockStats {
- int64_t intra_cost; /**< Intra cost */
- int64_t inter_cost; /**< Inter cost */
- int16_t mv_r; /**< Motion vector row */
- int16_t mv_c; /**< Motion vector col */
- int64_t recrf_rate; /**< Rate from reconstructed ref frame */
- int64_t recrf_dist; /**< Distortion from reconstructed ref frame */
- int ref_frame_index; /**< Ref frame index */
-} VpxTplBlockStats;
-
-/*!\brief Temporal dependency model stats for each frame before propagation */
-typedef struct VpxTplFrameStats {
- int frame_width; /**< Frame width */
- int frame_height; /**< Frame height */
- int num_blocks; /**< Number of blocks. Size of block_stats_list */
- VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
-} VpxTplFrameStats;
-
-/*!\brief Temporal dependency model stats for each GOP before propagation */
-typedef struct VpxTplGopStats {
- int size; /**< GOP size, also the size of frame_stats_list. */
- VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
-} VpxTplGopStats;
-
/*!\brief Encoded Frame Flags
*
* This type indicates a bitfield to be passed to vpx_codec_encode(), defining
diff --git a/vpx/vpx_tpl.h b/vpx/vpx_tpl.h
new file mode 100644
index 000000000..50aec49eb
--- /dev/null
+++ b/vpx/vpx_tpl.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2023 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+/*!\file
+ * \brief Describes the TPL stats descriptor and associated operations
+ *
+ */
+#ifndef VPX_VPX_VPX_TPL_H_
+#define VPX_VPX_VPX_TPL_H_
+
+#include <stdio.h>
+
+#include "./vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*!\brief Current ABI version number
+ *
+ * \internal
+ * If this file is altered in any way that changes the ABI, this value
+ * must be bumped. Examples include, but are not limited to, changing
+ * types, removing or reassigning enums, adding/removing/rearranging
+ * fields to structures
+ */
+#define VPX_TPL_ABI_VERSION (1) /**<\hideinitializer*/
+
+/*!\brief Temporal dependency model stats for each block before propagation */
+typedef struct VpxTplBlockStats {
+ int64_t intra_cost; /**< Intra cost */
+ int64_t inter_cost; /**< Inter cost */
+ int16_t mv_r; /**< Motion vector row */
+ int16_t mv_c; /**< Motion vector col */
+ int64_t recrf_rate; /**< Rate from reconstructed ref frame */
+ int64_t recrf_dist; /**< Distortion from reconstructed ref frame */
+ int ref_frame_index; /**< Ref frame index */
+} VpxTplBlockStats;
+
+/*!\brief Temporal dependency model stats for each frame before propagation */
+typedef struct VpxTplFrameStats {
+ int frame_width; /**< Frame width */
+ int frame_height; /**< Frame height */
+ int num_blocks; /**< Number of blocks. Size of block_stats_list */
+ VpxTplBlockStats *block_stats_list; /**< List of tpl stats for each block */
+} VpxTplFrameStats;
+
+/*!\brief Temporal dependency model stats for each GOP before propagation */
+typedef struct VpxTplGopStats {
+ int size; /**< GOP size, also the size of frame_stats_list. */
+ VpxTplFrameStats *frame_stats_list; /**< List of tpl stats for each frame */
+} VpxTplGopStats;
+
+/*!\brief Write VpxTplGopStats to file
+ *
+ * Accepts an opened file handle and writes \p tpl_gop_stats.
+ *
+ * \param[in] tpl_file A FILE pointer that's already been opened.
+ * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the
+ * whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully written.
+ */
+vpx_codec_err_t vpx_write_tpl_gop_stats(FILE *tpl_file,
+ const VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Read VpxTplGopStats from file
+ *
+ * Accepts an opened file handle and reads TPL stats and stores them into
+ * \p tpl_gop_stats. Allocates memory for TPL stats.
+ *
+ * \param[in] tpl_file A FILE pointer that's already been opened.
+ * \param[out] tpl_gop_stats VpxTplGopStats that contains TPL stats for the
+ * whole GOP.
+ *
+ * \return VPX_CODEC_OK if TPL stats are successfully read from file.
+ */
+vpx_codec_err_t vpx_read_tpl_gop_stats(FILE *tpl_file,
+ VpxTplGopStats *tpl_gop_stats);
+
+/*!\brief Free the memory allocated for VpxTplGopStats
+ *
+ * \param[in] tpl_gop_stats VpxTplGopStats that contains TPL stats for the
+ * whole GOP.
+ */
+void vpx_free_tpl_gop_stats(VpxTplGopStats *tpl_gop_stats);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif // VPX_VPX_VPX_TPL_H_
diff --git a/vpx_dsp/arm/fdct_partial_neon.c b/vpx_dsp/arm/fdct_partial_neon.c
index 718dba0d9..df0da543c 100644
--- a/vpx_dsp/arm/fdct_partial_neon.c
+++ b/vpx_dsp/arm/fdct_partial_neon.c
@@ -37,6 +37,15 @@ void vpx_fdct4x4_1_neon(const int16_t *input, tran_low_t *output, int stride) {
output[1] = 0;
}
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// will fail with an internal compiler error.
+// See:
+// https://developercommunity.visualstudio.com/t/Compiler-crash-C1001-when-building-a-for/10346110
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", off)
+#endif
void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
int r;
int16x8_t sum = vld1q_s16(&input[0]);
@@ -49,6 +58,9 @@ void vpx_fdct8x8_1_neon(const int16_t *input, tran_low_t *output, int stride) {
output[0] = (tran_low_t)horizontal_add_int16x8(sum);
output[1] = 0;
}
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+#pragma optimize("", on)
+#endif
void vpx_fdct16x16_1_neon(const int16_t *input, tran_low_t *output,
int stride) {
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index 2de449546..4b946d756 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -45,9 +45,21 @@ typedef int16_t tran_low_t;
typedef int16_t tran_coef_t;
+// Visual Studio 2022 (cl.exe) targeting AArch64 with optimizations enabled
+// produces invalid code for clip_pixel() when the return type is uint8_t.
+// See:
+// https://developercommunity.visualstudio.com/t/Misoptimization-for-ARM64-in-VS-2022-17/10363361
+// TODO(jzern): check the compiler version after a fix for the issue is
+// released.
+#if defined(_MSC_VER) && defined(_M_ARM64) && !defined(__clang__)
+static INLINE int clip_pixel(int val) {
+ return (val > 255) ? 255 : (val < 0) ? 0 : val;
+}
+#else
static INLINE uint8_t clip_pixel(int val) {
return (val > 255) ? 255 : (val < 0) ? 0 : val;
}
+#endif
static INLINE int clamp(int value, int low, int high) {
return value < low ? low : (value > high ? high : value);
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 2498bba17..526c28382 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -46,7 +46,7 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
};
#define CALC_CONVOLVE8_HORZ_ROW \
- srcReg = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3); \
+ srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch); \
s1[0] = _mm256_shuffle_epi8(srcReg, filt[0]); \
s1[1] = _mm256_shuffle_epi8(srcReg, filt[1]); \
s1[2] = _mm256_shuffle_epi8(srcReg, filt[2]); \
@@ -60,16 +60,6 @@ DECLARE_ALIGNED(32, static const uint8_t, filt_d4_global_avx2[64]) = {
_mm256_extractf128_si256(s1[0], 1)); \
output_ptr += output_pitch;
-// 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
-static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
- // 0 0 0 0 0 0 0 0 | 0 0 0 0 lo3 lo2 lo1 lo0
- __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
-
- // 0 0 0 0 hi3 hi2 hi1 hi0 | 0 0 0 0 lo3 lo2 lo1 lo0
- a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
- return a;
-}
-
static INLINE void vpx_filter_block1d16_h8_x_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter,
@@ -93,12 +83,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
__m256i srcReg;
// load the 2 strides of source
- srcReg =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr - 3)));
- srcReg = _mm256_inserti128_si256(
- srcReg,
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line - 3)),
- 1);
+ srcReg = mm256_loadu2_si128(src_ptr - 3, src_ptr + src_pixels_per_line - 3);
// filter the source buffer
s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -109,12 +94,7 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
// reading 2 strides of the next 16 bytes
// (part of it was being read by earlier read)
- srcReg =
- _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(src_ptr + 5)));
- srcReg = _mm256_inserti128_si256(
- srcReg,
- _mm_loadu_si128((const __m128i *)(src_ptr + src_pixels_per_line + 5)),
- 1);
+ srcReg = mm256_loadu2_si128(src_ptr + 5, src_ptr + src_pixels_per_line + 5);
// filter the source buffer
s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
@@ -129,60 +109,37 @@ static INLINE void vpx_filter_block1d16_h8_x_avx2(
src_ptr += src_stride;
- // average if necessary
- outReg1 = _mm256_castsi256_si128(outReg32b1);
- outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
if (avg) {
- outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
- outReg2 = _mm_avg_epu8(
- outReg2, _mm_load_si128((__m128i *)(output_ptr + output_pitch)));
+ const __m256i outReg = mm256_loadu2_si128(
+ (__m128i *)output_ptr, (__m128i *)(output_ptr + output_pitch));
+ outReg32b1 = _mm256_avg_epu8(outReg32b1, outReg);
}
-
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, outReg1);
-
- // save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + output_pitch), outReg2);
-
+ mm256_store2_si128((__m128i *)output_ptr,
+ (__m128i *)(output_ptr + output_pitch), &outReg32b1);
output_ptr += dst_stride;
}
// if the number of strides is odd.
// process only 16 bytes
if (i > 0) {
- __m128i srcReg;
-
- // load the first 16 bytes of the last row
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+ const __m128i srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));
+ const __m128i srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+ const __m256i srcReg =
+ _mm256_inserti128_si256(_mm256_castsi128_si256(srcReg1), srcReg2, 1);
// filter the source buffer
- s[0] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
- s[1] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
- s[2] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
- s[3] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
- outReg1 = convolve8_8_avx2(s, f);
-
- // reading the next 16 bytes
- // (part of it was being read by earlier read)
- srcReg = _mm_loadu_si128((const __m128i *)(src_ptr + 5));
+ s[0] = _mm256_shuffle_epi8(srcReg, filt[0]);
+ s[1] = _mm256_shuffle_epi8(srcReg, filt[1]);
+ s[2] = _mm256_shuffle_epi8(srcReg, filt[2]);
+ s[3] = _mm256_shuffle_epi8(srcReg, filt[3]);
- // filter the source buffer
- s[0] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[0])));
- s[1] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[1])));
- s[2] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[2])));
- s[3] = _mm256_castsi128_si256(
- _mm_shuffle_epi8(srcReg, _mm256_castsi256_si128(filt[3])));
- outReg2 = convolve8_8_avx2(s, f);
+ // The low and high 128-bits of each lane contain the first and second
+ // convolve result respectively
+ outReg32b1 = convolve8_16_avx2(s, f);
+ outReg1 = _mm256_castsi256_si128(outReg32b1);
+ outReg2 = _mm256_extractf128_si256(outReg32b1, 1);
- // shrink to 8 bit each 16 bits, the low and high 64-bits of each lane
- // contain the first and second convolve result respectively
+ // shrink to 8 bit each 16 bits
outReg1 = _mm_packus_epi16(outReg1, outReg2);
// average if necessary
@@ -266,7 +223,6 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter,
const int avg) {
- __m128i outReg1, outReg2;
__m256i srcRegHead1;
unsigned int i;
ptrdiff_t src_stride, dst_stride;
@@ -345,19 +301,14 @@ static INLINE void vpx_filter_block1d16_v8_x_avx2(
src_ptr += src_stride;
// average if necessary
- outReg1 = _mm256_castsi256_si128(s1[0]);
- outReg2 = _mm256_extractf128_si256(s1[0], 1);
if (avg) {
- outReg1 = _mm_avg_epu8(outReg1, _mm_load_si128((__m128i *)output_ptr));
- outReg2 = _mm_avg_epu8(
- outReg2, _mm_load_si128((__m128i *)(output_ptr + out_pitch)));
+ const __m256i outReg = mm256_loadu2_si128(
+ (__m128i *)output_ptr, (__m128i *)(output_ptr + out_pitch));
+ s1[0] = _mm256_avg_epu8(s1[0], outReg);
}
- // save 16 bytes
- _mm_store_si128((__m128i *)output_ptr, outReg1);
-
- // save the next 16 bits
- _mm_store_si128((__m128i *)(output_ptr + out_pitch), outReg2);
+ mm256_store2_si128((__m128i *)output_ptr,
+ (__m128i *)(output_ptr + out_pitch), s1);
output_ptr += dst_stride;
@@ -1094,7 +1045,7 @@ static void vpx_filter_block1d4_h8_avx2(
// load the 2 strides of source
// r115 r114 ...... r15 r14 r13 r12 r11 r10 | r015 r014 r013 ...... r07
// r06 r05 r04 r03 r02 r01 r00
- srcReg32b1 = xx_loadu2_mi128(src_ptr - 3 + src_pitch, src_ptr - 3);
+ srcReg32b1 = mm256_loadu2_si128(src_ptr - 3, src_ptr - 3 + src_pitch);
// filter the source buffer
// r16 r15 r14 r13 r15 r14 r13 r12 r14 r13 r12 r11 r13 r12 r11 r10 | r06
@@ -1188,8 +1139,7 @@ static void vpx_filter_block1d4_v8_avx2(
const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
__m256i f[4], ss[4];
- __m256i r[8];
- __m128i r1[10];
+ __m256i r[9], rr[2];
__m128i s[11];
unsigned int y = output_height;
@@ -1210,48 +1160,35 @@ static void vpx_filter_block1d4_v8_avx2(
s[5] = _mm_loadl_epi64((const __m128i *)(src_ptr + 5 * src_pitch));
s[6] = _mm_loadl_epi64((const __m128i *)(src_ptr + 6 * src_pitch));
- // R1-0 xxxx .. . . x| r13 r12 r11 r10 r03 r02 r01 r00
- r1[0] = _mm_unpacklo_epi32(s[0], s[1]);
-
- // R2-1 xxxx .. . . x| r23 r22 r21 r20 r13 r12 r11 r10
- r1[1] = _mm_unpacklo_epi32(s[1], s[2]);
-
- // R3-2 xxxx .. . . x| r33 r32 r31 r30 r23 r22 r21 r20
- r1[2] = _mm_unpacklo_epi32(s[2], s[3]);
-
- // R4-3 xxxx .. . . x| r43 r42 r41 r40 r33 r32 r31 r30
- r1[3] = _mm_unpacklo_epi32(s[3], s[4]);
-
- // R5-4 xxxx .. . . x| r53 r52 r51 r50 r43 r42 r41 r40
- r1[4] = _mm_unpacklo_epi32(s[4], s[5]);
-
- // R6-5 xxxx .. . . x| r63 r62 r61 r60 r53 r52 r51 r50
- r1[5] = _mm_unpacklo_epi32(s[5], s[6]);
+ r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[0]), s[2], 1);
+ r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[1]), s[3], 1);
+ r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[2]), s[4], 1);
+ r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[3]), s[5], 1);
+ r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[4]), s[6], 1);
- // 00000000 r33 r32 r31 r30|r23 r22 r21 r20||00000000|r13 r12 r11 r10|r03 r02
- // r01 r00
- r[0] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[0]), r1[2], 1);
+ // r37.....r24..r33..r31 r30 r23 r22 r21 r20|r17....r14 r07..r05 r04 r13 r12
+ // r11 r10 r03 r02 r01 r00
+ rr[0] = _mm256_unpacklo_epi32(r[0], r[1]);
- // 00000000 r43 r42 r41 r40|r33 r32 r31 r30||00000000|r23 r22 r21 r20|r13 r12
- // r11 r10
- r[1] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[1]), r1[3], 1);
-
- // 00000000 r53 r52 r51 r50|r43 r42 r41 r40||00000000|r33 r32 r31 r30|r23 r22
- // r21 r20
- r[2] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[2]), r1[4], 1);
-
- // 00000000 r63 r62 r61 r60|r53 r52 r51 r50||00000000|r43 r42 r41 r40|r33 r32
- // r31 r30
- r[3] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[3]), r1[5], 1);
+ // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+ // r21 r20 r13 r12 r11 r10
+ rr[1] = _mm256_unpacklo_epi32(r[1], r[2]);
// r43 r33....r40 r30|r33 r23....r30 r20||r23 r13....r20 r10|r13 r03....r10
// r00|
- ss[0] = _mm256_unpacklo_epi8(r[0], r[1]);
+ ss[0] = _mm256_unpacklo_epi8(rr[0], rr[1]);
+
+ // r37.....r24..r33..r31 r30 r23 r22 r21 r20||r17....r14 r07..r05 r04 r13 r12
+ // r11 r10 r03 r02 r01 r00
+ rr[0] = _mm256_unpacklo_epi32(r[2], r[3]);
+
+ // r47.....r34..r43..r41 r40 r33 r32 r31 r30|r27....r24 r17..r15 r14 r23 r22
+ // r21 r20 r13 r12 r11 r10
+ rr[1] = _mm256_unpacklo_epi32(r[3], r[4]);
// r63 r53....r60 r50|r53 r43....r50 r40||r43 r33....r40 r30|r33 r23....r30
// r20|
- ss[1] = _mm256_unpacklo_epi8(r[2], r[3]);
-
+ ss[1] = _mm256_unpacklo_epi8(rr[0], rr[1]);
// Process 4 rows at a time
while (y >= 4) {
s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
@@ -1259,41 +1196,17 @@ static void vpx_filter_block1d4_v8_avx2(
s[9] = _mm_loadl_epi64((const __m128i *)(src_ptr + 9 * src_pitch));
s[10] = _mm_loadl_epi64((const __m128i *)(src_ptr + 10 * src_pitch));
- // R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
- r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
-
- // R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
- r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
-
- // R9-8 xxxx .. . . x| r93 r92 r91 r90 r83 r82 r81 r80
- r1[8] = _mm_unpacklo_epi32(s[8], s[9]);
-
- // R10-9 xxxx .. . . x| r10-3 r10-2 r10-1 r10-0 r93 r92 r91 r90
- r1[9] = _mm_unpacklo_epi32(s[9], s[10]);
-
- // 00000000 r73 r72 r71 r70|r63 r62 r61 r60||00000000|r53 r52 r51 r50|r43
- // r42 r41 r40
- r[4] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[4]), r1[6], 1);
-
- // 00000000 r83 r82 r81 r80|r73 r72 r71 r70||00000000|r63 r62 r61 r60|r53
- // r52 r51 r50
- r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[5]), r1[7], 1);
+ r[5] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[5]), s[7], 1);
+ r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[6]), s[8], 1);
+ rr[0] = _mm256_unpacklo_epi32(r[4], r[5]);
+ rr[1] = _mm256_unpacklo_epi32(r[5], r[6]);
+ ss[2] = _mm256_unpacklo_epi8(rr[0], rr[1]);
- // 00000000 r93 r92 r91 r90|r83 r82 r81 r80||00000000|r73 r72 r71 r70|r63
- // r62 r61 r60
- r[6] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[6]), r1[8], 1);
-
- // 00000000 r10-3 r10-2 r10-1 r10-0|r93 r92 r91 r90||00000000|r83 r82 r81
- // r80|r73 r72 r71 r70
- r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(r1[7]), r1[9], 1);
-
- // r83 r73....r80 r70|r73 r63....r70 r60||r63 r53....r60 r50|r53 r43....r50
- // r40|
- ss[2] = _mm256_unpacklo_epi8(r[4], r[5]);
-
- // r10-3 r10-3....r10-0 r10-0|r93 r83....r90 r80||r83 r73....r80 r70|r73
- // r63....r70 r60|
- ss[3] = _mm256_unpacklo_epi8(r[6], r[7]);
+ r[7] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[7]), s[9], 1);
+ r[8] = _mm256_inserti128_si256(_mm256_castsi128_si256(s[8]), s[10], 1);
+ rr[0] = _mm256_unpacklo_epi32(r[6], r[7]);
+ rr[1] = _mm256_unpacklo_epi32(r[7], r[8]);
+ ss[3] = _mm256_unpacklo_epi8(rr[0], rr[1]);
ss[0] = convolve8_16_avx2(ss, f);
@@ -1315,17 +1228,17 @@ static void vpx_filter_block1d4_v8_avx2(
ss[1] = ss[3];
s[6] = s[10];
+ s[5] = s[9];
- r1[4] = r1[8];
- r1[5] = r1[9];
-
+ r[4] = r[8];
y -= 4;
}
// Process 2 rows
if (y == 2) {
- __m128i ss1[4], f1[4];
+ __m128i ss1[4], f1[4], r1[4];
+ s[4] = _mm_loadl_epi64((const __m128i *)(src_ptr + 4 * src_pitch));
s[7] = _mm_loadl_epi64((const __m128i *)(src_ptr + 7 * src_pitch));
s[8] = _mm_loadl_epi64((const __m128i *)(src_ptr + 8 * src_pitch));
@@ -1334,11 +1247,14 @@ static void vpx_filter_block1d4_v8_avx2(
f1[2] = _mm256_castsi256_si128(f[2]);
f1[3] = _mm256_castsi256_si128(f[3]);
+ r1[0] = _mm_unpacklo_epi32(s[4], s[5]);
+ r1[1] = _mm_unpacklo_epi32(s[5], s[6]);
+
// R7-6 xxxx .. . . x| r73 r72 r71 r70 r63 r62 r61 r60
- r1[6] = _mm_unpacklo_epi32(s[6], s[7]);
+ r1[2] = _mm_unpacklo_epi32(s[6], s[7]);
// R8-7 xxxx .. . . x| r83 r82 r81 r80 r73 r72 r71 r70
- r1[7] = _mm_unpacklo_epi32(s[7], s[8]);
+ r1[3] = _mm_unpacklo_epi32(s[7], s[8]);
// r23 r13....r20 r10|r13 r03....r10 r00
ss1[0] = _mm256_castsi256_si128(ss[0]);
@@ -1347,10 +1263,10 @@ static void vpx_filter_block1d4_v8_avx2(
ss1[1] = _mm256_castsi256_si128(ss[1]);
// r63 r53....r60 r50|r53 r43....r50 r40
- ss1[2] = _mm_unpacklo_epi8(r1[4], r1[5]);
+ ss1[2] = _mm_unpacklo_epi8(r1[0], r1[1]);
// r83 r73....r80 r70|r73 r63....r70 r60
- ss1[3] = _mm_unpacklo_epi8(r1[6], r1[7]);
+ ss1[3] = _mm_unpacklo_epi8(r1[2], r1[3]);
ss1[0] = convolve8_8_ssse3(ss1, f1);