summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build/make/Makefile30
-rw-r--r--[-rwxr-xr-x]build/make/configure.sh43
-rwxr-xr-xconfigure23
-rw-r--r--examples/vpx_temporal_svc_encoder.c2
-rw-r--r--libs.mk8
-rw-r--r--test/dct16x16_test.cc25
-rwxr-xr-xtest/decode_to_md5.sh4
-rw-r--r--test/fdct4x4_test.cc15
-rw-r--r--test/fdct8x8_test.cc16
-rw-r--r--test/test-data.sha13
-rw-r--r--test/test.mk2
-rwxr-xr-xtest/tools_common.sh32
-rw-r--r--test/variance_test.cc12
-rwxr-xr-xtest/vp9_spatial_svc_encoder.sh57
-rw-r--r--test/vp9_subtract_test.cc5
-rw-r--r--test/vpx_scale_test.cc (renamed from test/scale_border_test.cc)164
-rwxr-xr-xtest/vpxdec.sh32
-rw-r--r--third_party/libmkv/EbmlIDs.h231
-rw-r--r--third_party/libmkv/EbmlWriter.c157
-rw-r--r--third_party/libmkv/EbmlWriter.h42
-rw-r--r--vp8/common/onyx.h4
-rw-r--r--vp8/encoder/denoising.c50
-rw-r--r--vp8/encoder/denoising.h26
-rw-r--r--vp8/encoder/encodeframe.c13
-rw-r--r--vp8/encoder/ethreading.c15
-rw-r--r--vp8/encoder/onyx_if.c31
-rw-r--r--vp8/encoder/onyx_int.h2
-rw-r--r--vp8/encoder/pickinter.c10
-rw-r--r--vp8/vp8_cx_iface.c2
-rw-r--r--vp8/vp8_dx_iface.c1
-rw-r--r--vp9/common/vp9_rtcd_defs.pl20
-rw-r--r--vp9/encoder/arm/neon/vp9_dct_neon.c223
-rw-r--r--vp9/encoder/arm/neon/vp9_quantize_neon.c102
-rw-r--r--vp9/encoder/arm/neon/vp9_subtract_neon.c81
-rw-r--r--vp9/encoder/arm/neon/vp9_variance_neon.c129
-rw-r--r--vp9/encoder/vp9_encodeframe.c146
-rw-r--r--vp9/encoder/vp9_encoder.h4
-rw-r--r--vp9/encoder/vp9_firstpass.c56
-rw-r--r--vp9/encoder/vp9_firstpass.h14
-rw-r--r--vp9/encoder/vp9_pickmode.c7
-rw-r--r--vp9/encoder/vp9_rdopt.c3
-rw-r--r--vp9/encoder/vp9_speed_features.c7
-rw-r--r--vp9/encoder/vp9_speed_features.h2
-rw-r--r--vp9/encoder/x86/vp9_dct_avx2.c2566
-rw-r--r--vp9/vp9_cx_iface.c40
-rw-r--r--vp9/vp9_dx_iface.c1
-rw-r--r--vp9/vp9cx.mk4
-rw-r--r--vpx/internal/vpx_codec_internal.h1
-rw-r--r--vpx/src/vpx_encoder.c4
-rw-r--r--vpx_ports/vpx_once.h27
-rw-r--r--vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm233
-rw-r--r--vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm259
-rw-r--r--vpx_scale/arm/neon/yv12extend_arm.c21
-rw-r--r--vpx_scale/vpx_scale.mk5
-rw-r--r--vpx_scale/vpx_scale_rtcd.pl2
55 files changed, 1233 insertions, 3781 deletions
diff --git a/build/make/Makefile b/build/make/Makefile
index 9efa0ec02..ed90397f0 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -118,20 +118,26 @@ testdata::
utiltest:
# Add compiler flags for intrinsic files
+ifeq ($(TOOLCHAIN), x86-os2-gcc)
+STACKREALIGN=-mstackrealign
+else
+STACKREALIGN=
+endif
+
$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
-$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
-$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
-$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
-$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
-$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
-$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
-$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
-$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
-$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
-$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
-$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
$(BUILD_PFX)%.c.d: %.c
$(if $(quiet),@echo " [DEP] $@")
diff --git a/build/make/configure.sh b/build/make/configure.sh
index d25f31333..ab6687f73 100755..100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -252,7 +252,7 @@ tolower(){
#
source_path=${0%/*}
enable_feature source_path_used
-if test -z "$source_path" -o "$source_path" = "." ; then
+if [ -z "$source_path" ] || [ "$source_path" = "." ]; then
source_path="`pwd`"
disable_feature source_path_used
fi
@@ -381,8 +381,8 @@ EOF
# tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used.
check_gcc_machine_option() {
- local opt="$1"
- local feature="$2"
+ opt="$1"
+ feature="$2"
[ -n "$feature" ] || feature="$opt"
if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
@@ -419,8 +419,8 @@ true
}
write_common_target_config_mk() {
- local CC="${CC}"
- local CXX="${CXX}"
+ saved_CC="${CC}"
+ saved_CXX="${CXX}"
enabled ccache && CC="ccache ${CC}"
enabled ccache && CXX="ccache ${CXX}"
print_webm_license $1 "##" ""
@@ -470,6 +470,8 @@ EOF
enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
+ CC="${saved_CC}"
+ CXX="${saved_CXX}"
}
@@ -547,7 +549,8 @@ process_common_cmdline() {
alt_libc="${optval}"
;;
--as=*)
- [ "${optval}" = yasm -o "${optval}" = nasm -o "${optval}" = auto ] \
+ [ "${optval}" = yasm ] || [ "${optval}" = nasm ] \
+ || [ "${optval}" = auto ] \
|| die "Must be yasm, nasm or auto: ${optval}"
alt_as="${optval}"
;;
@@ -555,8 +558,8 @@ process_common_cmdline() {
w="${optval%%x*}"
h="${optval##*x}"
VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}"
- [ ${w} -gt 0 -a ${h} -gt 0 ] || die "Invalid size-limit: too small."
- [ ${w} -lt 65536 -a ${h} -lt 65536 ] \
+ [ ${w} -gt 0 ] && [ ${h} -gt 0 ] || die "Invalid size-limit: too small."
+ [ ${w} -lt 65536 ] && [ ${h} -lt 65536 ] \
|| die "Invalid size-limit: too big."
enable_feature size_limit
;;
@@ -1150,7 +1153,7 @@ EOF
auto|"")
which nasm >/dev/null 2>&1 && AS=nasm
which yasm >/dev/null 2>&1 && AS=yasm
- [ "${AS}" = auto -o -z "${AS}" ] \
+ [ "${AS}" = auto ] || [ -z "${AS}" ] \
&& die "Neither yasm nor nasm have been found"
;;
esac
@@ -1314,8 +1317,9 @@ process_toolchain() {
}
print_config_mk() {
- local prefix=$1
- local makefile=$2
+ saved_prefix="${prefix}"
+ prefix=$1
+ makefile=$2
shift 2
for cfg; do
if enabled $cfg; then
@@ -1323,11 +1327,13 @@ print_config_mk() {
echo "${prefix}_${upname}=yes" >> $makefile
fi
done
+ prefix="${saved_prefix}"
}
print_config_h() {
- local prefix=$1
- local header=$2
+ saved_prefix="${prefix}"
+ prefix=$1
+ header=$2
shift 2
for cfg; do
upname="`toupper $cfg`"
@@ -1337,10 +1343,11 @@ print_config_h() {
echo "#define ${prefix}_${upname} 0" >> $header
fi
done
+ prefix="${saved_prefix}"
}
print_config_vars_h() {
- local header=$1
+ header=$1
shift
while [ $# -gt 0 ]; do
upname="`toupper $1`"
@@ -1350,9 +1357,10 @@ print_config_vars_h() {
}
print_webm_license() {
- local destination=$1
- local prefix="$2"
- local suffix="$3"
+ saved_prefix="${prefix}"
+ destination=$1
+ prefix="$2"
+ suffix="$3"
shift 3
cat <<EOF > ${destination}
${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1363,6 +1371,7 @@ ${prefix} tree. An additional intellectual property rights grant can be found${s
${prefix} in the file PATENTS. All contributing project authors may${suffix}
${prefix} be found in the AUTHORS file in the root of the source tree.${suffix}
EOF
+ prefix="${saved_prefix}"
}
process_targets() {
diff --git a/configure b/configure
index d570081d4..789e6c30a 100755
--- a/configure
+++ b/configure
@@ -67,10 +67,10 @@ Codecs:
EOF
#restore editor state '
- local family;
- local last_family;
- local c;
- local str;
+ family="";
+ last_family="";
+ c="";
+ str="";
for c in ${CODECS}; do
family=${c%_*}
if [ "${family}" != "${last_family}" ]; then
@@ -272,7 +272,6 @@ HAVE_LIST="
unistd_h
"
EXPERIMENT_LIST="
- multiple_arf
spatial_svc
vp9_temporal_denoising
fp_mb_stats
@@ -412,7 +411,7 @@ process_cmdline() {
}
post_process_cmdline() {
- local c
+ c=""
# If the codec family is disabled, disable all components of that family.
# If the codec family is enabled, enable all components of that family.
@@ -459,8 +458,8 @@ process_targets() {
enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk
# Calculate the default distribution name, based on the enabled features
- local cf
- local DIST_DIR=vpx
+ cf=""
+ DIST_DIR=vpx
for cf in $CODEC_FAMILIES; do
if enabled ${cf}_encoder && enabled ${cf}_decoder; then
DIST_DIR="${DIST_DIR}-${cf}"
@@ -482,7 +481,7 @@ process_targets() {
;;
esac
if [ -f "${source_path}/build/make/version.sh" ]; then
- local ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
+ ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
DIST_DIR="${DIST_DIR}-${ver}"
VERSION_STRING=${ver}
ver=${ver%%-*}
@@ -516,7 +515,7 @@ EOF
# Write makefiles for all enabled targets
#
for tgt in libs examples docs solution; do
- local tgt_fn="$tgt-$toolchain.mk"
+ tgt_fn="$tgt-$toolchain.mk"
if enabled $tgt; then
echo "Creating makefiles for ${toolchain} ${tgt}"
@@ -555,7 +554,7 @@ process_detect() {
true;
;;
*)
- local result=false
+ result=false
for d in "$@"; do
[ -f "${d##-I}/$header" ] && result=true && break
done
@@ -604,7 +603,7 @@ process_toolchain() {
# Handle universal binaries for this architecture
case $toolchain in
universal-darwin*)
- local darwin_ver=${tgt_os##darwin}
+ darwin_ver=${tgt_os##darwin}
# Snow Leopard (10.6/darwin10) dropped support for PPC
# Include PPC support for all prior versions
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index a7ad9f0c6..be3e7b2f1 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -579,7 +579,7 @@ int main(int argc, char **argv) {
if (strncmp(encoder->name, "vp8", 3) == 0) {
vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
- vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
+ vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
} else if (strncmp(encoder->name, "vp9", 3) == 0) {
vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
diff --git a/libs.mk b/libs.mk
index 1e01639c7..25fbc2cb9 100644
--- a/libs.mk
+++ b/libs.mk
@@ -409,12 +409,16 @@ $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1
curl -L -o $@ $(call libvpx_test_data_url,$(@F))
testdata:: $(LIBVPX_TEST_DATA)
- $(qexec)if [ -x "$$(which sha1sum)" ]; then\
+ $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\
+ [ -x "$$(which shasum)" ] && sha1sum=shasum;\
+ [ -x "$$(which sha1)" ] && sha1sum=sha1;\
+ if [ -n "$${sha1sum}" ]; then\
+ set -e;\
echo "Checking test data:";\
if [ -n "$(LIBVPX_TEST_DATA)" ]; then\
for f in $(call enabled,LIBVPX_TEST_DATA); do\
grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\
- (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c);\
+ (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\
done; \
fi; \
else\
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index bd7ddd8b6..ee417ce2e 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -606,29 +606,4 @@ INSTANTIATE_TEST_CASE_P(
::testing::Values(
make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0)));
#endif
-
-#if HAVE_AVX2
-// TODO(jzern): these prototypes can be removed after the avx2 versions are
-// reenabled in vp9_rtcd_defs.pl.
-extern "C" {
-void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride);
-void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, int stride,
- int tx_type);
-}
-INSTANTIATE_TEST_CASE_P(
- DISABLED_AVX2, Trans16x16DCT,
- ::testing::Values(
- make_tuple(&vp9_fdct16x16_avx2,
- &vp9_idct16x16_256_add_c, 0)));
-INSTANTIATE_TEST_CASE_P(
- AVX2, Trans16x16HT,
- ::testing::Values(
- make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 3)));
-INSTANTIATE_TEST_CASE_P(
- DISABLED_AVX2, Trans16x16HT,
- ::testing::Values(
- make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 0),
- make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 1),
- make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 2)));
-#endif
} // namespace
diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh
index 6cb7d0e6e..854b74f84 100755
--- a/test/decode_to_md5.sh
+++ b/test/decode_to_md5.sh
@@ -44,8 +44,8 @@ decode_to_md5() {
[ -e "${output_file}" ] || return 1
- local md5_last_frame=$(tail -n1 "${output_file}")
- local actual_md5=$(echo "${md5_last_frame% *}" | tr -d [:space:])
+ local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')"
+ local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')"
[ "${actual_md5}" = "${expected_md5}" ] || return 1
}
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index bea19f188..7c4826086 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -376,19 +376,4 @@ INSTANTIATE_TEST_CASE_P(
make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
#endif
-#if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
- AVX2, Trans4x4DCT,
- ::testing::Values(
- make_tuple(&vp9_fdct4x4_avx2,
- &vp9_idct4x4_16_add_c, 0)));
-INSTANTIATE_TEST_CASE_P(
- AVX2, Trans4x4HT,
- ::testing::Values(
- make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 0),
- make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 1),
- make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 2),
- make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 3)));
-#endif
-
} // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 25908374a..567e5f698 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -337,7 +337,7 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
NEON, FwdTrans8x8DCT,
::testing::Values(
- make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0)));
+ make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0)));
INSTANTIATE_TEST_CASE_P(
DISABLED_NEON, FwdTrans8x8HT,
::testing::Values(
@@ -367,18 +367,4 @@ INSTANTIATE_TEST_CASE_P(
::testing::Values(
make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0)));
#endif
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
- AVX2, FwdTrans8x8DCT,
- ::testing::Values(
- make_tuple(&vp9_fdct8x8_avx2, &vp9_idct8x8_64_add_c, 0)));
-INSTANTIATE_TEST_CASE_P(
- AVX2, FwdTrans8x8HT,
- ::testing::Values(
- make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 0),
- make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 1),
- make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 2),
- make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 3)));
-#endif
} // namespace
diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 98ac0e670..ee6289f1e 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1
@@ -679,4 +679,5 @@ edd86a1f5e62fd9da9a9d46078247759c2638009 tacomasmallcameramovement_640_480_30.y
9a70e8b7d14fba9234d0e51dce876635413ce444 thaloundeskmtg_640_480_30.yuv
e7d315dbf4f3928779e0dc624311196d44491d32 niklas_1280_720_30.yuv
c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf
-c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf
+c12918cf0a716417fba2de35c3fc5ab90e52dfce vp90-2-18-resize.ivf.md5
+717da707afcaa1f692ff1946f291054eb75a4f06 screendata.y4m
diff --git a/test/test.mk b/test/test.mk
index 53d40572a..057e1e8e3 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -99,8 +99,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc
LIBVPX_TEST_SRCS-yes += idct_test.cc
LIBVPX_TEST_SRCS-yes += intrapred_test.cc
-LIBVPX_TEST_SRCS-yes += scale_border_test.cc
LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc
+LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc
endif # VP8
diff --git a/test/tools_common.sh b/test/tools_common.sh
index 3e69c3687..e98beadf8 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh
@@ -182,38 +182,6 @@ webm_io_available() {
[ "$(vpx_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes
}
-# Echoes yes to stdout when vpxdec exists according to vpx_tool_available().
-vpxdec_available() {
- [ -n $(vpx_tool_available vpxdec) ] && echo yes
-}
-
-# Wrapper function for running vpxdec in noblit mode. Requires that
-# LIBVPX_BIN_PATH points to the directory containing vpxdec. Positional
-# parameter one is used as the input file path. Positional parameter two, when
-# present, is interpreted as a boolean flag that means the input should be sent
-# to vpxdec via pipe from cat instead of directly.
-vpxdec() {
- local input="${1}"
- local pipe_input=${2}
-
- if [ $# -gt 2 ]; then
- # shift away $1 and $2 so the remaining arguments can be passed to vpxdec
- # via $@.
- shift 2
- fi
-
- local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
-
- if [ -z "${pipe_input}" ]; then
- eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" --summary --noblit "$@" \
- ${devnull}
- else
- cat "${input}" \
- | eval "${VPX_TEST_PREFIX}" "${decoder}" - --summary --noblit "$@" \
- ${devnull}
- fi
-}
-
# Echoes yes to stdout when vpxenc exists according to vpx_tool_available().
vpxenc_available() {
[ -n $(vpx_tool_available vpxenc) ] && echo yes
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 9dc7c6a45..83b7435e6 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -756,6 +756,18 @@ INSTANTIATE_TEST_CASE_P(
::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
#endif // HAVE_AVX2
+#if HAVE_NEON
+const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+ NEON, VP9VarianceTest,
+ ::testing::Values(make_tuple(4, 4, variance16x16_neon)));
+
+const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
+ vp9_sub_pixel_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+ NEON, VP9SubpelVarianceTest,
+ ::testing::Values(make_tuple(4, 4, subpel_variance16x16_neon)));
+#endif // HAVE_NEON
#endif // CONFIG_VP9_ENCODER
} // namespace vp9
diff --git a/test/vp9_spatial_svc_encoder.sh b/test/vp9_spatial_svc_encoder.sh
index a5728f677..7a964a920 100755
--- a/test/vp9_spatial_svc_encoder.sh
+++ b/test/vp9_spatial_svc_encoder.sh
@@ -25,12 +25,13 @@ vp9_spatial_svc_encoder_verify_environment() {
# Runs vp9_spatial_svc_encoder. $1 is the test name.
vp9_spatial_svc_encoder() {
- local encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder"
- encoder="${encoder}${VPX_TEST_EXE_SUFFIX}"
- local test_name="$1"
- local output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf"
- local frames_to_encode="10"
- local max_kf="9999"
+ local readonly \
+ encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}"
+ local readonly test_name="$1"
+ local readonly \
+ output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf"
+ local readonly frames_to_encode=10
+ local readonly max_kf=9999
shift
@@ -40,8 +41,8 @@ vp9_spatial_svc_encoder() {
fi
eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \
- -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \
- "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull}
+ -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \
+ "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull}
[ -e "${output_file}" ] || return 1
}
@@ -49,43 +50,57 @@ vp9_spatial_svc_encoder() {
# Each mode is run with layer count 1-$vp9_ssvc_test_layers.
vp9_ssvc_test_layers=5
-vp9_spatial_svc_mode_i() {
+DISABLED_vp9_spatial_svc_mode_i() {
if [ "$(vp9_encode_available)" = "yes" ]; then
- local test_name="${FUNCNAME}"
+ local readonly test_name="DISABLED_vp9_spatial_svc_mode_i"
for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
vp9_spatial_svc_encoder "${test_name}" -m i -l ${layers}
done
fi
}
-vp9_spatial_svc_mode_altip() {
+DISABLED_vp9_spatial_svc_mode_altip() {
if [ "$(vp9_encode_available)" = "yes" ]; then
- local test_name="${FUNCNAME}"
+ local readonly test_name="DISABLED_vp9_spatial_svc_mode_altip"
for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
vp9_spatial_svc_encoder "${test_name}" -m "alt-ip" -l ${layers}
done
fi
}
-vp9_spatial_svc_mode_ip() {
+DISABLED_vp9_spatial_svc_mode_ip() {
if [ "$(vp9_encode_available)" = "yes" ]; then
- local test_name="${FUNCNAME}"
+ local readonly test_name="DISABLED_vp9_spatial_svc_mode_ip"
vp9_spatial_svc_encoder "${test_name}" -m ip -l 1
fi
}
-vp9_spatial_svc_mode_gf() {
+DISABLED_vp9_spatial_svc_mode_gf() {
if [ "$(vp9_encode_available)" = "yes" ]; then
- local test_name="${FUNCNAME}"
+ local readonly test_name="DISABLED_vp9_spatial_svc_mode_gf"
for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
vp9_spatial_svc_encoder "${test_name}" -m gf -l ${layers}
done
fi
}
-vp9_spatial_svc_tests="vp9_spatial_svc_mode_i
- vp9_spatial_svc_mode_altip
- vp9_spatial_svc_mode_ip
- vp9_spatial_svc_mode_gf"
+vp9_spatial_svc() {
+ if [ "$(vp9_encode_available)" = "yes" ]; then
+ local readonly test_name="vp9_spatial_svc"
+ for layers in $(seq 1 ${vp9_ssvc_test_layers}); do
+ vp9_spatial_svc_encoder "${test_name}" -l ${layers}
+ done
+ fi
+}
+
+readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i
+ DISABLED_vp9_spatial_svc_mode_altip
+ DISABLED_vp9_spatial_svc_mode_ip
+ DISABLED_vp9_spatial_svc_mode_gf
+ vp9_spatial_svc"
-run_tests vp9_spatial_svc_encoder_verify_environment "${vp9_spatial_svc_tests}"
+if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then
+ run_tests \
+ vp9_spatial_svc_encoder_verify_environment \
+ "${vp9_spatial_svc_tests}"
+fi
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
index 2e9676333..fabb43824 100644
--- a/test/vp9_subtract_test.cc
+++ b/test/vp9_subtract_test.cc
@@ -95,4 +95,9 @@ INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
::testing::Values(vp9_subtract_block_sse2));
#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
+ ::testing::Values(vp9_subtract_block_neon));
+#endif
+
} // namespace vp9
diff --git a/test/scale_border_test.cc b/test/vpx_scale_test.cc
index cc9a69a7d..b3302d942 100644
--- a/test/scale_border_test.cc
+++ b/test/vpx_scale_test.cc
@@ -21,11 +21,12 @@
namespace {
typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf);
+typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf,
+ YV12_BUFFER_CONFIG *dst_ybf);
-class ExtendBorderTest
- : public ::testing::TestWithParam<ExtendFrameBorderFunc> {
+class VpxScaleBase {
public:
- virtual ~ExtendBorderTest() {
+ virtual ~VpxScaleBase() {
libvpx_test::ClearSystemState();
}
@@ -35,7 +36,6 @@ class ExtendBorderTest
vpx_memset(&img_, 0, sizeof(img_));
ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_,
VP8BORDERINPIXELS));
-
vpx_memset(img_.buffer_alloc, kBufFiller, img_.frame_size);
FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height,
img_.y_stride);
@@ -47,31 +47,25 @@ class ExtendBorderTest
vpx_memset(&ref_img_, 0, sizeof(ref_img_));
ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_,
VP8BORDERINPIXELS));
-
vpx_memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size);
- FillPlane(ref_img_.y_buffer, ref_img_.y_crop_width, ref_img_.y_crop_height,
- ref_img_.y_stride);
- FillPlane(ref_img_.u_buffer,
- ref_img_.uv_crop_width, ref_img_.uv_crop_height,
- ref_img_.uv_stride);
- FillPlane(ref_img_.v_buffer,
- ref_img_.uv_crop_width, ref_img_.uv_crop_height,
- ref_img_.uv_stride);
+
+ vpx_memset(&cpy_img_, 0, sizeof(cpy_img_));
+ ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_,
+ VP8BORDERINPIXELS));
+ vpx_memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size);
+ ReferenceCopyFrame();
}
void DeallocImage() {
vp8_yv12_de_alloc_frame_buffer(&img_);
vp8_yv12_de_alloc_frame_buffer(&ref_img_);
+ vp8_yv12_de_alloc_frame_buffer(&cpy_img_);
}
- private:
+ protected:
static const int kBufFiller = 123;
static const int kBufMax = kBufFiller - 1;
- virtual void SetUp() {
- extend_fn_ = GetParam();
- }
-
static void FillPlane(uint8_t *buf, int width, int height, int stride) {
for (int y = 0; y < height; ++y) {
for (int x = 0; x < width; ++x) {
@@ -80,24 +74,6 @@ class ExtendBorderTest
}
}
- void ReferenceExtendBorder() {
- ExtendPlane(ref_img_.y_buffer,
- ref_img_.y_crop_width, ref_img_.y_crop_height,
- ref_img_.y_width, ref_img_.y_height,
- ref_img_.y_stride,
- ref_img_.border);
- ExtendPlane(ref_img_.u_buffer,
- ref_img_.uv_crop_width, ref_img_.uv_crop_height,
- ref_img_.uv_width, ref_img_.uv_height,
- ref_img_.uv_stride,
- ref_img_.border / 2);
- ExtendPlane(ref_img_.v_buffer,
- ref_img_.uv_crop_width, ref_img_.uv_crop_height,
- ref_img_.uv_width, ref_img_.uv_height,
- ref_img_.uv_stride,
- ref_img_.border / 2);
- }
-
static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height,
int width, int height, int stride, int padding) {
// Copy the outermost visible pixel to a distance of at least 'padding.'
@@ -136,17 +112,75 @@ class ExtendBorderTest
}
}
- void ExtendBorder() {
- ASM_REGISTER_STATE_CHECK(extend_fn_(&img_));
+ void ReferenceExtendBorder() {
+ ExtendPlane(ref_img_.y_buffer,
+ ref_img_.y_crop_width, ref_img_.y_crop_height,
+ ref_img_.y_width, ref_img_.y_height,
+ ref_img_.y_stride,
+ ref_img_.border);
+ ExtendPlane(ref_img_.u_buffer,
+ ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+ ref_img_.uv_width, ref_img_.uv_height,
+ ref_img_.uv_stride,
+ ref_img_.border / 2);
+ ExtendPlane(ref_img_.v_buffer,
+ ref_img_.uv_crop_width, ref_img_.uv_crop_height,
+ ref_img_.uv_width, ref_img_.uv_height,
+ ref_img_.uv_stride,
+ ref_img_.border / 2);
}
- void CompareImages() {
+ void ReferenceCopyFrame() {
+ // Copy img_ to ref_img_ and extend frame borders. This will be used for
+ // verifying extend_fn_ as well as copy_frame_fn_.
EXPECT_EQ(ref_img_.frame_size, img_.frame_size);
- EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, img_.buffer_alloc,
+ for (int y = 0; y < img_.y_crop_height; ++y) {
+ for (int x = 0; x < img_.y_crop_width; ++x) {
+ ref_img_.y_buffer[x + y * ref_img_.y_stride] =
+ img_.y_buffer[x + y * img_.y_stride];
+ }
+ }
+
+ for (int y = 0; y < img_.uv_crop_height; ++y) {
+ for (int x = 0; x < img_.uv_crop_width; ++x) {
+ ref_img_.u_buffer[x + y * ref_img_.uv_stride] =
+ img_.u_buffer[x + y * img_.uv_stride];
+ ref_img_.v_buffer[x + y * ref_img_.uv_stride] =
+ img_.v_buffer[x + y * img_.uv_stride];
+ }
+ }
+
+ ReferenceExtendBorder();
+ }
+
+ void CompareImages(const YV12_BUFFER_CONFIG actual) {
+ EXPECT_EQ(ref_img_.frame_size, actual.frame_size);
+ EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc,
ref_img_.frame_size));
}
+ YV12_BUFFER_CONFIG img_;
+ YV12_BUFFER_CONFIG ref_img_;
+ YV12_BUFFER_CONFIG cpy_img_;
+ int width_;
+ int height_;
+};
+
+class ExtendBorderTest
+ : public VpxScaleBase,
+ public ::testing::TestWithParam<ExtendFrameBorderFunc> {
+ public:
+ virtual ~ExtendBorderTest() {}
+
protected:
+ virtual void SetUp() {
+ extend_fn_ = GetParam();
+ }
+
+ void ExtendBorder() {
+ ASM_REGISTER_STATE_CHECK(extend_fn_(&img_));
+ }
+
void RunTest() {
#if ARCH_ARM
// Some arm devices OOM when trying to allocate the largest buffers.
@@ -160,17 +194,13 @@ class ExtendBorderTest
ResetImage(kSizesToTest[w], kSizesToTest[h]);
ExtendBorder();
ReferenceExtendBorder();
- CompareImages();
+ CompareImages(img_);
DeallocImage();
}
}
}
- YV12_BUFFER_CONFIG img_;
- YV12_BUFFER_CONFIG ref_img_;
ExtendFrameBorderFunc extend_fn_;
- int width_;
- int height_;
};
TEST_P(ExtendBorderTest, ExtendBorder) {
@@ -179,4 +209,48 @@ TEST_P(ExtendBorderTest, ExtendBorder) {
INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest,
::testing::Values(vp8_yv12_extend_frame_borders_c));
+
+class CopyFrameTest
+ : public VpxScaleBase,
+ public ::testing::TestWithParam<CopyFrameFunc> {
+ public:
+ virtual ~CopyFrameTest() {}
+
+ protected:
+ virtual void SetUp() {
+ copy_frame_fn_ = GetParam();
+ }
+
+ void CopyFrame() {
+ ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_));
+ }
+
+ void RunTest() {
+#if ARCH_ARM
+ // Some arm devices OOM when trying to allocate the largest buffers.
+ static const int kNumSizesToTest = 6;
+#else
+ static const int kNumSizesToTest = 7;
+#endif
+ static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383};
+ for (int h = 0; h < kNumSizesToTest; ++h) {
+ for (int w = 0; w < kNumSizesToTest; ++w) {
+ ResetImage(kSizesToTest[w], kSizesToTest[h]);
+ ReferenceCopyFrame();
+ CopyFrame();
+ CompareImages(cpy_img_);
+ DeallocImage();
+ }
+ }
+ }
+
+ CopyFrameFunc copy_frame_fn_;
+};
+
+TEST_P(CopyFrameTest, CopyFrame) {
+ ASSERT_NO_FATAL_FAILURE(RunTest());
+}
+
+INSTANTIATE_TEST_CASE_P(C, CopyFrameTest,
+ ::testing::Values(vp8_yv12_copy_frame_c));
} // namespace
diff --git a/test/vpxdec.sh b/test/vpxdec.sh
index 093230b69..836b13cce 100755
--- a/test/vpxdec.sh
+++ b/test/vpxdec.sh
@@ -22,6 +22,32 @@ vpxdec_verify_environment() {
fi
}
+# Echoes yes to stdout when vpxdec exists according to vpx_tool_available().
+vpxdec_available() {
+ [ -n "$(vpx_tool_available vpxdec)" ] && echo yes
+}
+
+# Wrapper function for running vpxdec with pipe input. Requires that
+# LIBVPX_BIN_PATH points to the directory containing vpxdec. $1 is used as the
+# input file path and shifted away. All remaining parameters are passed through
+# to vpxdec.
+vpxdec_pipe() {
+ local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
+ local input="$1"
+ shift
+ cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull}
+}
+
+# Wrapper function for running vpxdec. Requires that LIBVPX_BIN_PATH points to
+# the directory containing vpxdec. $1 one is used as the input file path and
+# shifted away. All remaining parameters are passed through to vpxdec.
+vpxdec() {
+ local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}"
+ local input="${1}"
+ shift
+ eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull}
+}
+
vpxdec_can_decode_vp8() {
if [ "$(vpxdec_available)" = "yes" ] && \
[ "$(vp8_decode_available)" = "yes" ]; then
@@ -38,20 +64,20 @@ vpxdec_can_decode_vp9() {
vpxdec_vp8_ivf() {
if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then
- vpxdec "${VP8_IVF_FILE}"
+ vpxdec "${VP8_IVF_FILE}" --summary --noblit
fi
}
vpxdec_vp8_ivf_pipe_input() {
if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then
- vpxdec "${VP8_IVF_FILE}" -
+ vpxdec_pipe "${VP8_IVF_FILE}" --summary --noblit
fi
}
vpxdec_vp9_webm() {
if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
[ "$(webm_io_available)" = "yes" ]; then
- vpxdec "${VP9_WEBM_FILE}"
+ vpxdec "${VP9_WEBM_FILE}" --summary --noblit
fi
}
diff --git a/third_party/libmkv/EbmlIDs.h b/third_party/libmkv/EbmlIDs.h
deleted file mode 100644
index 44d438583..000000000
--- a/third_party/libmkv/EbmlIDs.h
+++ /dev/null
@@ -1,231 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef MKV_DEFS_HPP
-#define MKV_DEFS_HPP 1
-
-/* Commenting out values not available in webm, but available in matroska */
-
-enum mkv {
- EBML = 0x1A45DFA3,
- EBMLVersion = 0x4286,
- EBMLReadVersion = 0x42F7,
- EBMLMaxIDLength = 0x42F2,
- EBMLMaxSizeLength = 0x42F3,
- DocType = 0x4282,
- DocTypeVersion = 0x4287,
- DocTypeReadVersion = 0x4285,
-/* CRC_32 = 0xBF, */
- Void = 0xEC,
- SignatureSlot = 0x1B538667,
- SignatureAlgo = 0x7E8A,
- SignatureHash = 0x7E9A,
- SignaturePublicKey = 0x7EA5,
- Signature = 0x7EB5,
- SignatureElements = 0x7E5B,
- SignatureElementList = 0x7E7B,
- SignedElement = 0x6532,
- /* segment */
- Segment = 0x18538067,
- /* Meta Seek Information */
- SeekHead = 0x114D9B74,
- Seek = 0x4DBB,
- SeekID = 0x53AB,
- SeekPosition = 0x53AC,
- /* Segment Information */
- Info = 0x1549A966,
-/* SegmentUID = 0x73A4, */
-/* SegmentFilename = 0x7384, */
-/* PrevUID = 0x3CB923, */
-/* PrevFilename = 0x3C83AB, */
-/* NextUID = 0x3EB923, */
-/* NextFilename = 0x3E83BB, */
-/* SegmentFamily = 0x4444, */
-/* ChapterTranslate = 0x6924, */
-/* ChapterTranslateEditionUID = 0x69FC, */
-/* ChapterTranslateCodec = 0x69BF, */
-/* ChapterTranslateID = 0x69A5, */
- TimecodeScale = 0x2AD7B1,
- Segment_Duration = 0x4489,
- DateUTC = 0x4461,
-/* Title = 0x7BA9, */
- MuxingApp = 0x4D80,
- WritingApp = 0x5741,
- /* Cluster */
- Cluster = 0x1F43B675,
- Timecode = 0xE7,
-/* SilentTracks = 0x5854, */
-/* SilentTrackNumber = 0x58D7, */
-/* Position = 0xA7, */
- PrevSize = 0xAB,
- BlockGroup = 0xA0,
- Block = 0xA1,
-/* BlockVirtual = 0xA2, */
- BlockAdditions = 0x75A1,
- BlockMore = 0xA6,
- BlockAddID = 0xEE,
- BlockAdditional = 0xA5,
- BlockDuration = 0x9B,
-/* ReferencePriority = 0xFA, */
- ReferenceBlock = 0xFB,
-/* ReferenceVirtual = 0xFD, */
-/* CodecState = 0xA4, */
-/* Slices = 0x8E, */
-/* TimeSlice = 0xE8, */
- LaceNumber = 0xCC,
-/* FrameNumber = 0xCD, */
-/* BlockAdditionID = 0xCB, */
-/* MkvDelay = 0xCE, */
-/* Cluster_Duration = 0xCF, */
- SimpleBlock = 0xA3,
-/* EncryptedBlock = 0xAF, */
- /* Track */
- Tracks = 0x1654AE6B,
- TrackEntry = 0xAE,
- TrackNumber = 0xD7,
- TrackUID = 0x73C5,
- TrackType = 0x83,
- FlagEnabled = 0xB9,
- FlagDefault = 0x88,
- FlagForced = 0x55AA,
- FlagLacing = 0x9C,
-/* MinCache = 0x6DE7, */
-/* MaxCache = 0x6DF8, */
- DefaultDuration = 0x23E383,
-/* TrackTimecodeScale = 0x23314F, */
-/* TrackOffset = 0x537F, */
- MaxBlockAdditionID = 0x55EE,
- Name = 0x536E,
- Language = 0x22B59C,
- CodecID = 0x86,
- CodecPrivate = 0x63A2,
- CodecName = 0x258688,
-/* AttachmentLink = 0x7446, */
-/* CodecSettings = 0x3A9697, */
-/* CodecInfoURL = 0x3B4040, */
-/* CodecDownloadURL = 0x26B240, */
-/* CodecDecodeAll = 0xAA, */
-/* TrackOverlay = 0x6FAB, */
-/* TrackTranslate = 0x6624, */
-/* TrackTranslateEditionUID = 0x66FC, */
-/* TrackTranslateCodec = 0x66BF, */
-/* TrackTranslateTrackID = 0x66A5, */
- /* video */
- Video = 0xE0,
- FlagInterlaced = 0x9A,
- StereoMode = 0x53B8,
- AlphaMode = 0x53C0,
- PixelWidth = 0xB0,
- PixelHeight = 0xBA,
- PixelCropBottom = 0x54AA,
- PixelCropTop = 0x54BB,
- PixelCropLeft = 0x54CC,
- PixelCropRight = 0x54DD,
- DisplayWidth = 0x54B0,
- DisplayHeight = 0x54BA,
- DisplayUnit = 0x54B2,
- AspectRatioType = 0x54B3,
-/* ColourSpace = 0x2EB524, */
-/* GammaValue = 0x2FB523, */
- FrameRate = 0x2383E3,
- /* end video */
- /* audio */
- Audio = 0xE1,
- SamplingFrequency = 0xB5,
- OutputSamplingFrequency = 0x78B5,
- Channels = 0x9F,
-/* ChannelPositions = 0x7D7B, */
- BitDepth = 0x6264,
- /* end audio */
- /* content encoding */
-/* ContentEncodings = 0x6d80, */
-/* ContentEncoding = 0x6240, */
-/* ContentEncodingOrder = 0x5031, */
-/* ContentEncodingScope = 0x5032, */
-/* ContentEncodingType = 0x5033, */
-/* ContentCompression = 0x5034, */
-/* ContentCompAlgo = 0x4254, */
-/* ContentCompSettings = 0x4255, */
-/* ContentEncryption = 0x5035, */
-/* ContentEncAlgo = 0x47e1, */
-/* ContentEncKeyID = 0x47e2, */
-/* ContentSignature = 0x47e3, */
-/* ContentSigKeyID = 0x47e4, */
-/* ContentSigAlgo = 0x47e5, */
-/* ContentSigHashAlgo = 0x47e6, */
- /* end content encoding */
- /* Cueing Data */
- Cues = 0x1C53BB6B,
- CuePoint = 0xBB,
- CueTime = 0xB3,
- CueTrackPositions = 0xB7,
- CueTrack = 0xF7,
- CueClusterPosition = 0xF1,
- CueBlockNumber = 0x5378
-/* CueCodecState = 0xEA, */
-/* CueReference = 0xDB, */
-/* CueRefTime = 0x96, */
-/* CueRefCluster = 0x97, */
-/* CueRefNumber = 0x535F, */
-/* CueRefCodecState = 0xEB, */
- /* Attachment */
-/* Attachments = 0x1941A469, */
-/* AttachedFile = 0x61A7, */
-/* FileDescription = 0x467E, */
-/* FileName = 0x466E, */
-/* FileMimeType = 0x4660, */
-/* FileData = 0x465C, */
-/* FileUID = 0x46AE, */
-/* FileReferral = 0x4675, */
- /* Chapters */
-/* Chapters = 0x1043A770, */
-/* EditionEntry = 0x45B9, */
-/* EditionUID = 0x45BC, */
-/* EditionFlagHidden = 0x45BD, */
-/* EditionFlagDefault = 0x45DB, */
-/* EditionFlagOrdered = 0x45DD, */
-/* ChapterAtom = 0xB6, */
-/* ChapterUID = 0x73C4, */
-/* ChapterTimeStart = 0x91, */
-/* ChapterTimeEnd = 0x92, */
-/* ChapterFlagHidden = 0x98, */
-/* ChapterFlagEnabled = 0x4598, */
-/* ChapterSegmentUID = 0x6E67, */
-/* ChapterSegmentEditionUID = 0x6EBC, */
-/* ChapterPhysicalEquiv = 0x63C3, */
-/* ChapterTrack = 0x8F, */
-/* ChapterTrackNumber = 0x89, */
-/* ChapterDisplay = 0x80, */
-/* ChapString = 0x85, */
-/* ChapLanguage = 0x437C, */
-/* ChapCountry = 0x437E, */
-/* ChapProcess = 0x6944, */
-/* ChapProcessCodecID = 0x6955, */
-/* ChapProcessPrivate = 0x450D, */
-/* ChapProcessCommand = 0x6911, */
-/* ChapProcessTime = 0x6922, */
-/* ChapProcessData = 0x6933, */
- /* Tagging */
-/* Tags = 0x1254C367, */
-/* Tag = 0x7373, */
-/* Targets = 0x63C0, */
-/* TargetTypeValue = 0x68CA, */
-/* TargetType = 0x63CA, */
-/* Tagging_TrackUID = 0x63C5, */
-/* Tagging_EditionUID = 0x63C9, */
-/* Tagging_ChapterUID = 0x63C4, */
-/* AttachmentUID = 0x63C6, */
-/* SimpleTag = 0x67C8, */
-/* TagName = 0x45A3, */
-/* TagLanguage = 0x447A, */
-/* TagDefault = 0x4484, */
-/* TagString = 0x4487, */
-/* TagBinary = 0x4485, */
-};
-#endif
diff --git a/third_party/libmkv/EbmlWriter.c b/third_party/libmkv/EbmlWriter.c
deleted file mode 100644
index 27cfe861c..000000000
--- a/third_party/libmkv/EbmlWriter.c
+++ /dev/null
@@ -1,157 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#include "EbmlWriter.h"
-#include <stdlib.h>
-#include <wchar.h>
-#include <string.h>
-#include <limits.h>
-#if defined(_MSC_VER)
-#define LITERALU64(n) n
-#else
-#define LITERALU64(n) n##LLU
-#endif
-
-void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) {
- /* TODO check and make sure we are not > than 0x0100000000000000LLU */
- unsigned char size = 8; /* size in bytes to output */
-
- /* mask to compare for byte size */
- int64_t minVal = 0xff;
-
- for (size = 1; size < 8; size ++) {
- if (val < minVal)
- break;
-
- minVal = (minVal << 7);
- }
-
- val |= (((uint64_t)0x80) << ((size - 1) * 7));
-
- Ebml_Serialize(glob, (void *) &val, sizeof(val), size);
-}
-
-void Ebml_WriteString(EbmlGlobal *glob, const char *str) {
- const size_t size_ = strlen(str);
- const uint64_t size = size_;
- Ebml_WriteLen(glob, size);
- /* TODO: it's not clear from the spec whether the nul terminator
- * should be serialized too. For now we omit the null terminator.
- */
- Ebml_Write(glob, str, (unsigned long)size);
-}
-
-void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) {
- const size_t strlen = wcslen(wstr);
-
- /* TODO: it's not clear from the spec whether the nul terminator
- * should be serialized too. For now we include it.
- */
- const uint64_t size = strlen;
-
- Ebml_WriteLen(glob, size);
- Ebml_Write(glob, wstr, (unsigned long)size);
-}
-
-void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) {
- int len;
-
- if (class_id >= 0x01000000)
- len = 4;
- else if (class_id >= 0x00010000)
- len = 3;
- else if (class_id >= 0x00000100)
- len = 2;
- else
- len = 1;
-
- Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len);
-}
-
-void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) {
- unsigned char sizeSerialized = 8 | 0x80;
- Ebml_WriteID(glob, class_id);
- Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
- Ebml_Serialize(glob, &ui, sizeof(ui), 8);
-}
-
-void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) {
- unsigned char size = 8; /* size in bytes to output */
- unsigned char sizeSerialized = 0;
- unsigned long minVal;
-
- Ebml_WriteID(glob, class_id);
- minVal = 0x7fLU; /* mask to compare for byte size */
-
- for (size = 1; size < 4; size ++) {
- if (ui < minVal) {
- break;
- }
-
- minVal <<= 7;
- }
-
- sizeSerialized = 0x80 | size;
- Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1);
- Ebml_Serialize(glob, &ui, sizeof(ui), size);
-}
-/* TODO: perhaps this is a poor name for this id serializer helper function */
-void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) {
- int size;
- for (size = 4; size > 1; size--) {
- if (bin & (unsigned int)0x000000ff << ((size - 1) * 8))
- break;
- }
- Ebml_WriteID(glob, class_id);
- Ebml_WriteLen(glob, size);
- Ebml_WriteID(glob, bin);
-}
-
-void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d) {
- unsigned char len = 0x88;
-
- Ebml_WriteID(glob, class_id);
- Ebml_Serialize(glob, &len, sizeof(len), 1);
- Ebml_Serialize(glob, &d, sizeof(d), 8);
-}
-
-void Ebml_WriteSigned16(EbmlGlobal *glob, short val) {
- signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8;
- Ebml_Serialize(glob, &out, sizeof(out), 3);
-}
-
-void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s) {
- Ebml_WriteID(glob, class_id);
- Ebml_WriteString(glob, s);
-}
-
-void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s) {
- Ebml_WriteID(glob, class_id);
- Ebml_WriteUTF8(glob, s);
-}
-
-void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length) {
- Ebml_WriteID(glob, class_id);
- Ebml_WriteLen(glob, data_length);
- Ebml_Write(glob, data, data_length);
-}
-
-void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) {
- unsigned char tmp = 0;
- unsigned long i = 0;
-
- Ebml_WriteID(glob, 0xEC);
- Ebml_WriteLen(glob, vSize);
-
- for (i = 0; i < vSize; i++) {
- Ebml_Write(glob, &tmp, 1);
- }
-}
-
-/* TODO Serialize Date */
diff --git a/third_party/libmkv/EbmlWriter.h b/third_party/libmkv/EbmlWriter.h
deleted file mode 100644
index b94f75733..000000000
--- a/third_party/libmkv/EbmlWriter.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-#ifndef EBMLWRITER_HPP
-#define EBMLWRITER_HPP
-#include <stddef.h>
-#include "vpx/vpx_integer.h"
-
-/* note: you must define write and serialize functions as well as your own
- * EBML_GLOBAL
- *
- * These functions MUST be implemented
- */
-
-typedef struct EbmlGlobal EbmlGlobal;
-void Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long);
-void Ebml_Write(EbmlGlobal *glob, const void *, unsigned long);
-
-/*****/
-
-void Ebml_WriteLen(EbmlGlobal *glob, int64_t val);
-void Ebml_WriteString(EbmlGlobal *glob, const char *str);
-void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr);
-void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id);
-void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui);
-void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
-void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui);
-void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d);
-/* TODO make this more generic to signed */
-void Ebml_WriteSigned16(EbmlGlobal *glob, short val);
-void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s);
-void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s);
-void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length);
-void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize);
-/* TODO need date function */
-#endif
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 7d9441d54..ef7f61b12 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -108,8 +108,8 @@ extern "C"
* For temporal denoiser: noise_sensitivity = 0 means off,
* noise_sensitivity = 1 means temporal denoiser on for Y channel only,
* noise_sensitivity = 2 means temporal denoiser on for all channels.
- * noise_sensitivity = 3 will be used for aggressive mode in future.
- * Temporal denoiser is enabled via the build option
+ * noise_sensitivity = 3 means aggressive denoising mode.
+ * Temporal denoiser is enabled via the configuration option:
* CONFIG_TEMPORAL_DENOISING.
* For spatial denoiser: noise_sensitivity controls the amount of
* pre-processing blur: noise_sensitivity = 0 means off.
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 1a401a4b9..c4c0de81b 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <limits.h>
+
#include "denoising.h"
#include "vp8/common/reconinter.h"
@@ -333,12 +335,33 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
return FILTER_BLOCK;
}
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser) {
+ if (!denoiser->aggressive_mode) {
+ denoiser->denoise_pars.scale_sse_thresh = 1;
+ denoiser->denoise_pars.scale_motion_thresh = 8;
+ denoiser->denoise_pars.scale_increase_filter = 0;
+ denoiser->denoise_pars.denoise_mv_bias = 95;
+ denoiser->denoise_pars.pickmode_mv_bias = 100;
+ denoiser->denoise_pars.qp_thresh = 0;
+ denoiser->denoise_pars.consec_zerolast = UINT_MAX;
+ } else {
+ denoiser->denoise_pars.scale_sse_thresh = 2;
+ denoiser->denoise_pars.scale_motion_thresh = 16;
+ denoiser->denoise_pars.scale_increase_filter = 1;
+ denoiser->denoise_pars.denoise_mv_bias = 60;
+ denoiser->denoise_pars.pickmode_mv_bias = 60;
+ denoiser->denoise_pars.qp_thresh = 100;
+ denoiser->denoise_pars.consec_zerolast = 10;
+ }
+}
+
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
- int num_mb_rows, int num_mb_cols)
+ int num_mb_rows, int num_mb_cols, int mode)
{
int i;
assert(denoiser);
denoiser->num_mb_cols = num_mb_cols;
+ denoiser->aggressive_mode = mode;
for (i = 0; i < MAX_REF_FRAMES; i++)
{
@@ -369,10 +392,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
-
+ vp8_denoiser_set_parameters(denoiser);
return 0;
}
+
void vp8_denoiser_free(VP8_DENOISER *denoiser)
{
int i;
@@ -401,6 +425,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
{
int mv_row;
int mv_col;
+ unsigned int motion_threshold;
unsigned int motion_magnitude2;
unsigned int sse_thresh;
int sse_diff_thresh = 0;
@@ -424,7 +449,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
int sse_diff = 0;
// Bias on zero motion vector sse.
- int zero_bias = 95;
+ const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
sse_diff = zero_mv_sse - best_sse;
@@ -502,14 +527,19 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
mv_row = x->best_sse_mv.as_mv.row;
mv_col = x->best_sse_mv.as_mv.col;
motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
- sse_thresh = SSE_THRESHOLD;
- if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH;
+ motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
+ NOISE_MOTION_THRESHOLD;
- if (best_sse > sse_thresh || motion_magnitude2
- > 8 * NOISE_MOTION_THRESHOLD)
- {
- decision = COPY_BLOCK;
- }
+ if (motion_magnitude2 <
+ denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
+ x->increase_denoising = 1;
+
+ sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
+ if (x->increase_denoising)
+ sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
+
+ if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
+ decision = COPY_BLOCK;
if (decision == FILTER_BLOCK)
{
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index a1f195b72..1a42f86d3 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -39,16 +39,40 @@ enum vp8_denoiser_filter_state {
kFilterNonZeroMV
};
+typedef struct {
+ // Scale factor on sse threshold above which no denoising is done.
+ unsigned int scale_sse_thresh;
+ // Scale factor on motion magnitude threshold above which no
+ // denoising is done.
+ unsigned int scale_motion_thresh;
+ // Scale factor on motion magnitude below which we increase the strength of
+ // the temporal filter (in function vp8_denoiser_filter).
+ unsigned int scale_increase_filter;
+ // Scale factor to bias to ZEROMV for denoising.
+ unsigned int denoise_mv_bias;
+ // Scale factor to bias to ZEROMV for coding mode selection.
+ unsigned int pickmode_mv_bias;
+ // Quantizer threshold below which we use the segmentation map to switch off
+ // loop filter for blocks that have been coded as ZEROMV-LAST a certain number
+ // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to
+ // 0 when segmentation map is used for shutting off loop filter.
+ unsigned int qp_thresh;
+ // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST.
+ unsigned int consec_zerolast;
+} denoise_params;
+
typedef struct vp8_denoiser
{
YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
YV12_BUFFER_CONFIG yv12_mc_running_avg;
unsigned char* denoise_state;
int num_mb_cols;
+ int aggressive_mode;
+ denoise_params denoise_pars;
} VP8_DENOISER;
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
- int num_mb_rows, int num_mb_cols);
+ int num_mb_rows, int num_mb_cols, int mode);
void vp8_denoiser_free(VP8_DENOISER *denoiser);
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e6b0f9b64..aec6b9880 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -522,6 +522,19 @@ void encode_mb_row(VP8_COMP *cpi,
}
#endif
+ // Keep track of how many (consecutive) times a block is coded
+ // as ZEROMV_LASTREF, for base layer frames.
+ // Reset to 0 if its coded as anything else.
+ if (cpi->current_layer == 0) {
+ if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+ xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) {
+ // Increment, check for wrap-around.
+ if (cpi->consec_zero_last[map_index+mb_col] < 255)
+ cpi->consec_zero_last[map_index+mb_col] += 1;
+ } else {
+ cpi->consec_zero_last[map_index+mb_col] = 0;
+ }
+ }
/* Special case code for cyclic refresh
* If cyclic update enabled then copy xd->mbmi.segment_id; (which
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index d4b17cef1..7b8b51f30 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -206,6 +206,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
}
#endif
+ // Keep track of how many (consecutive) times a block
+ // is coded as ZEROMV_LASTREF, for base layer frames.
+ // Reset to 0 if its coded as anything else.
+ if (cpi->current_layer == 0) {
+ if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+ xd->mode_info_context->mbmi.ref_frame ==
+ LAST_FRAME) {
+ // Increment, check for wrap-around.
+ if (cpi->consec_zero_last[map_index+mb_col] < 255)
+ cpi->consec_zero_last[map_index+mb_col] +=
+ 1;
+ } else {
+ cpi->consec_zero_last[map_index+mb_col] = 0;
+ }
+ }
/* Special case code for cyclic refresh
* If cyclic update enabled then copy
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 469d0d6e9..298f50f65 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -613,6 +613,24 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
while(block_count && i != cpi->cyclic_refresh_mode_index);
cpi->cyclic_refresh_mode_index = i;
+
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->denoiser.aggressive_mode != 0 &&
+ Q < (int)cpi->denoiser.denoise_pars.qp_thresh) {
+ // Under aggressive denoising mode, use segmentation to turn off loop
+ // filter below some qp thresh. The loop filter is turned off for all
+ // blocks that have been encoded as ZEROMV LAST x frames in a row,
+ // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
+ // This is to avoid "dot" artifacts that can occur from repeated
+ // loop filtering on noisy input source.
+ cpi->cyclic_refresh_q = Q;
+ lf_adjustment = -MAX_LOOP_FILTER;
+ for (i = 0; i < mbs_in_frame; ++i) {
+ seg_map[i] = (cpi->consec_zero_last[i] >
+ cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+ }
+ }
+#endif
}
/* Activate segmentation. */
@@ -1752,7 +1770,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
int width = (cpi->oxcf.Width + 15) & ~15;
int height = (cpi->oxcf.Height + 15) & ~15;
vp8_denoiser_allocate(&cpi->denoiser, width, height,
- cpi->common.mb_rows, cpi->common.mb_cols);
+ cm->mb_rows, cm->mb_cols,
+ ((cpi->oxcf.noise_sensitivity == 3) ? 1 : 0));
}
}
#endif
@@ -1896,6 +1915,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
else
cpi->cyclic_refresh_map = (signed char *) NULL;
+ CHECK_MEM_ERROR(cpi->consec_zero_last,
+ vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
#ifdef VP8_ENTROPY_STATS
init_context_counters();
#endif
@@ -2416,6 +2438,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
vpx_free(cpi->mb.ss);
vpx_free(cpi->tok);
vpx_free(cpi->cyclic_refresh_map);
+ vpx_free(cpi->consec_zero_last);
vp8_remove_common(&cpi->common);
vpx_free(cpi);
@@ -3478,6 +3501,9 @@ static void encode_frame_to_data_rate
{
cpi->mb.rd_thresh_mult[i] = 128;
}
+
+ // Reset the zero_last counter to 0 on key frame.
+ vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
}
#if 0
@@ -3899,6 +3925,7 @@ static void encode_frame_to_data_rate
#endif
+
#ifdef OUTPUT_YUV_SRC
vp8_write_yuv_frame(yuv_file, cpi->Source);
#endif
@@ -3994,6 +4021,8 @@ static void encode_frame_to_data_rate
else
disable_segmentation(cpi);
}
+ // Reset the consec_zero_last counter on key frame.
+ vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
vp8_set_quantizer(cpi, Q);
}
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index df17dff34..7a8baca77 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -511,6 +511,8 @@ typedef struct VP8_COMP
int cyclic_refresh_mode_index;
int cyclic_refresh_q;
signed char *cyclic_refresh_map;
+ // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST.
+ unsigned char *consec_zero_last;
// Frame counter for the temporal pattern. Counter is rest when the temporal
// layers are changed dynamically (run-time change).
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 86108b70a..ec1ea146f 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -40,7 +40,6 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
-
int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
@@ -694,6 +693,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
*/
calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity) {
+ rd_adjustment = (int)(rd_adjustment *
+ cpi->denoiser.denoise_pars.pickmode_mv_bias / 100);
+ }
+#endif
+
/* if we encode a new mv this is important
* find the best new motion vector
*/
@@ -1168,7 +1174,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity)
{
- int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0;
+ int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
int block_index = mb_row * cpi->common.mb_cols + mb_col;
if (x->best_sse_inter_mode == DC_PRED)
{
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index a10c401fe..83938dd3d 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -1326,7 +1326,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
0, /* ts_periodicity */
{0}, /* ts_layer_id */
}},
- { -1, {NOT_IMPLEMENTED}}
};
@@ -1352,6 +1351,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) =
NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */
},
{
+ 1, /* 1 cfg map */
vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */
vp8e_encode, /* vpx_codec_encode_fn_t encode; */
vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */
diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index fb3c236ce..f768b5c47 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c
@@ -804,6 +804,7 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) =
NOT_IMPLEMENTED,
},
{ /* encoder functions */
+ 0,
NOT_IMPLEMENTED,
NOT_IMPLEMENTED,
NOT_IMPLEMENTED,
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index ef5caf327..4602b7b4f 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -420,7 +420,7 @@ add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int sourc
specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
@@ -435,7 +435,7 @@ add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, co
specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
+specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance8x4/, "$sse2_x86inc";
@@ -483,7 +483,7 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_
specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -712,10 +712,10 @@ add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqc
specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vp9_subtract_block/, "$sse2_x86inc";
+specialize qw/vp9_subtract_block neon/, "$sse2_x86inc";
add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-specialize qw/vp9_quantize_fp/, "$ssse3_x86_64";
+specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64";
add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64";
@@ -739,10 +739,10 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
# fdct functions
add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht4x4 sse2 avx2/;
+specialize qw/vp9_fht4x4 sse2/;
add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type";
-specialize qw/vp9_fht8x8 sse2 avx2/;
+specialize qw/vp9_fht8x8 sse2/;
add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type";
specialize qw/vp9_fht16x16 sse2/;
@@ -754,13 +754,13 @@ add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int st
specialize qw/vp9_fdct4x4_1 sse2/;
add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct4x4 sse2 avx2/;
+specialize qw/vp9_fdct4x4 sse2/;
add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8_1 sse2/;
+specialize qw/vp9_fdct8x8_1 sse2 neon/;
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64";
+specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
specialize qw/vp9_fdct16x16_1 sse2/;
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 000000000..6c66f5d5b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+
+void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+ int r;
+ int16x8_t sum = vld1q_s16(&input[0]);
+ for (r = 1; r < 8; ++r) {
+ const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+ sum = vaddq_s16(sum, input_00);
+ }
+ {
+ const int32x4_t a = vpaddlq_s16(sum);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+ output[1] = 0;
+ }
+}
+
+void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+ int i;
+ // stage 1
+ int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+ for (i = 0; i < 2; ++i) {
+ int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+ const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+ const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+ const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+ const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+ const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+ const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+ const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+ const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+ // fdct4(step, step);
+ int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+ int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+ int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+ int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+ // fdct4(step, step);
+ int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+ int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+ v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+ v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+ v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+ v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+ v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+ v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
+ out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
+ out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
+ out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
+ }
+ // Stage 2
+ v_x0 = vsubq_s16(v_s6, v_s5);
+ v_x1 = vaddq_s16(v_s6, v_s5);
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x8_t ab = vcombine_s16(a, b);
+ const int16x8_t cd = vcombine_s16(c, d);
+ // Stage 3
+ v_x0 = vaddq_s16(v_s4, ab);
+ v_x1 = vsubq_s16(v_s4, ab);
+ v_x2 = vsubq_s16(v_s7, cd);
+ v_x3 = vaddq_s16(v_s7, cd);
+ }
+ // Stage 4
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+ v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+ v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+ v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+ v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+ v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
+ out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
+ out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
+ out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
+ }
+ // transpose 8x8
+ {
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ // 04 05 06 07 44 45 46 47
+ // 14 15 16 17 54 55 56 57
+ // 24 25 26 27 64 65 66 67
+ // 34 35 36 37 74 75 76 77
+ const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+ vreinterpretq_s32_s16(out_2));
+ const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+ vreinterpretq_s32_s16(out_3));
+ const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+ vreinterpretq_s32_s16(out_6));
+ const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+ vreinterpretq_s32_s16(out_7));
+ const int16x8x2_t r01_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+ vreinterpretq_s16_s32(r13_s32.val[0]));
+ const int16x8x2_t r23_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+ vreinterpretq_s16_s32(r13_s32.val[1]));
+ const int16x8x2_t r45_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+ vreinterpretq_s16_s32(r57_s32.val[0]));
+ const int16x8x2_t r67_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+ vreinterpretq_s16_s32(r57_s32.val[1]));
+ input_0 = r01_s16.val[0];
+ input_1 = r01_s16.val[1];
+ input_2 = r23_s16.val[0];
+ input_3 = r23_s16.val[1];
+ input_4 = r45_s16.val[0];
+ input_5 = r45_s16.val[1];
+ input_6 = r67_s16.val[0];
+ input_7 = r67_s16.val[1];
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ } // for
+ {
+ // from vp9_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+ input_0 = vhsubq_s16(input_0, sign_in0);
+ input_1 = vhsubq_s16(input_1, sign_in1);
+ input_2 = vhsubq_s16(input_2, sign_in2);
+ input_3 = vhsubq_s16(input_3, sign_in3);
+ input_4 = vhsubq_s16(input_4, sign_in4);
+ input_5 = vhsubq_s16(input_5, sign_in5);
+ input_6 = vhsubq_s16(input_6, sign_in6);
+ input_7 = vhsubq_s16(input_7, sign_in7);
+ // store results
+ vst1q_s16(&final_output[0 * 8], input_0);
+ vst1q_s16(&final_output[1 * 8], input_1);
+ vst1q_s16(&final_output[2 * 8], input_2);
+ vst1q_s16(&final_output[3 * 8], input_3);
+ vst1q_s16(&final_output[4 * 8], input_4);
+ vst1q_s16(&final_output[5 * 8], input_5);
+ vst1q_s16(&final_output[6 * 8], input_6);
+ vst1q_s16(&final_output[7 * 8], input_7);
+ }
+}
+
diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c
new file mode 100644
index 000000000..2d5ec79b3
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <math.h>
+
+#include "vpx_mem/vpx_mem.h"
+
+#include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_seg_common.h"
+
+#include "vp9/encoder/vp9_encoder.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_rd.h"
+
+void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+ int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ int zbin_oq_value, uint16_t *eob_ptr,
+ const int16_t *scan, const int16_t *iscan) {
+ int i;
+ // TODO(jingning) Decide the need of these arguments after the
+ // quantization process is completed.
+ (void)zbin_ptr;
+ (void)quant_shift_ptr;
+ (void)zbin_oq_value;
+ (void)scan;
+
+ if (!skip_block) {
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+
+ const int16x8_t v_zero = vdupq_n_s16(0);
+ const int16x8_t v_one = vdupq_n_s16(1);
+ int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1);
+ int16x8_t v_round = vmovq_n_s16(round_ptr[1]);
+ int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]);
+ int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ // adjust for dc
+ v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
+ v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0);
+ v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0);
+
+ for (i = 0; i < count; i += 8) {
+ const int16x8_t v_iscan = vld1q_s16(&iscan[i]);
+ const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]);
+ const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15);
+ const int16x8_t v_abs_coeff = vabsq_s16(v_coeff);
+ const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round);
+ const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp),
+ vget_low_s16(v_quant));
+ const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp),
+ vget_high_s16(v_quant));
+ const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16),
+ vshrn_n_s32(v_tmp_hi, 16));
+ const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero);
+ const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one);
+ const int16x8_t v_nz_iscan =
+ vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1);
+ const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign);
+ const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign);
+ const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant);
+
+ v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan);
+
+ vst1q_s16(&qcoeff_ptr[i], v_qcoeff);
+ vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff);
+ v_round = vmovq_n_s16(round_ptr[1]);
+ v_quant = vmovq_n_s16(quant_ptr[1]);
+ v_dequant = vmovq_n_s16(dequant_ptr[1]);
+ }
+ {
+ const int16x4_t v_eobmax_3210 =
+ vmax_s16(vget_low_s16(v_eobmax_76543210),
+ vget_high_s16(v_eobmax_76543210));
+ const int64x1_t v_eobmax_xx32 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32);
+ const int16x4_t v_eobmax_tmp =
+ vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32));
+ const int64x1_t v_eobmax_xxx3 =
+ vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16);
+ const int16x4_t v_eobmax_final =
+ vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3));
+
+ *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0);
+ }
+ } else {
+ vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t));
+ vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t));
+ *eob_ptr = 0;
+ }
+}
diff --git a/vp9/encoder/arm/neon/vp9_subtract_neon.c b/vp9/encoder/arm/neon/vp9_subtract_neon.c
new file mode 100644
index 000000000..b4bf567db
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_subtract_neon.c
@@ -0,0 +1,81 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+void vp9_subtract_block_neon(int rows, int cols,
+ int16_t *diff, ptrdiff_t diff_stride,
+ const uint8_t *src, ptrdiff_t src_stride,
+ const uint8_t *pred, ptrdiff_t pred_stride) {
+ int r, c;
+
+ if (cols > 16) {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; c += 32) {
+ const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
+ const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
+ const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
+ const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00),
+ vget_low_u8(v_pred_00));
+ const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00),
+ vget_high_u8(v_pred_00));
+ const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16),
+ vget_low_u8(v_pred_16));
+ const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16),
+ vget_high_u8(v_pred_16));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
+ vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
+ vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+ }
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else if (cols > 8) {
+ for (r = 0; r < rows; ++r) {
+ const uint8x16_t v_src = vld1q_u8(&src[0]);
+ const uint8x16_t v_pred = vld1q_u8(&pred[0]);
+ const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src),
+ vget_low_u8(v_pred));
+ const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src),
+ vget_high_u8(v_pred));
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else if (cols > 4) {
+ for (r = 0; r < rows; ++r) {
+ const uint8x8_t v_src = vld1_u8(&src[0]);
+ const uint8x8_t v_pred = vld1_u8(&pred[0]);
+ const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ } else {
+ for (r = 0; r < rows; ++r) {
+ for (c = 0; c < cols; ++c)
+ diff[c] = src[c] - pred[c];
+
+ diff += diff_stride;
+ pred += pred_stride;
+ src += src_stride;
+ }
+ }
+}
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
new file mode 100644
index 000000000..f6871188b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+enum { kWidth16 = 16 };
+enum { kHeight16 = 16 };
+enum { kHeight16PlusOne = 17 };
+enum { kPixelStepOne = 1 };
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+ int16x8_t v_sum = vdupq_n_s16(0);
+ int32x4_t v_sse_lo = vdupq_n_s32(0);
+ int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const uint8x8_t v_a = vld1_u8(&a[j]);
+ const uint8x8_t v_b = vld1_u8(&b[j]);
+ const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+ const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+ v_sum = vaddq_s16(v_sum, sv_diff);
+ v_sse_lo = vmlal_s16(v_sse_lo,
+ vget_low_s16(sv_diff),
+ vget_low_s16(sv_diff));
+ v_sse_hi = vmlal_s16(v_sse_hi,
+ vget_high_s16(sv_diff),
+ vget_high_s16(sv_diff));
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(v_sum);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
+ kHeight16, sse, sum);
+}
+
+unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
+ const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; ++i) {
+ const uint8x16_t src_0 = vld1q_u8(&src_ptr[0]);
+ const uint8x16_t src_1 = vld1q_u8(&src_ptr[pixel_step]);
+ const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+ const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+ const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+ const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+ const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+ const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+ vst1q_u8(&output_ptr[0], vcombine_u8(out_lo, out_hi));
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, temp2, kHeight16 * kWidth16);
+ DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+ kHeight16PlusOne, kWidth16,
+ BILINEAR_FILTERS_2TAP(xoffset));
+ var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
+ kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
+}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 584bcb8f5..b1ef3dcac 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1836,7 +1836,7 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
BLOCK_SIZE max_size = BLOCK_8X8;
int bsl = mi_width_log2(BLOCK_64X64);
const int search_range_ctrl = (((mi_row + mi_col) >> bsl) +
- get_chessboard_index(cm)) % 2;
+ get_chessboard_index(cm->current_video_frame)) & 0x1;
// Trap case where we do not have a prediction.
if (search_range_ctrl &&
(left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) {
@@ -1880,6 +1880,60 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile,
*max_block_size = max_size;
}
+// TODO(jingning) refactor functions setting partition search range
+static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd,
+ int mi_row, int mi_col, BLOCK_SIZE bsize,
+ BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) {
+ int mi_width = num_8x8_blocks_wide_lookup[bsize];
+ int mi_height = num_8x8_blocks_high_lookup[bsize];
+ int idx, idy;
+
+ MODE_INFO *mi;
+ MODE_INFO **prev_mi =
+ &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col];
+ BLOCK_SIZE bs, min_size, max_size;
+
+ min_size = BLOCK_64X64;
+ max_size = BLOCK_4X4;
+
+ if (prev_mi) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ mi = prev_mi[idy * cm->mi_stride + idx];
+ bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = MIN(min_size, bs);
+ max_size = MAX(max_size, bs);
+ }
+ }
+ }
+
+ if (xd->left_available) {
+ for (idy = 0; idy < mi_height; ++idy) {
+ mi = xd->mi[idy * cm->mi_stride - 1];
+ bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = MIN(min_size, bs);
+ max_size = MAX(max_size, bs);
+ }
+ }
+
+ if (xd->up_available) {
+ for (idx = 0; idx < mi_width; ++idx) {
+ mi = xd->mi[idx - cm->mi_stride];
+ bs = mi ? mi->mbmi.sb_type : bsize;
+ min_size = MIN(min_size, bs);
+ max_size = MAX(max_size, bs);
+ }
+ }
+
+ if (min_size == max_size) {
+ min_size = min_partition_size[min_size];
+ max_size = max_partition_size[max_size];
+ }
+
+ *min_bs = min_size;
+ *max_bs = max_size;
+}
+
static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv));
}
@@ -1888,13 +1942,22 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv));
}
+#if CONFIG_FP_MB_STATS
+const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
+ {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4};
+const int num_16x16_blocks_high_lookup[BLOCK_SIZES] =
+ {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4};
+const int qindex_skip_threshold_lookup[BLOCK_SIZES] =
+ {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120};
+#endif
+
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previous rate-distortion optimization
// results, for encoding speed-up.
static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
TOKENEXTRA **tp, int mi_row,
int mi_col, BLOCK_SIZE bsize, int *rate,
- int64_t *dist, int do_recon, int64_t best_rd,
+ int64_t *dist, int64_t best_rd,
PC_TREE *pc_tree) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
@@ -1917,6 +1980,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
const int xss = x->e_mbd.plane[1].subsampling_x;
const int yss = x->e_mbd.plane[1].subsampling_y;
+ BLOCK_SIZE min_size = cpi->sf.min_partition_size;
+ BLOCK_SIZE max_size = cpi->sf.max_partition_size;
+
int partition_none_allowed = !force_horz_split && !force_vert_split;
int partition_horz_allowed = !force_vert_split && yss <= xss &&
bsize >= BLOCK_8X8;
@@ -1931,18 +1997,24 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
set_offsets(cpi, tile, mi_row, mi_col, bsize);
x->mb_energy = vp9_block_energy(cpi, x, bsize);
}
+
+ if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
+ int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3)
+ + get_chessboard_index(cm->current_video_frame)) & 0x1;
+
+ if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size)
+ set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size);
+ }
+
// Determine partition types in search according to the speed features.
// The threshold set here has to be of square block size.
if (cpi->sf.auto_min_max_partition_size) {
- partition_none_allowed &= (bsize <= cpi->sf.max_partition_size &&
- bsize >= cpi->sf.min_partition_size);
- partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size &&
- bsize > cpi->sf.min_partition_size) ||
+ partition_none_allowed &= (bsize <= max_size && bsize >= min_size);
+ partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) ||
force_horz_split);
- partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size &&
- bsize > cpi->sf.min_partition_size) ||
+ partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) ||
force_vert_split);
- do_split &= bsize > cpi->sf.min_partition_size;
+ do_split &= bsize > min_size;
}
if (cpi->sf.use_square_partition_only) {
partition_horz_allowed &= force_horz_split;
@@ -1993,6 +2065,52 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
do_split = 0;
do_rect = 0;
}
+
+#if CONFIG_FP_MB_STATS
+ // Check if every 16x16 first pass block statistics has zero
+ // motion and the corresponding first pass residue is small enough.
+ // If that is the case, check the difference variance between the
+ // current frame and the last frame. If the variance is small enough,
+ // stop further splitting in RD optimization
+ if (cpi->use_fp_mb_stats && do_split != 0 &&
+ cm->base_qindex > qindex_skip_threshold_lookup[bsize]) {
+ VP9_COMMON *cm = &cpi->common;
+ int mb_row = mi_row >> 1;
+ int mb_col = mi_col >> 1;
+ int mb_row_end =
+ MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows);
+ int mb_col_end =
+ MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols);
+ int r, c;
+
+ int skip = 1;
+ for (r = mb_row; r < mb_row_end; r++) {
+ for (c = mb_col; c < mb_col_end; c++) {
+ const int mb_index = r * cm->mb_cols + c;
+ if ((cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_NONZERO_MOTION_MASK) ||
+ !(cpi->twopass.this_frame_mb_stats[mb_index] &
+ FPMB_ERROR_LEVEL0_MASK)) {
+ skip = 0;
+ break;
+ }
+ }
+ if (skip == 0) {
+ break;
+ }
+ }
+ if (skip) {
+ unsigned int var;
+ set_offsets(cpi, tile, mi_row, mi_col, bsize);
+ var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src,
+ mi_row, mi_col, bsize);
+ if (var < 8) {
+ do_split = 0;
+ do_rect = 0;
+ }
+ }
+ }
+#endif
}
}
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
@@ -2030,8 +2148,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
if (cpi->sf.adaptive_motion_search)
load_pred_mv(x, ctx);
+ pc_tree->split[i]->index = i;
rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx,
- subsize, &this_rate, &this_dist, i != 3,
+ subsize, &this_rate, &this_dist,
best_rd - sum_rd, pc_tree->split[i]);
if (this_rate == INT_MAX) {
@@ -2168,7 +2287,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile,
*rate = best_rate;
*dist = best_dist;
- if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) {
+ if (best_rate < INT_MAX && best_dist < INT64_MAX && pc_tree->index != 3) {
int output_enabled = (bsize == BLOCK_64X64);
// Check the projected output rate for this SB against it's target
@@ -2225,6 +2344,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
}
vp9_zero(cpi->mb.pred_mv);
+ cpi->pc_root->index = 0;
if ((sf->partition_search_type == SEARCH_PARTITION &&
sf->use_lastframe_partitioning) ||
@@ -2278,7 +2398,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
&sf->max_partition_size);
}
rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, INT64_MAX,
+ &dummy_rate, &dummy_dist, INT64_MAX,
cpi->pc_root);
} else {
if (sf->constrain_copy_partition &&
@@ -2300,7 +2420,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile,
&sf->max_partition_size);
}
rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64,
- &dummy_rate, &dummy_dist, 1, INT64_MAX, cpi->pc_root);
+ &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root);
}
}
}
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 4b3f2ad56..2a6c4b362 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -551,8 +551,8 @@ static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd,
: 0];
}
-static INLINE int get_chessboard_index(const VP9_COMMON *cm) {
- return cm->current_video_frame % 2;
+static INLINE int get_chessboard_index(const int frame_index) {
+ return frame_index & 0x1;
}
#ifdef __cplusplus
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3c41f5f73..1b574758b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -553,6 +553,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
double error_weight = 1.0;
const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+#if CONFIG_FP_MB_STATS
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
vp9_clear_system_state();
@@ -599,7 +602,8 @@ void vp9_first_pass(VP9_COMP *cpi) {
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
- // TODO(pengchong): store some related block statistics here
+ // initialization
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
}
#endif
@@ -700,6 +704,27 @@ void vp9_first_pass(VP9_COMP *cpi) {
// Start by assuming that intra mode is best.
best_ref_mv.as_int = 0;
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ // intra predication statistics
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &=
+ ~FPMB_NONZERO_MOTION_MASK;
+ if (this_error > FPMB_ERROR_LEVEL4_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL4_MASK;
+ } else if (this_error > FPMB_ERROR_LEVEL3_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL3_MASK;
+ } else if (this_error > FPMB_ERROR_LEVEL2_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL2_MASK;
+ } else if (this_error > FPMB_ERROR_LEVEL1_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL1_MASK;
+ } else {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL0_MASK;
+ }
+ }
+#endif
+
if (motion_error <= this_error) {
// Keep a count of cases where the inter and intra were very close
// and very low. This helps with scene cut detection for example in
@@ -730,13 +755,40 @@ void vp9_first_pass(VP9_COMP *cpi) {
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
- // TODO(pengchong): save some related block statistics here
+ // inter predication statistics
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &=
+ ~FPMB_NONZERO_MOTION_MASK;
+ if (this_error > FPMB_ERROR_LEVEL4_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_LEVEL4_MASK;
+ } else if (this_error > FPMB_ERROR_LEVEL3_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_LEVEL3_MASK;
+ } else if (this_error > FPMB_ERROR_LEVEL2_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_LEVEL2_MASK;
+ } else if (this_error > FPMB_ERROR_LEVEL1_TH) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_LEVEL1_MASK;
+ } else {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_ERROR_LEVEL0_MASK;
+ }
}
#endif
if (mv.as_int) {
++mvcount;
+#if CONFIG_FP_MB_STATS
+ if (cpi->use_fp_mb_stats) {
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
+ FPMB_NONZERO_MOTION_MASK;
+ }
+#endif
+
// Non-zero vector, was it different from the last non zero vector?
if (mv.as_int != lastmv_as_int)
++new_mv_count;
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 33a795f26..e898abc16 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -19,6 +19,20 @@ extern "C" {
#endif
#if CONFIG_FP_MB_STATS
+
+#define FPMB_DCINTRA_MASK 0x01
+#define FPMB_NONZERO_MOTION_MASK 0x02
+#define FPMB_ERROR_LEVEL0_MASK 0x04
+#define FPMB_ERROR_LEVEL1_MASK 0x10
+#define FPMB_ERROR_LEVEL2_MASK 0x20
+#define FPMB_ERROR_LEVEL3_MASK 0x40
+#define FPMB_ERROR_LEVEL4_MASK 0x80
+
+#define FPMB_ERROR_LEVEL1_TH 2000
+#define FPMB_ERROR_LEVEL2_TH 8000
+#define FPMB_ERROR_LEVEL3_TH 24000
+#define FPMB_ERROR_LEVEL4_TH 48000
+
typedef struct {
uint8_t *mb_stats_start;
uint8_t *mb_stats_end;
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 30a0e9d0d..7a1600155 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -394,7 +394,8 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
INTERP_FILTER filter_ref = cm->interp_filter;
int bsl = mi_width_log2(bsize);
const int pred_filter_search = cm->interp_filter == SWITCHABLE ?
- (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm)) % 2 : 0;
+ (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
int const_motion[MAX_REF_FRAMES] = { 0 };
int bh = num_4x4_blocks_high_lookup[bsize] << 2;
int bw = num_4x4_blocks_wide_lookup[bsize] << 2;
@@ -409,6 +410,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
PRED_BUFFER *this_mode_pred = NULL;
int i;
+ // CTX is used by the temporal denoiser which is currently being developed.
+ // TODO(jbb): when temporal denoiser is finished and in the default build
+ // remove the following line;
+ (void) ctx;
if (cpi->sf.reuse_inter_pred_sby) {
for (i = 0; i < 3; i++) {
tmp[i].data = &pred_buf[pixels_in_block * i];
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 6778d69b9..177066b32 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -2106,7 +2106,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int bsl = mi_width_log2_lookup[bsize];
int pred_filter_search = cpi->sf.cb_pred_filter_search ?
- (((mi_row + mi_col) >> bsl)) & 0x01 : 0;
+ (((mi_row + mi_col) >> bsl) +
+ get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
if (pred_filter_search) {
INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 7315dd454..6ca1bc525 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -110,10 +110,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm,
if (speed >= 3) {
sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD
: USE_LARGESTALL;
- if (MIN(cm->width, cm->height) >= 720)
+ if (MIN(cm->width, cm->height) >= 720) {
sf->disable_split_mask = DISABLE_ALL_SPLIT;
- else
+ sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1;
+ } else {
sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT;
+ }
sf->adaptive_pred_interp_filter = 0;
sf->cb_pred_filter_search = 1;
@@ -334,6 +336,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->adaptive_motion_search = 0;
sf->adaptive_pred_interp_filter = 0;
sf->cb_pred_filter_search = 0;
+ sf->cb_partition_search = 0;
sf->use_quant_fp = 0;
sf->reference_masking = 0;
sf->partition_search_type = SEARCH_PARTITION;
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 929acaf3e..de731cee1 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -286,6 +286,8 @@ typedef struct SPEED_FEATURES {
// Chessboard pattern prediction filter type search
int cb_pred_filter_search;
+ int cb_partition_search;
+
// Fast quantization process path
int use_quant_fp;
diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c
index b5269ed03..3a19f5274 100644
--- a/vp9/encoder/x86/vp9_dct_avx2.c
+++ b/vp9/encoder/x86/vp9_dct_avx2.c
@@ -12,2572 +12,6 @@
#include "vp9/common/vp9_idct.h" // for cospi constants
#include "vpx_ports/mem.h"
-void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
- const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- const __m128i kOne = _mm_set1_epi16(1);
- __m128i in0, in1, in2, in3;
- // Load inputs.
- {
- in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
- // x = x << 4
- in0 = _mm_slli_epi16(in0, 4);
- in1 = _mm_slli_epi16(in1, 4);
- in2 = _mm_slli_epi16(in2, 4);
- in3 = _mm_slli_epi16(in3, 4);
- // if (i == 0 && input[0]) input[0] += 1;
- {
- // The mask will only contain whether the first value is zero, all
- // other comparison will fail as something shifted by 4 (above << 4)
- // can never be equal to one. To increment in the non-zero case, we
- // add the mask and one for the first element:
- // - if zero, mask = -1, v = v - 1 + 1 = v
- // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
- __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
- in0 = _mm_add_epi16(in0, mask);
- in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
- }
- }
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- // Transform 1/2: Add/subtract
- const __m128i r0 = _mm_add_epi16(in0, in3);
- const __m128i r1 = _mm_add_epi16(in1, in2);
- const __m128i r2 = _mm_sub_epi16(in1, in2);
- const __m128i r3 = _mm_sub_epi16(in0, in3);
- // Transform 1/2: Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- // Combine and transpose
- const __m128i res0 = _mm_packs_epi32(w0, w2);
- const __m128i res1 = _mm_packs_epi32(w4, w6);
- // 00 01 02 03 20 21 22 23
- // 10 11 12 13 30 31 32 33
- const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1
- // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3
- if (0 == pass) {
- // Extract values in the high part for second pass as transform code
- // only uses the first four values.
- in1 = _mm_unpackhi_epi64(in0, in0);
- in3 = _mm_unpackhi_epi64(in2, in2);
- } else {
- // Post-condition output and store it (v + 1) >> 2, taking advantage
- // of the fact 1/3 are stored just after 0/2.
- __m128i out01 = _mm_add_epi16(in0, kOne);
- __m128i out23 = _mm_add_epi16(in2, kOne);
- out01 = _mm_srai_epi16(out01, 2);
- out23 = _mm_srai_epi16(out23, 2);
- _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
- _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
- }
- }
-}
-
-static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in,
- int stride) {
- const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
- const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
- __m128i mask;
-
- in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
- in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
- in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
- in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
-
- in[0] = _mm_slli_epi16(in[0], 4);
- in[1] = _mm_slli_epi16(in[1], 4);
- in[2] = _mm_slli_epi16(in[2], 4);
- in[3] = _mm_slli_epi16(in[3], 4);
-
- mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a);
- in[0] = _mm_add_epi16(in[0], mask);
- in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b);
-}
-
-static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) {
- const __m128i kOne = _mm_set1_epi16(1);
- __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
- __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
- __m128i out01 = _mm_add_epi16(in01, kOne);
- __m128i out23 = _mm_add_epi16(in23, kOne);
- out01 = _mm_srai_epi16(out01, 2);
- out23 = _mm_srai_epi16(out23, 2);
- _mm_store_si128((__m128i *)(output + 0 * 8), out01);
- _mm_store_si128((__m128i *)(output + 1 * 8), out23);
-}
-
-static INLINE void transpose_4x4_avx2(__m128i *res) {
- // Combine and transpose
- // 00 01 02 03 20 21 22 23
- // 10 11 12 13 30 31 32 33
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
-
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
- res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
-
- // 00 10 20 30 01 11 21 31
- // 02 12 22 32 03 13 23 33
- // only use the first 4 16-bit integers
- res[1] = _mm_unpackhi_epi64(res[0], res[0]);
- res[3] = _mm_unpackhi_epi64(res[2], res[2]);
-}
-
-void fdct4_avx2(__m128i *in) {
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u[4], v[4];
- u[0]=_mm_unpacklo_epi16(in[0], in[1]);
- u[1]=_mm_unpacklo_epi16(in[3], in[2]);
-
- v[0] = _mm_add_epi16(u[0], u[1]);
- v[1] = _mm_sub_epi16(u[0], u[1]);
-
- u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0
- u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2
- u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1
- u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[1] = _mm_packs_epi32(u[2], u[3]);
- transpose_4x4_avx2(in);
-}
-
-void fadst4_avx2(__m128i *in) {
- const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9);
- const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9);
- const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9);
- const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9);
- const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9);
- const __m128i kZero = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u[8], v[8];
- __m128i in7 = _mm_add_epi16(in[0], in[1]);
-
- u[0] = _mm_unpacklo_epi16(in[0], in[1]);
- u[1] = _mm_unpacklo_epi16(in[2], in[3]);
- u[2] = _mm_unpacklo_epi16(in7, kZero);
- u[3] = _mm_unpacklo_epi16(in[2], kZero);
- u[4] = _mm_unpacklo_epi16(in[3], kZero);
-
- v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2
- v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5
- v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1
- v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3
- v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6
- v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4
- v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
-
- u[0] = _mm_add_epi32(v[0], v[1]);
- u[1] = _mm_sub_epi32(v[2], v[6]);
- u[2] = _mm_add_epi32(v[3], v[4]);
- u[3] = _mm_sub_epi32(u[2], u[0]);
- u[4] = _mm_slli_epi32(v[5], 2);
- u[5] = _mm_sub_epi32(u[4], v[5]);
- u[6] = _mm_add_epi32(u[3], u[5]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[2]);
- in[1] = _mm_packs_epi32(u[1], u[3]);
- transpose_4x4_avx2(in);
-}
-
-void vp9_fht4x4_avx2(const int16_t *input, int16_t *output,
- int stride, int tx_type) {
- __m128i in[4];
-
- switch (tx_type) {
- case DCT_DCT:
- vp9_fdct4x4_avx2(input, output, stride);
- break;
- case ADST_DCT:
- load_buffer_4x4_avx2(input, in, stride);
- fadst4_avx2(in);
- fdct4_avx2(in);
- write_buffer_4x4_avx2(output, in);
- break;
- case DCT_ADST:
- load_buffer_4x4_avx2(input, in, stride);
- fdct4_avx2(in);
- fadst4_avx2(in);
- write_buffer_4x4_avx2(output, in);
- break;
- case ADST_ADST:
- load_buffer_4x4_avx2(input, in, stride);
- fadst4_avx2(in);
- fadst4_avx2(in);
- write_buffer_4x4_avx2(output, in);
- break;
- default:
- assert(0);
- break;
- }
-}
-
-void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) {
- int pass;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- // Load input
- __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride));
- __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride));
- __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride));
- __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride));
- __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride));
- __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride));
- __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride));
- __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride));
- // Pre-condition input (shift by two)
- in0 = _mm_slli_epi16(in0, 2);
- in1 = _mm_slli_epi16(in1, 2);
- in2 = _mm_slli_epi16(in2, 2);
- in3 = _mm_slli_epi16(in3, 2);
- in4 = _mm_slli_epi16(in4, 2);
- in5 = _mm_slli_epi16(in5, 2);
- in6 = _mm_slli_epi16(in6, 2);
- in7 = _mm_slli_epi16(in7, 2);
-
- // We do two passes, first the columns, then the rows. The results of the
- // first pass are transposed so that the same column code can be reused. The
- // results of the second pass are also transposed so that the rows (processed
- // as columns) are put back in row positions.
- for (pass = 0; pass < 2; pass++) {
- // To store results of each pass before the transpose.
- __m128i res0, res1, res2, res3, res4, res5, res6, res7;
- // Add/subtract
- const __m128i q0 = _mm_add_epi16(in0, in7);
- const __m128i q1 = _mm_add_epi16(in1, in6);
- const __m128i q2 = _mm_add_epi16(in2, in5);
- const __m128i q3 = _mm_add_epi16(in3, in4);
- const __m128i q4 = _mm_sub_epi16(in3, in4);
- const __m128i q5 = _mm_sub_epi16(in2, in5);
- const __m128i q6 = _mm_sub_epi16(in1, in6);
- const __m128i q7 = _mm_sub_epi16(in0, in7);
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = _mm_add_epi16(q0, q3);
- const __m128i r1 = _mm_add_epi16(q1, q2);
- const __m128i r2 = _mm_sub_epi16(q1, q2);
- const __m128i r3 = _mm_sub_epi16(q0, q3);
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res0 = _mm_packs_epi32(w0, w1);
- res4 = _mm_packs_epi32(w2, w3);
- res2 = _mm_packs_epi32(w4, w5);
- res6 = _mm_packs_epi32(w6, w7);
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/subtract
- const __m128i x0 = _mm_add_epi16(q4, r0);
- const __m128i x1 = _mm_sub_epi16(q4, r0);
- const __m128i x2 = _mm_sub_epi16(q7, r1);
- const __m128i x3 = _mm_add_epi16(q7, r1);
- // Interleave to do the multiply by constants which gets us into 32bits
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res1 = _mm_packs_epi32(w0, w1);
- res7 = _mm_packs_epi32(w2, w3);
- res5 = _mm_packs_epi32(w4, w5);
- res3 = _mm_packs_epi32(w6, w7);
- }
- // Transpose the 8x8.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- }
- }
- // Post-condition output and store it
- {
- // Post-condition (division by two)
- // division of two 16 bits signed numbers using shifts
- // n / 2 = (n - (n >> 15)) >> 1
- const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
- const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
- const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
- const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
- const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
- const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
- const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
- const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
- in0 = _mm_sub_epi16(in0, sign_in0);
- in1 = _mm_sub_epi16(in1, sign_in1);
- in2 = _mm_sub_epi16(in2, sign_in2);
- in3 = _mm_sub_epi16(in3, sign_in3);
- in4 = _mm_sub_epi16(in4, sign_in4);
- in5 = _mm_sub_epi16(in5, sign_in5);
- in6 = _mm_sub_epi16(in6, sign_in6);
- in7 = _mm_sub_epi16(in7, sign_in7);
- in0 = _mm_srai_epi16(in0, 1);
- in1 = _mm_srai_epi16(in1, 1);
- in2 = _mm_srai_epi16(in2, 1);
- in3 = _mm_srai_epi16(in3, 1);
- in4 = _mm_srai_epi16(in4, 1);
- in5 = _mm_srai_epi16(in5, 1);
- in6 = _mm_srai_epi16(in6, 1);
- in7 = _mm_srai_epi16(in7, 1);
- // store results
- _mm_store_si128((__m128i *)(output + 0 * 8), in0);
- _mm_store_si128((__m128i *)(output + 1 * 8), in1);
- _mm_store_si128((__m128i *)(output + 2 * 8), in2);
- _mm_store_si128((__m128i *)(output + 3 * 8), in3);
- _mm_store_si128((__m128i *)(output + 4 * 8), in4);
- _mm_store_si128((__m128i *)(output + 5 * 8), in5);
- _mm_store_si128((__m128i *)(output + 6 * 8), in6);
- _mm_store_si128((__m128i *)(output + 7 * 8), in7);
- }
-}
-
-// load 8x8 array
-static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in,
- int stride) {
- in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride));
- in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride));
- in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride));
- in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride));
- in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride));
- in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride));
- in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride));
- in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride));
-
- in[0] = _mm_slli_epi16(in[0], 2);
- in[1] = _mm_slli_epi16(in[1], 2);
- in[2] = _mm_slli_epi16(in[2], 2);
- in[3] = _mm_slli_epi16(in[3], 2);
- in[4] = _mm_slli_epi16(in[4], 2);
- in[5] = _mm_slli_epi16(in[5], 2);
- in[6] = _mm_slli_epi16(in[6], 2);
- in[7] = _mm_slli_epi16(in[7], 2);
-}
-
-// right shift and rounding
-static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) {
- const __m128i kOne = _mm_set1_epi16(1);
- const int bit_m02 = bit - 2;
- __m128i sign0 = _mm_srai_epi16(res[0], 15);
- __m128i sign1 = _mm_srai_epi16(res[1], 15);
- __m128i sign2 = _mm_srai_epi16(res[2], 15);
- __m128i sign3 = _mm_srai_epi16(res[3], 15);
- __m128i sign4 = _mm_srai_epi16(res[4], 15);
- __m128i sign5 = _mm_srai_epi16(res[5], 15);
- __m128i sign6 = _mm_srai_epi16(res[6], 15);
- __m128i sign7 = _mm_srai_epi16(res[7], 15);
-
- if (bit_m02 >= 0) {
- __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02);
- res[0] = _mm_add_epi16(res[0], k_const_rounding);
- res[1] = _mm_add_epi16(res[1], k_const_rounding);
- res[2] = _mm_add_epi16(res[2], k_const_rounding);
- res[3] = _mm_add_epi16(res[3], k_const_rounding);
- res[4] = _mm_add_epi16(res[4], k_const_rounding);
- res[5] = _mm_add_epi16(res[5], k_const_rounding);
- res[6] = _mm_add_epi16(res[6], k_const_rounding);
- res[7] = _mm_add_epi16(res[7], k_const_rounding);
- }
-
- res[0] = _mm_sub_epi16(res[0], sign0);
- res[1] = _mm_sub_epi16(res[1], sign1);
- res[2] = _mm_sub_epi16(res[2], sign2);
- res[3] = _mm_sub_epi16(res[3], sign3);
- res[4] = _mm_sub_epi16(res[4], sign4);
- res[5] = _mm_sub_epi16(res[5], sign5);
- res[6] = _mm_sub_epi16(res[6], sign6);
- res[7] = _mm_sub_epi16(res[7], sign7);
-
- res[0] = _mm_srai_epi16(res[0], bit);
- res[1] = _mm_srai_epi16(res[1], bit);
- res[2] = _mm_srai_epi16(res[2], bit);
- res[3] = _mm_srai_epi16(res[3], bit);
- res[4] = _mm_srai_epi16(res[4], bit);
- res[5] = _mm_srai_epi16(res[5], bit);
- res[6] = _mm_srai_epi16(res[6], bit);
- res[7] = _mm_srai_epi16(res[7], bit);
-}
-
-// write 8x8 array
-static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) {
- _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
- _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
- _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
- _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
- _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
- _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
- _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
- _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
-}
-
-// perform in-place transpose
-static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 44 54 45 55 46 56 47 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 25 35
- // 44 54 64 74 45 55 65 75
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1);
- res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1);
- res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3);
- res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3);
- res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5);
- res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5);
- res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7);
- res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
-}
-
-void fdct8_avx2(__m128i *in) {
- // constants
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- __m128i u0, u1, u2, u3, u4, u5, u6, u7;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
-
- // stage 1
- s0 = _mm_add_epi16(in[0], in[7]);
- s1 = _mm_add_epi16(in[1], in[6]);
- s2 = _mm_add_epi16(in[2], in[5]);
- s3 = _mm_add_epi16(in[3], in[4]);
- s4 = _mm_sub_epi16(in[3], in[4]);
- s5 = _mm_sub_epi16(in[2], in[5]);
- s6 = _mm_sub_epi16(in[1], in[6]);
- s7 = _mm_sub_epi16(in[0], in[7]);
-
- u0 = _mm_add_epi16(s0, s3);
- u1 = _mm_add_epi16(s1, s2);
- u2 = _mm_sub_epi16(s1, s2);
- u3 = _mm_sub_epi16(s0, s3);
- // interleave and perform butterfly multiplication/addition
- v0 = _mm_unpacklo_epi16(u0, u1);
- v1 = _mm_unpackhi_epi16(u0, u1);
- v2 = _mm_unpacklo_epi16(u2, u3);
- v3 = _mm_unpackhi_epi16(u2, u3);
-
- u0 = _mm_madd_epi16(v0, k__cospi_p16_p16);
- u1 = _mm_madd_epi16(v1, k__cospi_p16_p16);
- u2 = _mm_madd_epi16(v0, k__cospi_p16_m16);
- u3 = _mm_madd_epi16(v1, k__cospi_p16_m16);
- u4 = _mm_madd_epi16(v2, k__cospi_p24_p08);
- u5 = _mm_madd_epi16(v3, k__cospi_p24_p08);
- u6 = _mm_madd_epi16(v2, k__cospi_m08_p24);
- u7 = _mm_madd_epi16(v3, k__cospi_m08_p24);
-
- // shift and rounding
- v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u0, u1);
- in[2] = _mm_packs_epi32(u4, u5);
- in[4] = _mm_packs_epi32(u2, u3);
- in[6] = _mm_packs_epi32(u6, u7);
-
- // stage 2
- // interleave and perform butterfly multiplication/addition
- u0 = _mm_unpacklo_epi16(s6, s5);
- u1 = _mm_unpackhi_epi16(s6, s5);
- v0 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_p16);
-
- // shift and rounding
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
-
- u0 = _mm_packs_epi32(v0, v1);
- u1 = _mm_packs_epi32(v2, v3);
-
- // stage 3
- s0 = _mm_add_epi16(s4, u0);
- s1 = _mm_sub_epi16(s4, u0);
- s2 = _mm_sub_epi16(s7, u1);
- s3 = _mm_add_epi16(s7, u1);
-
- // stage 4
- u0 = _mm_unpacklo_epi16(s0, s3);
- u1 = _mm_unpackhi_epi16(s0, s3);
- u2 = _mm_unpacklo_epi16(s1, s2);
- u3 = _mm_unpackhi_epi16(s1, s2);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p28_p04);
- v1 = _mm_madd_epi16(u1, k__cospi_p28_p04);
- v2 = _mm_madd_epi16(u2, k__cospi_p12_p20);
- v3 = _mm_madd_epi16(u3, k__cospi_p12_p20);
- v4 = _mm_madd_epi16(u2, k__cospi_m20_p12);
- v5 = _mm_madd_epi16(u3, k__cospi_m20_p12);
- v6 = _mm_madd_epi16(u0, k__cospi_m04_p28);
- v7 = _mm_madd_epi16(u1, k__cospi_m04_p28);
-
- // shift and rounding
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- in[1] = _mm_packs_epi32(v0, v1);
- in[3] = _mm_packs_epi32(v4, v5);
- in[5] = _mm_packs_epi32(v2, v3);
- in[7] = _mm_packs_epi32(v6, v7);
-
- // transpose
- array_transpose_8x8_avx2(in, in);
-}
-
-void fadst8_avx2(__m128i *in) {
- // Constants
- const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64);
- const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64);
- const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64);
- const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64);
- const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64);
- const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64);
- const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__const_0 = _mm_set1_epi16(0);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15;
- __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15;
- __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15;
- __m128i s0, s1, s2, s3, s4, s5, s6, s7;
- __m128i in0, in1, in2, in3, in4, in5, in6, in7;
-
- // properly aligned for butterfly input
- in0 = in[7];
- in1 = in[0];
- in2 = in[5];
- in3 = in[2];
- in4 = in[3];
- in5 = in[4];
- in6 = in[1];
- in7 = in[6];
-
- // column transformation
- // stage 1
- // interleave and multiply/add into 32-bit integer
- s0 = _mm_unpacklo_epi16(in0, in1);
- s1 = _mm_unpackhi_epi16(in0, in1);
- s2 = _mm_unpacklo_epi16(in2, in3);
- s3 = _mm_unpackhi_epi16(in2, in3);
- s4 = _mm_unpacklo_epi16(in4, in5);
- s5 = _mm_unpackhi_epi16(in4, in5);
- s6 = _mm_unpacklo_epi16(in6, in7);
- s7 = _mm_unpackhi_epi16(in6, in7);
-
- u0 = _mm_madd_epi16(s0, k__cospi_p02_p30);
- u1 = _mm_madd_epi16(s1, k__cospi_p02_p30);
- u2 = _mm_madd_epi16(s0, k__cospi_p30_m02);
- u3 = _mm_madd_epi16(s1, k__cospi_p30_m02);
- u4 = _mm_madd_epi16(s2, k__cospi_p10_p22);
- u5 = _mm_madd_epi16(s3, k__cospi_p10_p22);
- u6 = _mm_madd_epi16(s2, k__cospi_p22_m10);
- u7 = _mm_madd_epi16(s3, k__cospi_p22_m10);
- u8 = _mm_madd_epi16(s4, k__cospi_p18_p14);
- u9 = _mm_madd_epi16(s5, k__cospi_p18_p14);
- u10 = _mm_madd_epi16(s4, k__cospi_p14_m18);
- u11 = _mm_madd_epi16(s5, k__cospi_p14_m18);
- u12 = _mm_madd_epi16(s6, k__cospi_p26_p06);
- u13 = _mm_madd_epi16(s7, k__cospi_p26_p06);
- u14 = _mm_madd_epi16(s6, k__cospi_p06_m26);
- u15 = _mm_madd_epi16(s7, k__cospi_p06_m26);
-
- // addition
- w0 = _mm_add_epi32(u0, u8);
- w1 = _mm_add_epi32(u1, u9);
- w2 = _mm_add_epi32(u2, u10);
- w3 = _mm_add_epi32(u3, u11);
- w4 = _mm_add_epi32(u4, u12);
- w5 = _mm_add_epi32(u5, u13);
- w6 = _mm_add_epi32(u6, u14);
- w7 = _mm_add_epi32(u7, u15);
- w8 = _mm_sub_epi32(u0, u8);
- w9 = _mm_sub_epi32(u1, u9);
- w10 = _mm_sub_epi32(u2, u10);
- w11 = _mm_sub_epi32(u3, u11);
- w12 = _mm_sub_epi32(u4, u12);
- w13 = _mm_sub_epi32(u5, u13);
- w14 = _mm_sub_epi32(u6, u14);
- w15 = _mm_sub_epi32(u7, u15);
-
- // shift and rounding
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
- v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
- v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
- v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
- v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING);
- v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING);
- v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING);
- v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
- v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
- u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
- u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
- u11 = _mm_srai_epi32(v11, DCT_CONST_BITS);
- u12 = _mm_srai_epi32(v12, DCT_CONST_BITS);
- u13 = _mm_srai_epi32(v13, DCT_CONST_BITS);
- u14 = _mm_srai_epi32(v14, DCT_CONST_BITS);
- u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
-
- // back to 16-bit and pack 8 integers into __m128i
- in[0] = _mm_packs_epi32(u0, u1);
- in[1] = _mm_packs_epi32(u2, u3);
- in[2] = _mm_packs_epi32(u4, u5);
- in[3] = _mm_packs_epi32(u6, u7);
- in[4] = _mm_packs_epi32(u8, u9);
- in[5] = _mm_packs_epi32(u10, u11);
- in[6] = _mm_packs_epi32(u12, u13);
- in[7] = _mm_packs_epi32(u14, u15);
-
- // stage 2
- s0 = _mm_add_epi16(in[0], in[2]);
- s1 = _mm_add_epi16(in[1], in[3]);
- s2 = _mm_sub_epi16(in[0], in[2]);
- s3 = _mm_sub_epi16(in[1], in[3]);
- u0 = _mm_unpacklo_epi16(in[4], in[5]);
- u1 = _mm_unpackhi_epi16(in[4], in[5]);
- u2 = _mm_unpacklo_epi16(in[6], in[7]);
- u3 = _mm_unpackhi_epi16(in[6], in[7]);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p08_p24);
- v1 = _mm_madd_epi16(u1, k__cospi_p08_p24);
- v2 = _mm_madd_epi16(u0, k__cospi_p24_m08);
- v3 = _mm_madd_epi16(u1, k__cospi_p24_m08);
- v4 = _mm_madd_epi16(u2, k__cospi_m24_p08);
- v5 = _mm_madd_epi16(u3, k__cospi_m24_p08);
- v6 = _mm_madd_epi16(u2, k__cospi_p08_p24);
- v7 = _mm_madd_epi16(u3, k__cospi_p08_p24);
-
- w0 = _mm_add_epi32(v0, v4);
- w1 = _mm_add_epi32(v1, v5);
- w2 = _mm_add_epi32(v2, v6);
- w3 = _mm_add_epi32(v3, v7);
- w4 = _mm_sub_epi32(v0, v4);
- w5 = _mm_sub_epi32(v1, v5);
- w6 = _mm_sub_epi32(v2, v6);
- w7 = _mm_sub_epi32(v3, v7);
-
- v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
- v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
- v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
- v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
- v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
- v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
- v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
- v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
-
- u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
-
- // back to 16-bit intergers
- s4 = _mm_packs_epi32(u0, u1);
- s5 = _mm_packs_epi32(u2, u3);
- s6 = _mm_packs_epi32(u4, u5);
- s7 = _mm_packs_epi32(u6, u7);
-
- // stage 3
- u0 = _mm_unpacklo_epi16(s2, s3);
- u1 = _mm_unpackhi_epi16(s2, s3);
- u2 = _mm_unpacklo_epi16(s6, s7);
- u3 = _mm_unpackhi_epi16(s6, s7);
-
- v0 = _mm_madd_epi16(u0, k__cospi_p16_p16);
- v1 = _mm_madd_epi16(u1, k__cospi_p16_p16);
- v2 = _mm_madd_epi16(u0, k__cospi_p16_m16);
- v3 = _mm_madd_epi16(u1, k__cospi_p16_m16);
- v4 = _mm_madd_epi16(u2, k__cospi_p16_p16);
- v5 = _mm_madd_epi16(u3, k__cospi_p16_p16);
- v6 = _mm_madd_epi16(u2, k__cospi_p16_m16);
- v7 = _mm_madd_epi16(u3, k__cospi_p16_m16);
-
- u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
- u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
- u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
- u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
- u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
- u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
- u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
- u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
-
- v0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
- v1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
- v2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
- v3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
- v4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
- v5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
- v6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
- v7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
-
- s2 = _mm_packs_epi32(v0, v1);
- s3 = _mm_packs_epi32(v2, v3);
- s6 = _mm_packs_epi32(v4, v5);
- s7 = _mm_packs_epi32(v6, v7);
-
- // FIXME(jingning): do subtract using bit inversion?
- in[0] = s0;
- in[1] = _mm_sub_epi16(k__const_0, s4);
- in[2] = s6;
- in[3] = _mm_sub_epi16(k__const_0, s2);
- in[4] = s3;
- in[5] = _mm_sub_epi16(k__const_0, s7);
- in[6] = s5;
- in[7] = _mm_sub_epi16(k__const_0, s1);
-
- // transpose
- array_transpose_8x8_avx2(in, in);
-}
-
-void vp9_fht8x8_avx2(const int16_t *input, int16_t *output,
- int stride, int tx_type) {
- __m128i in[8];
-
- switch (tx_type) {
- case DCT_DCT:
- vp9_fdct8x8_avx2(input, output, stride);
- break;
- case ADST_DCT:
- load_buffer_8x8_avx2(input, in, stride);
- fadst8_avx2(in);
- fdct8_avx2(in);
- right_shift_8x8_avx2(in, 1);
- write_buffer_8x8_avx2(output, in, 8);
- break;
- case DCT_ADST:
- load_buffer_8x8_avx2(input, in, stride);
- fdct8_avx2(in);
- fadst8_avx2(in);
- right_shift_8x8_avx2(in, 1);
- write_buffer_8x8_avx2(output, in, 8);
- break;
- case ADST_ADST:
- load_buffer_8x8_avx2(input, in, stride);
- fadst8_avx2(in);
- fadst8_avx2(in);
- right_shift_8x8_avx2(in, 1);
- write_buffer_8x8_avx2(output, in, 8);
- break;
- default:
- assert(0);
- break;
- }
-}
-
-void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256);
- const int16_t *in = input;
- int16_t *out = intermediate;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kOne = _mm_set1_epi16(1);
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- // We process eight columns (transposed rows in second pass) at a time.
- int column_start;
- for (column_start = 0; column_start < 16; column_start += 8) {
- __m128i in00, in01, in02, in03, in04, in05, in06, in07;
- __m128i in08, in09, in10, in11, in12, in13, in14, in15;
- __m128i input0, input1, input2, input3, input4, input5, input6, input7;
- __m128i step1_0, step1_1, step1_2, step1_3;
- __m128i step1_4, step1_5, step1_6, step1_7;
- __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- __m128i step3_0, step3_1, step3_2, step3_3;
- __m128i step3_4, step3_5, step3_6, step3_7;
- __m128i res00, res01, res02, res03, res04, res05, res06, res07;
- __m128i res08, res09, res10, res11, res12, res13, res14, res15;
- // Load and pre-condition input.
- if (0 == pass) {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
- // x = x << 2
- in00 = _mm_slli_epi16(in00, 2);
- in01 = _mm_slli_epi16(in01, 2);
- in02 = _mm_slli_epi16(in02, 2);
- in03 = _mm_slli_epi16(in03, 2);
- in04 = _mm_slli_epi16(in04, 2);
- in05 = _mm_slli_epi16(in05, 2);
- in06 = _mm_slli_epi16(in06, 2);
- in07 = _mm_slli_epi16(in07, 2);
- in08 = _mm_slli_epi16(in08, 2);
- in09 = _mm_slli_epi16(in09, 2);
- in10 = _mm_slli_epi16(in10, 2);
- in11 = _mm_slli_epi16(in11, 2);
- in12 = _mm_slli_epi16(in12, 2);
- in13 = _mm_slli_epi16(in13, 2);
- in14 = _mm_slli_epi16(in14, 2);
- in15 = _mm_slli_epi16(in15, 2);
- } else {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
- // x = (x + 1) >> 2
- in00 = _mm_add_epi16(in00, kOne);
- in01 = _mm_add_epi16(in01, kOne);
- in02 = _mm_add_epi16(in02, kOne);
- in03 = _mm_add_epi16(in03, kOne);
- in04 = _mm_add_epi16(in04, kOne);
- in05 = _mm_add_epi16(in05, kOne);
- in06 = _mm_add_epi16(in06, kOne);
- in07 = _mm_add_epi16(in07, kOne);
- in08 = _mm_add_epi16(in08, kOne);
- in09 = _mm_add_epi16(in09, kOne);
- in10 = _mm_add_epi16(in10, kOne);
- in11 = _mm_add_epi16(in11, kOne);
- in12 = _mm_add_epi16(in12, kOne);
- in13 = _mm_add_epi16(in13, kOne);
- in14 = _mm_add_epi16(in14, kOne);
- in15 = _mm_add_epi16(in15, kOne);
- in00 = _mm_srai_epi16(in00, 2);
- in01 = _mm_srai_epi16(in01, 2);
- in02 = _mm_srai_epi16(in02, 2);
- in03 = _mm_srai_epi16(in03, 2);
- in04 = _mm_srai_epi16(in04, 2);
- in05 = _mm_srai_epi16(in05, 2);
- in06 = _mm_srai_epi16(in06, 2);
- in07 = _mm_srai_epi16(in07, 2);
- in08 = _mm_srai_epi16(in08, 2);
- in09 = _mm_srai_epi16(in09, 2);
- in10 = _mm_srai_epi16(in10, 2);
- in11 = _mm_srai_epi16(in11, 2);
- in12 = _mm_srai_epi16(in12, 2);
- in13 = _mm_srai_epi16(in13, 2);
- in14 = _mm_srai_epi16(in14, 2);
- in15 = _mm_srai_epi16(in15, 2);
- }
- in += 8;
- // Calculate input for the first 8 results.
- {
- input0 = _mm_add_epi16(in00, in15);
- input1 = _mm_add_epi16(in01, in14);
- input2 = _mm_add_epi16(in02, in13);
- input3 = _mm_add_epi16(in03, in12);
- input4 = _mm_add_epi16(in04, in11);
- input5 = _mm_add_epi16(in05, in10);
- input6 = _mm_add_epi16(in06, in09);
- input7 = _mm_add_epi16(in07, in08);
- }
- // Calculate input for the next 8 results.
- {
- step1_0 = _mm_sub_epi16(in07, in08);
- step1_1 = _mm_sub_epi16(in06, in09);
- step1_2 = _mm_sub_epi16(in05, in10);
- step1_3 = _mm_sub_epi16(in04, in11);
- step1_4 = _mm_sub_epi16(in03, in12);
- step1_5 = _mm_sub_epi16(in02, in13);
- step1_6 = _mm_sub_epi16(in01, in14);
- step1_7 = _mm_sub_epi16(in00, in15);
- }
- // Work on the first eight values; fdct8(input, even_results);
- {
- // Add/subtract
- const __m128i q0 = _mm_add_epi16(input0, input7);
- const __m128i q1 = _mm_add_epi16(input1, input6);
- const __m128i q2 = _mm_add_epi16(input2, input5);
- const __m128i q3 = _mm_add_epi16(input3, input4);
- const __m128i q4 = _mm_sub_epi16(input3, input4);
- const __m128i q5 = _mm_sub_epi16(input2, input5);
- const __m128i q6 = _mm_sub_epi16(input1, input6);
- const __m128i q7 = _mm_sub_epi16(input0, input7);
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = _mm_add_epi16(q0, q3);
- const __m128i r1 = _mm_add_epi16(q1, q2);
- const __m128i r2 = _mm_sub_epi16(q1, q2);
- const __m128i r3 = _mm_sub_epi16(q0, q3);
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res00 = _mm_packs_epi32(w0, w1);
- res08 = _mm_packs_epi32(w2, w3);
- res04 = _mm_packs_epi32(w4, w5);
- res12 = _mm_packs_epi32(w6, w7);
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
- const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
- const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
- const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
- const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
- const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
- const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
- const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
- const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
- const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
- const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
- // Combine
- const __m128i r0 = _mm_packs_epi32(s0, s1);
- const __m128i r1 = _mm_packs_epi32(s2, s3);
- // Add/subtract
- const __m128i x0 = _mm_add_epi16(q4, r0);
- const __m128i x1 = _mm_sub_epi16(q4, r0);
- const __m128i x2 = _mm_sub_epi16(q7, r1);
- const __m128i x3 = _mm_add_epi16(q7, r1);
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
- const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
- const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
- const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
- const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
- const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
- const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
- const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
- const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
- const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
- const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
- const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
- const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
- // Combine
- res02 = _mm_packs_epi32(w0, w1);
- res14 = _mm_packs_epi32(w2, w3);
- res10 = _mm_packs_epi32(w4, w5);
- res06 = _mm_packs_epi32(w6, w7);
- }
- }
- // Work on the next eight values; step1 -> odd_results
- {
- // step 2
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
- const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
- const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
- const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_2 = _mm_packs_epi32(w0, w1);
- step2_3 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
- const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
- const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
- const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_5 = _mm_packs_epi32(w0, w1);
- step2_4 = _mm_packs_epi32(w2, w3);
- }
- // step 3
- {
- step3_0 = _mm_add_epi16(step1_0, step2_3);
- step3_1 = _mm_add_epi16(step1_1, step2_2);
- step3_2 = _mm_sub_epi16(step1_1, step2_2);
- step3_3 = _mm_sub_epi16(step1_0, step2_3);
- step3_4 = _mm_sub_epi16(step1_7, step2_4);
- step3_5 = _mm_sub_epi16(step1_6, step2_5);
- step3_6 = _mm_add_epi16(step1_6, step2_5);
- step3_7 = _mm_add_epi16(step1_7, step2_4);
- }
- // step 4
- {
- const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
- const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
- const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
- const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_1 = _mm_packs_epi32(w0, w1);
- step2_2 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
- const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
- const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
- const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- step2_6 = _mm_packs_epi32(w0, w1);
- step2_5 = _mm_packs_epi32(w2, w3);
- }
- // step 5
- {
- step1_0 = _mm_add_epi16(step3_0, step2_1);
- step1_1 = _mm_sub_epi16(step3_0, step2_1);
- step1_2 = _mm_sub_epi16(step3_3, step2_2);
- step1_3 = _mm_add_epi16(step3_3, step2_2);
- step1_4 = _mm_add_epi16(step3_4, step2_5);
- step1_5 = _mm_sub_epi16(step3_4, step2_5);
- step1_6 = _mm_sub_epi16(step3_7, step2_6);
- step1_7 = _mm_add_epi16(step3_7, step2_6);
- }
- // step 6
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
- const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
- const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
- const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res01 = _mm_packs_epi32(w0, w1);
- res09 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
- const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
- const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
- const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res05 = _mm_packs_epi32(w0, w1);
- res13 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
- const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
- const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
- const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res11 = _mm_packs_epi32(w0, w1);
- res03 = _mm_packs_epi32(w2, w3);
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
- const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
- const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
- const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
- const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
- const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
- const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
- const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
- // dct_const_round_shift
- const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
- const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
- const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
- const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
- const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
- const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
- const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
- const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
- // Combine
- res15 = _mm_packs_epi32(w0, w1);
- res07 = _mm_packs_epi32(w2, w3);
- }
- }
- // Transpose the results, do it as two 8x8 transposes.
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
- _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
- _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
- _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
- _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
- _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
- _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
- _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
- }
- {
- // 00 01 02 03 04 05 06 07
- // 10 11 12 13 14 15 16 17
- // 20 21 22 23 24 25 26 27
- // 30 31 32 33 34 35 36 37
- // 40 41 42 43 44 45 46 47
- // 50 51 52 53 54 55 56 57
- // 60 61 62 63 64 65 66 67
- // 70 71 72 73 74 75 76 77
- const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
- // 00 10 01 11 02 12 03 13
- // 20 30 21 31 22 32 23 33
- // 04 14 05 15 06 16 07 17
- // 24 34 25 35 26 36 27 37
- // 40 50 41 51 42 52 43 53
- // 60 70 61 71 62 72 63 73
- // 54 54 55 55 56 56 57 57
- // 64 74 65 75 66 76 67 77
- const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
- const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
- const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
- const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
- const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
- const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
- const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
- const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
- // 00 10 20 30 01 11 21 31
- // 40 50 60 70 41 51 61 71
- // 02 12 22 32 03 13 23 33
- // 42 52 62 72 43 53 63 73
- // 04 14 24 34 05 15 21 36
- // 44 54 64 74 45 55 61 76
- // 06 16 26 36 07 17 27 37
- // 46 56 66 76 47 57 67 77
- const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
- const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
- const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
- const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
- const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
- const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
- const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
- const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
- // 00 10 20 30 40 50 60 70
- // 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72
- // 03 13 23 33 43 53 63 73
- // 04 14 24 34 44 54 64 74
- // 05 15 25 35 45 55 65 75
- // 06 16 26 36 46 56 66 76
- // 07 17 27 37 47 57 67 77
- // Store results
- _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
- _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
- _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
- _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
- _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
- _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
- _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
- _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
- }
- out += 8*16;
- }
- // Setup in/out for next pass.
- in = intermediate;
- out = output;
- }
-}
-
-static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0,
- __m128i *in1, int stride) {
- // load first 8 columns
- load_buffer_8x8_avx2(input, in0, stride);
- load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride);
-
- input += 8;
- // load second 8 columns
- load_buffer_8x8_avx2(input, in1, stride);
- load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0,
- __m128i *in1, int stride) {
- // write first 8 columns
- write_buffer_8x8_avx2(output, in0, stride);
- write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride);
- // write second 8 columns
- output += 8;
- write_buffer_8x8_avx2(output, in1, stride);
- write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride);
-}
-
-static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) {
- __m128i tbuf[8];
- array_transpose_8x8_avx2(res0, res0);
- array_transpose_8x8_avx2(res1, tbuf);
- array_transpose_8x8_avx2(res0 + 8, res1);
- array_transpose_8x8_avx2(res1 + 8, res1 + 8);
-
- res0[8] = tbuf[0];
- res0[9] = tbuf[1];
- res0[10] = tbuf[2];
- res0[11] = tbuf[3];
- res0[12] = tbuf[4];
- res0[13] = tbuf[5];
- res0[14] = tbuf[6];
- res0[15] = tbuf[7];
-}
-
-static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) {
- // perform rounding operations
- right_shift_8x8_avx2(res0, 2);
- right_shift_8x8_avx2(res0 + 8, 2);
- right_shift_8x8_avx2(res1, 2);
- right_shift_8x8_avx2(res1 + 8, 2);
-}
-
-void fdct16_8col_avx2(__m128i *in) {
- // perform 16x16 1-D DCT for 8 columns
- __m128i i[8], s[8], p[8], t[8], u[16], v[16];
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-
- // stage 1
- i[0] = _mm_add_epi16(in[0], in[15]);
- i[1] = _mm_add_epi16(in[1], in[14]);
- i[2] = _mm_add_epi16(in[2], in[13]);
- i[3] = _mm_add_epi16(in[3], in[12]);
- i[4] = _mm_add_epi16(in[4], in[11]);
- i[5] = _mm_add_epi16(in[5], in[10]);
- i[6] = _mm_add_epi16(in[6], in[9]);
- i[7] = _mm_add_epi16(in[7], in[8]);
-
- s[0] = _mm_sub_epi16(in[7], in[8]);
- s[1] = _mm_sub_epi16(in[6], in[9]);
- s[2] = _mm_sub_epi16(in[5], in[10]);
- s[3] = _mm_sub_epi16(in[4], in[11]);
- s[4] = _mm_sub_epi16(in[3], in[12]);
- s[5] = _mm_sub_epi16(in[2], in[13]);
- s[6] = _mm_sub_epi16(in[1], in[14]);
- s[7] = _mm_sub_epi16(in[0], in[15]);
-
- p[0] = _mm_add_epi16(i[0], i[7]);
- p[1] = _mm_add_epi16(i[1], i[6]);
- p[2] = _mm_add_epi16(i[2], i[5]);
- p[3] = _mm_add_epi16(i[3], i[4]);
- p[4] = _mm_sub_epi16(i[3], i[4]);
- p[5] = _mm_sub_epi16(i[2], i[5]);
- p[6] = _mm_sub_epi16(i[1], i[6]);
- p[7] = _mm_sub_epi16(i[0], i[7]);
-
- u[0] = _mm_add_epi16(p[0], p[3]);
- u[1] = _mm_add_epi16(p[1], p[2]);
- u[2] = _mm_sub_epi16(p[1], p[2]);
- u[3] = _mm_sub_epi16(p[0], p[3]);
-
- v[0] = _mm_unpacklo_epi16(u[0], u[1]);
- v[1] = _mm_unpackhi_epi16(u[0], u[1]);
- v[2] = _mm_unpacklo_epi16(u[2], u[3]);
- v[3] = _mm_unpackhi_epi16(u[2], u[3]);
-
- u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16);
- u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16);
- u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16);
- u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16);
- u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08);
- u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08);
- u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24);
- u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-
- in[0] = _mm_packs_epi32(u[0], u[1]);
- in[4] = _mm_packs_epi32(u[4], u[5]);
- in[8] = _mm_packs_epi32(u[2], u[3]);
- in[12] = _mm_packs_epi32(u[6], u[7]);
-
- u[0] = _mm_unpacklo_epi16(p[5], p[6]);
- u[1] = _mm_unpackhi_epi16(p[5], p[6]);
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
-
- u[0] = _mm_packs_epi32(v[0], v[1]);
- u[1] = _mm_packs_epi32(v[2], v[3]);
-
- t[0] = _mm_add_epi16(p[4], u[0]);
- t[1] = _mm_sub_epi16(p[4], u[0]);
- t[2] = _mm_sub_epi16(p[7], u[1]);
- t[3] = _mm_add_epi16(p[7], u[1]);
-
- u[0] = _mm_unpacklo_epi16(t[0], t[3]);
- u[1] = _mm_unpackhi_epi16(t[0], t[3]);
- u[2] = _mm_unpacklo_epi16(t[1], t[2]);
- u[3] = _mm_unpackhi_epi16(t[1], t[2]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04);
- v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20);
- v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12);
- v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28);
- v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- in[2] = _mm_packs_epi32(v[0], v[1]);
- in[6] = _mm_packs_epi32(v[4], v[5]);
- in[10] = _mm_packs_epi32(v[2], v[3]);
- in[14] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[2], s[5]);
- u[1] = _mm_unpackhi_epi16(s[2], s[5]);
- u[2] = _mm_unpacklo_epi16(s[3], s[4]);
- u[3] = _mm_unpackhi_epi16(s[3], s[4]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16);
- v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
- v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[2] = _mm_packs_epi32(v[0], v[1]);
- t[3] = _mm_packs_epi32(v[2], v[3]);
- t[4] = _mm_packs_epi32(v[4], v[5]);
- t[5] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 3
- p[0] = _mm_add_epi16(s[0], t[3]);
- p[1] = _mm_add_epi16(s[1], t[2]);
- p[2] = _mm_sub_epi16(s[1], t[2]);
- p[3] = _mm_sub_epi16(s[0], t[3]);
- p[4] = _mm_sub_epi16(s[7], t[4]);
- p[5] = _mm_sub_epi16(s[6], t[5]);
- p[6] = _mm_add_epi16(s[6], t[5]);
- p[7] = _mm_add_epi16(s[7], t[4]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(p[1], p[6]);
- u[1] = _mm_unpackhi_epi16(p[1], p[6]);
- u[2] = _mm_unpacklo_epi16(p[2], p[5]);
- u[3] = _mm_unpackhi_epi16(p[2], p[5]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
- v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
- v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
- v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
- v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
-
- t[1] = _mm_packs_epi32(v[0], v[1]);
- t[2] = _mm_packs_epi32(v[2], v[3]);
- t[5] = _mm_packs_epi32(v[4], v[5]);
- t[6] = _mm_packs_epi32(v[6], v[7]);
-
- // stage 5
- s[0] = _mm_add_epi16(p[0], t[1]);
- s[1] = _mm_sub_epi16(p[0], t[1]);
- s[2] = _mm_sub_epi16(p[3], t[2]);
- s[3] = _mm_add_epi16(p[3], t[2]);
- s[4] = _mm_add_epi16(p[4], t[5]);
- s[5] = _mm_sub_epi16(p[4], t[5]);
- s[6] = _mm_sub_epi16(p[7], t[6]);
- s[7] = _mm_add_epi16(p[7], t[6]);
-
- // stage 6
- u[0] = _mm_unpacklo_epi16(s[0], s[7]);
- u[1] = _mm_unpackhi_epi16(s[0], s[7]);
- u[2] = _mm_unpacklo_epi16(s[1], s[6]);
- u[3] = _mm_unpackhi_epi16(s[1], s[6]);
- u[4] = _mm_unpacklo_epi16(s[2], s[5]);
- u[5] = _mm_unpackhi_epi16(s[2], s[5]);
- u[6] = _mm_unpacklo_epi16(s[3], s[4]);
- u[7] = _mm_unpackhi_epi16(s[3], s[4]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02);
- v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18);
- v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18);
- v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10);
- v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10);
- v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26);
- v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26);
- v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06);
- v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22);
- v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14);
- v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14);
- v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30);
- v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[1] = _mm_packs_epi32(v[0], v[1]);
- in[9] = _mm_packs_epi32(v[2], v[3]);
- in[5] = _mm_packs_epi32(v[4], v[5]);
- in[13] = _mm_packs_epi32(v[6], v[7]);
- in[3] = _mm_packs_epi32(v[8], v[9]);
- in[11] = _mm_packs_epi32(v[10], v[11]);
- in[7] = _mm_packs_epi32(v[12], v[13]);
- in[15] = _mm_packs_epi32(v[14], v[15]);
-}
-
-void fadst16_8col_avx2(__m128i *in) {
- // perform 16x16 1-D ADST for 8 columns
- __m128i s[16], x[16], u[32], v[32];
- const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64);
- const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64);
- const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64);
- const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64);
- const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64);
- const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64);
- const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64);
- const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64);
- const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64);
- const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64);
- const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64);
- const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64);
- const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64);
- const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64);
- const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64);
- const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64);
- const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64);
- const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64);
- const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64);
- const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64);
- const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64);
- const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64);
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kZero = _mm_set1_epi16(0);
-
- u[0] = _mm_unpacklo_epi16(in[15], in[0]);
- u[1] = _mm_unpackhi_epi16(in[15], in[0]);
- u[2] = _mm_unpacklo_epi16(in[13], in[2]);
- u[3] = _mm_unpackhi_epi16(in[13], in[2]);
- u[4] = _mm_unpacklo_epi16(in[11], in[4]);
- u[5] = _mm_unpackhi_epi16(in[11], in[4]);
- u[6] = _mm_unpacklo_epi16(in[9], in[6]);
- u[7] = _mm_unpackhi_epi16(in[9], in[6]);
- u[8] = _mm_unpacklo_epi16(in[7], in[8]);
- u[9] = _mm_unpackhi_epi16(in[7], in[8]);
- u[10] = _mm_unpacklo_epi16(in[5], in[10]);
- u[11] = _mm_unpackhi_epi16(in[5], in[10]);
- u[12] = _mm_unpacklo_epi16(in[3], in[12]);
- u[13] = _mm_unpackhi_epi16(in[3], in[12]);
- u[14] = _mm_unpacklo_epi16(in[1], in[14]);
- u[15] = _mm_unpackhi_epi16(in[1], in[14]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09);
- v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19);
- v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13);
- v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15);
- v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15);
- v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17);
- v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17);
- v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11);
- v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11);
- v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21);
- v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21);
- v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07);
- v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07);
- v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25);
- v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25);
- v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03);
- v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03);
- v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29);
- v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29);
-
- u[0] = _mm_add_epi32(v[0], v[16]);
- u[1] = _mm_add_epi32(v[1], v[17]);
- u[2] = _mm_add_epi32(v[2], v[18]);
- u[3] = _mm_add_epi32(v[3], v[19]);
- u[4] = _mm_add_epi32(v[4], v[20]);
- u[5] = _mm_add_epi32(v[5], v[21]);
- u[6] = _mm_add_epi32(v[6], v[22]);
- u[7] = _mm_add_epi32(v[7], v[23]);
- u[8] = _mm_add_epi32(v[8], v[24]);
- u[9] = _mm_add_epi32(v[9], v[25]);
- u[10] = _mm_add_epi32(v[10], v[26]);
- u[11] = _mm_add_epi32(v[11], v[27]);
- u[12] = _mm_add_epi32(v[12], v[28]);
- u[13] = _mm_add_epi32(v[13], v[29]);
- u[14] = _mm_add_epi32(v[14], v[30]);
- u[15] = _mm_add_epi32(v[15], v[31]);
- u[16] = _mm_sub_epi32(v[0], v[16]);
- u[17] = _mm_sub_epi32(v[1], v[17]);
- u[18] = _mm_sub_epi32(v[2], v[18]);
- u[19] = _mm_sub_epi32(v[3], v[19]);
- u[20] = _mm_sub_epi32(v[4], v[20]);
- u[21] = _mm_sub_epi32(v[5], v[21]);
- u[22] = _mm_sub_epi32(v[6], v[22]);
- u[23] = _mm_sub_epi32(v[7], v[23]);
- u[24] = _mm_sub_epi32(v[8], v[24]);
- u[25] = _mm_sub_epi32(v[9], v[25]);
- u[26] = _mm_sub_epi32(v[10], v[26]);
- u[27] = _mm_sub_epi32(v[11], v[27]);
- u[28] = _mm_sub_epi32(v[12], v[28]);
- u[29] = _mm_sub_epi32(v[13], v[29]);
- u[30] = _mm_sub_epi32(v[14], v[30]);
- u[31] = _mm_sub_epi32(v[15], v[31]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
- v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
- v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
- v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
- v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING);
- v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING);
- v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING);
- v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING);
- v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING);
- v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING);
- v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING);
- v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING);
- v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING);
- v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING);
- v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING);
- v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
- v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
- u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
- u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
- u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
- u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
- u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
- u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
- u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
- u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
- u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS);
- u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS);
- u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS);
- u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS);
- u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS);
- u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS);
- u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
- u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
-
- s[0] = _mm_packs_epi32(u[0], u[1]);
- s[1] = _mm_packs_epi32(u[2], u[3]);
- s[2] = _mm_packs_epi32(u[4], u[5]);
- s[3] = _mm_packs_epi32(u[6], u[7]);
- s[4] = _mm_packs_epi32(u[8], u[9]);
- s[5] = _mm_packs_epi32(u[10], u[11]);
- s[6] = _mm_packs_epi32(u[12], u[13]);
- s[7] = _mm_packs_epi32(u[14], u[15]);
- s[8] = _mm_packs_epi32(u[16], u[17]);
- s[9] = _mm_packs_epi32(u[18], u[19]);
- s[10] = _mm_packs_epi32(u[20], u[21]);
- s[11] = _mm_packs_epi32(u[22], u[23]);
- s[12] = _mm_packs_epi32(u[24], u[25]);
- s[13] = _mm_packs_epi32(u[26], u[27]);
- s[14] = _mm_packs_epi32(u[28], u[29]);
- s[15] = _mm_packs_epi32(u[30], u[31]);
-
- // stage 2
- u[0] = _mm_unpacklo_epi16(s[8], s[9]);
- u[1] = _mm_unpackhi_epi16(s[8], s[9]);
- u[2] = _mm_unpacklo_epi16(s[10], s[11]);
- u[3] = _mm_unpackhi_epi16(s[10], s[11]);
- u[4] = _mm_unpacklo_epi16(s[12], s[13]);
- u[5] = _mm_unpackhi_epi16(s[12], s[13]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20);
- v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04);
- v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12);
-
- u[0] = _mm_add_epi32(v[0], v[8]);
- u[1] = _mm_add_epi32(v[1], v[9]);
- u[2] = _mm_add_epi32(v[2], v[10]);
- u[3] = _mm_add_epi32(v[3], v[11]);
- u[4] = _mm_add_epi32(v[4], v[12]);
- u[5] = _mm_add_epi32(v[5], v[13]);
- u[6] = _mm_add_epi32(v[6], v[14]);
- u[7] = _mm_add_epi32(v[7], v[15]);
- u[8] = _mm_sub_epi32(v[0], v[8]);
- u[9] = _mm_sub_epi32(v[1], v[9]);
- u[10] = _mm_sub_epi32(v[2], v[10]);
- u[11] = _mm_sub_epi32(v[3], v[11]);
- u[12] = _mm_sub_epi32(v[4], v[12]);
- u[13] = _mm_sub_epi32(v[5], v[13]);
- u[14] = _mm_sub_epi32(v[6], v[14]);
- u[15] = _mm_sub_epi32(v[7], v[15]);
-
- v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
- u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
- u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
- u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
- u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
- u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
- u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
- u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
- u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
- u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
- u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
- u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
- u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
- u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
- u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
- u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
-
- x[0] = _mm_add_epi16(s[0], s[4]);
- x[1] = _mm_add_epi16(s[1], s[5]);
- x[2] = _mm_add_epi16(s[2], s[6]);
- x[3] = _mm_add_epi16(s[3], s[7]);
- x[4] = _mm_sub_epi16(s[0], s[4]);
- x[5] = _mm_sub_epi16(s[1], s[5]);
- x[6] = _mm_sub_epi16(s[2], s[6]);
- x[7] = _mm_sub_epi16(s[3], s[7]);
- x[8] = _mm_packs_epi32(u[0], u[1]);
- x[9] = _mm_packs_epi32(u[2], u[3]);
- x[10] = _mm_packs_epi32(u[4], u[5]);
- x[11] = _mm_packs_epi32(u[6], u[7]);
- x[12] = _mm_packs_epi32(u[8], u[9]);
- x[13] = _mm_packs_epi32(u[10], u[11]);
- x[14] = _mm_packs_epi32(u[12], u[13]);
- x[15] = _mm_packs_epi32(u[14], u[15]);
-
- // stage 3
- u[0] = _mm_unpacklo_epi16(x[4], x[5]);
- u[1] = _mm_unpackhi_epi16(x[4], x[5]);
- u[2] = _mm_unpacklo_epi16(x[6], x[7]);
- u[3] = _mm_unpackhi_epi16(x[6], x[7]);
- u[4] = _mm_unpacklo_epi16(x[12], x[13]);
- u[5] = _mm_unpackhi_epi16(x[12], x[13]);
- u[6] = _mm_unpacklo_epi16(x[14], x[15]);
- u[7] = _mm_unpackhi_epi16(x[14], x[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24);
- v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
- v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08);
- v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08);
- v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24);
- v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24);
- v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08);
- v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24);
-
- u[0] = _mm_add_epi32(v[0], v[4]);
- u[1] = _mm_add_epi32(v[1], v[5]);
- u[2] = _mm_add_epi32(v[2], v[6]);
- u[3] = _mm_add_epi32(v[3], v[7]);
- u[4] = _mm_sub_epi32(v[0], v[4]);
- u[5] = _mm_sub_epi32(v[1], v[5]);
- u[6] = _mm_sub_epi32(v[2], v[6]);
- u[7] = _mm_sub_epi32(v[3], v[7]);
- u[8] = _mm_add_epi32(v[8], v[12]);
- u[9] = _mm_add_epi32(v[9], v[13]);
- u[10] = _mm_add_epi32(v[10], v[14]);
- u[11] = _mm_add_epi32(v[11], v[15]);
- u[12] = _mm_sub_epi32(v[8], v[12]);
- u[13] = _mm_sub_epi32(v[9], v[13]);
- u[14] = _mm_sub_epi32(v[10], v[14]);
- u[15] = _mm_sub_epi32(v[11], v[15]);
-
- u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- s[0] = _mm_add_epi16(x[0], x[2]);
- s[1] = _mm_add_epi16(x[1], x[3]);
- s[2] = _mm_sub_epi16(x[0], x[2]);
- s[3] = _mm_sub_epi16(x[1], x[3]);
- s[4] = _mm_packs_epi32(v[0], v[1]);
- s[5] = _mm_packs_epi32(v[2], v[3]);
- s[6] = _mm_packs_epi32(v[4], v[5]);
- s[7] = _mm_packs_epi32(v[6], v[7]);
- s[8] = _mm_add_epi16(x[8], x[10]);
- s[9] = _mm_add_epi16(x[9], x[11]);
- s[10] = _mm_sub_epi16(x[8], x[10]);
- s[11] = _mm_sub_epi16(x[9], x[11]);
- s[12] = _mm_packs_epi32(v[8], v[9]);
- s[13] = _mm_packs_epi32(v[10], v[11]);
- s[14] = _mm_packs_epi32(v[12], v[13]);
- s[15] = _mm_packs_epi32(v[14], v[15]);
-
- // stage 4
- u[0] = _mm_unpacklo_epi16(s[2], s[3]);
- u[1] = _mm_unpackhi_epi16(s[2], s[3]);
- u[2] = _mm_unpacklo_epi16(s[6], s[7]);
- u[3] = _mm_unpackhi_epi16(s[6], s[7]);
- u[4] = _mm_unpacklo_epi16(s[10], s[11]);
- u[5] = _mm_unpackhi_epi16(s[10], s[11]);
- u[6] = _mm_unpacklo_epi16(s[14], s[15]);
- u[7] = _mm_unpackhi_epi16(s[14], s[15]);
-
- v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16);
- v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16);
- v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
- v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16);
- v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16);
- v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16);
- v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16);
- v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16);
- v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16);
- v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16);
- v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16);
- v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16);
- v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16);
- v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16);
- v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16);
- v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16);
-
- u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING);
- u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING);
- u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING);
- u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING);
- u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING);
- u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING);
- u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING);
- u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING);
- u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
- u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
- u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
- u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
- u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
- u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
- u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
- u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
-
- v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS);
- v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS);
- v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
- v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
- v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS);
- v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS);
- v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS);
- v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS);
- v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS);
- v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS);
- v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS);
- v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS);
- v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS);
- v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS);
- v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
- v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
-
- in[0] = s[0];
- in[1] = _mm_sub_epi16(kZero, s[8]);
- in[2] = s[12];
- in[3] = _mm_sub_epi16(kZero, s[4]);
- in[4] = _mm_packs_epi32(v[4], v[5]);
- in[5] = _mm_packs_epi32(v[12], v[13]);
- in[6] = _mm_packs_epi32(v[8], v[9]);
- in[7] = _mm_packs_epi32(v[0], v[1]);
- in[8] = _mm_packs_epi32(v[2], v[3]);
- in[9] = _mm_packs_epi32(v[10], v[11]);
- in[10] = _mm_packs_epi32(v[14], v[15]);
- in[11] = _mm_packs_epi32(v[6], v[7]);
- in[12] = s[5];
- in[13] = _mm_sub_epi16(kZero, s[13]);
- in[14] = s[9];
- in[15] = _mm_sub_epi16(kZero, s[1]);
-}
-
-void fdct16_avx2(__m128i *in0, __m128i *in1) {
- fdct16_8col_avx2(in0);
- fdct16_8col_avx2(in1);
- array_transpose_16x16_avx2(in0, in1);
-}
-
-void fadst16_avx2(__m128i *in0, __m128i *in1) {
- fadst16_8col_avx2(in0);
- fadst16_8col_avx2(in1);
- array_transpose_16x16_avx2(in0, in1);
-}
-
-void vp9_fht16x16_avx2(const int16_t *input, int16_t *output,
- int stride, int tx_type) {
- __m128i in0[16], in1[16];
-
- switch (tx_type) {
- case DCT_DCT:
- vp9_fdct16x16_avx2(input, output, stride);
- break;
- case ADST_DCT:
- load_buffer_16x16_avx2(input, in0, in1, stride);
- fadst16_avx2(in0, in1);
- right_shift_16x16_avx2(in0, in1);
- fdct16_avx2(in0, in1);
- write_buffer_16x16_avx2(output, in0, in1, 16);
- break;
- case DCT_ADST:
- load_buffer_16x16_avx2(input, in0, in1, stride);
- fdct16_avx2(in0, in1);
- right_shift_16x16_avx2(in0, in1);
- fadst16_avx2(in0, in1);
- write_buffer_16x16_avx2(output, in0, in1, 16);
- break;
- case ADST_ADST:
- load_buffer_16x16_avx2(input, in0, in1, stride);
- fadst16_avx2(in0, in1);
- right_shift_16x16_avx2(in0, in1);
- fadst16_avx2(in0, in1);
- write_buffer_16x16_avx2(output, in0, in1, 16);
- break;
- default:
- assert(0);
- break;
- }
-}
#define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2
#define FDCT32x32_HIGH_PRECISION 0
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index b3dc0b11a..af32eb3e3 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -305,6 +305,16 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static int get_image_bps(const vpx_image_t *img) {
+ switch (img->fmt) {
+ case VPX_IMG_FMT_YV12:
+ case VPX_IMG_FMT_I420: return 12;
+ case VPX_IMG_FMT_I422: return 16;
+ case VPX_IMG_FMT_I444: return 24;
+ default: assert(0 && "Invalid image format");
+ }
+ return 0;
+}
static vpx_codec_err_t set_encoder_config(
VP9EncoderConfig *oxcf,
@@ -672,16 +682,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx,
priv->extra_cfg = extracfg_map[i].cfg;
priv->extra_cfg.pkt_list = &priv->pkt_list.head;
- // Maximum buffer size approximated based on having multiple ARF.
- priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8;
-
- if (priv->cx_data_sz < 4096)
- priv->cx_data_sz = 4096;
-
- priv->cx_data = (unsigned char *)malloc(priv->cx_data_sz);
- if (priv->cx_data == NULL)
- return VPX_CODEC_MEM_ERROR;
-
vp9_initialize_enc();
res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
@@ -806,8 +806,24 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx,
unsigned long deadline) {
vpx_codec_err_t res = VPX_CODEC_OK;
- if (img)
+ if (img != NULL) {
res = validate_img(ctx, img);
+ // TODO(jzern) the checks related to cpi's validity should be treated as a
+ // failure condition, encoder setup is done fully in init() currently.
+ if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) {
+ // There's no codec control for multiple alt-refs so check the encoder
+ // instance for its status to determine the compressed data size.
+ ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h *
+ get_image_bps(img) / 8 *
+ (ctx->cpi->multi_arf_allowed ? 8 : 2);
+ if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096;
+
+ ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz);
+ if (ctx->cx_data == NULL) {
+ return VPX_CODEC_MEM_ERROR;
+ }
+ }
+ }
pick_quickcompress_mode(ctx, duration, deadline);
vpx_codec_pkt_list_init(&ctx->pkt_list);
@@ -1301,7 +1317,6 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = {
#endif
}
},
- { -1, {NOT_IMPLEMENTED}}
};
#ifndef VERSION_STRING
@@ -1324,6 +1339,7 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = {
NOT_IMPLEMENTED // vpx_codec_set_fb_fn_t
},
{ // NOLINT
+ 1, // 1 cfg map
encoder_usage_cfg_map, // vpx_codec_enc_cfg_map_t
encoder_encode, // vpx_codec_encode_fn_t
encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index ef9d10d65..c52884084 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -707,6 +707,7 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = {
decoder_set_fb_fn, // vpx_codec_set_fb_fn_t
},
{ // NOLINT
+ 0,
NOT_IMPLEMENTED, // vpx_codec_enc_cfg_map_t
NOT_IMPLEMENTED, // vpx_codec_encode_fn_t
NOT_IMPLEMENTED, // vpx_codec_get_cx_data_fn_t
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 3381cb95a..dc46c4e35 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -130,5 +130,9 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h
index 82d2bc3c0..cdda3406b 100644
--- a/vpx/internal/vpx_codec_internal.h
+++ b/vpx/internal/vpx_codec_internal.h
@@ -340,6 +340,7 @@ struct vpx_codec_iface {
vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */
} dec;
struct vpx_codec_enc_iface {
+ int cfg_map_count;
vpx_codec_enc_cfg_map_t *cfg_maps; /**< \copydoc ::vpx_codec_enc_cfg_map_t */
vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */
vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */
diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c
index ece2d0b64..db019957e 100644
--- a/vpx/src/vpx_encoder.c
+++ b/vpx/src/vpx_encoder.c
@@ -162,6 +162,7 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
unsigned int usage) {
vpx_codec_err_t res;
vpx_codec_enc_cfg_map_t *map;
+ int i;
if (!iface || !cfg || usage > INT_MAX)
res = VPX_CODEC_INVALID_PARAM;
@@ -170,7 +171,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface,
else {
res = VPX_CODEC_INVALID_PARAM;
- for (map = iface->enc.cfg_maps; map->usage >= 0; map++) {
+ for (i = 0; i < iface->enc.cfg_map_count; ++i) {
+ map = iface->enc.cfg_maps + i;
if (map->usage == (int)usage) {
*cfg = map->cfg;
cfg->g_usage = usage;
diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h
index 182892acf..bd9eebd64 100644
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h
@@ -73,6 +73,33 @@ static void once(void (*func)(void))
}
+#elif CONFIG_MULTITHREAD && defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>
+static void once(void (*func)(void))
+{
+ static int done;
+
+ /* If the initialization is complete, return early. */
+ if(done)
+ return;
+
+ /* Causes all other threads in the process to block themselves
+ * and give up their time slice.
+ */
+ DosEnterCritSec();
+
+ if (!done)
+ {
+ func();
+ done = 1;
+ }
+
+ /* Restores normal thread dispatching for the current process. */
+ DosExitCritSec();
+}
+
+
#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
#include <pthread.h>
static void once(void (*func)(void))
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
deleted file mode 100644
index 696f47a7b..000000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm
+++ /dev/null
@@ -1,233 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_yv12_copy_frame_func_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- INCLUDE vpx_scale_asm_offsets.asm
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-
-;void vp8_yv12_copy_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
-; YV12_BUFFER_CONFIG *dst_ybc);
-
-|vp8_yv12_copy_frame_func_neon| PROC
- push {r4 - r11, lr}
- vpush {d8 - d15}
-
- sub sp, sp, #16
-
- ;Copy Y plane
- ldr r8, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
- ldr r9, [r1, #yv12_buffer_config_u_buffer] ;srcptr1
- ldr r10, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
- ldr r11, [r1, #yv12_buffer_config_v_buffer] ;srcptr1
-
- ldr r4, [r0, #yv12_buffer_config_y_height]
- ldr r5, [r0, #yv12_buffer_config_y_width]
- ldr r6, [r0, #yv12_buffer_config_y_stride]
- ldr r7, [r1, #yv12_buffer_config_y_stride]
- ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
- ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
-
- str r8, [sp]
- str r9, [sp, #4]
- str r10, [sp, #8]
- str r11, [sp, #12]
-
- ; copy two rows at one time
- mov lr, r4, lsr #1
-
-cp_src_to_dst_height_loop
- mov r8, r2
- mov r9, r3
- add r10, r2, r6
- add r11, r3, r7
- movs r12, r5, lsr #7
- ble extra_cp_needed ; y_width < 128
-
-cp_src_to_dst_width_loop
- vld1.8 {q0, q1}, [r8]!
- vld1.8 {q8, q9}, [r10]!
- vld1.8 {q2, q3}, [r8]!
- vld1.8 {q10, q11}, [r10]!
- vld1.8 {q4, q5}, [r8]!
- vld1.8 {q12, q13}, [r10]!
- vld1.8 {q6, q7}, [r8]!
- vld1.8 {q14, q15}, [r10]!
-
- subs r12, r12, #1
-
- vst1.8 {q0, q1}, [r9]!
- vst1.8 {q8, q9}, [r11]!
- vst1.8 {q2, q3}, [r9]!
- vst1.8 {q10, q11}, [r11]!
- vst1.8 {q4, q5}, [r9]!
- vst1.8 {q12, q13}, [r11]!
- vst1.8 {q6, q7}, [r9]!
- vst1.8 {q14, q15}, [r11]!
-
- bne cp_src_to_dst_width_loop
-
- subs lr, lr, #1
- add r2, r2, r6, lsl #1
- add r3, r3, r7, lsl #1
-
- bne cp_src_to_dst_height_loop
-
-extra_cp_needed
- ands r10, r5, #0x7f ;check to see if extra copy is needed
- sub r11, r5, r10
- ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
- ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
- bne extra_cp_src_to_dst_width
-end_of_cp_src_to_dst
-
-;Copy U & V planes
- ldr r2, [sp] ;srcptr1
- ldr r3, [sp, #4] ;dstptr1
- mov r4, r4, lsr #1 ;src uv_height
- mov r5, r5, lsr #1 ;src uv_width
- mov r6, r6, lsr #1 ;src uv_stride
- mov r7, r7, lsr #1 ;dst uv_stride
-
- mov r1, #2
-
-cp_uv_loop
-
- ;copy two rows at one time
- mov lr, r4, lsr #1
-
-cp_src_to_dst_height_uv_loop
- mov r8, r2
- mov r9, r3
- add r10, r2, r6
- add r11, r3, r7
- movs r12, r5, lsr #6
- ble extra_uv_cp_needed
-
-cp_src_to_dst_width_uv_loop
- vld1.8 {q0, q1}, [r8]!
- vld1.8 {q8, q9}, [r10]!
- vld1.8 {q2, q3}, [r8]!
- vld1.8 {q10, q11}, [r10]!
-
- subs r12, r12, #1
-
- vst1.8 {q0, q1}, [r9]!
- vst1.8 {q8, q9}, [r11]!
- vst1.8 {q2, q3}, [r9]!
- vst1.8 {q10, q11}, [r11]!
-
- bne cp_src_to_dst_width_uv_loop
-
- subs lr, lr, #1
- add r2, r2, r6, lsl #1
- add r3, r3, r7, lsl #1
-
- bne cp_src_to_dst_height_uv_loop
-
-extra_uv_cp_needed
- ands r10, r5, #0x3f ;check to see if extra copy is needed
- sub r11, r5, r10
- ldr r2, [sp] ;srcptr1
- ldr r3, [sp, #4] ;dstptr1
- bne extra_cp_src_to_dst_uv_width
-end_of_cp_src_to_dst_uv
-
- subs r1, r1, #1
-
- addne sp, sp, #8
-
- ldrne r2, [sp] ;srcptr1
- ldrne r3, [sp, #4] ;dstptr1
-
- bne cp_uv_loop
-
- add sp, sp, #8
-
- vpop {d8 - d15}
- pop {r4 - r11, pc}
-
-;=============================
-extra_cp_src_to_dst_width
- add r2, r2, r11
- add r3, r3, r11
- add r0, r8, r6
- add r11, r9, r7
-
- mov lr, r4, lsr #1
-extra_cp_src_to_dst_height_loop
- mov r8, r2
- mov r9, r3
- add r0, r8, r6
- add r11, r9, r7
-
- mov r12, r10
-
-extra_cp_src_to_dst_width_loop
- vld1.8 {q0}, [r8]!
- vld1.8 {q1}, [r0]!
-
- subs r12, r12, #16
-
- vst1.8 {q0}, [r9]!
- vst1.8 {q1}, [r11]!
- bne extra_cp_src_to_dst_width_loop
-
- subs lr, lr, #1
-
- add r2, r2, r6, lsl #1
- add r3, r3, r7, lsl #1
-
- bne extra_cp_src_to_dst_height_loop
-
- b end_of_cp_src_to_dst
-
-;=================================
-extra_cp_src_to_dst_uv_width
- add r2, r2, r11
- add r3, r3, r11
- add r0, r8, r6
- add r11, r9, r7
-
- mov lr, r4, lsr #1
-extra_cp_src_to_dst_height_uv_loop
- mov r8, r2
- mov r9, r3
- add r0, r8, r6
- add r11, r9, r7
-
- mov r12, r10
-
-extra_cp_src_to_dst_width_uv_loop
- vld1.8 {d0}, [r8]!
- vld1.8 {d1}, [r0]!
-
- subs r12, r12, #8
-
- vst1.8 {d0}, [r9]!
- vst1.8 {d1}, [r11]!
- bne extra_cp_src_to_dst_width_uv_loop
-
- subs lr, lr, #1
-
- add r2, r2, r6, lsl #1
- add r3, r3, r7, lsl #1
-
- bne extra_cp_src_to_dst_height_uv_loop
-
- b end_of_cp_src_to_dst_uv
-
- ENDP
- END
diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
deleted file mode 100644
index d3306b669..000000000
--- a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm
+++ /dev/null
@@ -1,259 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
- EXPORT |vp8_yv12_copy_src_frame_func_neon|
- ARM
- REQUIRE8
- PRESERVE8
-
- INCLUDE vpx_scale_asm_offsets.asm
-
- AREA ||.text||, CODE, READONLY, ALIGN=2
-;Note: This function is used to copy source data in src_buffer[i] at beginning
-;of the encoding. The buffer has a width and height of cpi->oxcf.Width and
-;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4).
-
-;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc,
-; YV12_BUFFER_CONFIG *dst_ybc);
-
-|vp8_yv12_copy_src_frame_func_neon| PROC
- push {r4 - r11, lr}
- vpush {d8 - d15}
-
- ;Copy Y plane
- ldr r4, [r0, #yv12_buffer_config_y_height]
- ldr r5, [r0, #yv12_buffer_config_y_width]
- ldr r6, [r0, #yv12_buffer_config_y_stride]
- ldr r7, [r1, #yv12_buffer_config_y_stride]
- ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1
- ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1
-
- add r10, r2, r6 ;second row src
- add r11, r3, r7 ;second row dst
- mov r6, r6, lsl #1
- mov r7, r7, lsl #1
- sub r6, r6, r5 ;adjust stride
- sub r7, r7, r5
-
- ; copy two rows at one time
- mov lr, r4, lsr #1
-
-cp_src_to_dst_height_loop
- mov r12, r5
-
-cp_width_128_loop
- vld1.8 {q0, q1}, [r2]!
- vld1.8 {q4, q5}, [r10]!
- vld1.8 {q2, q3}, [r2]!
- vld1.8 {q6, q7}, [r10]!
- vld1.8 {q8, q9}, [r2]!
- vld1.8 {q12, q13}, [r10]!
- vld1.8 {q10, q11}, [r2]!
- vld1.8 {q14, q15}, [r10]!
- sub r12, r12, #128
- cmp r12, #128
- vst1.8 {q0, q1}, [r3]!
- vst1.8 {q4, q5}, [r11]!
- vst1.8 {q2, q3}, [r3]!
- vst1.8 {q6, q7}, [r11]!
- vst1.8 {q8, q9}, [r3]!
- vst1.8 {q12, q13}, [r11]!
- vst1.8 {q10, q11}, [r3]!
- vst1.8 {q14, q15}, [r11]!
- bhs cp_width_128_loop
-
- cmp r12, #0
- beq cp_width_done
-
-cp_width_8_loop
- vld1.8 {d0}, [r2]!
- vld1.8 {d1}, [r10]!
- sub r12, r12, #8
- cmp r12, #8
- vst1.8 {d0}, [r3]!
- vst1.8 {d1}, [r11]!
- bhs cp_width_8_loop
-
- cmp r12, #0
- beq cp_width_done
-
-cp_width_1_loop
- ldrb r8, [r2], #1
- subs r12, r12, #1
- strb r8, [r3], #1
- ldrb r8, [r10], #1
- strb r8, [r11], #1
- bne cp_width_1_loop
-
-cp_width_done
- subs lr, lr, #1
- add r2, r2, r6
- add r3, r3, r7
- add r10, r10, r6
- add r11, r11, r7
- bne cp_src_to_dst_height_loop
-
-;copy last line for Y if y_height is odd
- tst r4, #1
- beq cp_width_done_1
- mov r12, r5
-
-cp_width_128_loop_1
- vld1.8 {q0, q1}, [r2]!
- vld1.8 {q2, q3}, [r2]!
- vld1.8 {q8, q9}, [r2]!
- vld1.8 {q10, q11}, [r2]!
- sub r12, r12, #128
- cmp r12, #128
- vst1.8 {q0, q1}, [r3]!
- vst1.8 {q2, q3}, [r3]!
- vst1.8 {q8, q9}, [r3]!
- vst1.8 {q10, q11}, [r3]!
- bhs cp_width_128_loop_1
-
- cmp r12, #0
- beq cp_width_done_1
-
-cp_width_8_loop_1
- vld1.8 {d0}, [r2]!
- sub r12, r12, #8
- cmp r12, #8
- vst1.8 {d0}, [r3]!
- bhs cp_width_8_loop_1
-
- cmp r12, #0
- beq cp_width_done_1
-
-cp_width_1_loop_1
- ldrb r8, [r2], #1
- subs r12, r12, #1
- strb r8, [r3], #1
- bne cp_width_1_loop_1
-cp_width_done_1
-
-;Copy U & V planes
- ldr r4, [r0, #yv12_buffer_config_uv_height]
- ldr r5, [r0, #yv12_buffer_config_uv_width]
- ldr r6, [r0, #yv12_buffer_config_uv_stride]
- ldr r7, [r1, #yv12_buffer_config_uv_stride]
- ldr r2, [r0, #yv12_buffer_config_u_buffer] ;srcptr1
- ldr r3, [r1, #yv12_buffer_config_u_buffer] ;dstptr1
-
- add r10, r2, r6 ;second row src
- add r11, r3, r7 ;second row dst
- mov r6, r6, lsl #1
- mov r7, r7, lsl #1
- sub r6, r6, r5 ;adjust stride
- sub r7, r7, r5
-
- mov r9, #2
-
-cp_uv_loop
- ;copy two rows at one time
- mov lr, r4, lsr #1
-
-cp_src_to_dst_height_uv_loop
- mov r12, r5
-
-cp_width_uv_64_loop
- vld1.8 {q0, q1}, [r2]!
- vld1.8 {q4, q5}, [r10]!
- vld1.8 {q2, q3}, [r2]!
- vld1.8 {q6, q7}, [r10]!
- sub r12, r12, #64
- cmp r12, #64
- vst1.8 {q0, q1}, [r3]!
- vst1.8 {q4, q5}, [r11]!
- vst1.8 {q2, q3}, [r3]!
- vst1.8 {q6, q7}, [r11]!
- bhs cp_width_uv_64_loop
-
- cmp r12, #0
- beq cp_width_uv_done
-
-cp_width_uv_8_loop
- vld1.8 {d0}, [r2]!
- vld1.8 {d1}, [r10]!
- sub r12, r12, #8
- cmp r12, #8
- vst1.8 {d0}, [r3]!
- vst1.8 {d1}, [r11]!
- bhs cp_width_uv_8_loop
-
- cmp r12, #0
- beq cp_width_uv_done
-
-cp_width_uv_1_loop
- ldrb r8, [r2], #1
- subs r12, r12, #1
- strb r8, [r3], #1
- ldrb r8, [r10], #1
- strb r8, [r11], #1
- bne cp_width_uv_1_loop
-
-cp_width_uv_done
- subs lr, lr, #1
- add r2, r2, r6
- add r3, r3, r7
- add r10, r10, r6
- add r11, r11, r7
- bne cp_src_to_dst_height_uv_loop
-
-;copy last line for U & V if uv_height is odd
- tst r4, #1
- beq cp_width_uv_done_1
- mov r12, r5
-
-cp_width_uv_64_loop_1
- vld1.8 {q0, q1}, [r2]!
- vld1.8 {q2, q3}, [r2]!
- sub r12, r12, #64
- cmp r12, #64
- vst1.8 {q0, q1}, [r3]!
- vst1.8 {q2, q3}, [r3]!
- bhs cp_width_uv_64_loop_1
-
- cmp r12, #0
- beq cp_width_uv_done_1
-
-cp_width_uv_8_loop_1
- vld1.8 {d0}, [r2]!
- sub r12, r12, #8
- cmp r12, #8
- vst1.8 {d0}, [r3]!
- bhs cp_width_uv_8_loop_1
-
- cmp r12, #0
- beq cp_width_uv_done_1
-
-cp_width_uv_1_loop_1
- ldrb r8, [r2], #1
- subs r12, r12, #1
- strb r8, [r3], #1
- bne cp_width_uv_1_loop_1
-cp_width_uv_done_1
-
- subs r9, r9, #1
- ldrne r2, [r0, #yv12_buffer_config_v_buffer] ;srcptr1
- ldrne r3, [r1, #yv12_buffer_config_v_buffer] ;dstptr1
- ldrne r10, [r0, #yv12_buffer_config_uv_stride]
- ldrne r11, [r1, #yv12_buffer_config_uv_stride]
-
- addne r10, r2, r10 ;second row src
- addne r11, r3, r11 ;second row dst
-
- bne cp_uv_loop
-
- vpop {d8 - d15}
- pop {r4 - r11, pc}
-
- ENDP
- END
diff --git a/vpx_scale/arm/neon/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c
deleted file mode 100644
index d408eb311..000000000
--- a/vpx_scale/arm/neon/yv12extend_arm.c
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_scale_rtcd.h"
-
-extern void vp8_yv12_copy_frame_func_neon(
- const struct yv12_buffer_config *src_ybc,
- struct yv12_buffer_config *dst_ybc);
-
-void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc,
- struct yv12_buffer_config *dst_ybc) {
- vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc);
- vp8_yv12_extend_frame_borders_c(dst_ybc);
-}
diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk
index 1fa41afba..0a1594bd8 100644
--- a/vpx_scale/vpx_scale.mk
+++ b/vpx_scale/vpx_scale.mk
@@ -9,11 +9,6 @@ SCALE_SRCS-yes += vpx_scale_asm_offsets.c
SCALE_SRCS-yes += vpx_scale_rtcd.c
SCALE_SRCS-yes += vpx_scale_rtcd.pl
-#neon
-SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM)
-SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/yv12extend_arm.c
-
#mips(dspr2)
SCALE_SRCS-$(HAVE_DSPR2) += mips/dspr2/yv12extend_dspr2.c
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
index 5a7f973b2..d4a2b81a5 100644
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -19,8 +19,6 @@ if (vpx_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") {
add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf";
add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";
-specialize qw/vp8_yv12_copy_frame neon_asm/;
-$vp8_yv12_copy_frame_neon_asm=vp8_yv12_copy_frame_neon;
add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc";