diff options
55 files changed, 1233 insertions, 3781 deletions
diff --git a/build/make/Makefile b/build/make/Makefile index 9efa0ec02..ed90397f0 100644 --- a/build/make/Makefile +++ b/build/make/Makefile @@ -118,20 +118,26 @@ testdata:: utiltest: # Add compiler flags for intrinsic files +ifeq ($(TOOLCHAIN), x86-os2-gcc) +STACKREALIGN=-mstackrealign +else +STACKREALIGN= +endif + $(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx $(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx -$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 -$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 -$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 -$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 -$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 -$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 -$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 -$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 -$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx -$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx -$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 -$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 +$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN) +$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN) +$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN) +$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN) +$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN) +$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN) +$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN) +$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN) +$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN) +$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN) +$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN) +$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN) $(BUILD_PFX)%.c.d: %.c $(if $(quiet),@echo " [DEP] $@") diff --git a/build/make/configure.sh b/build/make/configure.sh index d25f31333..ab6687f73 100755..100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -252,7 +252,7 @@ tolower(){ # source_path=${0%/*} enable_feature source_path_used -if test -z "$source_path" -o "$source_path" = "." ; then +if [ -z "$source_path" ] || [ "$source_path" = "." ]; then source_path="`pwd`" disable_feature source_path_used fi @@ -381,8 +381,8 @@ EOF # tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used. check_gcc_machine_option() { - local opt="$1" - local feature="$2" + opt="$1" + feature="$2" [ -n "$feature" ] || feature="$opt" if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then @@ -419,8 +419,8 @@ true } write_common_target_config_mk() { - local CC="${CC}" - local CXX="${CXX}" + saved_CC="${CC}" + saved_CXX="${CXX}" enabled ccache && CC="ccache ${CC}" enabled ccache && CXX="ccache ${CXX}" print_webm_license $1 "##" "" @@ -470,6 +470,8 @@ EOF enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}" + CC="${saved_CC}" + CXX="${saved_CXX}" } @@ -547,7 +549,8 @@ process_common_cmdline() { alt_libc="${optval}" ;; --as=*) - [ "${optval}" = yasm -o "${optval}" = nasm -o "${optval}" = auto ] \ + [ "${optval}" = yasm ] || [ "${optval}" = nasm ] \ + || [ "${optval}" = auto ] \ || die "Must be yasm, nasm or auto: ${optval}" alt_as="${optval}" ;; @@ -555,8 +558,8 @@ process_common_cmdline() { w="${optval%%x*}" h="${optval##*x}" VAR_LIST="DECODE_WIDTH_LIMIT ${w} DECODE_HEIGHT_LIMIT ${h}" - [ ${w} -gt 0 -a ${h} -gt 0 ] || die "Invalid size-limit: too small." - [ ${w} -lt 65536 -a ${h} -lt 65536 ] \ + [ ${w} -gt 0 ] && [ ${h} -gt 0 ] || die "Invalid size-limit: too small." + [ ${w} -lt 65536 ] && [ ${h} -lt 65536 ] \ || die "Invalid size-limit: too big." enable_feature size_limit ;; @@ -1150,7 +1153,7 @@ EOF auto|"") which nasm >/dev/null 2>&1 && AS=nasm which yasm >/dev/null 2>&1 && AS=yasm - [ "${AS}" = auto -o -z "${AS}" ] \ + [ "${AS}" = auto ] || [ -z "${AS}" ] \ && die "Neither yasm nor nasm have been found" ;; esac @@ -1314,8 +1317,9 @@ process_toolchain() { } print_config_mk() { - local prefix=$1 - local makefile=$2 + saved_prefix="${prefix}" + prefix=$1 + makefile=$2 shift 2 for cfg; do if enabled $cfg; then @@ -1323,11 +1327,13 @@ print_config_mk() { echo "${prefix}_${upname}=yes" >> $makefile fi done + prefix="${saved_prefix}" } print_config_h() { - local prefix=$1 - local header=$2 + saved_prefix="${prefix}" + prefix=$1 + header=$2 shift 2 for cfg; do upname="`toupper $cfg`" @@ -1337,10 +1343,11 @@ print_config_h() { echo "#define ${prefix}_${upname} 0" >> $header fi done + prefix="${saved_prefix}" } print_config_vars_h() { - local header=$1 + header=$1 shift while [ $# -gt 0 ]; do upname="`toupper $1`" @@ -1350,9 +1357,10 @@ print_config_vars_h() { } print_webm_license() { - local destination=$1 - local prefix="$2" - local suffix="$3" + saved_prefix="${prefix}" + destination=$1 + prefix="$2" + suffix="$3" shift 3 cat <<EOF > ${destination} ${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix} @@ -1363,6 +1371,7 @@ ${prefix} tree. An additional intellectual property rights grant can be found${s ${prefix} in the file PATENTS. All contributing project authors may${suffix} ${prefix} be found in the AUTHORS file in the root of the source tree.${suffix} EOF + prefix="${saved_prefix}" } process_targets() { @@ -67,10 +67,10 @@ Codecs: EOF #restore editor state ' - local family; - local last_family; - local c; - local str; + family=""; + last_family=""; + c=""; + str=""; for c in ${CODECS}; do family=${c%_*} if [ "${family}" != "${last_family}" ]; then @@ -272,7 +272,6 @@ HAVE_LIST=" unistd_h " EXPERIMENT_LIST=" - multiple_arf spatial_svc vp9_temporal_denoising fp_mb_stats @@ -412,7 +411,7 @@ process_cmdline() { } post_process_cmdline() { - local c + c="" # If the codec family is disabled, disable all components of that family. # If the codec family is enabled, enable all components of that family. @@ -459,8 +458,8 @@ process_targets() { enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk # Calculate the default distribution name, based on the enabled features - local cf - local DIST_DIR=vpx + cf="" + DIST_DIR=vpx for cf in $CODEC_FAMILIES; do if enabled ${cf}_encoder && enabled ${cf}_decoder; then DIST_DIR="${DIST_DIR}-${cf}" @@ -482,7 +481,7 @@ process_targets() { ;; esac if [ -f "${source_path}/build/make/version.sh" ]; then - local ver=`"$source_path/build/make/version.sh" --bare "$source_path"` + ver=`"$source_path/build/make/version.sh" --bare "$source_path"` DIST_DIR="${DIST_DIR}-${ver}" VERSION_STRING=${ver} ver=${ver%%-*} @@ -516,7 +515,7 @@ EOF # Write makefiles for all enabled targets # for tgt in libs examples docs solution; do - local tgt_fn="$tgt-$toolchain.mk" + tgt_fn="$tgt-$toolchain.mk" if enabled $tgt; then echo "Creating makefiles for ${toolchain} ${tgt}" @@ -555,7 +554,7 @@ process_detect() { true; ;; *) - local result=false + result=false for d in "$@"; do [ -f "${d##-I}/$header" ] && result=true && break done @@ -604,7 +603,7 @@ process_toolchain() { # Handle universal binaries for this architecture case $toolchain in universal-darwin*) - local darwin_ver=${tgt_os##darwin} + darwin_ver=${tgt_os##darwin} # Snow Leopard (10.6/darwin10) dropped support for PPC # Include PPC support for all prior versions diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c index a7ad9f0c6..be3e7b2f1 100644 --- a/examples/vpx_temporal_svc_encoder.c +++ b/examples/vpx_temporal_svc_encoder.c @@ -579,7 +579,7 @@ int main(int argc, char **argv) { if (strncmp(encoder->name, "vp8", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed); - vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly); + vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly); } else if (strncmp(encoder->name, "vp9", 3) == 0) { vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed); vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3); @@ -409,12 +409,16 @@ $(LIBVPX_TEST_DATA): $(SRC_PATH_BARE)/test/test-data.sha1 curl -L -o $@ $(call libvpx_test_data_url,$(@F)) testdata:: $(LIBVPX_TEST_DATA) - $(qexec)if [ -x "$$(which sha1sum)" ]; then\ + $(qexec)[ -x "$$(which sha1sum)" ] && sha1sum=sha1sum;\ + [ -x "$$(which shasum)" ] && sha1sum=shasum;\ + [ -x "$$(which sha1)" ] && sha1sum=sha1;\ + if [ -n "$${sha1sum}" ]; then\ + set -e;\ echo "Checking test data:";\ if [ -n "$(LIBVPX_TEST_DATA)" ]; then\ for f in $(call enabled,LIBVPX_TEST_DATA); do\ grep $$f $(SRC_PATH_BARE)/test/test-data.sha1 |\ - (cd $(LIBVPX_TEST_DATA_PATH); sha1sum -c);\ + (cd $(LIBVPX_TEST_DATA_PATH); $${sha1sum} -c);\ done; \ fi; \ else\ diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index bd7ddd8b6..ee417ce2e 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -606,29 +606,4 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( make_tuple(&vp9_fdct16x16_c, &vp9_idct16x16_256_add_ssse3, 0))); #endif - -#if HAVE_AVX2 -// TODO(jzern): these prototypes can be removed after the avx2 versions are -// reenabled in vp9_rtcd_defs.pl. -extern "C" { -void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride); -void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, int stride, - int tx_type); -} -INSTANTIATE_TEST_CASE_P( - DISABLED_AVX2, Trans16x16DCT, - ::testing::Values( - make_tuple(&vp9_fdct16x16_avx2, - &vp9_idct16x16_256_add_c, 0))); -INSTANTIATE_TEST_CASE_P( - AVX2, Trans16x16HT, - ::testing::Values( - make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 3))); -INSTANTIATE_TEST_CASE_P( - DISABLED_AVX2, Trans16x16HT, - ::testing::Values( - make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 0), - make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 1), - make_tuple(&vp9_fht16x16_avx2, &vp9_iht16x16_256_add_c, 2))); -#endif } // namespace diff --git a/test/decode_to_md5.sh b/test/decode_to_md5.sh index 6cb7d0e6e..854b74f84 100755 --- a/test/decode_to_md5.sh +++ b/test/decode_to_md5.sh @@ -44,8 +44,8 @@ decode_to_md5() { [ -e "${output_file}" ] || return 1 - local md5_last_frame=$(tail -n1 "${output_file}") - local actual_md5=$(echo "${md5_last_frame% *}" | tr -d [:space:]) + local md5_last_frame="$(tail -n1 "${output_file}" | awk '{print $1}')" + local actual_md5="$(echo "${md5_last_frame}" | awk '{print $1}')" [ "${actual_md5}" = "${expected_md5}" ] || return 1 } diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc index bea19f188..7c4826086 100644 --- a/test/fdct4x4_test.cc +++ b/test/fdct4x4_test.cc @@ -376,19 +376,4 @@ INSTANTIATE_TEST_CASE_P( make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3))); #endif -#if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P( - AVX2, Trans4x4DCT, - ::testing::Values( - make_tuple(&vp9_fdct4x4_avx2, - &vp9_idct4x4_16_add_c, 0))); -INSTANTIATE_TEST_CASE_P( - AVX2, Trans4x4HT, - ::testing::Values( - make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 0), - make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 1), - make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 2), - make_tuple(&vp9_fht4x4_avx2, &vp9_iht4x4_16_add_c, 3))); -#endif - } // namespace diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc index 25908374a..567e5f698 100644 --- a/test/fdct8x8_test.cc +++ b/test/fdct8x8_test.cc @@ -337,7 +337,7 @@ INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P( NEON, FwdTrans8x8DCT, ::testing::Values( - make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0))); + make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0))); INSTANTIATE_TEST_CASE_P( DISABLED_NEON, FwdTrans8x8HT, ::testing::Values( @@ -367,18 +367,4 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( make_tuple(&vp9_fdct8x8_ssse3, &vp9_idct8x8_64_add_ssse3, 0))); #endif - -#if HAVE_AVX2 -INSTANTIATE_TEST_CASE_P( - AVX2, FwdTrans8x8DCT, - ::testing::Values( - make_tuple(&vp9_fdct8x8_avx2, &vp9_idct8x8_64_add_c, 0))); -INSTANTIATE_TEST_CASE_P( - AVX2, FwdTrans8x8HT, - ::testing::Values( - make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 0), - make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 1), - make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 2), - make_tuple(&vp9_fht8x8_avx2, &vp9_iht8x8_64_add_c, 3))); -#endif } // namespace diff --git a/test/test-data.sha1 b/test/test-data.sha1 index 98ac0e670..ee6289f1e 100644 --- a/test/test-data.sha1 +++ b/test/test-data.sha1 @@ -679,4 +679,5 @@ edd86a1f5e62fd9da9a9d46078247759c2638009 tacomasmallcameramovement_640_480_30.y 9a70e8b7d14fba9234d0e51dce876635413ce444 thaloundeskmtg_640_480_30.yuv e7d315dbf4f3928779e0dc624311196d44491d32 niklas_1280_720_30.yuv c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf -c77e4a26616add298a05dd5d12397be22c0e40c5 vp90-2-18-resize.ivf +c12918cf0a716417fba2de35c3fc5ab90e52dfce vp90-2-18-resize.ivf.md5 +717da707afcaa1f692ff1946f291054eb75a4f06 screendata.y4m diff --git a/test/test.mk b/test/test.mk index 53d40572a..057e1e8e3 100644 --- a/test/test.mk +++ b/test/test.mk @@ -99,8 +99,8 @@ LIBVPX_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_fdct4x4_test.cc LIBVPX_TEST_SRCS-yes += idct_test.cc LIBVPX_TEST_SRCS-yes += intrapred_test.cc -LIBVPX_TEST_SRCS-yes += scale_border_test.cc LIBVPX_TEST_SRCS-yes += sixtap_predict_test.cc +LIBVPX_TEST_SRCS-yes += vpx_scale_test.cc endif # VP8 diff --git a/test/tools_common.sh b/test/tools_common.sh index 3e69c3687..e98beadf8 100755 --- a/test/tools_common.sh +++ b/test/tools_common.sh @@ -182,38 +182,6 @@ webm_io_available() { [ "$(vpx_config_option_enabled CONFIG_WEBM_IO)" = "yes" ] && echo yes } -# Echoes yes to stdout when vpxdec exists according to vpx_tool_available(). -vpxdec_available() { - [ -n $(vpx_tool_available vpxdec) ] && echo yes -} - -# Wrapper function for running vpxdec in noblit mode. Requires that -# LIBVPX_BIN_PATH points to the directory containing vpxdec. Positional -# parameter one is used as the input file path. Positional parameter two, when -# present, is interpreted as a boolean flag that means the input should be sent -# to vpxdec via pipe from cat instead of directly. -vpxdec() { - local input="${1}" - local pipe_input=${2} - - if [ $# -gt 2 ]; then - # shift away $1 and $2 so the remaining arguments can be passed to vpxdec - # via $@. - shift 2 - fi - - local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}" - - if [ -z "${pipe_input}" ]; then - eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" --summary --noblit "$@" \ - ${devnull} - else - cat "${input}" \ - | eval "${VPX_TEST_PREFIX}" "${decoder}" - --summary --noblit "$@" \ - ${devnull} - fi -} - # Echoes yes to stdout when vpxenc exists according to vpx_tool_available(). vpxenc_available() { [ -n $(vpx_tool_available vpxenc) ] && echo yes diff --git a/test/variance_test.cc b/test/variance_test.cc index 9dc7c6a45..83b7435e6 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -756,6 +756,18 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2), make_tuple(6, 6, subpel_avg_variance64x64_avx2))); #endif // HAVE_AVX2 +#if HAVE_NEON +const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon; +INSTANTIATE_TEST_CASE_P( + NEON, VP9VarianceTest, + ::testing::Values(make_tuple(4, 4, variance16x16_neon))); + +const vp9_subpixvariance_fn_t subpel_variance16x16_neon = + vp9_sub_pixel_variance16x16_neon; +INSTANTIATE_TEST_CASE_P( + NEON, VP9SubpelVarianceTest, + ::testing::Values(make_tuple(4, 4, subpel_variance16x16_neon))); +#endif // HAVE_NEON #endif // CONFIG_VP9_ENCODER } // namespace vp9 diff --git a/test/vp9_spatial_svc_encoder.sh b/test/vp9_spatial_svc_encoder.sh index a5728f677..7a964a920 100755 --- a/test/vp9_spatial_svc_encoder.sh +++ b/test/vp9_spatial_svc_encoder.sh @@ -25,12 +25,13 @@ vp9_spatial_svc_encoder_verify_environment() { # Runs vp9_spatial_svc_encoder. $1 is the test name. vp9_spatial_svc_encoder() { - local encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder" - encoder="${encoder}${VPX_TEST_EXE_SUFFIX}" - local test_name="$1" - local output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf" - local frames_to_encode="10" - local max_kf="9999" + local readonly \ + encoder="${LIBVPX_BIN_PATH}/vp9_spatial_svc_encoder${VPX_TEST_EXE_SUFFIX}" + local readonly test_name="$1" + local readonly \ + output_file="${VPX_TEST_OUTPUT_DIR}/vp9_ssvc_encoder${test_name}.ivf" + local readonly frames_to_encode=10 + local readonly max_kf=9999 shift @@ -40,8 +41,8 @@ vp9_spatial_svc_encoder() { fi eval "${VPX_TEST_PREFIX}" "${encoder}" -w "${YUV_RAW_INPUT_WIDTH}" \ - -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \ - "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull} + -h "${YUV_RAW_INPUT_HEIGHT}" -k "${max_kf}" -f "${frames_to_encode}" \ + "$@" "${YUV_RAW_INPUT}" "${output_file}" ${devnull} [ -e "${output_file}" ] || return 1 } @@ -49,43 +50,57 @@ vp9_spatial_svc_encoder() { # Each mode is run with layer count 1-$vp9_ssvc_test_layers. vp9_ssvc_test_layers=5 -vp9_spatial_svc_mode_i() { +DISABLED_vp9_spatial_svc_mode_i() { if [ "$(vp9_encode_available)" = "yes" ]; then - local test_name="${FUNCNAME}" + local readonly test_name="DISABLED_vp9_spatial_svc_mode_i" for layers in $(seq 1 ${vp9_ssvc_test_layers}); do vp9_spatial_svc_encoder "${test_name}" -m i -l ${layers} done fi } -vp9_spatial_svc_mode_altip() { +DISABLED_vp9_spatial_svc_mode_altip() { if [ "$(vp9_encode_available)" = "yes" ]; then - local test_name="${FUNCNAME}" + local readonly test_name="DISABLED_vp9_spatial_svc_mode_altip" for layers in $(seq 1 ${vp9_ssvc_test_layers}); do vp9_spatial_svc_encoder "${test_name}" -m "alt-ip" -l ${layers} done fi } -vp9_spatial_svc_mode_ip() { +DISABLED_vp9_spatial_svc_mode_ip() { if [ "$(vp9_encode_available)" = "yes" ]; then - local test_name="${FUNCNAME}" + local readonly test_name="DISABLED_vp9_spatial_svc_mode_ip" vp9_spatial_svc_encoder "${test_name}" -m ip -l 1 fi } -vp9_spatial_svc_mode_gf() { +DISABLED_vp9_spatial_svc_mode_gf() { if [ "$(vp9_encode_available)" = "yes" ]; then - local test_name="${FUNCNAME}" + local readonly test_name="DISABLED_vp9_spatial_svc_mode_gf" for layers in $(seq 1 ${vp9_ssvc_test_layers}); do vp9_spatial_svc_encoder "${test_name}" -m gf -l ${layers} done fi } -vp9_spatial_svc_tests="vp9_spatial_svc_mode_i - vp9_spatial_svc_mode_altip - vp9_spatial_svc_mode_ip - vp9_spatial_svc_mode_gf" +vp9_spatial_svc() { + if [ "$(vp9_encode_available)" = "yes" ]; then + local readonly test_name="vp9_spatial_svc" + for layers in $(seq 1 ${vp9_ssvc_test_layers}); do + vp9_spatial_svc_encoder "${test_name}" -l ${layers} + done + fi +} + +readonly vp9_spatial_svc_tests="DISABLED_vp9_spatial_svc_mode_i + DISABLED_vp9_spatial_svc_mode_altip + DISABLED_vp9_spatial_svc_mode_ip + DISABLED_vp9_spatial_svc_mode_gf + vp9_spatial_svc" -run_tests vp9_spatial_svc_encoder_verify_environment "${vp9_spatial_svc_tests}" +if [ "$(vpx_config_option_enabled CONFIG_SPATIAL_SVC)" = "yes" ]; then + run_tests \ + vp9_spatial_svc_encoder_verify_environment \ + "${vp9_spatial_svc_tests}" +fi diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc index 2e9676333..fabb43824 100644 --- a/test/vp9_subtract_test.cc +++ b/test/vp9_subtract_test.cc @@ -95,4 +95,9 @@ INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest, INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest, ::testing::Values(vp9_subtract_block_sse2)); #endif +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest, + ::testing::Values(vp9_subtract_block_neon)); +#endif + } // namespace vp9 diff --git a/test/scale_border_test.cc b/test/vpx_scale_test.cc index cc9a69a7d..b3302d942 100644 --- a/test/scale_border_test.cc +++ b/test/vpx_scale_test.cc @@ -21,11 +21,12 @@ namespace { typedef void (*ExtendFrameBorderFunc)(YV12_BUFFER_CONFIG *ybf); +typedef void (*CopyFrameFunc)(const YV12_BUFFER_CONFIG *src_ybf, + YV12_BUFFER_CONFIG *dst_ybf); -class ExtendBorderTest - : public ::testing::TestWithParam<ExtendFrameBorderFunc> { +class VpxScaleBase { public: - virtual ~ExtendBorderTest() { + virtual ~VpxScaleBase() { libvpx_test::ClearSystemState(); } @@ -35,7 +36,6 @@ class ExtendBorderTest vpx_memset(&img_, 0, sizeof(img_)); ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&img_, width_, height_, VP8BORDERINPIXELS)); - vpx_memset(img_.buffer_alloc, kBufFiller, img_.frame_size); FillPlane(img_.y_buffer, img_.y_crop_width, img_.y_crop_height, img_.y_stride); @@ -47,31 +47,25 @@ class ExtendBorderTest vpx_memset(&ref_img_, 0, sizeof(ref_img_)); ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&ref_img_, width_, height_, VP8BORDERINPIXELS)); - vpx_memset(ref_img_.buffer_alloc, kBufFiller, ref_img_.frame_size); - FillPlane(ref_img_.y_buffer, ref_img_.y_crop_width, ref_img_.y_crop_height, - ref_img_.y_stride); - FillPlane(ref_img_.u_buffer, - ref_img_.uv_crop_width, ref_img_.uv_crop_height, - ref_img_.uv_stride); - FillPlane(ref_img_.v_buffer, - ref_img_.uv_crop_width, ref_img_.uv_crop_height, - ref_img_.uv_stride); + + vpx_memset(&cpy_img_, 0, sizeof(cpy_img_)); + ASSERT_EQ(0, vp8_yv12_alloc_frame_buffer(&cpy_img_, width_, height_, + VP8BORDERINPIXELS)); + vpx_memset(cpy_img_.buffer_alloc, kBufFiller, cpy_img_.frame_size); + ReferenceCopyFrame(); } void DeallocImage() { vp8_yv12_de_alloc_frame_buffer(&img_); vp8_yv12_de_alloc_frame_buffer(&ref_img_); + vp8_yv12_de_alloc_frame_buffer(&cpy_img_); } - private: + protected: static const int kBufFiller = 123; static const int kBufMax = kBufFiller - 1; - virtual void SetUp() { - extend_fn_ = GetParam(); - } - static void FillPlane(uint8_t *buf, int width, int height, int stride) { for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { @@ -80,24 +74,6 @@ class ExtendBorderTest } } - void ReferenceExtendBorder() { - ExtendPlane(ref_img_.y_buffer, - ref_img_.y_crop_width, ref_img_.y_crop_height, - ref_img_.y_width, ref_img_.y_height, - ref_img_.y_stride, - ref_img_.border); - ExtendPlane(ref_img_.u_buffer, - ref_img_.uv_crop_width, ref_img_.uv_crop_height, - ref_img_.uv_width, ref_img_.uv_height, - ref_img_.uv_stride, - ref_img_.border / 2); - ExtendPlane(ref_img_.v_buffer, - ref_img_.uv_crop_width, ref_img_.uv_crop_height, - ref_img_.uv_width, ref_img_.uv_height, - ref_img_.uv_stride, - ref_img_.border / 2); - } - static void ExtendPlane(uint8_t *buf, int crop_width, int crop_height, int width, int height, int stride, int padding) { // Copy the outermost visible pixel to a distance of at least 'padding.' @@ -136,17 +112,75 @@ class ExtendBorderTest } } - void ExtendBorder() { - ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); + void ReferenceExtendBorder() { + ExtendPlane(ref_img_.y_buffer, + ref_img_.y_crop_width, ref_img_.y_crop_height, + ref_img_.y_width, ref_img_.y_height, + ref_img_.y_stride, + ref_img_.border); + ExtendPlane(ref_img_.u_buffer, + ref_img_.uv_crop_width, ref_img_.uv_crop_height, + ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, + ref_img_.border / 2); + ExtendPlane(ref_img_.v_buffer, + ref_img_.uv_crop_width, ref_img_.uv_crop_height, + ref_img_.uv_width, ref_img_.uv_height, + ref_img_.uv_stride, + ref_img_.border / 2); } - void CompareImages() { + void ReferenceCopyFrame() { + // Copy img_ to ref_img_ and extend frame borders. This will be used for + // verifying extend_fn_ as well as copy_frame_fn_. EXPECT_EQ(ref_img_.frame_size, img_.frame_size); - EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, img_.buffer_alloc, + for (int y = 0; y < img_.y_crop_height; ++y) { + for (int x = 0; x < img_.y_crop_width; ++x) { + ref_img_.y_buffer[x + y * ref_img_.y_stride] = + img_.y_buffer[x + y * img_.y_stride]; + } + } + + for (int y = 0; y < img_.uv_crop_height; ++y) { + for (int x = 0; x < img_.uv_crop_width; ++x) { + ref_img_.u_buffer[x + y * ref_img_.uv_stride] = + img_.u_buffer[x + y * img_.uv_stride]; + ref_img_.v_buffer[x + y * ref_img_.uv_stride] = + img_.v_buffer[x + y * img_.uv_stride]; + } + } + + ReferenceExtendBorder(); + } + + void CompareImages(const YV12_BUFFER_CONFIG actual) { + EXPECT_EQ(ref_img_.frame_size, actual.frame_size); + EXPECT_EQ(0, memcmp(ref_img_.buffer_alloc, actual.buffer_alloc, ref_img_.frame_size)); } + YV12_BUFFER_CONFIG img_; + YV12_BUFFER_CONFIG ref_img_; + YV12_BUFFER_CONFIG cpy_img_; + int width_; + int height_; +}; + +class ExtendBorderTest + : public VpxScaleBase, + public ::testing::TestWithParam<ExtendFrameBorderFunc> { + public: + virtual ~ExtendBorderTest() {} + protected: + virtual void SetUp() { + extend_fn_ = GetParam(); + } + + void ExtendBorder() { + ASM_REGISTER_STATE_CHECK(extend_fn_(&img_)); + } + void RunTest() { #if ARCH_ARM // Some arm devices OOM when trying to allocate the largest buffers. @@ -160,17 +194,13 @@ class ExtendBorderTest ResetImage(kSizesToTest[w], kSizesToTest[h]); ExtendBorder(); ReferenceExtendBorder(); - CompareImages(); + CompareImages(img_); DeallocImage(); } } } - YV12_BUFFER_CONFIG img_; - YV12_BUFFER_CONFIG ref_img_; ExtendFrameBorderFunc extend_fn_; - int width_; - int height_; }; TEST_P(ExtendBorderTest, ExtendBorder) { @@ -179,4 +209,48 @@ TEST_P(ExtendBorderTest, ExtendBorder) { INSTANTIATE_TEST_CASE_P(C, ExtendBorderTest, ::testing::Values(vp8_yv12_extend_frame_borders_c)); + +class CopyFrameTest + : public VpxScaleBase, + public ::testing::TestWithParam<CopyFrameFunc> { + public: + virtual ~CopyFrameTest() {} + + protected: + virtual void SetUp() { + copy_frame_fn_ = GetParam(); + } + + void CopyFrame() { + ASM_REGISTER_STATE_CHECK(copy_frame_fn_(&img_, &cpy_img_)); + } + + void RunTest() { +#if ARCH_ARM + // Some arm devices OOM when trying to allocate the largest buffers. + static const int kNumSizesToTest = 6; +#else + static const int kNumSizesToTest = 7; +#endif + static const int kSizesToTest[] = {1, 15, 33, 145, 512, 1025, 16383}; + for (int h = 0; h < kNumSizesToTest; ++h) { + for (int w = 0; w < kNumSizesToTest; ++w) { + ResetImage(kSizesToTest[w], kSizesToTest[h]); + ReferenceCopyFrame(); + CopyFrame(); + CompareImages(cpy_img_); + DeallocImage(); + } + } + } + + CopyFrameFunc copy_frame_fn_; +}; + +TEST_P(CopyFrameTest, CopyFrame) { + ASSERT_NO_FATAL_FAILURE(RunTest()); +} + +INSTANTIATE_TEST_CASE_P(C, CopyFrameTest, + ::testing::Values(vp8_yv12_copy_frame_c)); } // namespace diff --git a/test/vpxdec.sh b/test/vpxdec.sh index 093230b69..836b13cce 100755 --- a/test/vpxdec.sh +++ b/test/vpxdec.sh @@ -22,6 +22,32 @@ vpxdec_verify_environment() { fi } +# Echoes yes to stdout when vpxdec exists according to vpx_tool_available(). +vpxdec_available() { + [ -n "$(vpx_tool_available vpxdec)" ] && echo yes +} + +# Wrapper function for running vpxdec with pipe input. Requires that +# LIBVPX_BIN_PATH points to the directory containing vpxdec. $1 is used as the +# input file path and shifted away. All remaining parameters are passed through +# to vpxdec. +vpxdec_pipe() { + local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}" + local input="$1" + shift + cat "${input}" | eval "${VPX_TEST_PREFIX}" "${decoder}" - "$@" ${devnull} +} + +# Wrapper function for running vpxdec. Requires that LIBVPX_BIN_PATH points to +# the directory containing vpxdec. $1 one is used as the input file path and +# shifted away. All remaining parameters are passed through to vpxdec. +vpxdec() { + local decoder="${LIBVPX_BIN_PATH}/vpxdec${VPX_TEST_EXE_SUFFIX}" + local input="${1}" + shift + eval "${VPX_TEST_PREFIX}" "${decoder}" "$input" "$@" ${devnull} +} + vpxdec_can_decode_vp8() { if [ "$(vpxdec_available)" = "yes" ] && \ [ "$(vp8_decode_available)" = "yes" ]; then @@ -38,20 +64,20 @@ vpxdec_can_decode_vp9() { vpxdec_vp8_ivf() { if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then - vpxdec "${VP8_IVF_FILE}" + vpxdec "${VP8_IVF_FILE}" --summary --noblit fi } vpxdec_vp8_ivf_pipe_input() { if [ "$(vpxdec_can_decode_vp8)" = "yes" ]; then - vpxdec "${VP8_IVF_FILE}" - + vpxdec_pipe "${VP8_IVF_FILE}" --summary --noblit fi } vpxdec_vp9_webm() { if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \ [ "$(webm_io_available)" = "yes" ]; then - vpxdec "${VP9_WEBM_FILE}" + vpxdec "${VP9_WEBM_FILE}" --summary --noblit fi } diff --git a/third_party/libmkv/EbmlIDs.h b/third_party/libmkv/EbmlIDs.h deleted file mode 100644 index 44d438583..000000000 --- a/third_party/libmkv/EbmlIDs.h +++ /dev/null @@ -1,231 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#ifndef MKV_DEFS_HPP -#define MKV_DEFS_HPP 1 - -/* Commenting out values not available in webm, but available in matroska */ - -enum mkv { - EBML = 0x1A45DFA3, - EBMLVersion = 0x4286, - EBMLReadVersion = 0x42F7, - EBMLMaxIDLength = 0x42F2, - EBMLMaxSizeLength = 0x42F3, - DocType = 0x4282, - DocTypeVersion = 0x4287, - DocTypeReadVersion = 0x4285, -/* CRC_32 = 0xBF, */ - Void = 0xEC, - SignatureSlot = 0x1B538667, - SignatureAlgo = 0x7E8A, - SignatureHash = 0x7E9A, - SignaturePublicKey = 0x7EA5, - Signature = 0x7EB5, - SignatureElements = 0x7E5B, - SignatureElementList = 0x7E7B, - SignedElement = 0x6532, - /* segment */ - Segment = 0x18538067, - /* Meta Seek Information */ - SeekHead = 0x114D9B74, - Seek = 0x4DBB, - SeekID = 0x53AB, - SeekPosition = 0x53AC, - /* Segment Information */ - Info = 0x1549A966, -/* SegmentUID = 0x73A4, */ -/* SegmentFilename = 0x7384, */ -/* PrevUID = 0x3CB923, */ -/* PrevFilename = 0x3C83AB, */ -/* NextUID = 0x3EB923, */ -/* NextFilename = 0x3E83BB, */ -/* SegmentFamily = 0x4444, */ -/* ChapterTranslate = 0x6924, */ -/* ChapterTranslateEditionUID = 0x69FC, */ -/* ChapterTranslateCodec = 0x69BF, */ -/* ChapterTranslateID = 0x69A5, */ - TimecodeScale = 0x2AD7B1, - Segment_Duration = 0x4489, - DateUTC = 0x4461, -/* Title = 0x7BA9, */ - MuxingApp = 0x4D80, - WritingApp = 0x5741, - /* Cluster */ - Cluster = 0x1F43B675, - Timecode = 0xE7, -/* SilentTracks = 0x5854, */ -/* SilentTrackNumber = 0x58D7, */ -/* Position = 0xA7, */ - PrevSize = 0xAB, - BlockGroup = 0xA0, - Block = 0xA1, -/* BlockVirtual = 0xA2, */ - BlockAdditions = 0x75A1, - BlockMore = 0xA6, - BlockAddID = 0xEE, - BlockAdditional = 0xA5, - BlockDuration = 0x9B, -/* ReferencePriority = 0xFA, */ - ReferenceBlock = 0xFB, -/* ReferenceVirtual = 0xFD, */ -/* CodecState = 0xA4, */ -/* Slices = 0x8E, */ -/* TimeSlice = 0xE8, */ - LaceNumber = 0xCC, -/* FrameNumber = 0xCD, */ -/* BlockAdditionID = 0xCB, */ -/* MkvDelay = 0xCE, */ -/* Cluster_Duration = 0xCF, */ - SimpleBlock = 0xA3, -/* EncryptedBlock = 0xAF, */ - /* Track */ - Tracks = 0x1654AE6B, - TrackEntry = 0xAE, - TrackNumber = 0xD7, - TrackUID = 0x73C5, - TrackType = 0x83, - FlagEnabled = 0xB9, - FlagDefault = 0x88, - FlagForced = 0x55AA, - FlagLacing = 0x9C, -/* MinCache = 0x6DE7, */ -/* MaxCache = 0x6DF8, */ - DefaultDuration = 0x23E383, -/* TrackTimecodeScale = 0x23314F, */ -/* TrackOffset = 0x537F, */ - MaxBlockAdditionID = 0x55EE, - Name = 0x536E, - Language = 0x22B59C, - CodecID = 0x86, - CodecPrivate = 0x63A2, - CodecName = 0x258688, -/* AttachmentLink = 0x7446, */ -/* CodecSettings = 0x3A9697, */ -/* CodecInfoURL = 0x3B4040, */ -/* CodecDownloadURL = 0x26B240, */ -/* CodecDecodeAll = 0xAA, */ -/* TrackOverlay = 0x6FAB, */ -/* TrackTranslate = 0x6624, */ -/* TrackTranslateEditionUID = 0x66FC, */ -/* TrackTranslateCodec = 0x66BF, */ -/* TrackTranslateTrackID = 0x66A5, */ - /* video */ - Video = 0xE0, - FlagInterlaced = 0x9A, - StereoMode = 0x53B8, - AlphaMode = 0x53C0, - PixelWidth = 0xB0, - PixelHeight = 0xBA, - PixelCropBottom = 0x54AA, - PixelCropTop = 0x54BB, - PixelCropLeft = 0x54CC, - PixelCropRight = 0x54DD, - DisplayWidth = 0x54B0, - DisplayHeight = 0x54BA, - DisplayUnit = 0x54B2, - AspectRatioType = 0x54B3, -/* ColourSpace = 0x2EB524, */ -/* GammaValue = 0x2FB523, */ - FrameRate = 0x2383E3, - /* end video */ - /* audio */ - Audio = 0xE1, - SamplingFrequency = 0xB5, - OutputSamplingFrequency = 0x78B5, - Channels = 0x9F, -/* ChannelPositions = 0x7D7B, */ - BitDepth = 0x6264, - /* end audio */ - /* content encoding */ -/* ContentEncodings = 0x6d80, */ -/* ContentEncoding = 0x6240, */ -/* ContentEncodingOrder = 0x5031, */ -/* ContentEncodingScope = 0x5032, */ -/* ContentEncodingType = 0x5033, */ -/* ContentCompression = 0x5034, */ -/* ContentCompAlgo = 0x4254, */ -/* ContentCompSettings = 0x4255, */ -/* ContentEncryption = 0x5035, */ -/* ContentEncAlgo = 0x47e1, */ -/* ContentEncKeyID = 0x47e2, */ -/* ContentSignature = 0x47e3, */ -/* ContentSigKeyID = 0x47e4, */ -/* ContentSigAlgo = 0x47e5, */ -/* ContentSigHashAlgo = 0x47e6, */ - /* end content encoding */ - /* Cueing Data */ - Cues = 0x1C53BB6B, - CuePoint = 0xBB, - CueTime = 0xB3, - CueTrackPositions = 0xB7, - CueTrack = 0xF7, - CueClusterPosition = 0xF1, - CueBlockNumber = 0x5378 -/* CueCodecState = 0xEA, */ -/* CueReference = 0xDB, */ -/* CueRefTime = 0x96, */ -/* CueRefCluster = 0x97, */ -/* CueRefNumber = 0x535F, */ -/* CueRefCodecState = 0xEB, */ - /* Attachment */ -/* Attachments = 0x1941A469, */ -/* AttachedFile = 0x61A7, */ -/* FileDescription = 0x467E, */ -/* FileName = 0x466E, */ -/* FileMimeType = 0x4660, */ -/* FileData = 0x465C, */ -/* FileUID = 0x46AE, */ -/* FileReferral = 0x4675, */ - /* Chapters */ -/* Chapters = 0x1043A770, */ -/* EditionEntry = 0x45B9, */ -/* EditionUID = 0x45BC, */ -/* EditionFlagHidden = 0x45BD, */ -/* EditionFlagDefault = 0x45DB, */ -/* EditionFlagOrdered = 0x45DD, */ -/* ChapterAtom = 0xB6, */ -/* ChapterUID = 0x73C4, */ -/* ChapterTimeStart = 0x91, */ -/* ChapterTimeEnd = 0x92, */ -/* ChapterFlagHidden = 0x98, */ -/* ChapterFlagEnabled = 0x4598, */ -/* ChapterSegmentUID = 0x6E67, */ -/* ChapterSegmentEditionUID = 0x6EBC, */ -/* ChapterPhysicalEquiv = 0x63C3, */ -/* ChapterTrack = 0x8F, */ -/* ChapterTrackNumber = 0x89, */ -/* ChapterDisplay = 0x80, */ -/* ChapString = 0x85, */ -/* ChapLanguage = 0x437C, */ -/* ChapCountry = 0x437E, */ -/* ChapProcess = 0x6944, */ -/* ChapProcessCodecID = 0x6955, */ -/* ChapProcessPrivate = 0x450D, */ -/* ChapProcessCommand = 0x6911, */ -/* ChapProcessTime = 0x6922, */ -/* ChapProcessData = 0x6933, */ - /* Tagging */ -/* Tags = 0x1254C367, */ -/* Tag = 0x7373, */ -/* Targets = 0x63C0, */ -/* TargetTypeValue = 0x68CA, */ -/* TargetType = 0x63CA, */ -/* Tagging_TrackUID = 0x63C5, */ -/* Tagging_EditionUID = 0x63C9, */ -/* Tagging_ChapterUID = 0x63C4, */ -/* AttachmentUID = 0x63C6, */ -/* SimpleTag = 0x67C8, */ -/* TagName = 0x45A3, */ -/* TagLanguage = 0x447A, */ -/* TagDefault = 0x4484, */ -/* TagString = 0x4487, */ -/* TagBinary = 0x4485, */ -}; -#endif diff --git a/third_party/libmkv/EbmlWriter.c b/third_party/libmkv/EbmlWriter.c deleted file mode 100644 index 27cfe861c..000000000 --- a/third_party/libmkv/EbmlWriter.c +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#include "EbmlWriter.h" -#include <stdlib.h> -#include <wchar.h> -#include <string.h> -#include <limits.h> -#if defined(_MSC_VER) -#define LITERALU64(n) n -#else -#define LITERALU64(n) n##LLU -#endif - -void Ebml_WriteLen(EbmlGlobal *glob, int64_t val) { - /* TODO check and make sure we are not > than 0x0100000000000000LLU */ - unsigned char size = 8; /* size in bytes to output */ - - /* mask to compare for byte size */ - int64_t minVal = 0xff; - - for (size = 1; size < 8; size ++) { - if (val < minVal) - break; - - minVal = (minVal << 7); - } - - val |= (((uint64_t)0x80) << ((size - 1) * 7)); - - Ebml_Serialize(glob, (void *) &val, sizeof(val), size); -} - -void Ebml_WriteString(EbmlGlobal *glob, const char *str) { - const size_t size_ = strlen(str); - const uint64_t size = size_; - Ebml_WriteLen(glob, size); - /* TODO: it's not clear from the spec whether the nul terminator - * should be serialized too. For now we omit the null terminator. - */ - Ebml_Write(glob, str, (unsigned long)size); -} - -void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr) { - const size_t strlen = wcslen(wstr); - - /* TODO: it's not clear from the spec whether the nul terminator - * should be serialized too. For now we include it. - */ - const uint64_t size = strlen; - - Ebml_WriteLen(glob, size); - Ebml_Write(glob, wstr, (unsigned long)size); -} - -void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id) { - int len; - - if (class_id >= 0x01000000) - len = 4; - else if (class_id >= 0x00010000) - len = 3; - else if (class_id >= 0x00000100) - len = 2; - else - len = 1; - - Ebml_Serialize(glob, (void *)&class_id, sizeof(class_id), len); -} - -void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui) { - unsigned char sizeSerialized = 8 | 0x80; - Ebml_WriteID(glob, class_id); - Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1); - Ebml_Serialize(glob, &ui, sizeof(ui), 8); -} - -void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui) { - unsigned char size = 8; /* size in bytes to output */ - unsigned char sizeSerialized = 0; - unsigned long minVal; - - Ebml_WriteID(glob, class_id); - minVal = 0x7fLU; /* mask to compare for byte size */ - - for (size = 1; size < 4; size ++) { - if (ui < minVal) { - break; - } - - minVal <<= 7; - } - - sizeSerialized = 0x80 | size; - Ebml_Serialize(glob, &sizeSerialized, sizeof(sizeSerialized), 1); - Ebml_Serialize(glob, &ui, sizeof(ui), size); -} -/* TODO: perhaps this is a poor name for this id serializer helper function */ -void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long bin) { - int size; - for (size = 4; size > 1; size--) { - if (bin & (unsigned int)0x000000ff << ((size - 1) * 8)) - break; - } - Ebml_WriteID(glob, class_id); - Ebml_WriteLen(glob, size); - Ebml_WriteID(glob, bin); -} - -void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d) { - unsigned char len = 0x88; - - Ebml_WriteID(glob, class_id); - Ebml_Serialize(glob, &len, sizeof(len), 1); - Ebml_Serialize(glob, &d, sizeof(d), 8); -} - -void Ebml_WriteSigned16(EbmlGlobal *glob, short val) { - signed long out = ((val & 0x003FFFFF) | 0x00200000) << 8; - Ebml_Serialize(glob, &out, sizeof(out), 3); -} - -void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s) { - Ebml_WriteID(glob, class_id); - Ebml_WriteString(glob, s); -} - -void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s) { - Ebml_WriteID(glob, class_id); - Ebml_WriteUTF8(glob, s); -} - -void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length) { - Ebml_WriteID(glob, class_id); - Ebml_WriteLen(glob, data_length); - Ebml_Write(glob, data, data_length); -} - -void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize) { - unsigned char tmp = 0; - unsigned long i = 0; - - Ebml_WriteID(glob, 0xEC); - Ebml_WriteLen(glob, vSize); - - for (i = 0; i < vSize; i++) { - Ebml_Write(glob, &tmp, 1); - } -} - -/* TODO Serialize Date */ diff --git a/third_party/libmkv/EbmlWriter.h b/third_party/libmkv/EbmlWriter.h deleted file mode 100644 index b94f75733..000000000 --- a/third_party/libmkv/EbmlWriter.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ -#ifndef EBMLWRITER_HPP -#define EBMLWRITER_HPP -#include <stddef.h> -#include "vpx/vpx_integer.h" - -/* note: you must define write and serialize functions as well as your own - * EBML_GLOBAL - * - * These functions MUST be implemented - */ - -typedef struct EbmlGlobal EbmlGlobal; -void Ebml_Serialize(EbmlGlobal *glob, const void *, int, unsigned long); -void Ebml_Write(EbmlGlobal *glob, const void *, unsigned long); - -/*****/ - -void Ebml_WriteLen(EbmlGlobal *glob, int64_t val); -void Ebml_WriteString(EbmlGlobal *glob, const char *str); -void Ebml_WriteUTF8(EbmlGlobal *glob, const wchar_t *wstr); -void Ebml_WriteID(EbmlGlobal *glob, unsigned long class_id); -void Ebml_SerializeUnsigned64(EbmlGlobal *glob, unsigned long class_id, uint64_t ui); -void Ebml_SerializeUnsigned(EbmlGlobal *glob, unsigned long class_id, unsigned long ui); -void Ebml_SerializeBinary(EbmlGlobal *glob, unsigned long class_id, unsigned long ui); -void Ebml_SerializeFloat(EbmlGlobal *glob, unsigned long class_id, double d); -/* TODO make this more generic to signed */ -void Ebml_WriteSigned16(EbmlGlobal *glob, short val); -void Ebml_SerializeString(EbmlGlobal *glob, unsigned long class_id, const char *s); -void Ebml_SerializeUTF8(EbmlGlobal *glob, unsigned long class_id, wchar_t *s); -void Ebml_SerializeData(EbmlGlobal *glob, unsigned long class_id, unsigned char *data, unsigned long data_length); -void Ebml_WriteVoid(EbmlGlobal *glob, unsigned long vSize); -/* TODO need date function */ -#endif diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 7d9441d54..ef7f61b12 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -108,8 +108,8 @@ extern "C" * For temporal denoiser: noise_sensitivity = 0 means off, * noise_sensitivity = 1 means temporal denoiser on for Y channel only, * noise_sensitivity = 2 means temporal denoiser on for all channels. - * noise_sensitivity = 3 will be used for aggressive mode in future. - * Temporal denoiser is enabled via the build option + * noise_sensitivity = 3 means aggressive denoising mode. + * Temporal denoiser is enabled via the configuration option: * CONFIG_TEMPORAL_DENOISING. * For spatial denoiser: noise_sensitivity controls the amount of * pre-processing blur: noise_sensitivity = 0 means off. diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c index 1a401a4b9..c4c0de81b 100644 --- a/vp8/encoder/denoising.c +++ b/vp8/encoder/denoising.c @@ -8,6 +8,8 @@ * be found in the AUTHORS file in the root of the source tree. */ +#include <limits.h> + #include "denoising.h" #include "vp8/common/reconinter.h" @@ -333,12 +335,33 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, return FILTER_BLOCK; } +void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser) { + if (!denoiser->aggressive_mode) { + denoiser->denoise_pars.scale_sse_thresh = 1; + denoiser->denoise_pars.scale_motion_thresh = 8; + denoiser->denoise_pars.scale_increase_filter = 0; + denoiser->denoise_pars.denoise_mv_bias = 95; + denoiser->denoise_pars.pickmode_mv_bias = 100; + denoiser->denoise_pars.qp_thresh = 0; + denoiser->denoise_pars.consec_zerolast = UINT_MAX; + } else { + denoiser->denoise_pars.scale_sse_thresh = 2; + denoiser->denoise_pars.scale_motion_thresh = 16; + denoiser->denoise_pars.scale_increase_filter = 1; + denoiser->denoise_pars.denoise_mv_bias = 60; + denoiser->denoise_pars.pickmode_mv_bias = 60; + denoiser->denoise_pars.qp_thresh = 100; + denoiser->denoise_pars.consec_zerolast = 10; + } +} + int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, - int num_mb_rows, int num_mb_cols) + int num_mb_rows, int num_mb_cols, int mode) { int i; assert(denoiser); denoiser->num_mb_cols = num_mb_cols; + denoiser->aggressive_mode = mode; for (i = 0; i < MAX_REF_FRAMES; i++) { @@ -369,10 +392,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1); vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols)); - + vp8_denoiser_set_parameters(denoiser); return 0; } + void vp8_denoiser_free(VP8_DENOISER *denoiser) { int i; @@ -401,6 +425,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, { int mv_row; int mv_col; + unsigned int motion_threshold; unsigned int motion_magnitude2; unsigned int sse_thresh; int sse_diff_thresh = 0; @@ -424,7 +449,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi; int sse_diff = 0; // Bias on zero motion vector sse. - int zero_bias = 95; + const int zero_bias = denoiser->denoise_pars.denoise_mv_bias; zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100); sse_diff = zero_mv_sse - best_sse; @@ -502,14 +527,19 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser, mv_row = x->best_sse_mv.as_mv.row; mv_col = x->best_sse_mv.as_mv.col; motion_magnitude2 = mv_row * mv_row + mv_col * mv_col; - sse_thresh = SSE_THRESHOLD; - if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH; + motion_threshold = denoiser->denoise_pars.scale_motion_thresh * + NOISE_MOTION_THRESHOLD; - if (best_sse > sse_thresh || motion_magnitude2 - > 8 * NOISE_MOTION_THRESHOLD) - { - decision = COPY_BLOCK; - } + if (motion_magnitude2 < + denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD) + x->increase_denoising = 1; + + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD; + if (x->increase_denoising) + sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH; + + if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold) + decision = COPY_BLOCK; if (decision == FILTER_BLOCK) { diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h index a1f195b72..1a42f86d3 100644 --- a/vp8/encoder/denoising.h +++ b/vp8/encoder/denoising.h @@ -39,16 +39,40 @@ enum vp8_denoiser_filter_state { kFilterNonZeroMV }; +typedef struct { + // Scale factor on sse threshold above which no denoising is done. + unsigned int scale_sse_thresh; + // Scale factor on motion magnitude threshold above which no + // denoising is done. + unsigned int scale_motion_thresh; + // Scale factor on motion magnitude below which we increase the strength of + // the temporal filter (in function vp8_denoiser_filter). + unsigned int scale_increase_filter; + // Scale factor to bias to ZEROMV for denoising. + unsigned int denoise_mv_bias; + // Scale factor to bias to ZEROMV for coding mode selection. + unsigned int pickmode_mv_bias; + // Quantizer threshold below which we use the segmentation map to switch off + // loop filter for blocks that have been coded as ZEROMV-LAST a certain number + // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to + // 0 when segmentation map is used for shutting off loop filter. + unsigned int qp_thresh; + // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST. + unsigned int consec_zerolast; +} denoise_params; + typedef struct vp8_denoiser { YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES]; YV12_BUFFER_CONFIG yv12_mc_running_avg; unsigned char* denoise_state; int num_mb_cols; + int aggressive_mode; + denoise_params denoise_pars; } VP8_DENOISER; int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height, - int num_mb_rows, int num_mb_cols); + int num_mb_rows, int num_mb_cols, int mode); void vp8_denoiser_free(VP8_DENOISER *denoiser); diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index e6b0f9b64..aec6b9880 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -522,6 +522,19 @@ void encode_mb_row(VP8_COMP *cpi, } #endif + // Keep track of how many (consecutive) times a block is coded + // as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index+mb_col] < 255) + cpi->consec_zero_last[map_index+mb_col] += 1; + } else { + cpi->consec_zero_last[map_index+mb_col] = 0; + } + } /* Special case code for cyclic refresh * If cyclic update enabled then copy xd->mbmi.segment_id; (which diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index d4b17cef1..7b8b51f30 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -206,6 +206,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data) } #endif + // Keep track of how many (consecutive) times a block + // is coded as ZEROMV_LASTREF, for base layer frames. + // Reset to 0 if its coded as anything else. + if (cpi->current_layer == 0) { + if (xd->mode_info_context->mbmi.mode == ZEROMV && + xd->mode_info_context->mbmi.ref_frame == + LAST_FRAME) { + // Increment, check for wrap-around. + if (cpi->consec_zero_last[map_index+mb_col] < 255) + cpi->consec_zero_last[map_index+mb_col] += + 1; + } else { + cpi->consec_zero_last[map_index+mb_col] = 0; + } + } /* Special case code for cyclic refresh * If cyclic update enabled then copy diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 469d0d6e9..298f50f65 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -613,6 +613,24 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment) while(block_count && i != cpi->cyclic_refresh_mode_index); cpi->cyclic_refresh_mode_index = i; + +#if CONFIG_TEMPORAL_DENOISING + if (cpi->denoiser.aggressive_mode != 0 && + Q < (int)cpi->denoiser.denoise_pars.qp_thresh) { + // Under aggressive denoising mode, use segmentation to turn off loop + // filter below some qp thresh. The loop filter is turned off for all + // blocks that have been encoded as ZEROMV LAST x frames in a row, + // where x is set by cpi->denoiser.denoise_pars.consec_zerolast. + // This is to avoid "dot" artifacts that can occur from repeated + // loop filtering on noisy input source. + cpi->cyclic_refresh_q = Q; + lf_adjustment = -MAX_LOOP_FILTER; + for (i = 0; i < mbs_in_frame; ++i) { + seg_map[i] = (cpi->consec_zero_last[i] > + cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0; + } + } +#endif } /* Activate segmentation. */ @@ -1752,7 +1770,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf) int width = (cpi->oxcf.Width + 15) & ~15; int height = (cpi->oxcf.Height + 15) & ~15; vp8_denoiser_allocate(&cpi->denoiser, width, height, - cpi->common.mb_rows, cpi->common.mb_cols); + cm->mb_rows, cm->mb_cols, + ((cpi->oxcf.noise_sensitivity == 3) ? 1 : 0)); } } #endif @@ -1896,6 +1915,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf) else cpi->cyclic_refresh_map = (signed char *) NULL; + CHECK_MEM_ERROR(cpi->consec_zero_last, + vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1)); + #ifdef VP8_ENTROPY_STATS init_context_counters(); #endif @@ -2416,6 +2438,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) vpx_free(cpi->mb.ss); vpx_free(cpi->tok); vpx_free(cpi->cyclic_refresh_map); + vpx_free(cpi->consec_zero_last); vp8_remove_common(&cpi->common); vpx_free(cpi); @@ -3478,6 +3501,9 @@ static void encode_frame_to_data_rate { cpi->mb.rd_thresh_mult[i] = 128; } + + // Reset the zero_last counter to 0 on key frame. + vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); } #if 0 @@ -3899,6 +3925,7 @@ static void encode_frame_to_data_rate #endif + #ifdef OUTPUT_YUV_SRC vp8_write_yuv_frame(yuv_file, cpi->Source); #endif @@ -3994,6 +4021,8 @@ static void encode_frame_to_data_rate else disable_segmentation(cpi); } + // Reset the consec_zero_last counter on key frame. + vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols); vp8_set_quantizer(cpi, Q); } diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index df17dff34..7a8baca77 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -511,6 +511,8 @@ typedef struct VP8_COMP int cyclic_refresh_mode_index; int cyclic_refresh_q; signed char *cyclic_refresh_map; + // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST. + unsigned char *consec_zero_last; // Frame counter for the temporal pattern. Counter is rest when the temporal // layers are changed dynamically (run-time change). diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 86108b70a..ec1ea146f 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -40,7 +40,6 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES]; extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); - int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, int error_per_bit, @@ -694,6 +693,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, */ calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment); +#if CONFIG_TEMPORAL_DENOISING + if (cpi->oxcf.noise_sensitivity) { + rd_adjustment = (int)(rd_adjustment * + cpi->denoiser.denoise_pars.pickmode_mv_bias / 100); + } +#endif + /* if we encode a new mv this is important * find the best new motion vector */ @@ -1168,7 +1174,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, #if CONFIG_TEMPORAL_DENOISING if (cpi->oxcf.noise_sensitivity) { - int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0; + int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0; int block_index = mb_row * cpi->common.mb_cols + mb_col; if (x->best_sse_inter_mode == DC_PRED) { diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index a10c401fe..83938dd3d 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -1326,7 +1326,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = 0, /* ts_periodicity */ {0}, /* ts_layer_id */ }}, - { -1, {NOT_IMPLEMENTED}} }; @@ -1352,6 +1351,7 @@ CODEC_INTERFACE(vpx_codec_vp8_cx) = NOT_IMPLEMENTED, /* vpx_codec_frame_get_fn_t frame_get; */ }, { + 1, /* 1 cfg map */ vp8e_usage_cfg_map, /* vpx_codec_enc_cfg_map_t peek_si; */ vp8e_encode, /* vpx_codec_encode_fn_t encode; */ vp8e_get_cxdata, /* vpx_codec_get_cx_data_fn_t frame_get; */ diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c index fb3c236ce..f768b5c47 100644 --- a/vp8/vp8_dx_iface.c +++ b/vp8/vp8_dx_iface.c @@ -804,6 +804,7 @@ CODEC_INTERFACE(vpx_codec_vp8_dx) = NOT_IMPLEMENTED, }, { /* encoder functions */ + 0, NOT_IMPLEMENTED, NOT_IMPLEMENTED, NOT_IMPLEMENTED, diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index ef5caf327..4602b7b4f 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -420,7 +420,7 @@ add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int sourc specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc"; +specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc"; @@ -435,7 +435,7 @@ add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, co specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc"; add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum"; -specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc"; +specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc"; add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vp9_variance8x4/, "$sse2_x86inc"; @@ -483,7 +483,7 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_ specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; -specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; +specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc"; add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"; specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc"; @@ -712,10 +712,10 @@ add_proto qw/int64_t vp9_block_error/, "const int16_t *coeff, const int16_t *dqc specialize qw/vp9_block_error avx2/, "$sse2_x86inc"; add_proto qw/void vp9_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"; -specialize qw/vp9_subtract_block/, "$sse2_x86inc"; +specialize qw/vp9_subtract_block neon/, "$sse2_x86inc"; add_proto qw/void vp9_quantize_fp/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; -specialize qw/vp9_quantize_fp/, "$ssse3_x86_64"; +specialize qw/vp9_quantize_fp neon/, "$ssse3_x86_64"; add_proto qw/void vp9_quantize_fp_32x32/, "const int16_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, int zbin_oq_value, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vp9_quantize_fp_32x32/, "$ssse3_x86_64"; @@ -739,10 +739,10 @@ if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") { # fdct functions add_proto qw/void vp9_fht4x4/, "const int16_t *input, int16_t *output, int stride, int tx_type"; -specialize qw/vp9_fht4x4 sse2 avx2/; +specialize qw/vp9_fht4x4 sse2/; add_proto qw/void vp9_fht8x8/, "const int16_t *input, int16_t *output, int stride, int tx_type"; -specialize qw/vp9_fht8x8 sse2 avx2/; +specialize qw/vp9_fht8x8 sse2/; add_proto qw/void vp9_fht16x16/, "const int16_t *input, int16_t *output, int stride, int tx_type"; specialize qw/vp9_fht16x16 sse2/; @@ -754,13 +754,13 @@ add_proto qw/void vp9_fdct4x4_1/, "const int16_t *input, int16_t *output, int st specialize qw/vp9_fdct4x4_1 sse2/; add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct4x4 sse2 avx2/; +specialize qw/vp9_fdct4x4 sse2/; add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct8x8_1 sse2/; +specialize qw/vp9_fdct8x8_1 sse2 neon/; add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride"; -specialize qw/vp9_fdct8x8 sse2 avx2/, "$ssse3_x86_64"; +specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64"; add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride"; specialize qw/vp9_fdct16x16_1 sse2/; diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c new file mode 100644 index 000000000..6c66f5d5b --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_idct.h" + +void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) { + int r; + int16x8_t sum = vld1q_s16(&input[0]); + for (r = 1; r < 8; ++r) { + const int16x8_t input_00 = vld1q_s16(&input[r * stride]); + sum = vaddq_s16(sum, input_00); + } + { + const int32x4_t a = vpaddlq_s16(sum); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0); + output[1] = 0; + } +} + +void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) { + int i; + // stage 1 + int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2); + int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2); + int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2); + int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2); + int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2); + int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2); + int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2); + int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2); + for (i = 0; i < 2; ++i) { + int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7; + const int16x8_t v_s0 = vaddq_s16(input_0, input_7); + const int16x8_t v_s1 = vaddq_s16(input_1, input_6); + const int16x8_t v_s2 = vaddq_s16(input_2, input_5); + const int16x8_t v_s3 = vaddq_s16(input_3, input_4); + const int16x8_t v_s4 = vsubq_s16(input_3, input_4); + const int16x8_t v_s5 = vsubq_s16(input_2, input_5); + const int16x8_t v_s6 = vsubq_s16(input_1, input_6); + const int16x8_t v_s7 = vsubq_s16(input_0, input_7); + // fdct4(step, step); + int16x8_t v_x0 = vaddq_s16(v_s0, v_s3); + int16x8_t v_x1 = vaddq_s16(v_s1, v_s2); + int16x8_t v_x2 = vsubq_s16(v_s1, v_s2); + int16x8_t v_x3 = vsubq_s16(v_s0, v_s3); + // fdct4(step, step); + int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1)); + int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1)); + int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64); + int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64); + int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64); + v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64); + v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64); + v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64); + v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64); + v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64); + v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43 + out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63 + out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47 + out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67 + } + // Stage 2 + v_x0 = vsubq_s16(v_s6, v_s5); + v_x1 = vaddq_s16(v_s6, v_s5); + v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x8_t ab = vcombine_s16(a, b); + const int16x8_t cd = vcombine_s16(c, d); + // Stage 3 + v_x0 = vaddq_s16(v_s4, ab); + v_x1 = vsubq_s16(v_s4, ab); + v_x2 = vsubq_s16(v_s7, cd); + v_x3 = vaddq_s16(v_s7, cd); + } + // Stage 4 + v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64); + v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64); + v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64); + v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64); + v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64); + v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64); + v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64); + v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64); + v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64); + v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64); + v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64); + v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64); + v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64); + v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64); + v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64); + v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64); + { + const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS); + const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS); + const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS); + const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS); + const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS); + const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS); + const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS); + const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS); + out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53 + out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73 + out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57 + out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77 + } + // transpose 8x8 + { + // 00 01 02 03 40 41 42 43 + // 10 11 12 13 50 51 52 53 + // 20 21 22 23 60 61 62 63 + // 30 31 32 33 70 71 72 73 + // 04 05 06 07 44 45 46 47 + // 14 15 16 17 54 55 56 57 + // 24 25 26 27 64 65 66 67 + // 34 35 36 37 74 75 76 77 + const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0), + vreinterpretq_s32_s16(out_2)); + const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1), + vreinterpretq_s32_s16(out_3)); + const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4), + vreinterpretq_s32_s16(out_6)); + const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5), + vreinterpretq_s32_s16(out_7)); + const int16x8x2_t r01_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]), + vreinterpretq_s16_s32(r13_s32.val[0])); + const int16x8x2_t r23_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]), + vreinterpretq_s16_s32(r13_s32.val[1])); + const int16x8x2_t r45_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]), + vreinterpretq_s16_s32(r57_s32.val[0])); + const int16x8x2_t r67_s16 = + vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]), + vreinterpretq_s16_s32(r57_s32.val[1])); + input_0 = r01_s16.val[0]; + input_1 = r01_s16.val[1]; + input_2 = r23_s16.val[0]; + input_3 = r23_s16.val[1]; + input_4 = r45_s16.val[0]; + input_5 = r45_s16.val[1]; + input_6 = r67_s16.val[0]; + input_7 = r67_s16.val[1]; + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + } + } // for + { + // from vp9_dct_sse2.c + // Post-condition (division by two) + // division of two 16 bits signed numbers using shifts + // n / 2 = (n - (n >> 15)) >> 1 + const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15); + const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15); + const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15); + const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15); + const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15); + const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15); + const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15); + const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15); + input_0 = vhsubq_s16(input_0, sign_in0); + input_1 = vhsubq_s16(input_1, sign_in1); + input_2 = vhsubq_s16(input_2, sign_in2); + input_3 = vhsubq_s16(input_3, sign_in3); + input_4 = vhsubq_s16(input_4, sign_in4); + input_5 = vhsubq_s16(input_5, sign_in5); + input_6 = vhsubq_s16(input_6, sign_in6); + input_7 = vhsubq_s16(input_7, sign_in7); + // store results + vst1q_s16(&final_output[0 * 8], input_0); + vst1q_s16(&final_output[1 * 8], input_1); + vst1q_s16(&final_output[2 * 8], input_2); + vst1q_s16(&final_output[3 * 8], input_3); + vst1q_s16(&final_output[4 * 8], input_4); + vst1q_s16(&final_output[5 * 8], input_5); + vst1q_s16(&final_output[6 * 8], input_6); + vst1q_s16(&final_output[7 * 8], input_7); + } +} + diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c new file mode 100644 index 000000000..2d5ec79b3 --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include <math.h> + +#include "vpx_mem/vpx_mem.h" + +#include "vp9/common/vp9_quant_common.h" +#include "vp9/common/vp9_seg_common.h" + +#include "vp9/encoder/vp9_encoder.h" +#include "vp9/encoder/vp9_quantize.h" +#include "vp9/encoder/vp9_rd.h" + +void vp9_quantize_fp_neon(const int16_t *coeff_ptr, intptr_t count, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr, + int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, + int zbin_oq_value, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { + int i; + // TODO(jingning) Decide the need of these arguments after the + // quantization process is completed. + (void)zbin_ptr; + (void)quant_shift_ptr; + (void)zbin_oq_value; + (void)scan; + + if (!skip_block) { + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + + const int16x8_t v_zero = vdupq_n_s16(0); + const int16x8_t v_one = vdupq_n_s16(1); + int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); + int16x8_t v_round = vmovq_n_s16(round_ptr[1]); + int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); + int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); + // adjust for dc + v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); + v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); + v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); + + for (i = 0; i < count; i += 8) { + const int16x8_t v_iscan = vld1q_s16(&iscan[i]); + const int16x8_t v_coeff = vld1q_s16(&coeff_ptr[i]); + const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); + const int16x8_t v_abs_coeff = vabsq_s16(v_coeff); + const int16x8_t v_tmp = vqaddq_s16(v_abs_coeff, v_round); + const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), + vget_low_s16(v_quant)); + const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), + vget_high_s16(v_quant)); + const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), + vshrn_n_s32(v_tmp_hi, 16)); + const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); + const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); + const int16x8_t v_nz_iscan = + vandq_s16(vmvnq_s16(vreinterpretq_s16_u16(v_nz_mask)), v_iscan_plus1); + const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); + const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); + const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); + + v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); + + vst1q_s16(&qcoeff_ptr[i], v_qcoeff); + vst1q_s16(&dqcoeff_ptr[i], v_dqcoeff); + v_round = vmovq_n_s16(round_ptr[1]); + v_quant = vmovq_n_s16(quant_ptr[1]); + v_dequant = vmovq_n_s16(dequant_ptr[1]); + } + { + const int16x4_t v_eobmax_3210 = + vmax_s16(vget_low_s16(v_eobmax_76543210), + vget_high_s16(v_eobmax_76543210)); + const int64x1_t v_eobmax_xx32 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); + const int16x4_t v_eobmax_tmp = + vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); + const int64x1_t v_eobmax_xxx3 = + vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); + const int16x4_t v_eobmax_final = + vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); + + *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); + } + } else { + vpx_memset(qcoeff_ptr, 0, count * sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, count * sizeof(int16_t)); + *eob_ptr = 0; + } +} diff --git a/vp9/encoder/arm/neon/vp9_subtract_neon.c b/vp9/encoder/arm/neon/vp9_subtract_neon.c new file mode 100644 index 000000000..b4bf567db --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_subtract_neon.c @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" +#include "./vpx_config.h" + +#include "vpx/vpx_integer.h" + +void vp9_subtract_block_neon(int rows, int cols, + int16_t *diff, ptrdiff_t diff_stride, + const uint8_t *src, ptrdiff_t src_stride, + const uint8_t *pred, ptrdiff_t pred_stride) { + int r, c; + + if (cols > 16) { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; c += 32) { + const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); + const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); + const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); + const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); + const uint16x8_t v_diff_lo_00 = vsubl_u8(vget_low_u8(v_src_00), + vget_low_u8(v_pred_00)); + const uint16x8_t v_diff_hi_00 = vsubl_u8(vget_high_u8(v_src_00), + vget_high_u8(v_pred_00)); + const uint16x8_t v_diff_lo_16 = vsubl_u8(vget_low_u8(v_src_16), + vget_low_u8(v_pred_16)); + const uint16x8_t v_diff_hi_16 = vsubl_u8(vget_high_u8(v_src_16), + vget_high_u8(v_pred_16)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 8) { + for (r = 0; r < rows; ++r) { + const uint8x16_t v_src = vld1q_u8(&src[0]); + const uint8x16_t v_pred = vld1q_u8(&pred[0]); + const uint16x8_t v_diff_lo = vsubl_u8(vget_low_u8(v_src), + vget_low_u8(v_pred)); + const uint16x8_t v_diff_hi = vsubl_u8(vget_high_u8(v_src), + vget_high_u8(v_pred)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else if (cols > 4) { + for (r = 0; r < rows; ++r) { + const uint8x8_t v_src = vld1_u8(&src[0]); + const uint8x8_t v_pred = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } else { + for (r = 0; r < rows; ++r) { + for (c = 0; c < cols; ++c) + diff[c] = src[c] - pred[c]; + + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } + } +} diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c new file mode 100644 index 000000000..f6871188b --- /dev/null +++ b/vp9/encoder/arm/neon/vp9_variance_neon.c @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> +#include "./vp9_rtcd.h" + +#include "vpx_ports/mem.h" +#include "vpx/vpx_integer.h" + +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_filter.h" + +#include "vp9/encoder/vp9_variance.h" + +enum { kWidth16 = 16 }; +enum { kHeight16 = 16 }; +enum { kHeight16PlusOne = 17 }; +enum { kPixelStepOne = 1 }; + +static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) { + const int32x4_t a = vpaddlq_s16(v_16x8); + const int64x2_t b = vpaddlq_s32(a); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) { + const int64x2_t b = vpaddlq_s32(v_32x4); + const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)), + vreinterpret_s32_s64(vget_high_s64(b))); + return vget_lane_s32(c, 0); +} + +static void variance_neon_w8(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + int w, int h, unsigned int *sse, int *sum) { + int i, j; + int16x8_t v_sum = vdupq_n_s16(0); + int32x4_t v_sse_lo = vdupq_n_s32(0); + int32x4_t v_sse_hi = vdupq_n_s32(0); + + for (i = 0; i < h; ++i) { + for (j = 0; j < w; j += 8) { + const uint8x8_t v_a = vld1_u8(&a[j]); + const uint8x8_t v_b = vld1_u8(&b[j]); + const uint16x8_t v_diff = vsubl_u8(v_a, v_b); + const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); + v_sum = vaddq_s16(v_sum, sv_diff); + v_sse_lo = vmlal_s16(v_sse_lo, + vget_low_s16(sv_diff), + vget_low_s16(sv_diff)); + v_sse_hi = vmlal_s16(v_sse_hi, + vget_high_s16(sv_diff), + vget_high_s16(sv_diff)); + } + a += a_stride; + b += b_stride; + } + + *sum = horizontal_add_s16x8(v_sum); + *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); +} + +void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride, + const uint8_t *ref_ptr, int ref_stride, + unsigned int *sse, int *sum) { + variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16, + kHeight16, sse, sum); +} + +unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride, + const uint8_t *b, int b_stride, + unsigned int *sse) { + int sum; + variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum); + return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16)); +} + +static void var_filter_block2d_bil_w16(const uint8_t *src_ptr, + uint8_t *output_ptr, + unsigned int src_pixels_per_line, + int pixel_step, + unsigned int output_height, + unsigned int output_width, + const int16_t *vp9_filter) { + const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]); + const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]); + unsigned int i; + for (i = 0; i < output_height; ++i) { + const uint8x16_t src_0 = vld1q_u8(&src_ptr[0]); + const uint8x16_t src_1 = vld1q_u8(&src_ptr[pixel_step]); + const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0); + const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1); + const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS); + const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0); + const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1); + const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS); + vst1q_u8(&output_ptr[0], vcombine_u8(out_lo, out_hi)); + // Next row... + src_ptr += src_pixels_per_line; + output_ptr += output_width; + } +} + +unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src, + int src_stride, + int xoffset, + int yoffset, + const uint8_t *dst, + int dst_stride, + unsigned int *sse) { + DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, temp2, kHeight16 * kWidth16); + DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, fdata3, kHeight16PlusOne * kWidth16); + + var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne, + kHeight16PlusOne, kWidth16, + BILINEAR_FILTERS_2TAP(xoffset)); + var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16, + kWidth16, BILINEAR_FILTERS_2TAP(yoffset)); + return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse); +} diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 584bcb8f5..b1ef3dcac 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1836,7 +1836,7 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, BLOCK_SIZE max_size = BLOCK_8X8; int bsl = mi_width_log2(BLOCK_64X64); const int search_range_ctrl = (((mi_row + mi_col) >> bsl) + - get_chessboard_index(cm)) % 2; + get_chessboard_index(cm->current_video_frame)) & 0x1; // Trap case where we do not have a prediction. if (search_range_ctrl && (left_in_image || above_in_image || cm->frame_type != KEY_FRAME)) { @@ -1880,6 +1880,60 @@ static void auto_partition_range(VP9_COMP *cpi, const TileInfo *const tile, *max_block_size = max_size; } +// TODO(jingning) refactor functions setting partition search range +static void set_partition_range(VP9_COMMON *cm, MACROBLOCKD *xd, + int mi_row, int mi_col, BLOCK_SIZE bsize, + BLOCK_SIZE *min_bs, BLOCK_SIZE *max_bs) { + int mi_width = num_8x8_blocks_wide_lookup[bsize]; + int mi_height = num_8x8_blocks_high_lookup[bsize]; + int idx, idy; + + MODE_INFO *mi; + MODE_INFO **prev_mi = + &cm->prev_mi_grid_visible[mi_row * cm->mi_stride + mi_col]; + BLOCK_SIZE bs, min_size, max_size; + + min_size = BLOCK_64X64; + max_size = BLOCK_4X4; + + if (prev_mi) { + for (idy = 0; idy < mi_height; ++idy) { + for (idx = 0; idx < mi_width; ++idx) { + mi = prev_mi[idy * cm->mi_stride + idx]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + } + + if (xd->left_available) { + for (idy = 0; idy < mi_height; ++idy) { + mi = xd->mi[idy * cm->mi_stride - 1]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + + if (xd->up_available) { + for (idx = 0; idx < mi_width; ++idx) { + mi = xd->mi[idx - cm->mi_stride]; + bs = mi ? mi->mbmi.sb_type : bsize; + min_size = MIN(min_size, bs); + max_size = MAX(max_size, bs); + } + } + + if (min_size == max_size) { + min_size = min_partition_size[min_size]; + max_size = max_partition_size[max_size]; + } + + *min_bs = min_size; + *max_bs = max_size; +} + static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { vpx_memcpy(ctx->pred_mv, x->pred_mv, sizeof(x->pred_mv)); } @@ -1888,13 +1942,22 @@ static INLINE void load_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) { vpx_memcpy(x->pred_mv, ctx->pred_mv, sizeof(x->pred_mv)); } +#if CONFIG_FP_MB_STATS +const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4}; +const int num_16x16_blocks_high_lookup[BLOCK_SIZES] = + {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4}; +const int qindex_skip_threshold_lookup[BLOCK_SIZES] = + {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120}; +#endif + // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previous rate-distortion optimization // results, for encoding speed-up. static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE bsize, int *rate, - int64_t *dist, int do_recon, int64_t best_rd, + int64_t *dist, int64_t best_rd, PC_TREE *pc_tree) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; @@ -1917,6 +1980,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, const int xss = x->e_mbd.plane[1].subsampling_x; const int yss = x->e_mbd.plane[1].subsampling_y; + BLOCK_SIZE min_size = cpi->sf.min_partition_size; + BLOCK_SIZE max_size = cpi->sf.max_partition_size; + int partition_none_allowed = !force_horz_split && !force_vert_split; int partition_horz_allowed = !force_vert_split && yss <= xss && bsize >= BLOCK_8X8; @@ -1931,18 +1997,24 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, set_offsets(cpi, tile, mi_row, mi_col, bsize); x->mb_energy = vp9_block_energy(cpi, x, bsize); } + + if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) { + int cb_partition_search_ctrl = ((pc_tree->index == 0 || pc_tree->index == 3) + + get_chessboard_index(cm->current_video_frame)) & 0x1; + + if (cb_partition_search_ctrl && bsize > min_size && bsize < max_size) + set_partition_range(cm, xd, mi_row, mi_col, bsize, &min_size, &max_size); + } + // Determine partition types in search according to the speed features. // The threshold set here has to be of square block size. if (cpi->sf.auto_min_max_partition_size) { - partition_none_allowed &= (bsize <= cpi->sf.max_partition_size && - bsize >= cpi->sf.min_partition_size); - partition_horz_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_none_allowed &= (bsize <= max_size && bsize >= min_size); + partition_horz_allowed &= ((bsize <= max_size && bsize > min_size) || force_horz_split); - partition_vert_allowed &= ((bsize <= cpi->sf.max_partition_size && - bsize > cpi->sf.min_partition_size) || + partition_vert_allowed &= ((bsize <= max_size && bsize > min_size) || force_vert_split); - do_split &= bsize > cpi->sf.min_partition_size; + do_split &= bsize > min_size; } if (cpi->sf.use_square_partition_only) { partition_horz_allowed &= force_horz_split; @@ -1993,6 +2065,52 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, do_split = 0; do_rect = 0; } + +#if CONFIG_FP_MB_STATS + // Check if every 16x16 first pass block statistics has zero + // motion and the corresponding first pass residue is small enough. + // If that is the case, check the difference variance between the + // current frame and the last frame. If the variance is small enough, + // stop further splitting in RD optimization + if (cpi->use_fp_mb_stats && do_split != 0 && + cm->base_qindex > qindex_skip_threshold_lookup[bsize]) { + VP9_COMMON *cm = &cpi->common; + int mb_row = mi_row >> 1; + int mb_col = mi_col >> 1; + int mb_row_end = + MIN(mb_row + num_16x16_blocks_high_lookup[bsize], cm->mb_rows); + int mb_col_end = + MIN(mb_col + num_16x16_blocks_wide_lookup[bsize], cm->mb_cols); + int r, c; + + int skip = 1; + for (r = mb_row; r < mb_row_end; r++) { + for (c = mb_col; c < mb_col_end; c++) { + const int mb_index = r * cm->mb_cols + c; + if ((cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_NONZERO_MOTION_MASK) || + !(cpi->twopass.this_frame_mb_stats[mb_index] & + FPMB_ERROR_LEVEL0_MASK)) { + skip = 0; + break; + } + } + if (skip == 0) { + break; + } + } + if (skip) { + unsigned int var; + set_offsets(cpi, tile, mi_row, mi_col, bsize); + var = get_sby_perpixel_diff_variance(cpi, &cpi->mb.plane[0].src, + mi_row, mi_col, bsize); + if (var < 8) { + do_split = 0; + do_rect = 0; + } + } + } +#endif } } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); @@ -2030,8 +2148,9 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, if (cpi->sf.adaptive_motion_search) load_pred_mv(x, ctx); + pc_tree->split[i]->index = i; rd_pick_partition(cpi, tile, tp, mi_row + y_idx, mi_col + x_idx, - subsize, &this_rate, &this_dist, i != 3, + subsize, &this_rate, &this_dist, best_rd - sum_rd, pc_tree->split[i]); if (this_rate == INT_MAX) { @@ -2168,7 +2287,7 @@ static void rd_pick_partition(VP9_COMP *cpi, const TileInfo *const tile, *rate = best_rate; *dist = best_dist; - if (best_rate < INT_MAX && best_dist < INT64_MAX && do_recon) { + if (best_rate < INT_MAX && best_dist < INT64_MAX && pc_tree->index != 3) { int output_enabled = (bsize == BLOCK_64X64); // Check the projected output rate for this SB against it's target @@ -2225,6 +2344,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, } vp9_zero(cpi->mb.pred_mv); + cpi->pc_root->index = 0; if ((sf->partition_search_type == SEARCH_PARTITION && sf->use_lastframe_partitioning) || @@ -2278,7 +2398,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, &sf->max_partition_size); } rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX, + &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root); } else { if (sf->constrain_copy_partition && @@ -2300,7 +2420,7 @@ static void encode_rd_sb_row(VP9_COMP *cpi, const TileInfo *const tile, &sf->max_partition_size); } rd_pick_partition(cpi, tile, tp, mi_row, mi_col, BLOCK_64X64, - &dummy_rate, &dummy_dist, 1, INT64_MAX, cpi->pc_root); + &dummy_rate, &dummy_dist, INT64_MAX, cpi->pc_root); } } } diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 4b3f2ad56..2a6c4b362 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -551,8 +551,8 @@ static INLINE void set_ref_ptrs(VP9_COMMON *cm, MACROBLOCKD *xd, : 0]; } -static INLINE int get_chessboard_index(const VP9_COMMON *cm) { - return cm->current_video_frame % 2; +static INLINE int get_chessboard_index(const int frame_index) { + return frame_index & 0x1; } #ifdef __cplusplus diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 3c41f5f73..1b574758b 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -553,6 +553,9 @@ void vp9_first_pass(VP9_COMP *cpi) { const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row); double error_weight = 1.0; const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col); +#if CONFIG_FP_MB_STATS + const int mb_index = mb_row * cm->mb_cols + mb_col; +#endif vp9_clear_system_state(); @@ -599,7 +602,8 @@ void vp9_first_pass(VP9_COMP *cpi) { #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { - // TODO(pengchong): store some related block statistics here + // initialization + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; } #endif @@ -700,6 +704,27 @@ void vp9_first_pass(VP9_COMP *cpi) { // Start by assuming that intra mode is best. best_ref_mv.as_int = 0; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + // intra predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] &= + ~FPMB_NONZERO_MOTION_MASK; + if (this_error > FPMB_ERROR_LEVEL4_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL4_MASK; + } else if (this_error > FPMB_ERROR_LEVEL3_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL3_MASK; + } else if (this_error > FPMB_ERROR_LEVEL2_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL2_MASK; + } else if (this_error > FPMB_ERROR_LEVEL1_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL1_MASK; + } else { + cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL0_MASK; + } + } +#endif + if (motion_error <= this_error) { // Keep a count of cases where the inter and intra were very close // and very low. This helps with scene cut detection for example in @@ -730,13 +755,40 @@ void vp9_first_pass(VP9_COMP *cpi) { #if CONFIG_FP_MB_STATS if (cpi->use_fp_mb_stats) { - // TODO(pengchong): save some related block statistics here + // inter predication statistics + cpi->twopass.frame_mb_stats_buf[mb_index] = 0; + cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK; + cpi->twopass.frame_mb_stats_buf[mb_index] &= + ~FPMB_NONZERO_MOTION_MASK; + if (this_error > FPMB_ERROR_LEVEL4_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_LEVEL4_MASK; + } else if (this_error > FPMB_ERROR_LEVEL3_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_LEVEL3_MASK; + } else if (this_error > FPMB_ERROR_LEVEL2_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_LEVEL2_MASK; + } else if (this_error > FPMB_ERROR_LEVEL1_TH) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_LEVEL1_MASK; + } else { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_ERROR_LEVEL0_MASK; + } } #endif if (mv.as_int) { ++mvcount; +#if CONFIG_FP_MB_STATS + if (cpi->use_fp_mb_stats) { + cpi->twopass.frame_mb_stats_buf[mb_index] |= + FPMB_NONZERO_MOTION_MASK; + } +#endif + // Non-zero vector, was it different from the last non zero vector? if (mv.as_int != lastmv_as_int) ++new_mv_count; diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h index 33a795f26..e898abc16 100644 --- a/vp9/encoder/vp9_firstpass.h +++ b/vp9/encoder/vp9_firstpass.h @@ -19,6 +19,20 @@ extern "C" { #endif #if CONFIG_FP_MB_STATS + +#define FPMB_DCINTRA_MASK 0x01 +#define FPMB_NONZERO_MOTION_MASK 0x02 +#define FPMB_ERROR_LEVEL0_MASK 0x04 +#define FPMB_ERROR_LEVEL1_MASK 0x10 +#define FPMB_ERROR_LEVEL2_MASK 0x20 +#define FPMB_ERROR_LEVEL3_MASK 0x40 +#define FPMB_ERROR_LEVEL4_MASK 0x80 + +#define FPMB_ERROR_LEVEL1_TH 2000 +#define FPMB_ERROR_LEVEL2_TH 8000 +#define FPMB_ERROR_LEVEL3_TH 24000 +#define FPMB_ERROR_LEVEL4_TH 48000 + typedef struct { uint8_t *mb_stats_start; uint8_t *mb_stats_end; diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 30a0e9d0d..7a1600155 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -394,7 +394,8 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, INTERP_FILTER filter_ref = cm->interp_filter; int bsl = mi_width_log2(bsize); const int pred_filter_search = cm->interp_filter == SWITCHABLE ? - (((mi_row + mi_col) >> bsl) + get_chessboard_index(cm)) % 2 : 0; + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1 : 0; int const_motion[MAX_REF_FRAMES] = { 0 }; int bh = num_4x4_blocks_high_lookup[bsize] << 2; int bw = num_4x4_blocks_wide_lookup[bsize] << 2; @@ -409,6 +410,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, PRED_BUFFER *this_mode_pred = NULL; int i; + // CTX is used by the temporal denoiser which is currently being developed. + // TODO(jbb): when temporal denoiser is finished and in the default build + // remove the following line; + (void) ctx; if (cpi->sf.reuse_inter_pred_sby) { for (i = 0; i < 3; i++) { tmp[i].data = &pred_buf[pixels_in_block * i]; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 6778d69b9..177066b32 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -2106,7 +2106,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int bsl = mi_width_log2_lookup[bsize]; int pred_filter_search = cpi->sf.cb_pred_filter_search ? - (((mi_row + mi_col) >> bsl)) & 0x01 : 0; + (((mi_row + mi_col) >> bsl) + + get_chessboard_index(cm->current_video_frame)) & 0x1 : 0; if (pred_filter_search) { INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 7315dd454..6ca1bc525 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -110,10 +110,12 @@ static void set_good_speed_feature(VP9_COMP *cpi, VP9_COMMON *cm, if (speed >= 3) { sf->tx_size_search_method = frame_is_intra_only(cm) ? USE_FULL_RD : USE_LARGESTALL; - if (MIN(cm->width, cm->height) >= 720) + if (MIN(cm->width, cm->height) >= 720) { sf->disable_split_mask = DISABLE_ALL_SPLIT; - else + sf->cb_partition_search = frame_is_boosted(cpi) ? 0 : 1; + } else { sf->disable_split_mask = DISABLE_ALL_INTER_SPLIT; + } sf->adaptive_pred_interp_filter = 0; sf->cb_pred_filter_search = 1; @@ -334,6 +336,7 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->adaptive_motion_search = 0; sf->adaptive_pred_interp_filter = 0; sf->cb_pred_filter_search = 0; + sf->cb_partition_search = 0; sf->use_quant_fp = 0; sf->reference_masking = 0; sf->partition_search_type = SEARCH_PARTITION; diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 929acaf3e..de731cee1 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -286,6 +286,8 @@ typedef struct SPEED_FEATURES { // Chessboard pattern prediction filter type search int cb_pred_filter_search; + int cb_partition_search; + // Fast quantization process path int use_quant_fp; diff --git a/vp9/encoder/x86/vp9_dct_avx2.c b/vp9/encoder/x86/vp9_dct_avx2.c index b5269ed03..3a19f5274 100644 --- a/vp9/encoder/x86/vp9_dct_avx2.c +++ b/vp9/encoder/x86/vp9_dct_avx2.c @@ -12,2572 +12,6 @@ #include "vp9/common/vp9_idct.h" // for cospi constants #include "vpx_ports/mem.h" -void vp9_fdct4x4_avx2(const int16_t *input, int16_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - const __m128i kOne = _mm_set1_epi16(1); - __m128i in0, in1, in2, in3; - // Load inputs. - { - in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in2 = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in3 = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - // x = x << 4 - in0 = _mm_slli_epi16(in0, 4); - in1 = _mm_slli_epi16(in1, 4); - in2 = _mm_slli_epi16(in2, 4); - in3 = _mm_slli_epi16(in3, 4); - // if (i == 0 && input[0]) input[0] += 1; - { - // The mask will only contain whether the first value is zero, all - // other comparison will fail as something shifted by 4 (above << 4) - // can never be equal to one. To increment in the non-zero case, we - // add the mask and one for the first element: - // - if zero, mask = -1, v = v - 1 + 1 = v - // - if non-zero, mask = 0, v = v + 0 + 1 = v + 1 - __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a); - in0 = _mm_add_epi16(in0, mask); - in0 = _mm_add_epi16(in0, k__nonzero_bias_b); - } - } - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // Transform 1/2: Add/subtract - const __m128i r0 = _mm_add_epi16(in0, in3); - const __m128i r1 = _mm_add_epi16(in1, in2); - const __m128i r2 = _mm_sub_epi16(in1, in2); - const __m128i r3 = _mm_sub_epi16(in0, in3); - // Transform 1/2: Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - // Combine and transpose - const __m128i res0 = _mm_packs_epi32(w0, w2); - const __m128i res1 = _mm_packs_epi32(w4, w6); - // 00 01 02 03 20 21 22 23 - // 10 11 12 13 30 31 32 33 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - in2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - // 00 10 20 30 01 11 21 31 in0 contains 0 followed by 1 - // 02 12 22 32 03 13 23 33 in2 contains 2 followed by 3 - if (0 == pass) { - // Extract values in the high part for second pass as transform code - // only uses the first four values. - in1 = _mm_unpackhi_epi64(in0, in0); - in3 = _mm_unpackhi_epi64(in2, in2); - } else { - // Post-condition output and store it (v + 1) >> 2, taking advantage - // of the fact 1/3 are stored just after 0/2. - __m128i out01 = _mm_add_epi16(in0, kOne); - __m128i out23 = _mm_add_epi16(in2, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - _mm_storeu_si128((__m128i *)(output + 0 * 4), out01); - _mm_storeu_si128((__m128i *)(output + 2 * 4), out23); - } - } -} - -static INLINE void load_buffer_4x4_avx2(const int16_t *input, __m128i *in, - int stride) { - const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1); - const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0); - __m128i mask; - - in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); - in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); - in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride)); - in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride)); - - in[0] = _mm_slli_epi16(in[0], 4); - in[1] = _mm_slli_epi16(in[1], 4); - in[2] = _mm_slli_epi16(in[2], 4); - in[3] = _mm_slli_epi16(in[3], 4); - - mask = _mm_cmpeq_epi16(in[0], k__nonzero_bias_a); - in[0] = _mm_add_epi16(in[0], mask); - in[0] = _mm_add_epi16(in[0], k__nonzero_bias_b); -} - -static INLINE void write_buffer_4x4_avx2(int16_t *output, __m128i *res) { - const __m128i kOne = _mm_set1_epi16(1); - __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]); - __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]); - __m128i out01 = _mm_add_epi16(in01, kOne); - __m128i out23 = _mm_add_epi16(in23, kOne); - out01 = _mm_srai_epi16(out01, 2); - out23 = _mm_srai_epi16(out23, 2); - _mm_store_si128((__m128i *)(output + 0 * 8), out01); - _mm_store_si128((__m128i *)(output + 1 * 8), out23); -} - -static INLINE void transpose_4x4_avx2(__m128i *res) { - // Combine and transpose - // 00 01 02 03 20 21 22 23 - // 10 11 12 13 30 31 32 33 - const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]); - const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]); - - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1); - res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1); - - // 00 10 20 30 01 11 21 31 - // 02 12 22 32 03 13 23 33 - // only use the first 4 16-bit integers - res[1] = _mm_unpackhi_epi64(res[0], res[0]); - res[3] = _mm_unpackhi_epi64(res[2], res[2]); -} - -void fdct4_avx2(__m128i *in) { - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u[4], v[4]; - u[0]=_mm_unpacklo_epi16(in[0], in[1]); - u[1]=_mm_unpacklo_epi16(in[3], in[2]); - - v[0] = _mm_add_epi16(u[0], u[1]); - v[1] = _mm_sub_epi16(u[0], u[1]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); // 0 - u[1] = _mm_madd_epi16(v[0], k__cospi_p16_m16); // 2 - u[2] = _mm_madd_epi16(v[1], k__cospi_p08_p24); // 1 - u[3] = _mm_madd_epi16(v[1], k__cospi_p24_m08); // 3 - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[1] = _mm_packs_epi32(u[2], u[3]); - transpose_4x4_avx2(in); -} - -void fadst4_avx2(__m128i *in) { - const __m128i k__sinpi_p01_p02 = pair_set_epi16(sinpi_1_9, sinpi_2_9); - const __m128i k__sinpi_p04_m01 = pair_set_epi16(sinpi_4_9, -sinpi_1_9); - const __m128i k__sinpi_p03_p04 = pair_set_epi16(sinpi_3_9, sinpi_4_9); - const __m128i k__sinpi_m03_p02 = pair_set_epi16(-sinpi_3_9, sinpi_2_9); - const __m128i k__sinpi_p03_p03 = _mm_set1_epi16(sinpi_3_9); - const __m128i kZero = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u[8], v[8]; - __m128i in7 = _mm_add_epi16(in[0], in[1]); - - u[0] = _mm_unpacklo_epi16(in[0], in[1]); - u[1] = _mm_unpacklo_epi16(in[2], in[3]); - u[2] = _mm_unpacklo_epi16(in7, kZero); - u[3] = _mm_unpacklo_epi16(in[2], kZero); - u[4] = _mm_unpacklo_epi16(in[3], kZero); - - v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02); // s0 + s2 - v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04); // s4 + s5 - v[2] = _mm_madd_epi16(u[2], k__sinpi_p03_p03); // x1 - v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01); // s1 - s3 - v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02); // -s4 + s6 - v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03); // s4 - v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03); - - u[0] = _mm_add_epi32(v[0], v[1]); - u[1] = _mm_sub_epi32(v[2], v[6]); - u[2] = _mm_add_epi32(v[3], v[4]); - u[3] = _mm_sub_epi32(u[2], u[0]); - u[4] = _mm_slli_epi32(v[5], 2); - u[5] = _mm_sub_epi32(u[4], v[5]); - u[6] = _mm_add_epi32(u[3], u[5]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[2]); - in[1] = _mm_packs_epi32(u[1], u[3]); - transpose_4x4_avx2(in); -} - -void vp9_fht4x4_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { - __m128i in[4]; - - switch (tx_type) { - case DCT_DCT: - vp9_fdct4x4_avx2(input, output, stride); - break; - case ADST_DCT: - load_buffer_4x4_avx2(input, in, stride); - fadst4_avx2(in); - fdct4_avx2(in); - write_buffer_4x4_avx2(output, in); - break; - case DCT_ADST: - load_buffer_4x4_avx2(input, in, stride); - fdct4_avx2(in); - fadst4_avx2(in); - write_buffer_4x4_avx2(output, in); - break; - case ADST_ADST: - load_buffer_4x4_avx2(input, in, stride); - fadst4_avx2(in); - fadst4_avx2(in); - write_buffer_4x4_avx2(output, in); - break; - default: - assert(0); - break; - } -} - -void vp9_fdct8x8_avx2(const int16_t *input, int16_t *output, int stride) { - int pass; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - // Load input - __m128i in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); - __m128i in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); - __m128i in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); - __m128i in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); - __m128i in4 = _mm_load_si128((const __m128i *)(input + 4 * stride)); - __m128i in5 = _mm_load_si128((const __m128i *)(input + 5 * stride)); - __m128i in6 = _mm_load_si128((const __m128i *)(input + 6 * stride)); - __m128i in7 = _mm_load_si128((const __m128i *)(input + 7 * stride)); - // Pre-condition input (shift by two) - in0 = _mm_slli_epi16(in0, 2); - in1 = _mm_slli_epi16(in1, 2); - in2 = _mm_slli_epi16(in2, 2); - in3 = _mm_slli_epi16(in3, 2); - in4 = _mm_slli_epi16(in4, 2); - in5 = _mm_slli_epi16(in5, 2); - in6 = _mm_slli_epi16(in6, 2); - in7 = _mm_slli_epi16(in7, 2); - - // We do two passes, first the columns, then the rows. The results of the - // first pass are transposed so that the same column code can be reused. The - // results of the second pass are also transposed so that the rows (processed - // as columns) are put back in row positions. - for (pass = 0; pass < 2; pass++) { - // To store results of each pass before the transpose. - __m128i res0, res1, res2, res3, res4, res5, res6, res7; - // Add/subtract - const __m128i q0 = _mm_add_epi16(in0, in7); - const __m128i q1 = _mm_add_epi16(in1, in6); - const __m128i q2 = _mm_add_epi16(in2, in5); - const __m128i q3 = _mm_add_epi16(in3, in4); - const __m128i q4 = _mm_sub_epi16(in3, in4); - const __m128i q5 = _mm_sub_epi16(in2, in5); - const __m128i q6 = _mm_sub_epi16(in1, in6); - const __m128i q7 = _mm_sub_epi16(in0, in7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res0 = _mm_packs_epi32(w0, w1); - res4 = _mm_packs_epi32(w2, w3); - res2 = _mm_packs_epi32(w4, w5); - res6 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us into 32bits - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res1 = _mm_packs_epi32(w0, w1); - res7 = _mm_packs_epi32(w2, w3); - res5 = _mm_packs_epi32(w4, w5); - res3 = _mm_packs_epi32(w6, w7); - } - // Transpose the 8x8. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1); - const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3); - const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1); - const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3); - const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5); - const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7); - const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5); - const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - in0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - in1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - in2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - in3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - in4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - in5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - in6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - in7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - } - } - // Post-condition output and store it - { - // Post-condition (division by two) - // division of two 16 bits signed numbers using shifts - // n / 2 = (n - (n >> 15)) >> 1 - const __m128i sign_in0 = _mm_srai_epi16(in0, 15); - const __m128i sign_in1 = _mm_srai_epi16(in1, 15); - const __m128i sign_in2 = _mm_srai_epi16(in2, 15); - const __m128i sign_in3 = _mm_srai_epi16(in3, 15); - const __m128i sign_in4 = _mm_srai_epi16(in4, 15); - const __m128i sign_in5 = _mm_srai_epi16(in5, 15); - const __m128i sign_in6 = _mm_srai_epi16(in6, 15); - const __m128i sign_in7 = _mm_srai_epi16(in7, 15); - in0 = _mm_sub_epi16(in0, sign_in0); - in1 = _mm_sub_epi16(in1, sign_in1); - in2 = _mm_sub_epi16(in2, sign_in2); - in3 = _mm_sub_epi16(in3, sign_in3); - in4 = _mm_sub_epi16(in4, sign_in4); - in5 = _mm_sub_epi16(in5, sign_in5); - in6 = _mm_sub_epi16(in6, sign_in6); - in7 = _mm_sub_epi16(in7, sign_in7); - in0 = _mm_srai_epi16(in0, 1); - in1 = _mm_srai_epi16(in1, 1); - in2 = _mm_srai_epi16(in2, 1); - in3 = _mm_srai_epi16(in3, 1); - in4 = _mm_srai_epi16(in4, 1); - in5 = _mm_srai_epi16(in5, 1); - in6 = _mm_srai_epi16(in6, 1); - in7 = _mm_srai_epi16(in7, 1); - // store results - _mm_store_si128((__m128i *)(output + 0 * 8), in0); - _mm_store_si128((__m128i *)(output + 1 * 8), in1); - _mm_store_si128((__m128i *)(output + 2 * 8), in2); - _mm_store_si128((__m128i *)(output + 3 * 8), in3); - _mm_store_si128((__m128i *)(output + 4 * 8), in4); - _mm_store_si128((__m128i *)(output + 5 * 8), in5); - _mm_store_si128((__m128i *)(output + 6 * 8), in6); - _mm_store_si128((__m128i *)(output + 7 * 8), in7); - } -} - -// load 8x8 array -static INLINE void load_buffer_8x8_avx2(const int16_t *input, __m128i *in, - int stride) { - in[0] = _mm_load_si128((const __m128i *)(input + 0 * stride)); - in[1] = _mm_load_si128((const __m128i *)(input + 1 * stride)); - in[2] = _mm_load_si128((const __m128i *)(input + 2 * stride)); - in[3] = _mm_load_si128((const __m128i *)(input + 3 * stride)); - in[4] = _mm_load_si128((const __m128i *)(input + 4 * stride)); - in[5] = _mm_load_si128((const __m128i *)(input + 5 * stride)); - in[6] = _mm_load_si128((const __m128i *)(input + 6 * stride)); - in[7] = _mm_load_si128((const __m128i *)(input + 7 * stride)); - - in[0] = _mm_slli_epi16(in[0], 2); - in[1] = _mm_slli_epi16(in[1], 2); - in[2] = _mm_slli_epi16(in[2], 2); - in[3] = _mm_slli_epi16(in[3], 2); - in[4] = _mm_slli_epi16(in[4], 2); - in[5] = _mm_slli_epi16(in[5], 2); - in[6] = _mm_slli_epi16(in[6], 2); - in[7] = _mm_slli_epi16(in[7], 2); -} - -// right shift and rounding -static INLINE void right_shift_8x8_avx2(__m128i *res, int const bit) { - const __m128i kOne = _mm_set1_epi16(1); - const int bit_m02 = bit - 2; - __m128i sign0 = _mm_srai_epi16(res[0], 15); - __m128i sign1 = _mm_srai_epi16(res[1], 15); - __m128i sign2 = _mm_srai_epi16(res[2], 15); - __m128i sign3 = _mm_srai_epi16(res[3], 15); - __m128i sign4 = _mm_srai_epi16(res[4], 15); - __m128i sign5 = _mm_srai_epi16(res[5], 15); - __m128i sign6 = _mm_srai_epi16(res[6], 15); - __m128i sign7 = _mm_srai_epi16(res[7], 15); - - if (bit_m02 >= 0) { - __m128i k_const_rounding = _mm_slli_epi16(kOne, bit_m02); - res[0] = _mm_add_epi16(res[0], k_const_rounding); - res[1] = _mm_add_epi16(res[1], k_const_rounding); - res[2] = _mm_add_epi16(res[2], k_const_rounding); - res[3] = _mm_add_epi16(res[3], k_const_rounding); - res[4] = _mm_add_epi16(res[4], k_const_rounding); - res[5] = _mm_add_epi16(res[5], k_const_rounding); - res[6] = _mm_add_epi16(res[6], k_const_rounding); - res[7] = _mm_add_epi16(res[7], k_const_rounding); - } - - res[0] = _mm_sub_epi16(res[0], sign0); - res[1] = _mm_sub_epi16(res[1], sign1); - res[2] = _mm_sub_epi16(res[2], sign2); - res[3] = _mm_sub_epi16(res[3], sign3); - res[4] = _mm_sub_epi16(res[4], sign4); - res[5] = _mm_sub_epi16(res[5], sign5); - res[6] = _mm_sub_epi16(res[6], sign6); - res[7] = _mm_sub_epi16(res[7], sign7); - - res[0] = _mm_srai_epi16(res[0], bit); - res[1] = _mm_srai_epi16(res[1], bit); - res[2] = _mm_srai_epi16(res[2], bit); - res[3] = _mm_srai_epi16(res[3], bit); - res[4] = _mm_srai_epi16(res[4], bit); - res[5] = _mm_srai_epi16(res[5], bit); - res[6] = _mm_srai_epi16(res[6], bit); - res[7] = _mm_srai_epi16(res[7], bit); -} - -// write 8x8 array -static INLINE void write_buffer_8x8_avx2(int16_t *output, __m128i *res, int stride) { - _mm_store_si128((__m128i *)(output + 0 * stride), res[0]); - _mm_store_si128((__m128i *)(output + 1 * stride), res[1]); - _mm_store_si128((__m128i *)(output + 2 * stride), res[2]); - _mm_store_si128((__m128i *)(output + 3 * stride), res[3]); - _mm_store_si128((__m128i *)(output + 4 * stride), res[4]); - _mm_store_si128((__m128i *)(output + 5 * stride), res[5]); - _mm_store_si128((__m128i *)(output + 6 * stride), res[6]); - _mm_store_si128((__m128i *)(output + 7 * stride), res[7]); -} - -// perform in-place transpose -static INLINE void array_transpose_8x8_avx2(__m128i *in, __m128i *res) { - const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]); - const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]); - const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]); - const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]); - const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]); - const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]); - const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]); - const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 44 54 45 55 46 56 47 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 25 35 - // 44 54 64 74 45 55 65 75 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - res[0] = _mm_unpacklo_epi64(tr1_0, tr1_1); - res[1] = _mm_unpackhi_epi64(tr1_0, tr1_1); - res[2] = _mm_unpacklo_epi64(tr1_2, tr1_3); - res[3] = _mm_unpackhi_epi64(tr1_2, tr1_3); - res[4] = _mm_unpacklo_epi64(tr1_4, tr1_5); - res[5] = _mm_unpackhi_epi64(tr1_4, tr1_5); - res[6] = _mm_unpacklo_epi64(tr1_6, tr1_7); - res[7] = _mm_unpackhi_epi64(tr1_6, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 -} - -void fdct8_avx2(__m128i *in) { - // constants - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - __m128i u0, u1, u2, u3, u4, u5, u6, u7; - __m128i v0, v1, v2, v3, v4, v5, v6, v7; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - - // stage 1 - s0 = _mm_add_epi16(in[0], in[7]); - s1 = _mm_add_epi16(in[1], in[6]); - s2 = _mm_add_epi16(in[2], in[5]); - s3 = _mm_add_epi16(in[3], in[4]); - s4 = _mm_sub_epi16(in[3], in[4]); - s5 = _mm_sub_epi16(in[2], in[5]); - s6 = _mm_sub_epi16(in[1], in[6]); - s7 = _mm_sub_epi16(in[0], in[7]); - - u0 = _mm_add_epi16(s0, s3); - u1 = _mm_add_epi16(s1, s2); - u2 = _mm_sub_epi16(s1, s2); - u3 = _mm_sub_epi16(s0, s3); - // interleave and perform butterfly multiplication/addition - v0 = _mm_unpacklo_epi16(u0, u1); - v1 = _mm_unpackhi_epi16(u0, u1); - v2 = _mm_unpacklo_epi16(u2, u3); - v3 = _mm_unpackhi_epi16(u2, u3); - - u0 = _mm_madd_epi16(v0, k__cospi_p16_p16); - u1 = _mm_madd_epi16(v1, k__cospi_p16_p16); - u2 = _mm_madd_epi16(v0, k__cospi_p16_m16); - u3 = _mm_madd_epi16(v1, k__cospi_p16_m16); - u4 = _mm_madd_epi16(v2, k__cospi_p24_p08); - u5 = _mm_madd_epi16(v3, k__cospi_p24_p08); - u6 = _mm_madd_epi16(v2, k__cospi_m08_p24); - u7 = _mm_madd_epi16(v3, k__cospi_m08_p24); - - // shift and rounding - v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u0, u1); - in[2] = _mm_packs_epi32(u4, u5); - in[4] = _mm_packs_epi32(u2, u3); - in[6] = _mm_packs_epi32(u6, u7); - - // stage 2 - // interleave and perform butterfly multiplication/addition - u0 = _mm_unpacklo_epi16(s6, s5); - u1 = _mm_unpackhi_epi16(s6, s5); - v0 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_p16); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - - u0 = _mm_packs_epi32(v0, v1); - u1 = _mm_packs_epi32(v2, v3); - - // stage 3 - s0 = _mm_add_epi16(s4, u0); - s1 = _mm_sub_epi16(s4, u0); - s2 = _mm_sub_epi16(s7, u1); - s3 = _mm_add_epi16(s7, u1); - - // stage 4 - u0 = _mm_unpacklo_epi16(s0, s3); - u1 = _mm_unpackhi_epi16(s0, s3); - u2 = _mm_unpacklo_epi16(s1, s2); - u3 = _mm_unpackhi_epi16(s1, s2); - - v0 = _mm_madd_epi16(u0, k__cospi_p28_p04); - v1 = _mm_madd_epi16(u1, k__cospi_p28_p04); - v2 = _mm_madd_epi16(u2, k__cospi_p12_p20); - v3 = _mm_madd_epi16(u3, k__cospi_p12_p20); - v4 = _mm_madd_epi16(u2, k__cospi_m20_p12); - v5 = _mm_madd_epi16(u3, k__cospi_m20_p12); - v6 = _mm_madd_epi16(u0, k__cospi_m04_p28); - v7 = _mm_madd_epi16(u1, k__cospi_m04_p28); - - // shift and rounding - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v0, v1); - in[3] = _mm_packs_epi32(v4, v5); - in[5] = _mm_packs_epi32(v2, v3); - in[7] = _mm_packs_epi32(v6, v7); - - // transpose - array_transpose_8x8_avx2(in, in); -} - -void fadst8_avx2(__m128i *in) { - // Constants - const __m128i k__cospi_p02_p30 = pair_set_epi16(cospi_2_64, cospi_30_64); - const __m128i k__cospi_p30_m02 = pair_set_epi16(cospi_30_64, -cospi_2_64); - const __m128i k__cospi_p10_p22 = pair_set_epi16(cospi_10_64, cospi_22_64); - const __m128i k__cospi_p22_m10 = pair_set_epi16(cospi_22_64, -cospi_10_64); - const __m128i k__cospi_p18_p14 = pair_set_epi16(cospi_18_64, cospi_14_64); - const __m128i k__cospi_p14_m18 = pair_set_epi16(cospi_14_64, -cospi_18_64); - const __m128i k__cospi_p26_p06 = pair_set_epi16(cospi_26_64, cospi_6_64); - const __m128i k__cospi_p06_m26 = pair_set_epi16(cospi_6_64, -cospi_26_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__const_0 = _mm_set1_epi16(0); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - __m128i u0, u1, u2, u3, u4, u5, u6, u7, u8, u9, u10, u11, u12, u13, u14, u15; - __m128i v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15; - __m128i w0, w1, w2, w3, w4, w5, w6, w7, w8, w9, w10, w11, w12, w13, w14, w15; - __m128i s0, s1, s2, s3, s4, s5, s6, s7; - __m128i in0, in1, in2, in3, in4, in5, in6, in7; - - // properly aligned for butterfly input - in0 = in[7]; - in1 = in[0]; - in2 = in[5]; - in3 = in[2]; - in4 = in[3]; - in5 = in[4]; - in6 = in[1]; - in7 = in[6]; - - // column transformation - // stage 1 - // interleave and multiply/add into 32-bit integer - s0 = _mm_unpacklo_epi16(in0, in1); - s1 = _mm_unpackhi_epi16(in0, in1); - s2 = _mm_unpacklo_epi16(in2, in3); - s3 = _mm_unpackhi_epi16(in2, in3); - s4 = _mm_unpacklo_epi16(in4, in5); - s5 = _mm_unpackhi_epi16(in4, in5); - s6 = _mm_unpacklo_epi16(in6, in7); - s7 = _mm_unpackhi_epi16(in6, in7); - - u0 = _mm_madd_epi16(s0, k__cospi_p02_p30); - u1 = _mm_madd_epi16(s1, k__cospi_p02_p30); - u2 = _mm_madd_epi16(s0, k__cospi_p30_m02); - u3 = _mm_madd_epi16(s1, k__cospi_p30_m02); - u4 = _mm_madd_epi16(s2, k__cospi_p10_p22); - u5 = _mm_madd_epi16(s3, k__cospi_p10_p22); - u6 = _mm_madd_epi16(s2, k__cospi_p22_m10); - u7 = _mm_madd_epi16(s3, k__cospi_p22_m10); - u8 = _mm_madd_epi16(s4, k__cospi_p18_p14); - u9 = _mm_madd_epi16(s5, k__cospi_p18_p14); - u10 = _mm_madd_epi16(s4, k__cospi_p14_m18); - u11 = _mm_madd_epi16(s5, k__cospi_p14_m18); - u12 = _mm_madd_epi16(s6, k__cospi_p26_p06); - u13 = _mm_madd_epi16(s7, k__cospi_p26_p06); - u14 = _mm_madd_epi16(s6, k__cospi_p06_m26); - u15 = _mm_madd_epi16(s7, k__cospi_p06_m26); - - // addition - w0 = _mm_add_epi32(u0, u8); - w1 = _mm_add_epi32(u1, u9); - w2 = _mm_add_epi32(u2, u10); - w3 = _mm_add_epi32(u3, u11); - w4 = _mm_add_epi32(u4, u12); - w5 = _mm_add_epi32(u5, u13); - w6 = _mm_add_epi32(u6, u14); - w7 = _mm_add_epi32(u7, u15); - w8 = _mm_sub_epi32(u0, u8); - w9 = _mm_sub_epi32(u1, u9); - w10 = _mm_sub_epi32(u2, u10); - w11 = _mm_sub_epi32(u3, u11); - w12 = _mm_sub_epi32(u4, u12); - w13 = _mm_sub_epi32(u5, u13); - w14 = _mm_sub_epi32(u6, u14); - w15 = _mm_sub_epi32(u7, u15); - - // shift and rounding - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING); - v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING); - v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING); - v11 = _mm_add_epi32(w11, k__DCT_CONST_ROUNDING); - v12 = _mm_add_epi32(w12, k__DCT_CONST_ROUNDING); - v13 = _mm_add_epi32(w13, k__DCT_CONST_ROUNDING); - v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING); - v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - u8 = _mm_srai_epi32(v8, DCT_CONST_BITS); - u9 = _mm_srai_epi32(v9, DCT_CONST_BITS); - u10 = _mm_srai_epi32(v10, DCT_CONST_BITS); - u11 = _mm_srai_epi32(v11, DCT_CONST_BITS); - u12 = _mm_srai_epi32(v12, DCT_CONST_BITS); - u13 = _mm_srai_epi32(v13, DCT_CONST_BITS); - u14 = _mm_srai_epi32(v14, DCT_CONST_BITS); - u15 = _mm_srai_epi32(v15, DCT_CONST_BITS); - - // back to 16-bit and pack 8 integers into __m128i - in[0] = _mm_packs_epi32(u0, u1); - in[1] = _mm_packs_epi32(u2, u3); - in[2] = _mm_packs_epi32(u4, u5); - in[3] = _mm_packs_epi32(u6, u7); - in[4] = _mm_packs_epi32(u8, u9); - in[5] = _mm_packs_epi32(u10, u11); - in[6] = _mm_packs_epi32(u12, u13); - in[7] = _mm_packs_epi32(u14, u15); - - // stage 2 - s0 = _mm_add_epi16(in[0], in[2]); - s1 = _mm_add_epi16(in[1], in[3]); - s2 = _mm_sub_epi16(in[0], in[2]); - s3 = _mm_sub_epi16(in[1], in[3]); - u0 = _mm_unpacklo_epi16(in[4], in[5]); - u1 = _mm_unpackhi_epi16(in[4], in[5]); - u2 = _mm_unpacklo_epi16(in[6], in[7]); - u3 = _mm_unpackhi_epi16(in[6], in[7]); - - v0 = _mm_madd_epi16(u0, k__cospi_p08_p24); - v1 = _mm_madd_epi16(u1, k__cospi_p08_p24); - v2 = _mm_madd_epi16(u0, k__cospi_p24_m08); - v3 = _mm_madd_epi16(u1, k__cospi_p24_m08); - v4 = _mm_madd_epi16(u2, k__cospi_m24_p08); - v5 = _mm_madd_epi16(u3, k__cospi_m24_p08); - v6 = _mm_madd_epi16(u2, k__cospi_p08_p24); - v7 = _mm_madd_epi16(u3, k__cospi_p08_p24); - - w0 = _mm_add_epi32(v0, v4); - w1 = _mm_add_epi32(v1, v5); - w2 = _mm_add_epi32(v2, v6); - w3 = _mm_add_epi32(v3, v7); - w4 = _mm_sub_epi32(v0, v4); - w5 = _mm_sub_epi32(v1, v5); - w6 = _mm_sub_epi32(v2, v6); - w7 = _mm_sub_epi32(v3, v7); - - v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING); - v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING); - v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING); - v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING); - v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING); - v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING); - v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING); - v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING); - - u0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - u1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - u2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - u3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - u4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - u5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - u6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - u7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - - // back to 16-bit intergers - s4 = _mm_packs_epi32(u0, u1); - s5 = _mm_packs_epi32(u2, u3); - s6 = _mm_packs_epi32(u4, u5); - s7 = _mm_packs_epi32(u6, u7); - - // stage 3 - u0 = _mm_unpacklo_epi16(s2, s3); - u1 = _mm_unpackhi_epi16(s2, s3); - u2 = _mm_unpacklo_epi16(s6, s7); - u3 = _mm_unpackhi_epi16(s6, s7); - - v0 = _mm_madd_epi16(u0, k__cospi_p16_p16); - v1 = _mm_madd_epi16(u1, k__cospi_p16_p16); - v2 = _mm_madd_epi16(u0, k__cospi_p16_m16); - v3 = _mm_madd_epi16(u1, k__cospi_p16_m16); - v4 = _mm_madd_epi16(u2, k__cospi_p16_p16); - v5 = _mm_madd_epi16(u3, k__cospi_p16_p16); - v6 = _mm_madd_epi16(u2, k__cospi_p16_m16); - v7 = _mm_madd_epi16(u3, k__cospi_p16_m16); - - u0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING); - u1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING); - u2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING); - u3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING); - u4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING); - u5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING); - u6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING); - u7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING); - - v0 = _mm_srai_epi32(u0, DCT_CONST_BITS); - v1 = _mm_srai_epi32(u1, DCT_CONST_BITS); - v2 = _mm_srai_epi32(u2, DCT_CONST_BITS); - v3 = _mm_srai_epi32(u3, DCT_CONST_BITS); - v4 = _mm_srai_epi32(u4, DCT_CONST_BITS); - v5 = _mm_srai_epi32(u5, DCT_CONST_BITS); - v6 = _mm_srai_epi32(u6, DCT_CONST_BITS); - v7 = _mm_srai_epi32(u7, DCT_CONST_BITS); - - s2 = _mm_packs_epi32(v0, v1); - s3 = _mm_packs_epi32(v2, v3); - s6 = _mm_packs_epi32(v4, v5); - s7 = _mm_packs_epi32(v6, v7); - - // FIXME(jingning): do subtract using bit inversion? - in[0] = s0; - in[1] = _mm_sub_epi16(k__const_0, s4); - in[2] = s6; - in[3] = _mm_sub_epi16(k__const_0, s2); - in[4] = s3; - in[5] = _mm_sub_epi16(k__const_0, s7); - in[6] = s5; - in[7] = _mm_sub_epi16(k__const_0, s1); - - // transpose - array_transpose_8x8_avx2(in, in); -} - -void vp9_fht8x8_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { - __m128i in[8]; - - switch (tx_type) { - case DCT_DCT: - vp9_fdct8x8_avx2(input, output, stride); - break; - case ADST_DCT: - load_buffer_8x8_avx2(input, in, stride); - fadst8_avx2(in); - fdct8_avx2(in); - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); - break; - case DCT_ADST: - load_buffer_8x8_avx2(input, in, stride); - fdct8_avx2(in); - fadst8_avx2(in); - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); - break; - case ADST_ADST: - load_buffer_8x8_avx2(input, in, stride); - fadst8_avx2(in); - fadst8_avx2(in); - right_shift_8x8_avx2(in, 1); - write_buffer_8x8_avx2(output, in, 8); - break; - default: - assert(0); - break; - } -} - -void vp9_fdct16x16_avx2(const int16_t *input, int16_t *output, int stride) { - // The 2D transform is done with two passes which are actually pretty - // similar. In the first one, we transform the columns and transpose - // the results. In the second one, we transform the rows. To achieve that, - // as the first pass results are transposed, we transpose the columns (that - // is the transposed rows) and transpose the results (so that it goes back - // in normal/row positions). - int pass; - // We need an intermediate buffer between passes. - DECLARE_ALIGNED_ARRAY(16, int16_t, intermediate, 256); - const int16_t *in = input; - int16_t *out = intermediate; - // Constants - // When we use them, in one case, they are all the same. In all others - // it's a pair of them that we need to repeat four times. This is done - // by constructing the 32 bit constant corresponding to that pair. - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kOne = _mm_set1_epi16(1); - // Do the two transform/transpose passes - for (pass = 0; pass < 2; ++pass) { - // We process eight columns (transposed rows in second pass) at a time. - int column_start; - for (column_start = 0; column_start < 16; column_start += 8) { - __m128i in00, in01, in02, in03, in04, in05, in06, in07; - __m128i in08, in09, in10, in11, in12, in13, in14, in15; - __m128i input0, input1, input2, input3, input4, input5, input6, input7; - __m128i step1_0, step1_1, step1_2, step1_3; - __m128i step1_4, step1_5, step1_6, step1_7; - __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; - __m128i step3_0, step3_1, step3_2, step3_3; - __m128i step3_4, step3_5, step3_6, step3_7; - __m128i res00, res01, res02, res03, res04, res05, res06, res07; - __m128i res08, res09, res10, res11, res12, res13, res14, res15; - // Load and pre-condition input. - if (0 == pass) { - in00 = _mm_load_si128((const __m128i *)(in + 0 * stride)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * stride)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * stride)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * stride)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * stride)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * stride)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * stride)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * stride)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * stride)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * stride)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * stride)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * stride)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * stride)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * stride)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * stride)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * stride)); - // x = x << 2 - in00 = _mm_slli_epi16(in00, 2); - in01 = _mm_slli_epi16(in01, 2); - in02 = _mm_slli_epi16(in02, 2); - in03 = _mm_slli_epi16(in03, 2); - in04 = _mm_slli_epi16(in04, 2); - in05 = _mm_slli_epi16(in05, 2); - in06 = _mm_slli_epi16(in06, 2); - in07 = _mm_slli_epi16(in07, 2); - in08 = _mm_slli_epi16(in08, 2); - in09 = _mm_slli_epi16(in09, 2); - in10 = _mm_slli_epi16(in10, 2); - in11 = _mm_slli_epi16(in11, 2); - in12 = _mm_slli_epi16(in12, 2); - in13 = _mm_slli_epi16(in13, 2); - in14 = _mm_slli_epi16(in14, 2); - in15 = _mm_slli_epi16(in15, 2); - } else { - in00 = _mm_load_si128((const __m128i *)(in + 0 * 16)); - in01 = _mm_load_si128((const __m128i *)(in + 1 * 16)); - in02 = _mm_load_si128((const __m128i *)(in + 2 * 16)); - in03 = _mm_load_si128((const __m128i *)(in + 3 * 16)); - in04 = _mm_load_si128((const __m128i *)(in + 4 * 16)); - in05 = _mm_load_si128((const __m128i *)(in + 5 * 16)); - in06 = _mm_load_si128((const __m128i *)(in + 6 * 16)); - in07 = _mm_load_si128((const __m128i *)(in + 7 * 16)); - in08 = _mm_load_si128((const __m128i *)(in + 8 * 16)); - in09 = _mm_load_si128((const __m128i *)(in + 9 * 16)); - in10 = _mm_load_si128((const __m128i *)(in + 10 * 16)); - in11 = _mm_load_si128((const __m128i *)(in + 11 * 16)); - in12 = _mm_load_si128((const __m128i *)(in + 12 * 16)); - in13 = _mm_load_si128((const __m128i *)(in + 13 * 16)); - in14 = _mm_load_si128((const __m128i *)(in + 14 * 16)); - in15 = _mm_load_si128((const __m128i *)(in + 15 * 16)); - // x = (x + 1) >> 2 - in00 = _mm_add_epi16(in00, kOne); - in01 = _mm_add_epi16(in01, kOne); - in02 = _mm_add_epi16(in02, kOne); - in03 = _mm_add_epi16(in03, kOne); - in04 = _mm_add_epi16(in04, kOne); - in05 = _mm_add_epi16(in05, kOne); - in06 = _mm_add_epi16(in06, kOne); - in07 = _mm_add_epi16(in07, kOne); - in08 = _mm_add_epi16(in08, kOne); - in09 = _mm_add_epi16(in09, kOne); - in10 = _mm_add_epi16(in10, kOne); - in11 = _mm_add_epi16(in11, kOne); - in12 = _mm_add_epi16(in12, kOne); - in13 = _mm_add_epi16(in13, kOne); - in14 = _mm_add_epi16(in14, kOne); - in15 = _mm_add_epi16(in15, kOne); - in00 = _mm_srai_epi16(in00, 2); - in01 = _mm_srai_epi16(in01, 2); - in02 = _mm_srai_epi16(in02, 2); - in03 = _mm_srai_epi16(in03, 2); - in04 = _mm_srai_epi16(in04, 2); - in05 = _mm_srai_epi16(in05, 2); - in06 = _mm_srai_epi16(in06, 2); - in07 = _mm_srai_epi16(in07, 2); - in08 = _mm_srai_epi16(in08, 2); - in09 = _mm_srai_epi16(in09, 2); - in10 = _mm_srai_epi16(in10, 2); - in11 = _mm_srai_epi16(in11, 2); - in12 = _mm_srai_epi16(in12, 2); - in13 = _mm_srai_epi16(in13, 2); - in14 = _mm_srai_epi16(in14, 2); - in15 = _mm_srai_epi16(in15, 2); - } - in += 8; - // Calculate input for the first 8 results. - { - input0 = _mm_add_epi16(in00, in15); - input1 = _mm_add_epi16(in01, in14); - input2 = _mm_add_epi16(in02, in13); - input3 = _mm_add_epi16(in03, in12); - input4 = _mm_add_epi16(in04, in11); - input5 = _mm_add_epi16(in05, in10); - input6 = _mm_add_epi16(in06, in09); - input7 = _mm_add_epi16(in07, in08); - } - // Calculate input for the next 8 results. - { - step1_0 = _mm_sub_epi16(in07, in08); - step1_1 = _mm_sub_epi16(in06, in09); - step1_2 = _mm_sub_epi16(in05, in10); - step1_3 = _mm_sub_epi16(in04, in11); - step1_4 = _mm_sub_epi16(in03, in12); - step1_5 = _mm_sub_epi16(in02, in13); - step1_6 = _mm_sub_epi16(in01, in14); - step1_7 = _mm_sub_epi16(in00, in15); - } - // Work on the first eight values; fdct8(input, even_results); - { - // Add/subtract - const __m128i q0 = _mm_add_epi16(input0, input7); - const __m128i q1 = _mm_add_epi16(input1, input6); - const __m128i q2 = _mm_add_epi16(input2, input5); - const __m128i q3 = _mm_add_epi16(input3, input4); - const __m128i q4 = _mm_sub_epi16(input3, input4); - const __m128i q5 = _mm_sub_epi16(input2, input5); - const __m128i q6 = _mm_sub_epi16(input1, input6); - const __m128i q7 = _mm_sub_epi16(input0, input7); - // Work on first four results - { - // Add/subtract - const __m128i r0 = _mm_add_epi16(q0, q3); - const __m128i r1 = _mm_add_epi16(q1, q2); - const __m128i r2 = _mm_sub_epi16(q1, q2); - const __m128i r3 = _mm_sub_epi16(q0, q3); - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(r0, r1); - const __m128i t1 = _mm_unpackhi_epi16(r0, r1); - const __m128i t2 = _mm_unpacklo_epi16(r2, r3); - const __m128i t3 = _mm_unpackhi_epi16(r2, r3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res00 = _mm_packs_epi32(w0, w1); - res08 = _mm_packs_epi32(w2, w3); - res04 = _mm_packs_epi32(w4, w5); - res12 = _mm_packs_epi32(w6, w7); - } - // Work on next four results - { - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i d0 = _mm_unpacklo_epi16(q6, q5); - const __m128i d1 = _mm_unpackhi_epi16(q6, q5); - const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); - const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); - const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); - const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); - const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); - const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); - const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); - const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); - const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); - const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); - const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); - // Combine - const __m128i r0 = _mm_packs_epi32(s0, s1); - const __m128i r1 = _mm_packs_epi32(s2, s3); - // Add/subtract - const __m128i x0 = _mm_add_epi16(q4, r0); - const __m128i x1 = _mm_sub_epi16(q4, r0); - const __m128i x2 = _mm_sub_epi16(q7, r1); - const __m128i x3 = _mm_add_epi16(q7, r1); - // Interleave to do the multiply by constants which gets us - // into 32 bits. - const __m128i t0 = _mm_unpacklo_epi16(x0, x3); - const __m128i t1 = _mm_unpackhi_epi16(x0, x3); - const __m128i t2 = _mm_unpacklo_epi16(x1, x2); - const __m128i t3 = _mm_unpackhi_epi16(x1, x2); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); - const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); - const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); - const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); - const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); - const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); - const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); - const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); - const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); - const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); - const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); - const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); - const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); - // Combine - res02 = _mm_packs_epi32(w0, w1); - res14 = _mm_packs_epi32(w2, w3); - res10 = _mm_packs_epi32(w4, w5); - res06 = _mm_packs_epi32(w6, w7); - } - } - // Work on the next eight values; step1 -> odd_results - { - // step 2 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); - const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); - const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); - const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_2 = _mm_packs_epi32(w0, w1); - step2_3 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); - const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); - const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); - const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_5 = _mm_packs_epi32(w0, w1); - step2_4 = _mm_packs_epi32(w2, w3); - } - // step 3 - { - step3_0 = _mm_add_epi16(step1_0, step2_3); - step3_1 = _mm_add_epi16(step1_1, step2_2); - step3_2 = _mm_sub_epi16(step1_1, step2_2); - step3_3 = _mm_sub_epi16(step1_0, step2_3); - step3_4 = _mm_sub_epi16(step1_7, step2_4); - step3_5 = _mm_sub_epi16(step1_6, step2_5); - step3_6 = _mm_add_epi16(step1_6, step2_5); - step3_7 = _mm_add_epi16(step1_7, step2_4); - } - // step 4 - { - const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); - const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); - const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); - const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_1 = _mm_packs_epi32(w0, w1); - step2_2 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); - const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); - const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); - const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - step2_6 = _mm_packs_epi32(w0, w1); - step2_5 = _mm_packs_epi32(w2, w3); - } - // step 5 - { - step1_0 = _mm_add_epi16(step3_0, step2_1); - step1_1 = _mm_sub_epi16(step3_0, step2_1); - step1_2 = _mm_sub_epi16(step3_3, step2_2); - step1_3 = _mm_add_epi16(step3_3, step2_2); - step1_4 = _mm_add_epi16(step3_4, step2_5); - step1_5 = _mm_sub_epi16(step3_4, step2_5); - step1_6 = _mm_sub_epi16(step3_7, step2_6); - step1_7 = _mm_add_epi16(step3_7, step2_6); - } - // step 6 - { - const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); - const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); - const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); - const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res01 = _mm_packs_epi32(w0, w1); - res09 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); - const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); - const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); - const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res05 = _mm_packs_epi32(w0, w1); - res13 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); - const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); - const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); - const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res11 = _mm_packs_epi32(w0, w1); - res03 = _mm_packs_epi32(w2, w3); - } - { - const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); - const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); - const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); - const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); - const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); - const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); - const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); - const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); - // dct_const_round_shift - const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); - const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); - const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); - const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); - const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); - const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); - const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); - const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); - // Combine - res15 = _mm_packs_epi32(w0, w1); - res07 = _mm_packs_epi32(w2, w3); - } - } - // Transpose the results, do it as two 8x8 transposes. - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); - const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); - const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); - const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); - const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); - const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); - const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); - const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0); - _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1); - _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2); - _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3); - _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4); - _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5); - _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6); - _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7); - } - { - // 00 01 02 03 04 05 06 07 - // 10 11 12 13 14 15 16 17 - // 20 21 22 23 24 25 26 27 - // 30 31 32 33 34 35 36 37 - // 40 41 42 43 44 45 46 47 - // 50 51 52 53 54 55 56 57 - // 60 61 62 63 64 65 66 67 - // 70 71 72 73 74 75 76 77 - const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); - const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); - const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); - const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); - const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); - const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); - const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); - const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); - // 00 10 01 11 02 12 03 13 - // 20 30 21 31 22 32 23 33 - // 04 14 05 15 06 16 07 17 - // 24 34 25 35 26 36 27 37 - // 40 50 41 51 42 52 43 53 - // 60 70 61 71 62 72 63 73 - // 54 54 55 55 56 56 57 57 - // 64 74 65 75 66 76 67 77 - const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); - const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); - const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); - const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); - const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); - const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); - const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); - const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); - // 00 10 20 30 01 11 21 31 - // 40 50 60 70 41 51 61 71 - // 02 12 22 32 03 13 23 33 - // 42 52 62 72 43 53 63 73 - // 04 14 24 34 05 15 21 36 - // 44 54 64 74 45 55 61 76 - // 06 16 26 36 07 17 27 37 - // 46 56 66 76 47 57 67 77 - const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); - const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); - const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); - const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); - const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); - const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); - const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); - const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); - // 00 10 20 30 40 50 60 70 - // 01 11 21 31 41 51 61 71 - // 02 12 22 32 42 52 62 72 - // 03 13 23 33 43 53 63 73 - // 04 14 24 34 44 54 64 74 - // 05 15 25 35 45 55 65 75 - // 06 16 26 36 46 56 66 76 - // 07 17 27 37 47 57 67 77 - // Store results - _mm_store_si128((__m128i *)(out + 8 + 0 * 16), tr2_0); - _mm_store_si128((__m128i *)(out + 8 + 1 * 16), tr2_1); - _mm_store_si128((__m128i *)(out + 8 + 2 * 16), tr2_2); - _mm_store_si128((__m128i *)(out + 8 + 3 * 16), tr2_3); - _mm_store_si128((__m128i *)(out + 8 + 4 * 16), tr2_4); - _mm_store_si128((__m128i *)(out + 8 + 5 * 16), tr2_5); - _mm_store_si128((__m128i *)(out + 8 + 6 * 16), tr2_6); - _mm_store_si128((__m128i *)(out + 8 + 7 * 16), tr2_7); - } - out += 8*16; - } - // Setup in/out for next pass. - in = intermediate; - out = output; - } -} - -static INLINE void load_buffer_16x16_avx2(const int16_t* input, __m128i *in0, - __m128i *in1, int stride) { - // load first 8 columns - load_buffer_8x8_avx2(input, in0, stride); - load_buffer_8x8_avx2(input + 8 * stride, in0 + 8, stride); - - input += 8; - // load second 8 columns - load_buffer_8x8_avx2(input, in1, stride); - load_buffer_8x8_avx2(input + 8 * stride, in1 + 8, stride); -} - -static INLINE void write_buffer_16x16_avx2(int16_t *output, __m128i *in0, - __m128i *in1, int stride) { - // write first 8 columns - write_buffer_8x8_avx2(output, in0, stride); - write_buffer_8x8_avx2(output + 8 * stride, in0 + 8, stride); - // write second 8 columns - output += 8; - write_buffer_8x8_avx2(output, in1, stride); - write_buffer_8x8_avx2(output + 8 * stride, in1 + 8, stride); -} - -static INLINE void array_transpose_16x16_avx2(__m128i *res0, __m128i *res1) { - __m128i tbuf[8]; - array_transpose_8x8_avx2(res0, res0); - array_transpose_8x8_avx2(res1, tbuf); - array_transpose_8x8_avx2(res0 + 8, res1); - array_transpose_8x8_avx2(res1 + 8, res1 + 8); - - res0[8] = tbuf[0]; - res0[9] = tbuf[1]; - res0[10] = tbuf[2]; - res0[11] = tbuf[3]; - res0[12] = tbuf[4]; - res0[13] = tbuf[5]; - res0[14] = tbuf[6]; - res0[15] = tbuf[7]; -} - -static INLINE void right_shift_16x16_avx2(__m128i *res0, __m128i *res1) { - // perform rounding operations - right_shift_8x8_avx2(res0, 2); - right_shift_8x8_avx2(res0 + 8, 2); - right_shift_8x8_avx2(res1, 2); - right_shift_8x8_avx2(res1 + 8, 2); -} - -void fdct16_8col_avx2(__m128i *in) { - // perform 16x16 1-D DCT for 8 columns - __m128i i[8], s[8], p[8], t[8], u[16], v[16]; - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); - const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); - const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); - const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); - const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); - const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); - const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); - const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); - const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); - const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); - const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); - const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); - const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); - const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - - // stage 1 - i[0] = _mm_add_epi16(in[0], in[15]); - i[1] = _mm_add_epi16(in[1], in[14]); - i[2] = _mm_add_epi16(in[2], in[13]); - i[3] = _mm_add_epi16(in[3], in[12]); - i[4] = _mm_add_epi16(in[4], in[11]); - i[5] = _mm_add_epi16(in[5], in[10]); - i[6] = _mm_add_epi16(in[6], in[9]); - i[7] = _mm_add_epi16(in[7], in[8]); - - s[0] = _mm_sub_epi16(in[7], in[8]); - s[1] = _mm_sub_epi16(in[6], in[9]); - s[2] = _mm_sub_epi16(in[5], in[10]); - s[3] = _mm_sub_epi16(in[4], in[11]); - s[4] = _mm_sub_epi16(in[3], in[12]); - s[5] = _mm_sub_epi16(in[2], in[13]); - s[6] = _mm_sub_epi16(in[1], in[14]); - s[7] = _mm_sub_epi16(in[0], in[15]); - - p[0] = _mm_add_epi16(i[0], i[7]); - p[1] = _mm_add_epi16(i[1], i[6]); - p[2] = _mm_add_epi16(i[2], i[5]); - p[3] = _mm_add_epi16(i[3], i[4]); - p[4] = _mm_sub_epi16(i[3], i[4]); - p[5] = _mm_sub_epi16(i[2], i[5]); - p[6] = _mm_sub_epi16(i[1], i[6]); - p[7] = _mm_sub_epi16(i[0], i[7]); - - u[0] = _mm_add_epi16(p[0], p[3]); - u[1] = _mm_add_epi16(p[1], p[2]); - u[2] = _mm_sub_epi16(p[1], p[2]); - u[3] = _mm_sub_epi16(p[0], p[3]); - - v[0] = _mm_unpacklo_epi16(u[0], u[1]); - v[1] = _mm_unpackhi_epi16(u[0], u[1]); - v[2] = _mm_unpacklo_epi16(u[2], u[3]); - v[3] = _mm_unpackhi_epi16(u[2], u[3]); - - u[0] = _mm_madd_epi16(v[0], k__cospi_p16_p16); - u[1] = _mm_madd_epi16(v[1], k__cospi_p16_p16); - u[2] = _mm_madd_epi16(v[0], k__cospi_p16_m16); - u[3] = _mm_madd_epi16(v[1], k__cospi_p16_m16); - u[4] = _mm_madd_epi16(v[2], k__cospi_p24_p08); - u[5] = _mm_madd_epi16(v[3], k__cospi_p24_p08); - u[6] = _mm_madd_epi16(v[2], k__cospi_m08_p24); - u[7] = _mm_madd_epi16(v[3], k__cospi_m08_p24); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - - in[0] = _mm_packs_epi32(u[0], u[1]); - in[4] = _mm_packs_epi32(u[4], u[5]); - in[8] = _mm_packs_epi32(u[2], u[3]); - in[12] = _mm_packs_epi32(u[6], u[7]); - - u[0] = _mm_unpacklo_epi16(p[5], p[6]); - u[1] = _mm_unpackhi_epi16(p[5], p[6]); - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - - u[0] = _mm_packs_epi32(v[0], v[1]); - u[1] = _mm_packs_epi32(v[2], v[3]); - - t[0] = _mm_add_epi16(p[4], u[0]); - t[1] = _mm_sub_epi16(p[4], u[0]); - t[2] = _mm_sub_epi16(p[7], u[1]); - t[3] = _mm_add_epi16(p[7], u[1]); - - u[0] = _mm_unpacklo_epi16(t[0], t[3]); - u[1] = _mm_unpackhi_epi16(t[0], t[3]); - u[2] = _mm_unpacklo_epi16(t[1], t[2]); - u[3] = _mm_unpackhi_epi16(t[1], t[2]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p28_p04); - v[1] = _mm_madd_epi16(u[1], k__cospi_p28_p04); - v[2] = _mm_madd_epi16(u[2], k__cospi_p12_p20); - v[3] = _mm_madd_epi16(u[3], k__cospi_p12_p20); - v[4] = _mm_madd_epi16(u[2], k__cospi_m20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_m20_p12); - v[6] = _mm_madd_epi16(u[0], k__cospi_m04_p28); - v[7] = _mm_madd_epi16(u[1], k__cospi_m04_p28); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - in[2] = _mm_packs_epi32(v[0], v[1]); - in[6] = _mm_packs_epi32(v[4], v[5]); - in[10] = _mm_packs_epi32(v[2], v[3]); - in[14] = _mm_packs_epi32(v[6], v[7]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[2], s[5]); - u[1] = _mm_unpackhi_epi16(s[2], s[5]); - u[2] = _mm_unpacklo_epi16(s[3], s[4]); - u[3] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_p16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_p16); - v[2] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[3] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[0], k__cospi_p16_p16); - v[7] = _mm_madd_epi16(u[1], k__cospi_p16_p16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[2] = _mm_packs_epi32(v[0], v[1]); - t[3] = _mm_packs_epi32(v[2], v[3]); - t[4] = _mm_packs_epi32(v[4], v[5]); - t[5] = _mm_packs_epi32(v[6], v[7]); - - // stage 3 - p[0] = _mm_add_epi16(s[0], t[3]); - p[1] = _mm_add_epi16(s[1], t[2]); - p[2] = _mm_sub_epi16(s[1], t[2]); - p[3] = _mm_sub_epi16(s[0], t[3]); - p[4] = _mm_sub_epi16(s[7], t[4]); - p[5] = _mm_sub_epi16(s[6], t[5]); - p[6] = _mm_add_epi16(s[6], t[5]); - p[7] = _mm_add_epi16(s[7], t[4]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(p[1], p[6]); - u[1] = _mm_unpackhi_epi16(p[1], p[6]); - u[2] = _mm_unpacklo_epi16(p[2], p[5]); - u[3] = _mm_unpackhi_epi16(p[2], p[5]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24); - v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08); - v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24); - v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24); - v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08); - v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - - t[1] = _mm_packs_epi32(v[0], v[1]); - t[2] = _mm_packs_epi32(v[2], v[3]); - t[5] = _mm_packs_epi32(v[4], v[5]); - t[6] = _mm_packs_epi32(v[6], v[7]); - - // stage 5 - s[0] = _mm_add_epi16(p[0], t[1]); - s[1] = _mm_sub_epi16(p[0], t[1]); - s[2] = _mm_sub_epi16(p[3], t[2]); - s[3] = _mm_add_epi16(p[3], t[2]); - s[4] = _mm_add_epi16(p[4], t[5]); - s[5] = _mm_sub_epi16(p[4], t[5]); - s[6] = _mm_sub_epi16(p[7], t[6]); - s[7] = _mm_add_epi16(p[7], t[6]); - - // stage 6 - u[0] = _mm_unpacklo_epi16(s[0], s[7]); - u[1] = _mm_unpackhi_epi16(s[0], s[7]); - u[2] = _mm_unpacklo_epi16(s[1], s[6]); - u[3] = _mm_unpackhi_epi16(s[1], s[6]); - u[4] = _mm_unpacklo_epi16(s[2], s[5]); - u[5] = _mm_unpackhi_epi16(s[2], s[5]); - u[6] = _mm_unpacklo_epi16(s[3], s[4]); - u[7] = _mm_unpackhi_epi16(s[3], s[4]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p30_p02); - v[1] = _mm_madd_epi16(u[1], k__cospi_p30_p02); - v[2] = _mm_madd_epi16(u[2], k__cospi_p14_p18); - v[3] = _mm_madd_epi16(u[3], k__cospi_p14_p18); - v[4] = _mm_madd_epi16(u[4], k__cospi_p22_p10); - v[5] = _mm_madd_epi16(u[5], k__cospi_p22_p10); - v[6] = _mm_madd_epi16(u[6], k__cospi_p06_p26); - v[7] = _mm_madd_epi16(u[7], k__cospi_p06_p26); - v[8] = _mm_madd_epi16(u[6], k__cospi_m26_p06); - v[9] = _mm_madd_epi16(u[7], k__cospi_m26_p06); - v[10] = _mm_madd_epi16(u[4], k__cospi_m10_p22); - v[11] = _mm_madd_epi16(u[5], k__cospi_m10_p22); - v[12] = _mm_madd_epi16(u[2], k__cospi_m18_p14); - v[13] = _mm_madd_epi16(u[3], k__cospi_m18_p14); - v[14] = _mm_madd_epi16(u[0], k__cospi_m02_p30); - v[15] = _mm_madd_epi16(u[1], k__cospi_m02_p30); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[1] = _mm_packs_epi32(v[0], v[1]); - in[9] = _mm_packs_epi32(v[2], v[3]); - in[5] = _mm_packs_epi32(v[4], v[5]); - in[13] = _mm_packs_epi32(v[6], v[7]); - in[3] = _mm_packs_epi32(v[8], v[9]); - in[11] = _mm_packs_epi32(v[10], v[11]); - in[7] = _mm_packs_epi32(v[12], v[13]); - in[15] = _mm_packs_epi32(v[14], v[15]); -} - -void fadst16_8col_avx2(__m128i *in) { - // perform 16x16 1-D ADST for 8 columns - __m128i s[16], x[16], u[32], v[32]; - const __m128i k__cospi_p01_p31 = pair_set_epi16(cospi_1_64, cospi_31_64); - const __m128i k__cospi_p31_m01 = pair_set_epi16(cospi_31_64, -cospi_1_64); - const __m128i k__cospi_p05_p27 = pair_set_epi16(cospi_5_64, cospi_27_64); - const __m128i k__cospi_p27_m05 = pair_set_epi16(cospi_27_64, -cospi_5_64); - const __m128i k__cospi_p09_p23 = pair_set_epi16(cospi_9_64, cospi_23_64); - const __m128i k__cospi_p23_m09 = pair_set_epi16(cospi_23_64, -cospi_9_64); - const __m128i k__cospi_p13_p19 = pair_set_epi16(cospi_13_64, cospi_19_64); - const __m128i k__cospi_p19_m13 = pair_set_epi16(cospi_19_64, -cospi_13_64); - const __m128i k__cospi_p17_p15 = pair_set_epi16(cospi_17_64, cospi_15_64); - const __m128i k__cospi_p15_m17 = pair_set_epi16(cospi_15_64, -cospi_17_64); - const __m128i k__cospi_p21_p11 = pair_set_epi16(cospi_21_64, cospi_11_64); - const __m128i k__cospi_p11_m21 = pair_set_epi16(cospi_11_64, -cospi_21_64); - const __m128i k__cospi_p25_p07 = pair_set_epi16(cospi_25_64, cospi_7_64); - const __m128i k__cospi_p07_m25 = pair_set_epi16(cospi_7_64, -cospi_25_64); - const __m128i k__cospi_p29_p03 = pair_set_epi16(cospi_29_64, cospi_3_64); - const __m128i k__cospi_p03_m29 = pair_set_epi16(cospi_3_64, -cospi_29_64); - const __m128i k__cospi_p04_p28 = pair_set_epi16(cospi_4_64, cospi_28_64); - const __m128i k__cospi_p28_m04 = pair_set_epi16(cospi_28_64, -cospi_4_64); - const __m128i k__cospi_p20_p12 = pair_set_epi16(cospi_20_64, cospi_12_64); - const __m128i k__cospi_p12_m20 = pair_set_epi16(cospi_12_64, -cospi_20_64); - const __m128i k__cospi_m28_p04 = pair_set_epi16(-cospi_28_64, cospi_4_64); - const __m128i k__cospi_m12_p20 = pair_set_epi16(-cospi_12_64, cospi_20_64); - const __m128i k__cospi_p08_p24 = pair_set_epi16(cospi_8_64, cospi_24_64); - const __m128i k__cospi_p24_m08 = pair_set_epi16(cospi_24_64, -cospi_8_64); - const __m128i k__cospi_m24_p08 = pair_set_epi16(-cospi_24_64, cospi_8_64); - const __m128i k__cospi_m16_m16 = _mm_set1_epi16(-cospi_16_64); - const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); - const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); - const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64); - const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); - const __m128i kZero = _mm_set1_epi16(0); - - u[0] = _mm_unpacklo_epi16(in[15], in[0]); - u[1] = _mm_unpackhi_epi16(in[15], in[0]); - u[2] = _mm_unpacklo_epi16(in[13], in[2]); - u[3] = _mm_unpackhi_epi16(in[13], in[2]); - u[4] = _mm_unpacklo_epi16(in[11], in[4]); - u[5] = _mm_unpackhi_epi16(in[11], in[4]); - u[6] = _mm_unpacklo_epi16(in[9], in[6]); - u[7] = _mm_unpackhi_epi16(in[9], in[6]); - u[8] = _mm_unpacklo_epi16(in[7], in[8]); - u[9] = _mm_unpackhi_epi16(in[7], in[8]); - u[10] = _mm_unpacklo_epi16(in[5], in[10]); - u[11] = _mm_unpackhi_epi16(in[5], in[10]); - u[12] = _mm_unpacklo_epi16(in[3], in[12]); - u[13] = _mm_unpackhi_epi16(in[3], in[12]); - u[14] = _mm_unpacklo_epi16(in[1], in[14]); - u[15] = _mm_unpackhi_epi16(in[1], in[14]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p01_p31); - v[1] = _mm_madd_epi16(u[1], k__cospi_p01_p31); - v[2] = _mm_madd_epi16(u[0], k__cospi_p31_m01); - v[3] = _mm_madd_epi16(u[1], k__cospi_p31_m01); - v[4] = _mm_madd_epi16(u[2], k__cospi_p05_p27); - v[5] = _mm_madd_epi16(u[3], k__cospi_p05_p27); - v[6] = _mm_madd_epi16(u[2], k__cospi_p27_m05); - v[7] = _mm_madd_epi16(u[3], k__cospi_p27_m05); - v[8] = _mm_madd_epi16(u[4], k__cospi_p09_p23); - v[9] = _mm_madd_epi16(u[5], k__cospi_p09_p23); - v[10] = _mm_madd_epi16(u[4], k__cospi_p23_m09); - v[11] = _mm_madd_epi16(u[5], k__cospi_p23_m09); - v[12] = _mm_madd_epi16(u[6], k__cospi_p13_p19); - v[13] = _mm_madd_epi16(u[7], k__cospi_p13_p19); - v[14] = _mm_madd_epi16(u[6], k__cospi_p19_m13); - v[15] = _mm_madd_epi16(u[7], k__cospi_p19_m13); - v[16] = _mm_madd_epi16(u[8], k__cospi_p17_p15); - v[17] = _mm_madd_epi16(u[9], k__cospi_p17_p15); - v[18] = _mm_madd_epi16(u[8], k__cospi_p15_m17); - v[19] = _mm_madd_epi16(u[9], k__cospi_p15_m17); - v[20] = _mm_madd_epi16(u[10], k__cospi_p21_p11); - v[21] = _mm_madd_epi16(u[11], k__cospi_p21_p11); - v[22] = _mm_madd_epi16(u[10], k__cospi_p11_m21); - v[23] = _mm_madd_epi16(u[11], k__cospi_p11_m21); - v[24] = _mm_madd_epi16(u[12], k__cospi_p25_p07); - v[25] = _mm_madd_epi16(u[13], k__cospi_p25_p07); - v[26] = _mm_madd_epi16(u[12], k__cospi_p07_m25); - v[27] = _mm_madd_epi16(u[13], k__cospi_p07_m25); - v[28] = _mm_madd_epi16(u[14], k__cospi_p29_p03); - v[29] = _mm_madd_epi16(u[15], k__cospi_p29_p03); - v[30] = _mm_madd_epi16(u[14], k__cospi_p03_m29); - v[31] = _mm_madd_epi16(u[15], k__cospi_p03_m29); - - u[0] = _mm_add_epi32(v[0], v[16]); - u[1] = _mm_add_epi32(v[1], v[17]); - u[2] = _mm_add_epi32(v[2], v[18]); - u[3] = _mm_add_epi32(v[3], v[19]); - u[4] = _mm_add_epi32(v[4], v[20]); - u[5] = _mm_add_epi32(v[5], v[21]); - u[6] = _mm_add_epi32(v[6], v[22]); - u[7] = _mm_add_epi32(v[7], v[23]); - u[8] = _mm_add_epi32(v[8], v[24]); - u[9] = _mm_add_epi32(v[9], v[25]); - u[10] = _mm_add_epi32(v[10], v[26]); - u[11] = _mm_add_epi32(v[11], v[27]); - u[12] = _mm_add_epi32(v[12], v[28]); - u[13] = _mm_add_epi32(v[13], v[29]); - u[14] = _mm_add_epi32(v[14], v[30]); - u[15] = _mm_add_epi32(v[15], v[31]); - u[16] = _mm_sub_epi32(v[0], v[16]); - u[17] = _mm_sub_epi32(v[1], v[17]); - u[18] = _mm_sub_epi32(v[2], v[18]); - u[19] = _mm_sub_epi32(v[3], v[19]); - u[20] = _mm_sub_epi32(v[4], v[20]); - u[21] = _mm_sub_epi32(v[5], v[21]); - u[22] = _mm_sub_epi32(v[6], v[22]); - u[23] = _mm_sub_epi32(v[7], v[23]); - u[24] = _mm_sub_epi32(v[8], v[24]); - u[25] = _mm_sub_epi32(v[9], v[25]); - u[26] = _mm_sub_epi32(v[10], v[26]); - u[27] = _mm_sub_epi32(v[11], v[27]); - u[28] = _mm_sub_epi32(v[12], v[28]); - u[29] = _mm_sub_epi32(v[13], v[29]); - u[30] = _mm_sub_epi32(v[14], v[30]); - u[31] = _mm_sub_epi32(v[15], v[31]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING); - v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING); - v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING); - v[19] = _mm_add_epi32(u[19], k__DCT_CONST_ROUNDING); - v[20] = _mm_add_epi32(u[20], k__DCT_CONST_ROUNDING); - v[21] = _mm_add_epi32(u[21], k__DCT_CONST_ROUNDING); - v[22] = _mm_add_epi32(u[22], k__DCT_CONST_ROUNDING); - v[23] = _mm_add_epi32(u[23], k__DCT_CONST_ROUNDING); - v[24] = _mm_add_epi32(u[24], k__DCT_CONST_ROUNDING); - v[25] = _mm_add_epi32(u[25], k__DCT_CONST_ROUNDING); - v[26] = _mm_add_epi32(u[26], k__DCT_CONST_ROUNDING); - v[27] = _mm_add_epi32(u[27], k__DCT_CONST_ROUNDING); - v[28] = _mm_add_epi32(u[28], k__DCT_CONST_ROUNDING); - v[29] = _mm_add_epi32(u[29], k__DCT_CONST_ROUNDING); - v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING); - v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS); - u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS); - u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS); - u[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS); - u[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS); - u[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS); - u[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS); - u[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS); - u[24] = _mm_srai_epi32(v[24], DCT_CONST_BITS); - u[25] = _mm_srai_epi32(v[25], DCT_CONST_BITS); - u[26] = _mm_srai_epi32(v[26], DCT_CONST_BITS); - u[27] = _mm_srai_epi32(v[27], DCT_CONST_BITS); - u[28] = _mm_srai_epi32(v[28], DCT_CONST_BITS); - u[29] = _mm_srai_epi32(v[29], DCT_CONST_BITS); - u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS); - u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS); - - s[0] = _mm_packs_epi32(u[0], u[1]); - s[1] = _mm_packs_epi32(u[2], u[3]); - s[2] = _mm_packs_epi32(u[4], u[5]); - s[3] = _mm_packs_epi32(u[6], u[7]); - s[4] = _mm_packs_epi32(u[8], u[9]); - s[5] = _mm_packs_epi32(u[10], u[11]); - s[6] = _mm_packs_epi32(u[12], u[13]); - s[7] = _mm_packs_epi32(u[14], u[15]); - s[8] = _mm_packs_epi32(u[16], u[17]); - s[9] = _mm_packs_epi32(u[18], u[19]); - s[10] = _mm_packs_epi32(u[20], u[21]); - s[11] = _mm_packs_epi32(u[22], u[23]); - s[12] = _mm_packs_epi32(u[24], u[25]); - s[13] = _mm_packs_epi32(u[26], u[27]); - s[14] = _mm_packs_epi32(u[28], u[29]); - s[15] = _mm_packs_epi32(u[30], u[31]); - - // stage 2 - u[0] = _mm_unpacklo_epi16(s[8], s[9]); - u[1] = _mm_unpackhi_epi16(s[8], s[9]); - u[2] = _mm_unpacklo_epi16(s[10], s[11]); - u[3] = _mm_unpackhi_epi16(s[10], s[11]); - u[4] = _mm_unpacklo_epi16(s[12], s[13]); - u[5] = _mm_unpackhi_epi16(s[12], s[13]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p04_p28); - v[1] = _mm_madd_epi16(u[1], k__cospi_p04_p28); - v[2] = _mm_madd_epi16(u[0], k__cospi_p28_m04); - v[3] = _mm_madd_epi16(u[1], k__cospi_p28_m04); - v[4] = _mm_madd_epi16(u[2], k__cospi_p20_p12); - v[5] = _mm_madd_epi16(u[3], k__cospi_p20_p12); - v[6] = _mm_madd_epi16(u[2], k__cospi_p12_m20); - v[7] = _mm_madd_epi16(u[3], k__cospi_p12_m20); - v[8] = _mm_madd_epi16(u[4], k__cospi_m28_p04); - v[9] = _mm_madd_epi16(u[5], k__cospi_m28_p04); - v[10] = _mm_madd_epi16(u[4], k__cospi_p04_p28); - v[11] = _mm_madd_epi16(u[5], k__cospi_p04_p28); - v[12] = _mm_madd_epi16(u[6], k__cospi_m12_p20); - v[13] = _mm_madd_epi16(u[7], k__cospi_m12_p20); - v[14] = _mm_madd_epi16(u[6], k__cospi_p20_p12); - v[15] = _mm_madd_epi16(u[7], k__cospi_p20_p12); - - u[0] = _mm_add_epi32(v[0], v[8]); - u[1] = _mm_add_epi32(v[1], v[9]); - u[2] = _mm_add_epi32(v[2], v[10]); - u[3] = _mm_add_epi32(v[3], v[11]); - u[4] = _mm_add_epi32(v[4], v[12]); - u[5] = _mm_add_epi32(v[5], v[13]); - u[6] = _mm_add_epi32(v[6], v[14]); - u[7] = _mm_add_epi32(v[7], v[15]); - u[8] = _mm_sub_epi32(v[0], v[8]); - u[9] = _mm_sub_epi32(v[1], v[9]); - u[10] = _mm_sub_epi32(v[2], v[10]); - u[11] = _mm_sub_epi32(v[3], v[11]); - u[12] = _mm_sub_epi32(v[4], v[12]); - u[13] = _mm_sub_epi32(v[5], v[13]); - u[14] = _mm_sub_epi32(v[6], v[14]); - u[15] = _mm_sub_epi32(v[7], v[15]); - - v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS); - u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS); - u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS); - u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS); - u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS); - u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS); - u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS); - u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS); - u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS); - u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS); - u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS); - u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS); - u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS); - u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS); - u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS); - u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS); - - x[0] = _mm_add_epi16(s[0], s[4]); - x[1] = _mm_add_epi16(s[1], s[5]); - x[2] = _mm_add_epi16(s[2], s[6]); - x[3] = _mm_add_epi16(s[3], s[7]); - x[4] = _mm_sub_epi16(s[0], s[4]); - x[5] = _mm_sub_epi16(s[1], s[5]); - x[6] = _mm_sub_epi16(s[2], s[6]); - x[7] = _mm_sub_epi16(s[3], s[7]); - x[8] = _mm_packs_epi32(u[0], u[1]); - x[9] = _mm_packs_epi32(u[2], u[3]); - x[10] = _mm_packs_epi32(u[4], u[5]); - x[11] = _mm_packs_epi32(u[6], u[7]); - x[12] = _mm_packs_epi32(u[8], u[9]); - x[13] = _mm_packs_epi32(u[10], u[11]); - x[14] = _mm_packs_epi32(u[12], u[13]); - x[15] = _mm_packs_epi32(u[14], u[15]); - - // stage 3 - u[0] = _mm_unpacklo_epi16(x[4], x[5]); - u[1] = _mm_unpackhi_epi16(x[4], x[5]); - u[2] = _mm_unpacklo_epi16(x[6], x[7]); - u[3] = _mm_unpackhi_epi16(x[6], x[7]); - u[4] = _mm_unpacklo_epi16(x[12], x[13]); - u[5] = _mm_unpackhi_epi16(x[12], x[13]); - u[6] = _mm_unpacklo_epi16(x[14], x[15]); - u[7] = _mm_unpackhi_epi16(x[14], x[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_p08_p24); - v[1] = _mm_madd_epi16(u[1], k__cospi_p08_p24); - v[2] = _mm_madd_epi16(u[0], k__cospi_p24_m08); - v[3] = _mm_madd_epi16(u[1], k__cospi_p24_m08); - v[4] = _mm_madd_epi16(u[2], k__cospi_m24_p08); - v[5] = _mm_madd_epi16(u[3], k__cospi_m24_p08); - v[6] = _mm_madd_epi16(u[2], k__cospi_p08_p24); - v[7] = _mm_madd_epi16(u[3], k__cospi_p08_p24); - v[8] = _mm_madd_epi16(u[4], k__cospi_p08_p24); - v[9] = _mm_madd_epi16(u[5], k__cospi_p08_p24); - v[10] = _mm_madd_epi16(u[4], k__cospi_p24_m08); - v[11] = _mm_madd_epi16(u[5], k__cospi_p24_m08); - v[12] = _mm_madd_epi16(u[6], k__cospi_m24_p08); - v[13] = _mm_madd_epi16(u[7], k__cospi_m24_p08); - v[14] = _mm_madd_epi16(u[6], k__cospi_p08_p24); - v[15] = _mm_madd_epi16(u[7], k__cospi_p08_p24); - - u[0] = _mm_add_epi32(v[0], v[4]); - u[1] = _mm_add_epi32(v[1], v[5]); - u[2] = _mm_add_epi32(v[2], v[6]); - u[3] = _mm_add_epi32(v[3], v[7]); - u[4] = _mm_sub_epi32(v[0], v[4]); - u[5] = _mm_sub_epi32(v[1], v[5]); - u[6] = _mm_sub_epi32(v[2], v[6]); - u[7] = _mm_sub_epi32(v[3], v[7]); - u[8] = _mm_add_epi32(v[8], v[12]); - u[9] = _mm_add_epi32(v[9], v[13]); - u[10] = _mm_add_epi32(v[10], v[14]); - u[11] = _mm_add_epi32(v[11], v[15]); - u[12] = _mm_sub_epi32(v[8], v[12]); - u[13] = _mm_sub_epi32(v[9], v[13]); - u[14] = _mm_sub_epi32(v[10], v[14]); - u[15] = _mm_sub_epi32(v[11], v[15]); - - u[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - s[0] = _mm_add_epi16(x[0], x[2]); - s[1] = _mm_add_epi16(x[1], x[3]); - s[2] = _mm_sub_epi16(x[0], x[2]); - s[3] = _mm_sub_epi16(x[1], x[3]); - s[4] = _mm_packs_epi32(v[0], v[1]); - s[5] = _mm_packs_epi32(v[2], v[3]); - s[6] = _mm_packs_epi32(v[4], v[5]); - s[7] = _mm_packs_epi32(v[6], v[7]); - s[8] = _mm_add_epi16(x[8], x[10]); - s[9] = _mm_add_epi16(x[9], x[11]); - s[10] = _mm_sub_epi16(x[8], x[10]); - s[11] = _mm_sub_epi16(x[9], x[11]); - s[12] = _mm_packs_epi32(v[8], v[9]); - s[13] = _mm_packs_epi32(v[10], v[11]); - s[14] = _mm_packs_epi32(v[12], v[13]); - s[15] = _mm_packs_epi32(v[14], v[15]); - - // stage 4 - u[0] = _mm_unpacklo_epi16(s[2], s[3]); - u[1] = _mm_unpackhi_epi16(s[2], s[3]); - u[2] = _mm_unpacklo_epi16(s[6], s[7]); - u[3] = _mm_unpackhi_epi16(s[6], s[7]); - u[4] = _mm_unpacklo_epi16(s[10], s[11]); - u[5] = _mm_unpackhi_epi16(s[10], s[11]); - u[6] = _mm_unpacklo_epi16(s[14], s[15]); - u[7] = _mm_unpackhi_epi16(s[14], s[15]); - - v[0] = _mm_madd_epi16(u[0], k__cospi_m16_m16); - v[1] = _mm_madd_epi16(u[1], k__cospi_m16_m16); - v[2] = _mm_madd_epi16(u[0], k__cospi_p16_m16); - v[3] = _mm_madd_epi16(u[1], k__cospi_p16_m16); - v[4] = _mm_madd_epi16(u[2], k__cospi_p16_p16); - v[5] = _mm_madd_epi16(u[3], k__cospi_p16_p16); - v[6] = _mm_madd_epi16(u[2], k__cospi_m16_p16); - v[7] = _mm_madd_epi16(u[3], k__cospi_m16_p16); - v[8] = _mm_madd_epi16(u[4], k__cospi_p16_p16); - v[9] = _mm_madd_epi16(u[5], k__cospi_p16_p16); - v[10] = _mm_madd_epi16(u[4], k__cospi_m16_p16); - v[11] = _mm_madd_epi16(u[5], k__cospi_m16_p16); - v[12] = _mm_madd_epi16(u[6], k__cospi_m16_m16); - v[13] = _mm_madd_epi16(u[7], k__cospi_m16_m16); - v[14] = _mm_madd_epi16(u[6], k__cospi_p16_m16); - v[15] = _mm_madd_epi16(u[7], k__cospi_p16_m16); - - u[0] = _mm_add_epi32(v[0], k__DCT_CONST_ROUNDING); - u[1] = _mm_add_epi32(v[1], k__DCT_CONST_ROUNDING); - u[2] = _mm_add_epi32(v[2], k__DCT_CONST_ROUNDING); - u[3] = _mm_add_epi32(v[3], k__DCT_CONST_ROUNDING); - u[4] = _mm_add_epi32(v[4], k__DCT_CONST_ROUNDING); - u[5] = _mm_add_epi32(v[5], k__DCT_CONST_ROUNDING); - u[6] = _mm_add_epi32(v[6], k__DCT_CONST_ROUNDING); - u[7] = _mm_add_epi32(v[7], k__DCT_CONST_ROUNDING); - u[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING); - u[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING); - u[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING); - u[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING); - u[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING); - u[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING); - u[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING); - u[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING); - - v[0] = _mm_srai_epi32(u[0], DCT_CONST_BITS); - v[1] = _mm_srai_epi32(u[1], DCT_CONST_BITS); - v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS); - v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS); - v[4] = _mm_srai_epi32(u[4], DCT_CONST_BITS); - v[5] = _mm_srai_epi32(u[5], DCT_CONST_BITS); - v[6] = _mm_srai_epi32(u[6], DCT_CONST_BITS); - v[7] = _mm_srai_epi32(u[7], DCT_CONST_BITS); - v[8] = _mm_srai_epi32(u[8], DCT_CONST_BITS); - v[9] = _mm_srai_epi32(u[9], DCT_CONST_BITS); - v[10] = _mm_srai_epi32(u[10], DCT_CONST_BITS); - v[11] = _mm_srai_epi32(u[11], DCT_CONST_BITS); - v[12] = _mm_srai_epi32(u[12], DCT_CONST_BITS); - v[13] = _mm_srai_epi32(u[13], DCT_CONST_BITS); - v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS); - v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS); - - in[0] = s[0]; - in[1] = _mm_sub_epi16(kZero, s[8]); - in[2] = s[12]; - in[3] = _mm_sub_epi16(kZero, s[4]); - in[4] = _mm_packs_epi32(v[4], v[5]); - in[5] = _mm_packs_epi32(v[12], v[13]); - in[6] = _mm_packs_epi32(v[8], v[9]); - in[7] = _mm_packs_epi32(v[0], v[1]); - in[8] = _mm_packs_epi32(v[2], v[3]); - in[9] = _mm_packs_epi32(v[10], v[11]); - in[10] = _mm_packs_epi32(v[14], v[15]); - in[11] = _mm_packs_epi32(v[6], v[7]); - in[12] = s[5]; - in[13] = _mm_sub_epi16(kZero, s[13]); - in[14] = s[9]; - in[15] = _mm_sub_epi16(kZero, s[1]); -} - -void fdct16_avx2(__m128i *in0, __m128i *in1) { - fdct16_8col_avx2(in0); - fdct16_8col_avx2(in1); - array_transpose_16x16_avx2(in0, in1); -} - -void fadst16_avx2(__m128i *in0, __m128i *in1) { - fadst16_8col_avx2(in0); - fadst16_8col_avx2(in1); - array_transpose_16x16_avx2(in0, in1); -} - -void vp9_fht16x16_avx2(const int16_t *input, int16_t *output, - int stride, int tx_type) { - __m128i in0[16], in1[16]; - - switch (tx_type) { - case DCT_DCT: - vp9_fdct16x16_avx2(input, output, stride); - break; - case ADST_DCT: - load_buffer_16x16_avx2(input, in0, in1, stride); - fadst16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fdct16_avx2(in0, in1); - write_buffer_16x16_avx2(output, in0, in1, 16); - break; - case DCT_ADST: - load_buffer_16x16_avx2(input, in0, in1, stride); - fdct16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fadst16_avx2(in0, in1); - write_buffer_16x16_avx2(output, in0, in1, 16); - break; - case ADST_ADST: - load_buffer_16x16_avx2(input, in0, in1, stride); - fadst16_avx2(in0, in1); - right_shift_16x16_avx2(in0, in1); - fadst16_avx2(in0, in1); - write_buffer_16x16_avx2(output, in0, in1, 16); - break; - default: - assert(0); - break; - } -} #define FDCT32x32_2D_AVX2 vp9_fdct32x32_rd_avx2 #define FDCT32x32_HIGH_PRECISION 0 diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index b3dc0b11a..af32eb3e3 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -305,6 +305,16 @@ static vpx_codec_err_t validate_img(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static int get_image_bps(const vpx_image_t *img) { + switch (img->fmt) { + case VPX_IMG_FMT_YV12: + case VPX_IMG_FMT_I420: return 12; + case VPX_IMG_FMT_I422: return 16; + case VPX_IMG_FMT_I444: return 24; + default: assert(0 && "Invalid image format"); + } + return 0; +} static vpx_codec_err_t set_encoder_config( VP9EncoderConfig *oxcf, @@ -672,16 +682,6 @@ static vpx_codec_err_t encoder_init(vpx_codec_ctx_t *ctx, priv->extra_cfg = extracfg_map[i].cfg; priv->extra_cfg.pkt_list = &priv->pkt_list.head; - // Maximum buffer size approximated based on having multiple ARF. - priv->cx_data_sz = priv->cfg.g_w * priv->cfg.g_h * 3 / 2 * 8; - - if (priv->cx_data_sz < 4096) - priv->cx_data_sz = 4096; - - priv->cx_data = (unsigned char *)malloc(priv->cx_data_sz); - if (priv->cx_data == NULL) - return VPX_CODEC_MEM_ERROR; - vp9_initialize_enc(); res = validate_config(priv, &priv->cfg, &priv->extra_cfg); @@ -806,8 +806,24 @@ static vpx_codec_err_t encoder_encode(vpx_codec_alg_priv_t *ctx, unsigned long deadline) { vpx_codec_err_t res = VPX_CODEC_OK; - if (img) + if (img != NULL) { res = validate_img(ctx, img); + // TODO(jzern) the checks related to cpi's validity should be treated as a + // failure condition, encoder setup is done fully in init() currently. + if (res == VPX_CODEC_OK && ctx->cpi != NULL && ctx->cx_data == NULL) { + // There's no codec control for multiple alt-refs so check the encoder + // instance for its status to determine the compressed data size. + ctx->cx_data_sz = ctx->cfg.g_w * ctx->cfg.g_h * + get_image_bps(img) / 8 * + (ctx->cpi->multi_arf_allowed ? 8 : 2); + if (ctx->cx_data_sz < 4096) ctx->cx_data_sz = 4096; + + ctx->cx_data = (unsigned char *)malloc(ctx->cx_data_sz); + if (ctx->cx_data == NULL) { + return VPX_CODEC_MEM_ERROR; + } + } + } pick_quickcompress_mode(ctx, duration, deadline); vpx_codec_pkt_list_init(&ctx->pkt_list); @@ -1301,7 +1317,6 @@ static vpx_codec_enc_cfg_map_t encoder_usage_cfg_map[] = { #endif } }, - { -1, {NOT_IMPLEMENTED}} }; #ifndef VERSION_STRING @@ -1324,6 +1339,7 @@ CODEC_INTERFACE(vpx_codec_vp9_cx) = { NOT_IMPLEMENTED // vpx_codec_set_fb_fn_t }, { // NOLINT + 1, // 1 cfg map encoder_usage_cfg_map, // vpx_codec_enc_cfg_map_t encoder_encode, // vpx_codec_encode_fn_t encoder_get_cxdata, // vpx_codec_get_cx_data_fn_t diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c index ef9d10d65..c52884084 100644 --- a/vp9/vp9_dx_iface.c +++ b/vp9/vp9_dx_iface.c @@ -707,6 +707,7 @@ CODEC_INTERFACE(vpx_codec_vp9_dx) = { decoder_set_fb_fn, // vpx_codec_set_fb_fn_t }, { // NOLINT + 0, NOT_IMPLEMENTED, // vpx_codec_enc_cfg_map_t NOT_IMPLEMENTED, // vpx_codec_encode_fn_t NOT_IMPLEMENTED, // vpx_codec_get_cx_data_fn_t diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 3381cb95a..dc46c4e35 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -130,5 +130,9 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c +VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes)) diff --git a/vpx/internal/vpx_codec_internal.h b/vpx/internal/vpx_codec_internal.h index 82d2bc3c0..cdda3406b 100644 --- a/vpx/internal/vpx_codec_internal.h +++ b/vpx/internal/vpx_codec_internal.h @@ -340,6 +340,7 @@ struct vpx_codec_iface { vpx_codec_set_fb_fn_t set_fb_fn; /**< \copydoc ::vpx_codec_set_fb_fn_t */ } dec; struct vpx_codec_enc_iface { + int cfg_map_count; vpx_codec_enc_cfg_map_t *cfg_maps; /**< \copydoc ::vpx_codec_enc_cfg_map_t */ vpx_codec_encode_fn_t encode; /**< \copydoc ::vpx_codec_encode_fn_t */ vpx_codec_get_cx_data_fn_t get_cx_data; /**< \copydoc ::vpx_codec_get_cx_data_fn_t */ diff --git a/vpx/src/vpx_encoder.c b/vpx/src/vpx_encoder.c index ece2d0b64..db019957e 100644 --- a/vpx/src/vpx_encoder.c +++ b/vpx/src/vpx_encoder.c @@ -162,6 +162,7 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, unsigned int usage) { vpx_codec_err_t res; vpx_codec_enc_cfg_map_t *map; + int i; if (!iface || !cfg || usage > INT_MAX) res = VPX_CODEC_INVALID_PARAM; @@ -170,7 +171,8 @@ vpx_codec_err_t vpx_codec_enc_config_default(vpx_codec_iface_t *iface, else { res = VPX_CODEC_INVALID_PARAM; - for (map = iface->enc.cfg_maps; map->usage >= 0; map++) { + for (i = 0; i < iface->enc.cfg_map_count; ++i) { + map = iface->enc.cfg_maps + i; if (map->usage == (int)usage) { *cfg = map->cfg; cfg->g_usage = usage; diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h index 182892acf..bd9eebd64 100644 --- a/vpx_ports/vpx_once.h +++ b/vpx_ports/vpx_once.h @@ -73,6 +73,33 @@ static void once(void (*func)(void)) } +#elif CONFIG_MULTITHREAD && defined(__OS2__) +#define INCL_DOS +#include <os2.h> +static void once(void (*func)(void)) +{ + static int done; + + /* If the initialization is complete, return early. */ + if(done) + return; + + /* Causes all other threads in the process to block themselves + * and give up their time slice. + */ + DosEnterCritSec(); + + if (!done) + { + func(); + done = 1; + } + + /* Restores normal thread dispatching for the current process. */ + DosExitCritSec(); +} + + #elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H #include <pthread.h> static void once(void (*func)(void)) diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm deleted file mode 100644 index 696f47a7b..000000000 --- a/vpx_scale/arm/neon/vp8_vpxyv12_copyframe_func_neon.asm +++ /dev/null @@ -1,233 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_yv12_copy_frame_func_neon| - ARM - REQUIRE8 - PRESERVE8 - - INCLUDE vpx_scale_asm_offsets.asm - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -;void vp8_yv12_copy_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc, -; YV12_BUFFER_CONFIG *dst_ybc); - -|vp8_yv12_copy_frame_func_neon| PROC - push {r4 - r11, lr} - vpush {d8 - d15} - - sub sp, sp, #16 - - ;Copy Y plane - ldr r8, [r0, #yv12_buffer_config_u_buffer] ;srcptr1 - ldr r9, [r1, #yv12_buffer_config_u_buffer] ;srcptr1 - ldr r10, [r0, #yv12_buffer_config_v_buffer] ;srcptr1 - ldr r11, [r1, #yv12_buffer_config_v_buffer] ;srcptr1 - - ldr r4, [r0, #yv12_buffer_config_y_height] - ldr r5, [r0, #yv12_buffer_config_y_width] - ldr r6, [r0, #yv12_buffer_config_y_stride] - ldr r7, [r1, #yv12_buffer_config_y_stride] - ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 - - str r8, [sp] - str r9, [sp, #4] - str r10, [sp, #8] - str r11, [sp, #12] - - ; copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_loop - mov r8, r2 - mov r9, r3 - add r10, r2, r6 - add r11, r3, r7 - movs r12, r5, lsr #7 - ble extra_cp_needed ; y_width < 128 - -cp_src_to_dst_width_loop - vld1.8 {q0, q1}, [r8]! - vld1.8 {q8, q9}, [r10]! - vld1.8 {q2, q3}, [r8]! - vld1.8 {q10, q11}, [r10]! - vld1.8 {q4, q5}, [r8]! - vld1.8 {q12, q13}, [r10]! - vld1.8 {q6, q7}, [r8]! - vld1.8 {q14, q15}, [r10]! - - subs r12, r12, #1 - - vst1.8 {q0, q1}, [r9]! - vst1.8 {q8, q9}, [r11]! - vst1.8 {q2, q3}, [r9]! - vst1.8 {q10, q11}, [r11]! - vst1.8 {q4, q5}, [r9]! - vst1.8 {q12, q13}, [r11]! - vst1.8 {q6, q7}, [r9]! - vst1.8 {q14, q15}, [r11]! - - bne cp_src_to_dst_width_loop - - subs lr, lr, #1 - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne cp_src_to_dst_height_loop - -extra_cp_needed - ands r10, r5, #0x7f ;check to see if extra copy is needed - sub r11, r5, r10 - ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 - bne extra_cp_src_to_dst_width -end_of_cp_src_to_dst - -;Copy U & V planes - ldr r2, [sp] ;srcptr1 - ldr r3, [sp, #4] ;dstptr1 - mov r4, r4, lsr #1 ;src uv_height - mov r5, r5, lsr #1 ;src uv_width - mov r6, r6, lsr #1 ;src uv_stride - mov r7, r7, lsr #1 ;dst uv_stride - - mov r1, #2 - -cp_uv_loop - - ;copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_uv_loop - mov r8, r2 - mov r9, r3 - add r10, r2, r6 - add r11, r3, r7 - movs r12, r5, lsr #6 - ble extra_uv_cp_needed - -cp_src_to_dst_width_uv_loop - vld1.8 {q0, q1}, [r8]! - vld1.8 {q8, q9}, [r10]! - vld1.8 {q2, q3}, [r8]! - vld1.8 {q10, q11}, [r10]! - - subs r12, r12, #1 - - vst1.8 {q0, q1}, [r9]! - vst1.8 {q8, q9}, [r11]! - vst1.8 {q2, q3}, [r9]! - vst1.8 {q10, q11}, [r11]! - - bne cp_src_to_dst_width_uv_loop - - subs lr, lr, #1 - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne cp_src_to_dst_height_uv_loop - -extra_uv_cp_needed - ands r10, r5, #0x3f ;check to see if extra copy is needed - sub r11, r5, r10 - ldr r2, [sp] ;srcptr1 - ldr r3, [sp, #4] ;dstptr1 - bne extra_cp_src_to_dst_uv_width -end_of_cp_src_to_dst_uv - - subs r1, r1, #1 - - addne sp, sp, #8 - - ldrne r2, [sp] ;srcptr1 - ldrne r3, [sp, #4] ;dstptr1 - - bne cp_uv_loop - - add sp, sp, #8 - - vpop {d8 - d15} - pop {r4 - r11, pc} - -;============================= -extra_cp_src_to_dst_width - add r2, r2, r11 - add r3, r3, r11 - add r0, r8, r6 - add r11, r9, r7 - - mov lr, r4, lsr #1 -extra_cp_src_to_dst_height_loop - mov r8, r2 - mov r9, r3 - add r0, r8, r6 - add r11, r9, r7 - - mov r12, r10 - -extra_cp_src_to_dst_width_loop - vld1.8 {q0}, [r8]! - vld1.8 {q1}, [r0]! - - subs r12, r12, #16 - - vst1.8 {q0}, [r9]! - vst1.8 {q1}, [r11]! - bne extra_cp_src_to_dst_width_loop - - subs lr, lr, #1 - - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne extra_cp_src_to_dst_height_loop - - b end_of_cp_src_to_dst - -;================================= -extra_cp_src_to_dst_uv_width - add r2, r2, r11 - add r3, r3, r11 - add r0, r8, r6 - add r11, r9, r7 - - mov lr, r4, lsr #1 -extra_cp_src_to_dst_height_uv_loop - mov r8, r2 - mov r9, r3 - add r0, r8, r6 - add r11, r9, r7 - - mov r12, r10 - -extra_cp_src_to_dst_width_uv_loop - vld1.8 {d0}, [r8]! - vld1.8 {d1}, [r0]! - - subs r12, r12, #8 - - vst1.8 {d0}, [r9]! - vst1.8 {d1}, [r11]! - bne extra_cp_src_to_dst_width_uv_loop - - subs lr, lr, #1 - - add r2, r2, r6, lsl #1 - add r3, r3, r7, lsl #1 - - bne extra_cp_src_to_dst_height_uv_loop - - b end_of_cp_src_to_dst_uv - - ENDP - END diff --git a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm b/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm deleted file mode 100644 index d3306b669..000000000 --- a/vpx_scale/arm/neon/vp8_vpxyv12_copysrcframe_func_neon.asm +++ /dev/null @@ -1,259 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_yv12_copy_src_frame_func_neon| - ARM - REQUIRE8 - PRESERVE8 - - INCLUDE vpx_scale_asm_offsets.asm - - AREA ||.text||, CODE, READONLY, ALIGN=2 -;Note: This function is used to copy source data in src_buffer[i] at beginning -;of the encoding. The buffer has a width and height of cpi->oxcf.Width and -;cpi->oxcf.Height, which can be ANY numbers(NOT always multiples of 16 or 4). - -;void vp8_yv12_copy_src_frame_func_neon(const YV12_BUFFER_CONFIG *src_ybc, -; YV12_BUFFER_CONFIG *dst_ybc); - -|vp8_yv12_copy_src_frame_func_neon| PROC - push {r4 - r11, lr} - vpush {d8 - d15} - - ;Copy Y plane - ldr r4, [r0, #yv12_buffer_config_y_height] - ldr r5, [r0, #yv12_buffer_config_y_width] - ldr r6, [r0, #yv12_buffer_config_y_stride] - ldr r7, [r1, #yv12_buffer_config_y_stride] - ldr r2, [r0, #yv12_buffer_config_y_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_y_buffer] ;dstptr1 - - add r10, r2, r6 ;second row src - add r11, r3, r7 ;second row dst - mov r6, r6, lsl #1 - mov r7, r7, lsl #1 - sub r6, r6, r5 ;adjust stride - sub r7, r7, r5 - - ; copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_loop - mov r12, r5 - -cp_width_128_loop - vld1.8 {q0, q1}, [r2]! - vld1.8 {q4, q5}, [r10]! - vld1.8 {q2, q3}, [r2]! - vld1.8 {q6, q7}, [r10]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q12, q13}, [r10]! - vld1.8 {q10, q11}, [r2]! - vld1.8 {q14, q15}, [r10]! - sub r12, r12, #128 - cmp r12, #128 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q4, q5}, [r11]! - vst1.8 {q2, q3}, [r3]! - vst1.8 {q6, q7}, [r11]! - vst1.8 {q8, q9}, [r3]! - vst1.8 {q12, q13}, [r11]! - vst1.8 {q10, q11}, [r3]! - vst1.8 {q14, q15}, [r11]! - bhs cp_width_128_loop - - cmp r12, #0 - beq cp_width_done - -cp_width_8_loop - vld1.8 {d0}, [r2]! - vld1.8 {d1}, [r10]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - vst1.8 {d1}, [r11]! - bhs cp_width_8_loop - - cmp r12, #0 - beq cp_width_done - -cp_width_1_loop - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - ldrb r8, [r10], #1 - strb r8, [r11], #1 - bne cp_width_1_loop - -cp_width_done - subs lr, lr, #1 - add r2, r2, r6 - add r3, r3, r7 - add r10, r10, r6 - add r11, r11, r7 - bne cp_src_to_dst_height_loop - -;copy last line for Y if y_height is odd - tst r4, #1 - beq cp_width_done_1 - mov r12, r5 - -cp_width_128_loop_1 - vld1.8 {q0, q1}, [r2]! - vld1.8 {q2, q3}, [r2]! - vld1.8 {q8, q9}, [r2]! - vld1.8 {q10, q11}, [r2]! - sub r12, r12, #128 - cmp r12, #128 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q2, q3}, [r3]! - vst1.8 {q8, q9}, [r3]! - vst1.8 {q10, q11}, [r3]! - bhs cp_width_128_loop_1 - - cmp r12, #0 - beq cp_width_done_1 - -cp_width_8_loop_1 - vld1.8 {d0}, [r2]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - bhs cp_width_8_loop_1 - - cmp r12, #0 - beq cp_width_done_1 - -cp_width_1_loop_1 - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - bne cp_width_1_loop_1 -cp_width_done_1 - -;Copy U & V planes - ldr r4, [r0, #yv12_buffer_config_uv_height] - ldr r5, [r0, #yv12_buffer_config_uv_width] - ldr r6, [r0, #yv12_buffer_config_uv_stride] - ldr r7, [r1, #yv12_buffer_config_uv_stride] - ldr r2, [r0, #yv12_buffer_config_u_buffer] ;srcptr1 - ldr r3, [r1, #yv12_buffer_config_u_buffer] ;dstptr1 - - add r10, r2, r6 ;second row src - add r11, r3, r7 ;second row dst - mov r6, r6, lsl #1 - mov r7, r7, lsl #1 - sub r6, r6, r5 ;adjust stride - sub r7, r7, r5 - - mov r9, #2 - -cp_uv_loop - ;copy two rows at one time - mov lr, r4, lsr #1 - -cp_src_to_dst_height_uv_loop - mov r12, r5 - -cp_width_uv_64_loop - vld1.8 {q0, q1}, [r2]! - vld1.8 {q4, q5}, [r10]! - vld1.8 {q2, q3}, [r2]! - vld1.8 {q6, q7}, [r10]! - sub r12, r12, #64 - cmp r12, #64 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q4, q5}, [r11]! - vst1.8 {q2, q3}, [r3]! - vst1.8 {q6, q7}, [r11]! - bhs cp_width_uv_64_loop - - cmp r12, #0 - beq cp_width_uv_done - -cp_width_uv_8_loop - vld1.8 {d0}, [r2]! - vld1.8 {d1}, [r10]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - vst1.8 {d1}, [r11]! - bhs cp_width_uv_8_loop - - cmp r12, #0 - beq cp_width_uv_done - -cp_width_uv_1_loop - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - ldrb r8, [r10], #1 - strb r8, [r11], #1 - bne cp_width_uv_1_loop - -cp_width_uv_done - subs lr, lr, #1 - add r2, r2, r6 - add r3, r3, r7 - add r10, r10, r6 - add r11, r11, r7 - bne cp_src_to_dst_height_uv_loop - -;copy last line for U & V if uv_height is odd - tst r4, #1 - beq cp_width_uv_done_1 - mov r12, r5 - -cp_width_uv_64_loop_1 - vld1.8 {q0, q1}, [r2]! - vld1.8 {q2, q3}, [r2]! - sub r12, r12, #64 - cmp r12, #64 - vst1.8 {q0, q1}, [r3]! - vst1.8 {q2, q3}, [r3]! - bhs cp_width_uv_64_loop_1 - - cmp r12, #0 - beq cp_width_uv_done_1 - -cp_width_uv_8_loop_1 - vld1.8 {d0}, [r2]! - sub r12, r12, #8 - cmp r12, #8 - vst1.8 {d0}, [r3]! - bhs cp_width_uv_8_loop_1 - - cmp r12, #0 - beq cp_width_uv_done_1 - -cp_width_uv_1_loop_1 - ldrb r8, [r2], #1 - subs r12, r12, #1 - strb r8, [r3], #1 - bne cp_width_uv_1_loop_1 -cp_width_uv_done_1 - - subs r9, r9, #1 - ldrne r2, [r0, #yv12_buffer_config_v_buffer] ;srcptr1 - ldrne r3, [r1, #yv12_buffer_config_v_buffer] ;dstptr1 - ldrne r10, [r0, #yv12_buffer_config_uv_stride] - ldrne r11, [r1, #yv12_buffer_config_uv_stride] - - addne r10, r2, r10 ;second row src - addne r11, r3, r11 ;second row dst - - bne cp_uv_loop - - vpop {d8 - d15} - pop {r4 - r11, pc} - - ENDP - END diff --git a/vpx_scale/arm/neon/yv12extend_arm.c b/vpx_scale/arm/neon/yv12extend_arm.c deleted file mode 100644 index d408eb311..000000000 --- a/vpx_scale/arm/neon/yv12extend_arm.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_scale_rtcd.h" - -extern void vp8_yv12_copy_frame_func_neon( - const struct yv12_buffer_config *src_ybc, - struct yv12_buffer_config *dst_ybc); - -void vp8_yv12_copy_frame_neon(const struct yv12_buffer_config *src_ybc, - struct yv12_buffer_config *dst_ybc) { - vp8_yv12_copy_frame_func_neon(src_ybc, dst_ybc); - vp8_yv12_extend_frame_borders_c(dst_ybc); -} diff --git a/vpx_scale/vpx_scale.mk b/vpx_scale/vpx_scale.mk index 1fa41afba..0a1594bd8 100644 --- a/vpx_scale/vpx_scale.mk +++ b/vpx_scale/vpx_scale.mk @@ -9,11 +9,6 @@ SCALE_SRCS-yes += vpx_scale_asm_offsets.c SCALE_SRCS-yes += vpx_scale_rtcd.c SCALE_SRCS-yes += vpx_scale_rtcd.pl -#neon -SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/vp8_vpxyv12_copyframe_func_neon$(ASM) -SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/vp8_vpxyv12_copysrcframe_func_neon$(ASM) -SCALE_SRCS-$(HAVE_NEON_ASM) += arm/neon/yv12extend_arm.c - #mips(dspr2) SCALE_SRCS-$(HAVE_DSPR2) += mips/dspr2/yv12extend_dspr2.c diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl index 5a7f973b2..d4a2b81a5 100644 --- a/vpx_scale/vpx_scale_rtcd.pl +++ b/vpx_scale/vpx_scale_rtcd.pl @@ -19,8 +19,6 @@ if (vpx_config("CONFIG_SPATIAL_RESAMPLING") eq "yes") { add_proto qw/void vp8_yv12_extend_frame_borders/, "struct yv12_buffer_config *ybf"; add_proto qw/void vp8_yv12_copy_frame/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; -specialize qw/vp8_yv12_copy_frame neon_asm/; -$vp8_yv12_copy_frame_neon_asm=vp8_yv12_copy_frame_neon; add_proto qw/void vpx_yv12_copy_y/, "const struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc"; |