summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--build/make/Makefile30
-rw-r--r--[-rwxr-xr-x]build/make/configure.sh32
-rwxr-xr-xconfigure22
-rw-r--r--examples/vpx_temporal_svc_encoder.c2
-rw-r--r--test/fdct8x8_test.cc2
-rw-r--r--test/variance_test.cc12
-rw-r--r--vp8/common/onyx.h4
-rw-r--r--vp8/encoder/denoising.c50
-rw-r--r--vp8/encoder/denoising.h26
-rw-r--r--vp8/encoder/encodeframe.c13
-rw-r--r--vp8/encoder/ethreading.c15
-rw-r--r--vp8/encoder/onyx_if.c31
-rw-r--r--vp8/encoder/onyx_int.h2
-rw-r--r--vp8/encoder/pickinter.c10
-rw-r--r--vp9/common/vp9_rtcd_defs.pl10
-rw-r--r--vp9/encoder/arm/neon/vp9_dct_neon.c223
-rw-r--r--vp9/encoder/arm/neon/vp9_variance_neon.c129
-rw-r--r--vp9/encoder/vp9_firstpass.c50
-rw-r--r--vp9/encoder/vp9_pickmode.c4
-rw-r--r--vp9/vp9cx.mk2
-rw-r--r--vpx_ports/vpx_once.h27
21 files changed, 611 insertions, 85 deletions
diff --git a/build/make/Makefile b/build/make/Makefile
index 9efa0ec02..ed90397f0 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -118,20 +118,26 @@ testdata::
utiltest:
# Add compiler flags for intrinsic files
+ifeq ($(TOOLCHAIN), x86-os2-gcc)
+STACKREALIGN=-mstackrealign
+else
+STACKREALIGN=
+endif
+
$(BUILD_PFX)%_mmx.c.d: CFLAGS += -mmmx
$(BUILD_PFX)%_mmx.c.o: CFLAGS += -mmmx
-$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2
-$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2
-$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3
-$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3
-$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3
-$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3
-$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1
-$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1
-$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx
-$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx
-$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2
-$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2
+$(BUILD_PFX)%_sse2.c.d: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse2.c.o: CFLAGS += -msse2 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.d: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse3.c.o: CFLAGS += -msse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.d: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_ssse3.c.o: CFLAGS += -mssse3 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.d: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_sse4.c.o: CFLAGS += -msse4.1 $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.d: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx.c.o: CFLAGS += -mavx $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.d: CFLAGS += -mavx2 $(STACKREALIGN)
+$(BUILD_PFX)%_avx2.c.o: CFLAGS += -mavx2 $(STACKREALIGN)
$(BUILD_PFX)%.c.d: %.c
$(if $(quiet),@echo " [DEP] $@")
diff --git a/build/make/configure.sh b/build/make/configure.sh
index d25f31333..6bc8509ab 100755..100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -381,8 +381,8 @@ EOF
# tests for -m$1 toggling the feature given in $2. If $2 is empty $1 is used.
check_gcc_machine_option() {
- local opt="$1"
- local feature="$2"
+ opt="$1"
+ feature="$2"
[ -n "$feature" ] || feature="$opt"
if enabled gcc && ! disabled "$feature" && ! check_cflags "-m$opt"; then
@@ -419,8 +419,8 @@ true
}
write_common_target_config_mk() {
- local CC="${CC}"
- local CXX="${CXX}"
+ saved_CC="${CC}"
+ saved_CXX="${CXX}"
enabled ccache && CC="ccache ${CC}"
enabled ccache && CXX="ccache ${CXX}"
print_webm_license $1 "##" ""
@@ -470,6 +470,8 @@ EOF
enabled msvs && echo "CONFIG_VS_VERSION=${vs_version}" >> "${1}"
+ CC="${saved_CC}"
+ CXX="${saved_CXX}"
}
@@ -1314,8 +1316,9 @@ process_toolchain() {
}
print_config_mk() {
- local prefix=$1
- local makefile=$2
+ saved_prefix="${prefix}"
+ prefix=$1
+ makefile=$2
shift 2
for cfg; do
if enabled $cfg; then
@@ -1323,11 +1326,13 @@ print_config_mk() {
echo "${prefix}_${upname}=yes" >> $makefile
fi
done
+ prefix="${saved_prefix}"
}
print_config_h() {
- local prefix=$1
- local header=$2
+ saved_prefix="${prefix}"
+ prefix=$1
+ header=$2
shift 2
for cfg; do
upname="`toupper $cfg`"
@@ -1337,10 +1342,11 @@ print_config_h() {
echo "#define ${prefix}_${upname} 0" >> $header
fi
done
+ prefix="${saved_prefix}"
}
print_config_vars_h() {
- local header=$1
+ header=$1
shift
while [ $# -gt 0 ]; do
upname="`toupper $1`"
@@ -1350,9 +1356,10 @@ print_config_vars_h() {
}
print_webm_license() {
- local destination=$1
- local prefix="$2"
- local suffix="$3"
+ saved_prefix="${prefix}"
+ destination=$1
+ prefix="$2"
+ suffix="$3"
shift 3
cat <<EOF > ${destination}
${prefix} Copyright (c) 2011 The WebM project authors. All Rights Reserved.${suffix}
@@ -1363,6 +1370,7 @@ ${prefix} tree. An additional intellectual property rights grant can be found${s
${prefix} in the file PATENTS. All contributing project authors may${suffix}
${prefix} be found in the AUTHORS file in the root of the source tree.${suffix}
EOF
+ prefix="${saved_prefix}"
}
process_targets() {
diff --git a/configure b/configure
index d570081d4..d650eeb70 100755
--- a/configure
+++ b/configure
@@ -67,10 +67,10 @@ Codecs:
EOF
#restore editor state '
- local family;
- local last_family;
- local c;
- local str;
+ family="";
+ last_family="";
+ c="";
+ str="";
for c in ${CODECS}; do
family=${c%_*}
if [ "${family}" != "${last_family}" ]; then
@@ -412,7 +412,7 @@ process_cmdline() {
}
post_process_cmdline() {
- local c
+ c=""
# If the codec family is disabled, disable all components of that family.
# If the codec family is enabled, enable all components of that family.
@@ -459,8 +459,8 @@ process_targets() {
enabled universal && echo "FAT_ARCHS=${fat_bin_archs}" >> config.mk
# Calculate the default distribution name, based on the enabled features
- local cf
- local DIST_DIR=vpx
+ cf=""
+ DIST_DIR=vpx
for cf in $CODEC_FAMILIES; do
if enabled ${cf}_encoder && enabled ${cf}_decoder; then
DIST_DIR="${DIST_DIR}-${cf}"
@@ -482,7 +482,7 @@ process_targets() {
;;
esac
if [ -f "${source_path}/build/make/version.sh" ]; then
- local ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
+ ver=`"$source_path/build/make/version.sh" --bare "$source_path"`
DIST_DIR="${DIST_DIR}-${ver}"
VERSION_STRING=${ver}
ver=${ver%%-*}
@@ -516,7 +516,7 @@ EOF
# Write makefiles for all enabled targets
#
for tgt in libs examples docs solution; do
- local tgt_fn="$tgt-$toolchain.mk"
+ tgt_fn="$tgt-$toolchain.mk"
if enabled $tgt; then
echo "Creating makefiles for ${toolchain} ${tgt}"
@@ -555,7 +555,7 @@ process_detect() {
true;
;;
*)
- local result=false
+ result=false
for d in "$@"; do
[ -f "${d##-I}/$header" ] && result=true && break
done
@@ -604,7 +604,7 @@ process_toolchain() {
# Handle universal binaries for this architecture
case $toolchain in
universal-darwin*)
- local darwin_ver=${tgt_os##darwin}
+ darwin_ver=${tgt_os##darwin}
# Snow Leopard (10.6/darwin10) dropped support for PPC
# Include PPC support for all prior versions
diff --git a/examples/vpx_temporal_svc_encoder.c b/examples/vpx_temporal_svc_encoder.c
index a7ad9f0c6..be3e7b2f1 100644
--- a/examples/vpx_temporal_svc_encoder.c
+++ b/examples/vpx_temporal_svc_encoder.c
@@ -579,7 +579,7 @@ int main(int argc, char **argv) {
if (strncmp(encoder->name, "vp8", 3) == 0) {
vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
- vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
+ vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kDenoiserOnYOnly);
} else if (strncmp(encoder->name, "vp9", 3) == 0) {
vpx_codec_control(&codec, VP8E_SET_CPUUSED, speed);
vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 3);
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 6a5d6bb98..567e5f698 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -337,7 +337,7 @@ INSTANTIATE_TEST_CASE_P(
INSTANTIATE_TEST_CASE_P(
NEON, FwdTrans8x8DCT,
::testing::Values(
- make_tuple(&vp9_fdct8x8_c, &vp9_idct8x8_64_add_neon, 0)));
+ make_tuple(&vp9_fdct8x8_neon, &vp9_idct8x8_64_add_neon, 0)));
INSTANTIATE_TEST_CASE_P(
DISABLED_NEON, FwdTrans8x8HT,
::testing::Values(
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 9dc7c6a45..83b7435e6 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -756,6 +756,18 @@ INSTANTIATE_TEST_CASE_P(
::testing::Values(make_tuple(5, 5, subpel_avg_variance32x32_avx2),
make_tuple(6, 6, subpel_avg_variance64x64_avx2)));
#endif // HAVE_AVX2
+#if HAVE_NEON
+const vp9_variance_fn_t variance16x16_neon = vp9_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+ NEON, VP9VarianceTest,
+ ::testing::Values(make_tuple(4, 4, variance16x16_neon)));
+
+const vp9_subpixvariance_fn_t subpel_variance16x16_neon =
+ vp9_sub_pixel_variance16x16_neon;
+INSTANTIATE_TEST_CASE_P(
+ NEON, VP9SubpelVarianceTest,
+ ::testing::Values(make_tuple(4, 4, subpel_variance16x16_neon)));
+#endif // HAVE_NEON
#endif // CONFIG_VP9_ENCODER
} // namespace vp9
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 7d9441d54..ef7f61b12 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -108,8 +108,8 @@ extern "C"
* For temporal denoiser: noise_sensitivity = 0 means off,
* noise_sensitivity = 1 means temporal denoiser on for Y channel only,
* noise_sensitivity = 2 means temporal denoiser on for all channels.
- * noise_sensitivity = 3 will be used for aggressive mode in future.
- * Temporal denoiser is enabled via the build option
+ * noise_sensitivity = 3 means aggressive denoising mode.
+ * Temporal denoiser is enabled via the configuration option:
* CONFIG_TEMPORAL_DENOISING.
* For spatial denoiser: noise_sensitivity controls the amount of
* pre-processing blur: noise_sensitivity = 0 means off.
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index 1a401a4b9..c4c0de81b 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -8,6 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <limits.h>
+
#include "denoising.h"
#include "vp8/common/reconinter.h"
@@ -333,12 +335,33 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv,
return FILTER_BLOCK;
}
+void vp8_denoiser_set_parameters(VP8_DENOISER *denoiser) {
+ if (!denoiser->aggressive_mode) {
+ denoiser->denoise_pars.scale_sse_thresh = 1;
+ denoiser->denoise_pars.scale_motion_thresh = 8;
+ denoiser->denoise_pars.scale_increase_filter = 0;
+ denoiser->denoise_pars.denoise_mv_bias = 95;
+ denoiser->denoise_pars.pickmode_mv_bias = 100;
+ denoiser->denoise_pars.qp_thresh = 0;
+ denoiser->denoise_pars.consec_zerolast = UINT_MAX;
+ } else {
+ denoiser->denoise_pars.scale_sse_thresh = 2;
+ denoiser->denoise_pars.scale_motion_thresh = 16;
+ denoiser->denoise_pars.scale_increase_filter = 1;
+ denoiser->denoise_pars.denoise_mv_bias = 60;
+ denoiser->denoise_pars.pickmode_mv_bias = 60;
+ denoiser->denoise_pars.qp_thresh = 100;
+ denoiser->denoise_pars.consec_zerolast = 10;
+ }
+}
+
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
- int num_mb_rows, int num_mb_cols)
+ int num_mb_rows, int num_mb_cols, int mode)
{
int i;
assert(denoiser);
denoiser->num_mb_cols = num_mb_cols;
+ denoiser->aggressive_mode = mode;
for (i = 0; i < MAX_REF_FRAMES; i++)
{
@@ -369,10 +392,11 @@ int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
denoiser->denoise_state = vpx_calloc((num_mb_rows * num_mb_cols), 1);
vpx_memset(denoiser->denoise_state, 0, (num_mb_rows * num_mb_cols));
-
+ vp8_denoiser_set_parameters(denoiser);
return 0;
}
+
void vp8_denoiser_free(VP8_DENOISER *denoiser)
{
int i;
@@ -401,6 +425,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
{
int mv_row;
int mv_col;
+ unsigned int motion_threshold;
unsigned int motion_magnitude2;
unsigned int sse_thresh;
int sse_diff_thresh = 0;
@@ -424,7 +449,7 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
MB_MODE_INFO *mbmi = &filter_xd->mode_info_context->mbmi;
int sse_diff = 0;
// Bias on zero motion vector sse.
- int zero_bias = 95;
+ const int zero_bias = denoiser->denoise_pars.denoise_mv_bias;
zero_mv_sse = (unsigned int)((int64_t)zero_mv_sse * zero_bias / 100);
sse_diff = zero_mv_sse - best_sse;
@@ -502,14 +527,19 @@ void vp8_denoiser_denoise_mb(VP8_DENOISER *denoiser,
mv_row = x->best_sse_mv.as_mv.row;
mv_col = x->best_sse_mv.as_mv.col;
motion_magnitude2 = mv_row * mv_row + mv_col * mv_col;
- sse_thresh = SSE_THRESHOLD;
- if (x->increase_denoising) sse_thresh = SSE_THRESHOLD_HIGH;
+ motion_threshold = denoiser->denoise_pars.scale_motion_thresh *
+ NOISE_MOTION_THRESHOLD;
- if (best_sse > sse_thresh || motion_magnitude2
- > 8 * NOISE_MOTION_THRESHOLD)
- {
- decision = COPY_BLOCK;
- }
+ if (motion_magnitude2 <
+ denoiser->denoise_pars.scale_increase_filter * NOISE_MOTION_THRESHOLD)
+ x->increase_denoising = 1;
+
+ sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD;
+ if (x->increase_denoising)
+ sse_thresh = denoiser->denoise_pars.scale_sse_thresh * SSE_THRESHOLD_HIGH;
+
+ if (best_sse > sse_thresh || motion_magnitude2 > motion_threshold)
+ decision = COPY_BLOCK;
if (decision == FILTER_BLOCK)
{
diff --git a/vp8/encoder/denoising.h b/vp8/encoder/denoising.h
index a1f195b72..1a42f86d3 100644
--- a/vp8/encoder/denoising.h
+++ b/vp8/encoder/denoising.h
@@ -39,16 +39,40 @@ enum vp8_denoiser_filter_state {
kFilterNonZeroMV
};
+typedef struct {
+ // Scale factor on sse threshold above which no denoising is done.
+ unsigned int scale_sse_thresh;
+ // Scale factor on motion magnitude threshold above which no
+ // denoising is done.
+ unsigned int scale_motion_thresh;
+ // Scale factor on motion magnitude below which we increase the strength of
+ // the temporal filter (in function vp8_denoiser_filter).
+ unsigned int scale_increase_filter;
+ // Scale factor to bias to ZEROMV for denoising.
+ unsigned int denoise_mv_bias;
+ // Scale factor to bias to ZEROMV for coding mode selection.
+ unsigned int pickmode_mv_bias;
+ // Quantizer threshold below which we use the segmentation map to switch off
+ // loop filter for blocks that have been coded as ZEROMV-LAST a certain number
+ // (consec_zerolast) of consecutive frames. Note that the delta-QP is set to
+ // 0 when segmentation map is used for shutting off loop filter.
+ unsigned int qp_thresh;
+ // Threshold for number of consecutive frames for blocks coded as ZEROMV-LAST.
+ unsigned int consec_zerolast;
+} denoise_params;
+
typedef struct vp8_denoiser
{
YV12_BUFFER_CONFIG yv12_running_avg[MAX_REF_FRAMES];
YV12_BUFFER_CONFIG yv12_mc_running_avg;
unsigned char* denoise_state;
int num_mb_cols;
+ int aggressive_mode;
+ denoise_params denoise_pars;
} VP8_DENOISER;
int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height,
- int num_mb_rows, int num_mb_cols);
+ int num_mb_rows, int num_mb_cols, int mode);
void vp8_denoiser_free(VP8_DENOISER *denoiser);
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index e6b0f9b64..aec6b9880 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -522,6 +522,19 @@ void encode_mb_row(VP8_COMP *cpi,
}
#endif
+ // Keep track of how many (consecutive) times a block is coded
+ // as ZEROMV_LASTREF, for base layer frames.
+ // Reset to 0 if its coded as anything else.
+ if (cpi->current_layer == 0) {
+ if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+ xd->mode_info_context->mbmi.ref_frame == LAST_FRAME) {
+ // Increment, check for wrap-around.
+ if (cpi->consec_zero_last[map_index+mb_col] < 255)
+ cpi->consec_zero_last[map_index+mb_col] += 1;
+ } else {
+ cpi->consec_zero_last[map_index+mb_col] = 0;
+ }
+ }
/* Special case code for cyclic refresh
* If cyclic update enabled then copy xd->mbmi.segment_id; (which
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index d4b17cef1..7b8b51f30 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -206,6 +206,21 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
}
#endif
+ // Keep track of how many (consecutive) times a block
+ // is coded as ZEROMV_LASTREF, for base layer frames.
+ // Reset to 0 if its coded as anything else.
+ if (cpi->current_layer == 0) {
+ if (xd->mode_info_context->mbmi.mode == ZEROMV &&
+ xd->mode_info_context->mbmi.ref_frame ==
+ LAST_FRAME) {
+ // Increment, check for wrap-around.
+ if (cpi->consec_zero_last[map_index+mb_col] < 255)
+ cpi->consec_zero_last[map_index+mb_col] +=
+ 1;
+ } else {
+ cpi->consec_zero_last[map_index+mb_col] = 0;
+ }
+ }
/* Special case code for cyclic refresh
* If cyclic update enabled then copy
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 469d0d6e9..e81c05e6f 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -613,6 +613,24 @@ static void cyclic_background_refresh(VP8_COMP *cpi, int Q, int lf_adjustment)
while(block_count && i != cpi->cyclic_refresh_mode_index);
cpi->cyclic_refresh_mode_index = i;
+
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->denoiser.aggressive_mode != 0 &&
+ Q < cpi->denoiser.denoise_pars.qp_thresh) {
+ // Under aggressive denoising mode, use segmentation to turn off loop
+ // filter below some qp thresh. The loop filter is turned off for all
+ // blocks that have been encoded as ZEROMV LAST x frames in a row,
+ // where x is set by cpi->denoiser.denoise_pars.consec_zerolast.
+ // This is to avoid "dot" artifacts that can occur from repeated
+ // loop filtering on noisy input source.
+ cpi->cyclic_refresh_q = Q;
+ lf_adjustment = -MAX_LOOP_FILTER;
+ for (i = 0; i < mbs_in_frame; ++i) {
+ seg_map[i] = (cpi->consec_zero_last[i] >
+ cpi->denoiser.denoise_pars.consec_zerolast) ? 1 : 0;
+ }
+ }
+#endif
}
/* Activate segmentation. */
@@ -1752,7 +1770,8 @@ void vp8_change_config(VP8_COMP *cpi, VP8_CONFIG *oxcf)
int width = (cpi->oxcf.Width + 15) & ~15;
int height = (cpi->oxcf.Height + 15) & ~15;
vp8_denoiser_allocate(&cpi->denoiser, width, height,
- cpi->common.mb_rows, cpi->common.mb_cols);
+ cm->mb_rows, cm->mb_cols,
+ ((cpi->oxcf.noise_sensitivity == 3) ? 1 : 0));
}
}
#endif
@@ -1896,6 +1915,9 @@ struct VP8_COMP* vp8_create_compressor(VP8_CONFIG *oxcf)
else
cpi->cyclic_refresh_map = (signed char *) NULL;
+ CHECK_MEM_ERROR(cpi->consec_zero_last,
+ vpx_calloc(cpi->common.mb_rows * cpi->common.mb_cols, 1));
+
#ifdef VP8_ENTROPY_STATS
init_context_counters();
#endif
@@ -2416,6 +2438,7 @@ void vp8_remove_compressor(VP8_COMP **ptr)
vpx_free(cpi->mb.ss);
vpx_free(cpi->tok);
vpx_free(cpi->cyclic_refresh_map);
+ vpx_free(cpi->consec_zero_last);
vp8_remove_common(&cpi->common);
vpx_free(cpi);
@@ -3478,6 +3501,9 @@ static void encode_frame_to_data_rate
{
cpi->mb.rd_thresh_mult[i] = 128;
}
+
+ // Reset the zero_last counter to 0 on key frame.
+ vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
}
#if 0
@@ -3899,6 +3925,7 @@ static void encode_frame_to_data_rate
#endif
+
#ifdef OUTPUT_YUV_SRC
vp8_write_yuv_frame(yuv_file, cpi->Source);
#endif
@@ -3994,6 +4021,8 @@ static void encode_frame_to_data_rate
else
disable_segmentation(cpi);
}
+ // Reset the consec_zero_last counter on key frame.
+ vpx_memset(cpi->consec_zero_last, 0, cm->mb_rows * cm->mb_cols);
vp8_set_quantizer(cpi, Q);
}
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index df17dff34..7a8baca77 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -511,6 +511,8 @@ typedef struct VP8_COMP
int cyclic_refresh_mode_index;
int cyclic_refresh_q;
signed char *cyclic_refresh_map;
+ // Count on how many (consecutive) times a macroblock uses ZER0MV_LAST.
+ unsigned char *consec_zero_last;
// Frame counter for the temporal pattern. Counter is rest when the temporal
// layers are changed dynamically (run-time change).
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 86108b70a..ec1ea146f 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -40,7 +40,6 @@ extern const MB_PREDICTION_MODE vp8_mode_order[MAX_MODES];
extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]);
-
int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d,
int_mv *bestmv, int_mv *ref_mv,
int error_per_bit,
@@ -694,6 +693,13 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
*/
calculate_zeromv_rd_adjustment(cpi, x, &rd_adjustment);
+#if CONFIG_TEMPORAL_DENOISING
+ if (cpi->oxcf.noise_sensitivity) {
+ rd_adjustment = (int)(rd_adjustment *
+ cpi->denoiser.denoise_pars.pickmode_mv_bias / 100);
+ }
+#endif
+
/* if we encode a new mv this is important
* find the best new motion vector
*/
@@ -1168,7 +1174,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
#if CONFIG_TEMPORAL_DENOISING
if (cpi->oxcf.noise_sensitivity)
{
- int uv_denoise = (cpi->oxcf.noise_sensitivity == 2) ? 1 : 0;
+ int uv_denoise = (cpi->oxcf.noise_sensitivity >= 2) ? 1 : 0;
int block_index = mb_row * cpi->common.mb_cols + mb_col;
if (x->best_sse_inter_mode == DC_PRED)
{
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 666afa4dc..d3d874dfc 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -420,7 +420,7 @@ add_proto qw/unsigned int vp9_variance64x64/, "const uint8_t *src_ptr, int sourc
specialize qw/vp9_variance64x64 avx2/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_variance16x16 mmx avx2/, "$sse2_x86inc";
+specialize qw/vp9_variance16x16 mmx avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance16x8 mmx/, "$sse2_x86inc";
@@ -435,7 +435,7 @@ add_proto qw/void vp9_get8x8var/, "const uint8_t *src_ptr, int source_stride, co
specialize qw/vp9_get8x8var mmx/, "$sse2_x86inc";
add_proto qw/void vp9_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-specialize qw/vp9_get16x16var avx2/, "$sse2_x86inc";
+specialize qw/vp9_get16x16var avx2 neon/, "$sse2_x86inc";
add_proto qw/unsigned int vp9_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
specialize qw/vp9_variance8x4/, "$sse2_x86inc";
@@ -483,7 +483,7 @@ add_proto qw/unsigned int vp9_sub_pixel_avg_variance32x32/, "const uint8_t *src_
specialize qw/vp9_sub_pixel_avg_variance32x32/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-specialize qw/vp9_sub_pixel_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vp9_sub_pixel_variance16x16 neon/, "$sse2_x86inc", "$ssse3_x86inc";
add_proto qw/unsigned int vp9_sub_pixel_avg_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred";
specialize qw/vp9_sub_pixel_avg_variance16x16/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -757,10 +757,10 @@ add_proto qw/void vp9_fdct4x4/, "const int16_t *input, int16_t *output, int stri
specialize qw/vp9_fdct4x4 sse2/;
add_proto qw/void vp9_fdct8x8_1/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8_1 sse2/;
+specialize qw/vp9_fdct8x8_1 sse2 neon/;
add_proto qw/void vp9_fdct8x8/, "const int16_t *input, int16_t *output, int stride";
-specialize qw/vp9_fdct8x8 sse2/, "$ssse3_x86_64";
+specialize qw/vp9_fdct8x8 sse2 neon/, "$ssse3_x86_64";
add_proto qw/void vp9_fdct16x16_1/, "const int16_t *input, int16_t *output, int stride";
specialize qw/vp9_fdct16x16_1 sse2/;
diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 000000000..6c66f5d5b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,223 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+
+#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_idct.h"
+
+void vp9_fdct8x8_1_neon(const int16_t *input, int16_t *output, int stride) {
+ int r;
+ int16x8_t sum = vld1q_s16(&input[0]);
+ for (r = 1; r < 8; ++r) {
+ const int16x8_t input_00 = vld1q_s16(&input[r * stride]);
+ sum = vaddq_s16(sum, input_00);
+ }
+ {
+ const int32x4_t a = vpaddlq_s16(sum);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ output[0] = vget_lane_s16(vreinterpret_s16_s32(c), 0);
+ output[1] = 0;
+ }
+}
+
+void vp9_fdct8x8_neon(const int16_t *input, int16_t *final_output, int stride) {
+ int i;
+ // stage 1
+ int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+ int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+ int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+ int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+ int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+ int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+ int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+ int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+ for (i = 0; i < 2; ++i) {
+ int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
+ const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
+ const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
+ const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
+ const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
+ const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
+ const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
+ const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
+ const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
+ // fdct4(step, step);
+ int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+ int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+ int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+ int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+ // fdct4(step, step);
+ int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+ int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+ int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_24_64);
+ int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_24_64);
+ int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_24_64);
+ v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), (int16_t)cospi_8_64);
+ v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), (int16_t)cospi_8_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), (int16_t)cospi_8_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), (int16_t)cospi_8_64);
+ v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+ v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+ v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+ v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_0 = vcombine_s16(a, c); // 00 01 02 03 40 41 42 43
+ out_2 = vcombine_s16(e, g); // 20 21 22 23 60 61 62 63
+ out_4 = vcombine_s16(b, d); // 04 05 06 07 44 45 46 47
+ out_6 = vcombine_s16(f, h); // 24 25 26 27 64 65 66 67
+ }
+ // Stage 2
+ v_x0 = vsubq_s16(v_s6, v_s5);
+ v_x1 = vaddq_s16(v_s6, v_s5);
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), (int16_t)cospi_16_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), (int16_t)cospi_16_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_16_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_16_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x8_t ab = vcombine_s16(a, b);
+ const int16x8_t cd = vcombine_s16(c, d);
+ // Stage 3
+ v_x0 = vaddq_s16(v_s4, ab);
+ v_x1 = vsubq_s16(v_s4, ab);
+ v_x2 = vsubq_s16(v_s7, cd);
+ v_x3 = vaddq_s16(v_s7, cd);
+ }
+ // Stage 4
+ v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_4_64);
+ v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), (int16_t)cospi_28_64);
+ v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), (int16_t)cospi_28_64);
+ v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), (int16_t)cospi_12_64);
+ v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), (int16_t)cospi_20_64);
+ v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), (int16_t)cospi_20_64);
+ v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), (int16_t)cospi_12_64);
+ v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), (int16_t)cospi_20_64);
+ v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), (int16_t)cospi_20_64);
+ v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), (int16_t)cospi_28_64);
+ v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), (int16_t)cospi_4_64);
+ v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), (int16_t)cospi_4_64);
+ {
+ const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+ const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+ const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+ const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+ const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+ const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+ const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+ const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+ out_1 = vcombine_s16(a, c); // 10 11 12 13 50 51 52 53
+ out_3 = vcombine_s16(e, g); // 30 31 32 33 70 71 72 73
+ out_5 = vcombine_s16(b, d); // 14 15 16 17 54 55 56 57
+ out_7 = vcombine_s16(f, h); // 34 35 36 37 74 75 76 77
+ }
+ // transpose 8x8
+ {
+ // 00 01 02 03 40 41 42 43
+ // 10 11 12 13 50 51 52 53
+ // 20 21 22 23 60 61 62 63
+ // 30 31 32 33 70 71 72 73
+ // 04 05 06 07 44 45 46 47
+ // 14 15 16 17 54 55 56 57
+ // 24 25 26 27 64 65 66 67
+ // 34 35 36 37 74 75 76 77
+ const int32x4x2_t r02_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_0),
+ vreinterpretq_s32_s16(out_2));
+ const int32x4x2_t r13_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_1),
+ vreinterpretq_s32_s16(out_3));
+ const int32x4x2_t r46_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_4),
+ vreinterpretq_s32_s16(out_6));
+ const int32x4x2_t r57_s32 = vtrnq_s32(vreinterpretq_s32_s16(out_5),
+ vreinterpretq_s32_s16(out_7));
+ const int16x8x2_t r01_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+ vreinterpretq_s16_s32(r13_s32.val[0]));
+ const int16x8x2_t r23_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+ vreinterpretq_s16_s32(r13_s32.val[1]));
+ const int16x8x2_t r45_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+ vreinterpretq_s16_s32(r57_s32.val[0]));
+ const int16x8x2_t r67_s16 =
+ vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+ vreinterpretq_s16_s32(r57_s32.val[1]));
+ input_0 = r01_s16.val[0];
+ input_1 = r01_s16.val[1];
+ input_2 = r23_s16.val[0];
+ input_3 = r23_s16.val[1];
+ input_4 = r45_s16.val[0];
+ input_5 = r45_s16.val[1];
+ input_6 = r67_s16.val[0];
+ input_7 = r67_s16.val[1];
+ // 00 10 20 30 40 50 60 70
+ // 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72
+ // 03 13 23 33 43 53 63 73
+ // 04 14 24 34 44 54 64 74
+ // 05 15 25 35 45 55 65 75
+ // 06 16 26 36 46 56 66 76
+ // 07 17 27 37 47 57 67 77
+ }
+ } // for
+ {
+ // from vp9_dct_sse2.c
+ // Post-condition (division by two)
+ // division of two 16 bits signed numbers using shifts
+ // n / 2 = (n - (n >> 15)) >> 1
+ const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
+ const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
+ const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
+ const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
+ const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
+ const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
+ const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
+ const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
+ input_0 = vhsubq_s16(input_0, sign_in0);
+ input_1 = vhsubq_s16(input_1, sign_in1);
+ input_2 = vhsubq_s16(input_2, sign_in2);
+ input_3 = vhsubq_s16(input_3, sign_in3);
+ input_4 = vhsubq_s16(input_4, sign_in4);
+ input_5 = vhsubq_s16(input_5, sign_in5);
+ input_6 = vhsubq_s16(input_6, sign_in6);
+ input_7 = vhsubq_s16(input_7, sign_in7);
+ // store results
+ vst1q_s16(&final_output[0 * 8], input_0);
+ vst1q_s16(&final_output[1 * 8], input_1);
+ vst1q_s16(&final_output[2 * 8], input_2);
+ vst1q_s16(&final_output[3 * 8], input_3);
+ vst1q_s16(&final_output[4 * 8], input_4);
+ vst1q_s16(&final_output[5 * 8], input_5);
+ vst1q_s16(&final_output[6 * 8], input_6);
+ vst1q_s16(&final_output[7 * 8], input_7);
+ }
+}
+
diff --git a/vp9/encoder/arm/neon/vp9_variance_neon.c b/vp9/encoder/arm/neon/vp9_variance_neon.c
new file mode 100644
index 000000000..f6871188b
--- /dev/null
+++ b/vp9/encoder/arm/neon/vp9_variance_neon.c
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+#include "./vp9_rtcd.h"
+
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
+
+#include "vp9/encoder/vp9_variance.h"
+
+enum { kWidth16 = 16 };
+enum { kHeight16 = 16 };
+enum { kHeight16PlusOne = 17 };
+enum { kPixelStepOne = 1 };
+
+static INLINE int horizontal_add_s16x8(const int16x8_t v_16x8) {
+ const int32x4_t a = vpaddlq_s16(v_16x8);
+ const int64x2_t b = vpaddlq_s32(a);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static INLINE int horizontal_add_s32x4(const int32x4_t v_32x4) {
+ const int64x2_t b = vpaddlq_s32(v_32x4);
+ const int32x2_t c = vadd_s32(vreinterpret_s32_s64(vget_low_s64(b)),
+ vreinterpret_s32_s64(vget_high_s64(b)));
+ return vget_lane_s32(c, 0);
+}
+
+static void variance_neon_w8(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ int w, int h, unsigned int *sse, int *sum) {
+ int i, j;
+ int16x8_t v_sum = vdupq_n_s16(0);
+ int32x4_t v_sse_lo = vdupq_n_s32(0);
+ int32x4_t v_sse_hi = vdupq_n_s32(0);
+
+ for (i = 0; i < h; ++i) {
+ for (j = 0; j < w; j += 8) {
+ const uint8x8_t v_a = vld1_u8(&a[j]);
+ const uint8x8_t v_b = vld1_u8(&b[j]);
+ const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
+ const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
+ v_sum = vaddq_s16(v_sum, sv_diff);
+ v_sse_lo = vmlal_s16(v_sse_lo,
+ vget_low_s16(sv_diff),
+ vget_low_s16(sv_diff));
+ v_sse_hi = vmlal_s16(v_sse_hi,
+ vget_high_s16(sv_diff),
+ vget_high_s16(sv_diff));
+ }
+ a += a_stride;
+ b += b_stride;
+ }
+
+ *sum = horizontal_add_s16x8(v_sum);
+ *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
+}
+
+void vp9_get16x16var_neon(const uint8_t *src_ptr, int source_stride,
+ const uint8_t *ref_ptr, int ref_stride,
+ unsigned int *sse, int *sum) {
+ variance_neon_w8(src_ptr, source_stride, ref_ptr, ref_stride, kWidth16,
+ kHeight16, sse, sum);
+}
+
+unsigned int vp9_variance16x16_neon(const uint8_t *a, int a_stride,
+ const uint8_t *b, int b_stride,
+ unsigned int *sse) {
+ int sum;
+ variance_neon_w8(a, a_stride, b, b_stride, kWidth16, kHeight16, sse, &sum);
+ return *sse - (((int64_t)sum * sum) / (kWidth16 * kHeight16));
+}
+
+static void var_filter_block2d_bil_w16(const uint8_t *src_ptr,
+ uint8_t *output_ptr,
+ unsigned int src_pixels_per_line,
+ int pixel_step,
+ unsigned int output_height,
+ unsigned int output_width,
+ const int16_t *vp9_filter) {
+ const uint8x8_t f0 = vmov_n_u8((uint8_t)vp9_filter[0]);
+ const uint8x8_t f1 = vmov_n_u8((uint8_t)vp9_filter[1]);
+ unsigned int i;
+ for (i = 0; i < output_height; ++i) {
+ const uint8x16_t src_0 = vld1q_u8(&src_ptr[0]);
+ const uint8x16_t src_1 = vld1q_u8(&src_ptr[pixel_step]);
+ const uint16x8_t a = vmull_u8(vget_low_u8(src_0), f0);
+ const uint16x8_t b = vmlal_u8(a, vget_low_u8(src_1), f1);
+ const uint8x8_t out_lo = vrshrn_n_u16(b, FILTER_BITS);
+ const uint16x8_t c = vmull_u8(vget_high_u8(src_0), f0);
+ const uint16x8_t d = vmlal_u8(c, vget_high_u8(src_1), f1);
+ const uint8x8_t out_hi = vrshrn_n_u16(d, FILTER_BITS);
+ vst1q_u8(&output_ptr[0], vcombine_u8(out_lo, out_hi));
+ // Next row...
+ src_ptr += src_pixels_per_line;
+ output_ptr += output_width;
+ }
+}
+
+unsigned int vp9_sub_pixel_variance16x16_neon(const uint8_t *src,
+ int src_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst,
+ int dst_stride,
+ unsigned int *sse) {
+ DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, temp2, kHeight16 * kWidth16);
+ DECLARE_ALIGNED_ARRAY(kWidth16, uint8_t, fdata3, kHeight16PlusOne * kWidth16);
+
+ var_filter_block2d_bil_w16(src, fdata3, src_stride, kPixelStepOne,
+ kHeight16PlusOne, kWidth16,
+ BILINEAR_FILTERS_2TAP(xoffset));
+ var_filter_block2d_bil_w16(fdata3, temp2, kWidth16, kWidth16, kHeight16,
+ kWidth16, BILINEAR_FILTERS_2TAP(yoffset));
+ return vp9_variance16x16_neon(temp2, kWidth16, dst, dst_stride, sse);
+}
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index fe15edfdc..1b574758b 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -553,6 +553,9 @@ void vp9_first_pass(VP9_COMP *cpi) {
const int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
double error_weight = 1.0;
const BLOCK_SIZE bsize = get_bsize(cm, mb_row, mb_col);
+#if CONFIG_FP_MB_STATS
+ const int mb_index = mb_row * cm->mb_cols + mb_col;
+#endif
vp9_clear_system_state();
@@ -600,7 +603,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
// initialization
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
}
#endif
@@ -704,26 +707,20 @@ void vp9_first_pass(VP9_COMP *cpi) {
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
// intra predication statistics
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0;
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
- FPMB_DCINTRA_MASK;
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &=
- (~FPMB_NONZERO_MOTION_MASK);
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &=
+ ~FPMB_NONZERO_MOTION_MASK;
if (this_error > FPMB_ERROR_LEVEL4_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
- FPMB_ERROR_LEVEL4_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL4_MASK;
} else if (this_error > FPMB_ERROR_LEVEL3_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
- FPMB_ERROR_LEVEL3_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL3_MASK;
} else if (this_error > FPMB_ERROR_LEVEL2_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
- FPMB_ERROR_LEVEL2_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL2_MASK;
} else if (this_error > FPMB_ERROR_LEVEL1_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
- FPMB_ERROR_LEVEL1_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL1_MASK;
} else {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
- FPMB_ERROR_LEVEL0_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] |= FPMB_ERROR_LEVEL0_MASK;
}
}
#endif
@@ -759,25 +756,24 @@ void vp9_first_pass(VP9_COMP *cpi) {
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
// inter predication statistics
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] = 0;
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &=
- (~FPMB_DCINTRA_MASK);
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] &=
- (~FPMB_NONZERO_MOTION_MASK);
+ cpi->twopass.frame_mb_stats_buf[mb_index] = 0;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &= ~FPMB_DCINTRA_MASK;
+ cpi->twopass.frame_mb_stats_buf[mb_index] &=
+ ~FPMB_NONZERO_MOTION_MASK;
if (this_error > FPMB_ERROR_LEVEL4_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
FPMB_ERROR_LEVEL4_MASK;
} else if (this_error > FPMB_ERROR_LEVEL3_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
FPMB_ERROR_LEVEL3_MASK;
} else if (this_error > FPMB_ERROR_LEVEL2_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
FPMB_ERROR_LEVEL2_MASK;
} else if (this_error > FPMB_ERROR_LEVEL1_TH) {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
FPMB_ERROR_LEVEL1_MASK;
} else {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
FPMB_ERROR_LEVEL0_MASK;
}
}
@@ -788,7 +784,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
#if CONFIG_FP_MB_STATS
if (cpi->use_fp_mb_stats) {
- cpi->twopass.frame_mb_stats_buf[mb_row * cm->mb_cols + mb_col] |=
+ cpi->twopass.frame_mb_stats_buf[mb_index] |=
FPMB_NONZERO_MOTION_MASK;
}
#endif
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 96473f5a1..7a1600155 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -410,6 +410,10 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
PRED_BUFFER *this_mode_pred = NULL;
int i;
+ // CTX is used by the temporal denoiser which is currently being developed.
+ // TODO(jbb): when temporal denoiser is finished and in the default build
+ // remove the following line;
+ (void) ctx;
if (cpi->sf.reuse_inter_pred_sby) {
for (i = 0; i < 3; i++) {
tmp[i].data = &pred_buf[pixels_in_block * i];
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 3381cb95a..77b968bda 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -130,5 +130,7 @@ VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))
diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h
index 182892acf..bd9eebd64 100644
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h
@@ -73,6 +73,33 @@ static void once(void (*func)(void))
}
+#elif CONFIG_MULTITHREAD && defined(__OS2__)
+#define INCL_DOS
+#include <os2.h>
+static void once(void (*func)(void))
+{
+ static int done;
+
+ /* If the initialization is complete, return early. */
+ if(done)
+ return;
+
+ /* Causes all other threads in the process to block themselves
+ * and give up their time slice.
+ */
+ DosEnterCritSec();
+
+ if (!done)
+ {
+ func();
+ done = 1;
+ }
+
+ /* Restores normal thread dispatching for the current process. */
+ DosExitCritSec();
+}
+
+
#elif CONFIG_MULTITHREAD && HAVE_PTHREAD_H
#include <pthread.h>
static void once(void (*func)(void))