summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--README1
-rw-r--r--build/make/configure.sh11
-rwxr-xr-xconfigure7
-rw-r--r--ivfdec.c8
-rw-r--r--test/convolve_test.cc4
-rw-r--r--test/dct_test.cc10
-rw-r--r--test/ivf_video_source.h2
-rw-r--r--test/sum_squares_test.cc7
-rw-r--r--test/variance_test.cc40
-rwxr-xr-xtest/vp8_multi_resolution_encoder.sh12
-rw-r--r--test/vp9_quantize_test.cc7
-rw-r--r--vp9/encoder/vp9_bitstream.c14
-rw-r--r--vp9/encoder/vp9_encodeframe.c59
-rw-r--r--vp9/encoder/vp9_encodemb.c40
-rw-r--r--vp9/encoder/vp9_encoder.c67
-rw-r--r--vp9/encoder/vp9_encoder.h2
-rw-r--r--vp9/encoder/vp9_firstpass.c68
-rw-r--r--vp9/encoder/vp9_mcomp.c11
-rw-r--r--vp9/encoder/vp9_mcomp.h3
-rw-r--r--vp9/encoder/vp9_picklpf.c8
-rw-r--r--vp9/encoder/vp9_pickmode.c24
-rw-r--r--vp9/encoder/vp9_quantize.c5
-rw-r--r--vp9/encoder/vp9_ratectrl.c27
-rw-r--r--vp9/encoder/vp9_rd.c48
-rw-r--r--vp9/encoder/vp9_rdopt.c53
-rw-r--r--vp9/encoder/vp9_speed_features.c1
-rw-r--r--vp9/encoder/vp9_speed_features.h3
-rw-r--r--vp9/encoder/vp9_svc_layercontext.c159
-rw-r--r--vp9/encoder/vp9_svc_layercontext.h31
-rw-r--r--vp9/vp9_cx_iface.c26
-rw-r--r--vpx/vp8cx.h26
-rw-r--r--vpx/vpx_encoder.h2
-rw-r--r--vpx_dsp/arm/avg_pred_neon.c42
-rw-r--r--vpx_dsp/arm/mem_neon.h12
-rw-r--r--vpx_dsp/arm/sad4d_neon.c380
-rw-r--r--vpx_dsp/arm/subtract_neon.c83
-rw-r--r--vpx_dsp/arm/sum_neon.h9
-rw-r--r--vpx_dsp/arm/sum_squares_neon.c85
-rw-r--r--vpx_dsp/mips/vpx_convolve8_mmi.c217
-rw-r--r--vpx_dsp/ppc/inv_txfm_vsx.c86
-rw-r--r--vpx_dsp/ppc/quantize_vsx.c124
-rw-r--r--vpx_dsp/ppc/types_vsx.h14
-rw-r--r--vpx_dsp/sum_squares.c5
-rw-r--r--vpx_dsp/vpx_dsp.mk2
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl10
-rw-r--r--vpx_dsp/x86/mem_sse2.h5
-rw-r--r--vpx_dsp/x86/sum_squares_sse2.c188
47 files changed, 1418 insertions, 630 deletions
diff --git a/README b/README
index a900c8077..49407ed9f 100644
--- a/README
+++ b/README
@@ -76,7 +76,6 @@ COMPILING THE APPLICATIONS/LIBRARIES:
armv8-linux-gcc
mips32-linux-gcc
mips64-linux-gcc
- ppc64-linux-gcc
ppc64le-linux-gcc
sparc-solaris-gcc
x86-android-gcc
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 876255bfe..f1d0e34c3 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -719,11 +719,8 @@ process_common_toolchain() {
*sparc*)
tgt_isa=sparc
;;
- power*64*-*)
- tgt_isa=ppc64
- ;;
- power*)
- tgt_isa=ppc
+ power*64le*-*)
+ tgt_isa=ppc64le
;;
*mips64el*)
tgt_isa=mips64
@@ -1221,7 +1218,7 @@ EOF
check_add_asflags -march=${tgt_isa}
check_add_asflags -KPIC
;;
- ppc*)
+ ppc64le*)
link_with_cc=gcc
setup_gnu_toolchain
check_gcc_machine_option "vsx"
@@ -1491,7 +1488,7 @@ EOF
# bionic includes basic pthread functionality, obviating -lpthread.
;;
*)
- check_header pthread.h && check_lib -lpthread <<EOF && enable_feature pthread_h && add_extralibs -lpthread
+ check_header pthread.h && check_lib -lpthread <<EOF && add_extralibs -lpthread || disable_feature pthread_h
#include <pthread.h>
#include <stddef.h>
int main(void) { return pthread_create(NULL, NULL, NULL, NULL); }
diff --git a/configure b/configure
index b021c1212..3a1a318c9 100755
--- a/configure
+++ b/configure
@@ -116,7 +116,6 @@ all_platforms="${all_platforms} armv7s-darwin-gcc"
all_platforms="${all_platforms} armv8-linux-gcc"
all_platforms="${all_platforms} mips32-linux-gcc"
all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
all_platforms="${all_platforms} ppc64le-linux-gcc"
all_platforms="${all_platforms} sparc-solaris-gcc"
all_platforms="${all_platforms} x86-android-gcc"
@@ -585,7 +584,7 @@ EOF
# Use both check_header and check_lib here, since check_lib
# could be a stub that always returns true.
- check_header pthread.h && check_lib -lpthread <<EOF && enable_feature pthread_h
+ check_header pthread.h && check_lib -lpthread <<EOF || disable_feature pthread_h
#include <pthread.h>
#include <stddef.h>
int main(void) { return pthread_create(NULL, NULL, NULL, NULL); }
@@ -593,6 +592,10 @@ EOF
check_header unistd.h # for sysconf(3) and friends.
check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports
+
+ if enabled neon && ! enabled external_build; then
+ check_header arm_neon.h || die "Unable to find arm_neon.h"
+ fi
}
process_toolchain() {
diff --git a/ivfdec.c b/ivfdec.c
index f64e594ab..3e179bc6e 100644
--- a/ivfdec.c
+++ b/ivfdec.c
@@ -76,12 +76,12 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
size_t frame_size = 0;
if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
- if (!feof(infile)) warn("Failed to read frame size\n");
+ if (!feof(infile)) warn("Failed to read frame size");
} else {
frame_size = mem_get_le32(raw_header);
if (frame_size > 256 * 1024 * 1024) {
- warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+ warn("Read invalid frame size (%u)", (unsigned int)frame_size);
frame_size = 0;
}
@@ -92,7 +92,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
*buffer = new_buffer;
*buffer_size = 2 * frame_size;
} else {
- warn("Failed to allocate compressed data buffer\n");
+ warn("Failed to allocate compressed data buffer");
frame_size = 0;
}
}
@@ -100,7 +100,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read,
if (!feof(infile)) {
if (fread(*buffer, 1, frame_size, infile) != frame_size) {
- warn("Failed to read full frame\n");
+ warn("Failed to read full frame");
return 1;
}
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 3e9377a0e..8f6c5cd48 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -1382,8 +1382,8 @@ INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest,
#if HAVE_MMI
const ConvolveFunctions convolve8_mmi(
- vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_mmi,
- vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_mmi,
+ vpx_convolve_copy_c, vpx_convolve_avg_mmi, vpx_convolve8_horiz_mmi,
+ vpx_convolve8_avg_horiz_mmi, vpx_convolve8_vert_mmi,
vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi,
vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
diff --git a/test/dct_test.cc b/test/dct_test.cc
index 10062150f..e8ad0cd5d 100644
--- a/test/dct_test.cc
+++ b/test/dct_test.cc
@@ -725,4 +725,14 @@ INSTANTIATE_TEST_CASE_P(SSE2, TransWHT,
::testing::Values(make_tuple(0, &wht_sse2_func_info, 0,
VPX_BITS_8)));
#endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo wht_vsx_func_info = {
+ &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_vsx>, 4, 1
+};
+
+INSTANTIATE_TEST_CASE_P(VSX, TransWHT,
+ ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0,
+ VPX_BITS_8)));
+#endif // HAVE_VSX && !CONFIG_EMULATE_HARDWARE
} // namespace
diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h
index 5862d2649..4b5d55469 100644
--- a/test/ivf_video_source.h
+++ b/test/ivf_video_source.h
@@ -16,7 +16,7 @@
#include "test/video_source.h"
namespace libvpx_test {
-const unsigned int kCodeBufferSize = 256 * 1024;
+const unsigned int kCodeBufferSize = 256 * 1024 * 1024;
const unsigned int kIvfFileHdrSize = 32;
const unsigned int kIvfFrameHdrSize = 12;
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
index 910718b06..e211fb4d8 100644
--- a/test/sum_squares_test.cc
+++ b/test/sum_squares_test.cc
@@ -104,6 +104,13 @@ TEST_P(SumSquaresTest, ExtremeValues) {
using ::testing::make_tuple;
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(
+ NEON, SumSquaresTest,
+ ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c,
+ &vpx_sum_squares_2d_i16_neon)));
+#endif // HAVE_NEON
+
#if HAVE_SSE2
INSTANTIATE_TEST_CASE_P(
SSE2, SumSquaresTest,
diff --git a/test/variance_test.cc b/test/variance_test.cc
index f89c98523..725821ae6 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -20,24 +20,13 @@
#include "test/register_state_check.h"
#include "vpx/vpx_codec.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/variance.h"
#include "vpx_mem/vpx_mem.h"
#include "vpx_ports/mem.h"
#include "vpx_ports/vpx_timer.h"
namespace {
-typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride,
- const uint8_t *b, int b_stride,
- unsigned int *sse);
-typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride,
- int xoffset, int yoffset,
- const uint8_t *b, int b_stride,
- unsigned int *sse);
-typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride,
- int xoffset, int yoffset,
- const uint8_t *b, int b_stride,
- uint32_t *sse,
- const uint8_t *second_pred);
typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride,
const uint8_t *b, int b_stride);
typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src);
@@ -692,7 +681,7 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() {
}
template <>
-void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
+void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
for (int x = 0; x < 8; ++x) {
for (int y = 0; y < 8; ++y) {
if (!use_high_bit_depth()) {
@@ -728,10 +717,10 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() {
}
typedef MainTestClass<Get4x4SseFunc> VpxSseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxMseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxVarianceTest;
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest;
-typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxMseTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest;
+typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest;
+typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest;
TEST_P(VpxSseTest, RefSse) { RefTestSse(); }
TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); }
@@ -756,14 +745,14 @@ INSTANTIATE_TEST_CASE_P(C, VpxSseTest,
::testing::Values(SseParams(2, 2,
&vpx_get4x4sse_cs_c)));
-typedef TestParams<VarianceMxNFunc> MseParams;
+typedef TestParams<vpx_variance_fn_t> MseParams;
INSTANTIATE_TEST_CASE_P(C, VpxMseTest,
::testing::Values(MseParams(4, 4, &vpx_mse16x16_c),
MseParams(4, 3, &vpx_mse16x8_c),
MseParams(3, 4, &vpx_mse8x16_c),
MseParams(3, 3, &vpx_mse8x8_c)));
-typedef TestParams<VarianceMxNFunc> VarianceParams;
+typedef TestParams<vpx_variance_fn_t> VarianceParams;
INSTANTIATE_TEST_CASE_P(
C, VpxVarianceTest,
::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_c),
@@ -780,7 +769,7 @@ INSTANTIATE_TEST_CASE_P(
VarianceParams(2, 3, &vpx_variance4x8_c),
VarianceParams(2, 2, &vpx_variance4x4_c)));
-typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams;
+typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams;
INSTANTIATE_TEST_CASE_P(
C, VpxSubpelVarianceTest,
::testing::Values(
@@ -798,7 +787,7 @@ INSTANTIATE_TEST_CASE_P(
SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
-typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams;
+typedef TestParams<vpx_subp_avg_variance_fn_t> SubpelAvgVarianceParams;
INSTANTIATE_TEST_CASE_P(
C, VpxSubpelAvgVarianceTest,
::testing::Values(
@@ -817,10 +806,11 @@ INSTANTIATE_TEST_CASE_P(
SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
#if CONFIG_VP9_HIGHBITDEPTH
-typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest;
-typedef MainTestClass<VarianceMxNFunc> VpxHBDVarianceTest;
-typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest;
-typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxHBDSubpelAvgVarianceTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest;
+typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest;
+typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest;
+typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t>
+ VpxHBDSubpelAvgVarianceTest;
TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); }
TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); }
diff --git a/test/vp8_multi_resolution_encoder.sh b/test/vp8_multi_resolution_encoder.sh
index a8b7fe78e..33fd5b0d8 100755
--- a/test/vp8_multi_resolution_encoder.sh
+++ b/test/vp8_multi_resolution_encoder.sh
@@ -46,19 +46,31 @@ vp8_multi_resolution_encoder_three_formats() {
local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf
${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf
${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf"
+ local readonly layer_bitrates="150 80 50"
+ local readonly keyframe_insert="200"
+ local readonly temporal_layers="3 3 3"
+ local readonly framerate="30"
if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then
if [ "$(vp8_encode_available)" = "yes" ]; then
# Param order:
# Input width
# Input height
+ # Framerate
# Input file path
# Output file names
+ # Layer bitrates
+ # Temporal layers
+ # Keyframe insert
# Output PSNR
vp8_mre "${YUV_RAW_INPUT_WIDTH}" \
"${YUV_RAW_INPUT_HEIGHT}" \
+ "${framerate}" \
"${YUV_RAW_INPUT}" \
${output_files} \
+ ${layer_bitrates} \
+ ${temporal_layers} \
+ "${keyframe_insert}" \
0
for output_file in ${output_files}; do
diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc
index 4740cadd9..f0bbedbfa 100644
--- a/test/vp9_quantize_test.cc
+++ b/test/vp9_quantize_test.cc
@@ -575,3 +575,10 @@ INSTANTIATE_TEST_CASE_P(
&QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32,
true)));
} // namespace
+
+#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(VSX, VP9QuantizeTest,
+ ::testing::Values(make_tuple(&vpx_quantize_b_vsx,
+ &vpx_quantize_b_c,
+ VPX_BITS_8, 16, false)));
+#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index abc705363..4e7d99f50 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -459,7 +459,8 @@ static void write_modes_sb(
write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
max_mv_magnitude, interp_filter_selected);
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition == PARTITION_SPLIT);
write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize,
max_mv_magnitude, interp_filter_selected);
write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs,
@@ -469,7 +470,6 @@ static void write_modes_sb(
write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
subsize, max_mv_magnitude, interp_filter_selected);
break;
- default: assert(0);
}
}
@@ -618,9 +618,10 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
return;
}
- case ONE_LOOP_REDUCED: {
+ default: {
int updates = 0;
int noupdates_before_first = 0;
+ assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED);
for (i = 0; i < PLANE_TYPES; ++i) {
for (j = 0; j < REF_TYPES; ++j) {
for (k = 0; k < COEF_BANDS; ++k) {
@@ -670,7 +671,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi,
}
return;
}
- default: assert(0);
}
}
@@ -1149,8 +1149,10 @@ static void write_profile(BITSTREAM_PROFILE profile,
case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break;
case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break;
case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break;
- case PROFILE_3: vpx_wb_write_literal(wb, 6, 3); break;
- default: assert(0);
+ default:
+ assert(profile == PROFILE_3);
+ vpx_wb_write_literal(wb, 6, 3);
+ break;
}
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index a283d92a8..091992dbd 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -385,16 +385,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
node->split[i] = &vt->split[i].part_variances.none;
break;
}
- case BLOCK_4X4: {
+ default: {
v4x4 *vt = (v4x4 *)data;
+ assert(bsize == BLOCK_4X4);
node->part_variances = &vt->part_variances;
for (i = 0; i < 4; i++) node->split[i] = &vt->split[i];
break;
}
- default: {
- assert(0);
- break;
- }
}
}
@@ -885,13 +882,13 @@ static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x,
set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize);
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition == PARTITION_SPLIT);
copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col);
copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col);
copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs);
copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs);
break;
- default: assert(0);
}
}
}
@@ -1004,7 +1001,8 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high,
subsize_high);
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition_high == PARTITION_SPLIT);
if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col,
mi_row_high, mi_col_high))
return 1;
@@ -1020,7 +1018,6 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
mi_col_high + bs_high))
return 1;
break;
- default: assert(0);
}
}
@@ -1067,13 +1064,13 @@ static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row,
prev_part[start_pos] = subsize;
if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition == PARTITION_SPLIT);
update_partition_svc(cpi, subsize, mi_row, mi_col);
update_partition_svc(cpi, subsize, mi_row + bs, mi_col);
update_partition_svc(cpi, subsize, mi_row, mi_col + bs);
update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs);
break;
- default: assert(0);
}
}
}
@@ -1108,13 +1105,13 @@ static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize,
prev_part[start_pos] = subsize;
if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize;
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition == PARTITION_SPLIT);
update_prev_partition_helper(cpi, subsize, mi_row, mi_col);
update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col);
update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs);
update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs);
break;
- default: assert(0);
}
}
}
@@ -1387,7 +1384,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile,
x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf,
xd->plane[0].pre[0].stride);
} else {
- y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+ const MV dummy_mv = { 0, 0 };
+ y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col,
+ &dummy_mv);
x->sb_use_mv_part = 1;
x->sb_mvcol_part = mi->mv[0].as_mv.col;
x->sb_mvrow_part = mi->mv[0].as_mv.row;
@@ -2181,7 +2180,8 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
subsize, &pc_tree->horizontal[1]);
}
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition == PARTITION_SPLIT);
if (bsize == BLOCK_8X8) {
encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
pc_tree->leaf_split[0]);
@@ -2196,7 +2196,6 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile,
subsize, pc_tree->split[3]);
}
break;
- default: assert(0 && "Invalid partition type."); break;
}
if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -2522,7 +2521,8 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
subsize, &pc_tree->horizontal[1]);
}
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition == PARTITION_SPLIT);
subsize = get_subsize(bsize, PARTITION_SPLIT);
encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
pc_tree->split[0]);
@@ -2533,7 +2533,6 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td,
encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs,
output_enabled, subsize, pc_tree->split[3]);
break;
- default: assert(0 && "Invalid partition type."); break;
}
if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -2672,7 +2671,8 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
last_part_rdc.rdcost += tmp_rdc.rdcost;
}
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition == PARTITION_SPLIT);
if (bsize == BLOCK_8X8) {
rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
subsize, pc_tree->leaf_split[0], INT64_MAX);
@@ -2702,7 +2702,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td,
last_part_rdc.dist += tmp_rdc.dist;
}
break;
- default: assert(0); break;
}
pl = partition_plane_context(xd, mi_row, mi_col, bsize);
@@ -4208,7 +4207,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
}
}
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition == PARTITION_SPLIT);
subsize = get_subsize(bsize, PARTITION_SPLIT);
nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
subsize, output_enabled, rd_cost,
@@ -4238,7 +4238,6 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td,
rd_cost->dist += this_rdc.dist;
}
break;
- default: assert(0 && "Invalid partition type."); break;
}
}
@@ -4327,7 +4326,8 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
output_enabled, subsize, &pc_tree->horizontal[1]);
}
break;
- case PARTITION_SPLIT:
+ default:
+ assert(partition == PARTITION_SPLIT);
subsize = get_subsize(bsize, PARTITION_SPLIT);
if (bsize == BLOCK_8X8) {
nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost,
@@ -4348,7 +4348,6 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td,
dummy_cost, pc_tree->split[3]);
}
break;
- default: assert(0 && "Invalid partition type."); break;
}
if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
@@ -4452,7 +4451,8 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
BLOCK_64X64, 1, &dummy_rdc, td->pc_root);
break;
- case REFERENCE_PARTITION:
+ default:
+ assert(partition_search_type == REFERENCE_PARTITION);
x->sb_pickmode_part = 1;
set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
// Use nonrd_pick_partition on scene-cut for VBR mode.
@@ -4484,7 +4484,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td,
}
break;
- default: assert(0); break;
}
// Update ref_frame usage for inter frame if this group is ARF group.
@@ -4551,16 +4550,12 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) {
&var16->sse, &var16->sum);
var16->var = variance_highbd(var16);
break;
- case VPX_BITS_12:
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride,
&var16->sse, &var16->sum);
var16->var = variance_highbd(var16);
break;
- default:
- assert(0 &&
- "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10"
- " or VPX_BITS_12");
- return -1;
}
} else {
vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse,
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 970077d89..bc2765728 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -358,13 +358,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp,
p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
- default: assert(0);
}
return;
}
@@ -388,13 +388,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col,
p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant,
eob, scan_order->scan, scan_order->iscan);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
- default: assert(0); break;
}
}
@@ -434,13 +434,13 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
eob);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
eob);
break;
- default: assert(0);
}
return;
}
@@ -462,12 +462,12 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col,
vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0],
qcoeff, dqcoeff, pd->dequant[0], eob);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
qcoeff, dqcoeff, pd->dequant[0], eob);
break;
- default: assert(0); break;
}
}
@@ -511,14 +511,14 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
p->quant, p->quant_shift, qcoeff, dqcoeff,
pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
- default: assert(0);
}
return;
}
@@ -544,13 +544,13 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col,
p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
scan_order->scan, scan_order->iscan);
break;
- default: assert(0); break;
}
}
@@ -634,14 +634,14 @@ static void encode_block(int plane, int block, int row, int col,
vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
xd->bd);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block],
xd->bd);
break;
- default: assert(0 && "Invalid transform size");
}
return;
}
@@ -657,13 +657,13 @@ static void encode_block(int plane, int block, int row, int col,
case TX_8X8:
vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
// this is like vp9_short_idct4x4 but has a special case around eob<=1
// which is significant (not just an optimization) for the lossless
// case.
x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
break;
- default: assert(0 && "Invalid transform size"); break;
}
}
@@ -848,7 +848,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
xd->bd);
}
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
if (!x->skip_recode) {
vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src,
src_stride, dst, dst_stride, xd->bd);
@@ -876,7 +877,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
}
}
break;
- default: assert(0); return;
}
if (*eob) *(args->skip) = 0;
return;
@@ -930,7 +930,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
if (!x->skip_encode && *eob)
vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
if (!x->skip_recode) {
vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst,
dst_stride);
@@ -955,7 +956,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col,
vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type);
}
break;
- default: assert(0); break;
}
if (*eob) *(args->skip) = 0;
}
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 2f92456f2..3384de7ea 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -483,14 +483,10 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) {
*hr = 3;
*hs = 5;
break;
- case ONETWO:
- *hr = 1;
- *hs = 2;
- break;
default:
+ assert(mode == ONETWO);
*hr = 1;
- *hs = 1;
- assert(0);
+ *hs = 2;
break;
}
}
@@ -1726,7 +1722,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_sad4x4x4d_bits10)
break;
- case VPX_BITS_12:
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12,
vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16,
vpx_highbd_12_sub_pixel_variance32x16,
@@ -1805,11 +1802,6 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) {
vpx_highbd_12_sub_pixel_avg_variance4x4,
vpx_highbd_sad4x4x4d_bits12)
break;
-
- default:
- assert(0 &&
- "cm->bit_depth should be VPX_BITS_8, "
- "VPX_BITS_10 or VPX_BITS_12");
}
}
}
@@ -3031,9 +3023,17 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
// Keep track of frame index for each reference frame.
SVC *const svc = &cpi->svc;
if (cm->frame_type == KEY_FRAME) {
+ int i;
svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe;
svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
+ // On key frame update all reference frame slots.
+ for (i = 0; i < REF_FRAMES; i++) {
+ // LAST/GOLDEN/ALTREF is already updated above.
+ if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx &&
+ i != cpi->alt_fb_idx)
+ ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx);
+ }
} else {
if (cpi->refresh_last_frame)
svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe;
@@ -3042,6 +3042,8 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
if (cpi->refresh_alt_ref_frame)
svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe;
}
+ // Copy flags from encoder to SVC struct.
+ vp9_copy_flags_ref_update_idx(cpi);
}
}
@@ -3284,11 +3286,9 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) {
case VPX_BITS_10:
dc_quant_devisor = 16.0;
break;
- case VPX_BITS_12:
- dc_quant_devisor = 64.0;
- break;
default:
- assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
+ assert(cm->bit_depth == VPX_BITS_12);
+ dc_quant_devisor = 64.0;
break;
}
#else
@@ -3730,28 +3730,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size,
suppress_active_map(cpi);
// For SVC on non-zero spatial layer: check for disabling inter-layer
- // (spatial) prediction, if svc.disable_inter_layer_pred is set.
- // if the previous spatial layer was dropped then disable the prediction from
- // this (scaled) reference.
- if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
- if ((cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY &&
- !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
- cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
- cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id - 1]) {
- MV_REFERENCE_FRAME ref_frame;
- static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
- VP9_ALT_FLAG };
- for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
- const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
- if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
- const struct scale_factors *const scale_fac =
- &cm->frame_refs[ref_frame - 1].sf;
- if (vp9_is_scaled(scale_fac))
- cpi->ref_frame_flags &= (~flag_list[ref_frame]);
- }
- }
- }
- }
+ // prediction.
+ if (cpi->use_svc && cpi->svc.spatial_layer_id > 0)
+ vp9_svc_constrain_inter_layer_pred(cpi);
// Variance adaptive and in frame q adjustment experiments are mutually
// exclusive.
@@ -4612,6 +4593,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size,
cpi->last_frame_dropped = 0;
cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0;
+ // Keep track of the frame buffer index updated/refreshed for the
+ // current encoded TL0 superframe.
+ if (cpi->svc.temporal_layer_id == 0) {
+ if (cpi->refresh_last_frame)
+ cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx;
+ else if (cpi->refresh_golden_frame)
+ cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx;
+ else if (cpi->refresh_alt_ref_frame)
+ cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx;
+ }
// Disable segmentation if it decrease rate/distortion ratio
if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ)
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 303c00a6f..1e0ed70fb 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -645,6 +645,8 @@ typedef struct VP9_COMP {
int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+ // Indices are: max_tx_size-1, tx_size_ctx, tx_size
+ int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
int multi_arf_allowed;
int multi_arf_enabled;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 3302fde08..453879fb8 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -494,11 +494,10 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) {
switch (cm->bit_depth) {
case VPX_BITS_8: ret_val = thresh; break;
case VPX_BITS_10: ret_val = thresh << 4; break;
- case VPX_BITS_12: ret_val = thresh << 8; break;
default:
- assert(0 &&
- "cm->bit_depth should be VPX_BITS_8, "
- "VPX_BITS_10 or VPX_BITS_12");
+ assert(cm->bit_depth == VPX_BITS_12);
+ ret_val = thresh << 8;
+ break;
}
}
#else
@@ -520,11 +519,10 @@ static int get_ul_intra_threshold(VP9_COMMON *cm) {
switch (cm->bit_depth) {
case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break;
case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break;
- case VPX_BITS_12: ret_val = UL_INTRA_THRESH << 4; break;
default:
- assert(0 &&
- "cm->bit_depth should be VPX_BITS_8, "
- "VPX_BITS_10 or VPX_BITS_12");
+ assert(cm->bit_depth == VPX_BITS_12);
+ ret_val = UL_INTRA_THRESH << 4;
+ break;
}
}
#else
@@ -541,11 +539,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) {
switch (cm->bit_depth) {
case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break;
case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break;
- case VPX_BITS_12: ret_val = SMOOTH_INTRA_THRESH << 8; break;
default:
- assert(0 &&
- "cm->bit_depth should be VPX_BITS_8, "
- "VPX_BITS_10 or VPX_BITS_12");
+ assert(cm->bit_depth == VPX_BITS_12);
+ ret_val = SMOOTH_INTRA_THRESH << 8;
+ break;
}
}
#else
@@ -971,12 +968,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td,
switch (cm->bit_depth) {
case VPX_BITS_8: break;
case VPX_BITS_10: this_error >>= 4; break;
- case VPX_BITS_12: this_error >>= 8; break;
default:
- assert(0 &&
- "cm->bit_depth should be VPX_BITS_8, "
- "VPX_BITS_10 or VPX_BITS_12");
- return;
+ assert(cm->bit_depth == VPX_BITS_12);
+ this_error >>= 8;
+ break;
}
}
#endif // CONFIG_VP9_HIGHBITDEPTH
@@ -2446,8 +2441,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame);
// Monitor for static sections.
- zero_motion_accumulator = VPXMIN(
- zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame));
+ if ((rc->frames_since_key + i - 1) > 1) {
+ zero_motion_accumulator *= get_zero_motion_factor(cpi, &next_frame);
+ }
// Break clause to detect very still sections after motion. For example,
// a static image after a fade or other transition.
@@ -2469,8 +2465,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
// Break out conditions.
// Break at maximum of active_max_gf_interval unless almost totally static.
- if (((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) &&
- (i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) ||
+ //
+ // Note that the addition of a test of rc->source_alt_ref_active is
+ // deliberate. The effect of this is that after a normal altref group even
+ // if the material is static there will be one normal length GF group
+ // before allowing longer GF groups. The reason for this is that in cases
+ // such as slide shows where slides are separated by a complex transition
+ // such as a fade, the arf group spanning the transition may not be coded
+ // at a very high quality and hence this frame (with its overlay) is a
+ // poor golden frame to use for an extended group.
+ if (((i >= active_max_gf_interval) &&
+ ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) ||
(
// Don't break out with a very short interval.
(i >= active_min_gf_interval) &&
@@ -2490,7 +2495,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0;
// Should we use the alternate reference frame.
- if ((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && allow_alt_ref &&
+ if ((zero_motion_accumulator < 0.995) && allow_alt_ref &&
(i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) {
const int forward_frames = (rc->frames_to_key - i >= i - 1)
? i - 1
@@ -2518,11 +2523,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200);
#endif
- // Set the interval until the next gf.
rc->baseline_gf_interval =
- (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH)
- ? (i - (is_key_frame || rc->source_alt_ref_pending))
- : i;
+ ((twopass->kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) &&
+ (i >= rc->frames_to_key))
+ ? i
+ : (i - (is_key_frame || rc->source_alt_ref_pending));
rc->frames_till_gf_update_due = rc->baseline_gf_interval;
@@ -2769,6 +2774,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
vp9_zero(next_frame);
cpi->common.frame_type = KEY_FRAME;
+ rc->frames_since_key = 0;
// Reset the GF group data structures.
vp9_zero(*gf_group);
@@ -2913,7 +2919,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
for (i = 0; i < (rc->frames_to_key - 1); ++i) {
if (EOF == input_stats(twopass, &next_frame)) break;
- if (i <= KF_BOOST_SCAN_MAX_FRAMES) {
+ // The zero motion test here insures that if we mark a kf group as static
+ // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES.
+ // It also allows for a larger boost on long static groups.
+ if ((i <= KF_BOOST_SCAN_MAX_FRAMES) || (zero_motion_accumulator >= 0.99)) {
double frame_boost;
double zm_factor;
@@ -3025,12 +3034,13 @@ static void configure_buffer_updates(VP9_COMP *cpi) {
cpi->refresh_alt_ref_frame = 0;
cpi->rc.is_src_frame_alt_ref = 1;
break;
- case ARF_UPDATE:
+ default:
+ assert(twopass->gf_group.update_type[twopass->gf_group.index] ==
+ ARF_UPDATE);
cpi->refresh_last_frame = 0;
cpi->refresh_golden_frame = 0;
cpi->refresh_alt_ref_frame = 1;
break;
- default: assert(0); break;
}
}
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 1cb978667..ba72c0be5 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1793,7 +1793,7 @@ static const MV search_pos[4] = {
unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE bsize, int mi_row,
- int mi_col) {
+ int mi_col, const MV *ref_mv) {
MACROBLOCKD *xd = &x->e_mbd;
MODE_INFO *mi = xd->mi[0];
struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } };
@@ -1815,6 +1815,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
const int norm_factor = 3 + (bw >> 5);
const YV12_BUFFER_CONFIG *scaled_ref_frame =
vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]);
+ MvLimits subpel_mv_limits;
if (scaled_ref_frame) {
int i;
@@ -1917,6 +1918,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x,
tmp_mv->row *= 8;
tmp_mv->col *= 8;
+ vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv);
+ clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max,
+ subpel_mv_limits.row_min, subpel_mv_limits.row_max);
+
if (scaled_ref_frame) {
int i;
for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i];
@@ -2210,7 +2215,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list,
fn_ptr, 1, ref_mv, tmp_mv);
break;
- case NSTEP:
+ default:
+ assert(method == NSTEP);
var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
MAX_MVSEARCH_STEPS - 1 - step_param, 1,
cost_list, fn_ptr, ref_mv, tmp_mv);
@@ -2236,7 +2242,6 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize,
}
}
break;
- default: assert(0 && "Invalid search method.");
}
if (method != NSTEP && rd && var < var_max)
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index b8db2c353..b4787fe1f 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -66,7 +66,8 @@ int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv,
// Perform integral projection based motion estimation.
unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi,
MACROBLOCK *x, BLOCK_SIZE bsize,
- int mi_row, int mi_col);
+ int mi_row, int mi_col,
+ const MV *ref_mv);
typedef uint32_t(fractional_mv_step_fp)(
const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp,
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index 1c2c55b9e..4e9649065 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -169,14 +169,10 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi,
case VPX_BITS_10:
filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
break;
- case VPX_BITS_12:
+ default:
+ assert(cm->bit_depth == VPX_BITS_12);
filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
break;
- default:
- assert(0 &&
- "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
- "or VPX_BITS_12");
- return;
}
#else
int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index 3aee46636..f9d7a6db8 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c
@@ -726,13 +726,13 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc,
qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
- case TX_4X4:
+ default:
+ assert(tx_size == TX_4X4);
x->fwd_txfm4x4(src_diff, coeff, diff_stride);
vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
scan_order->iscan);
break;
- default: assert(0); break;
}
*skippable &= (*eob == 0);
eob_cost += 1;
@@ -1502,8 +1502,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
int flag_svc_subpel = 0;
int svc_mv_col = 0;
int svc_mv_row = 0;
+ int no_scaling = 0;
unsigned int thresh_svc_skip_golden = 500;
- if (cpi->svc.spatial_layer_id > 0 && cpi->svc.high_source_sad_superframe)
+ if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) {
+ int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id - 1,
+ cpi->svc.temporal_layer_id,
+ cpi->svc.number_temporal_layers);
+ LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer];
+ if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1;
+ }
+ if (cpi->svc.spatial_layer_id > 0 &&
+ (cpi->svc.high_source_sad_superframe || no_scaling))
thresh_svc_skip_golden = 0;
// Lower the skip threshold if lower spatial layer is better quality relative
// to current layer.
@@ -1517,7 +1526,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
thresh_svc_skip_golden = 1000;
init_ref_frame_cost(cm, xd, ref_frame_cost);
-
memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES);
if (reuse_inter_pred) {
@@ -1764,7 +1772,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
continue;
if (sf->short_circuit_flat_blocks && x->source_variance == 0 &&
- this_mode != NEARESTMV) {
+ frame_mv[this_mode][ref_frame].as_int != 0) {
continue;
}
@@ -1875,7 +1883,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
(!cpi->sf.adaptive_rd_thresh_row_mt &&
rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh,
&rd_thresh_freq_fact[mode_index])))
- continue;
+ if (frame_mv[this_mode][ref_frame].as_int != 0) continue;
if (this_mode == NEWMV && !force_gf_mv) {
if (ref_frame > LAST_FRAME && !cpi->use_svc &&
@@ -1886,7 +1894,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data,
if (bsize < BLOCK_16X16) continue;
- tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+ tmp_sad = vp9_int_pro_motion_estimation(
+ cpi, x, bsize, mi_row, mi_col,
+ &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv);
if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue;
if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad)
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 09f61ead2..276022a56 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -204,10 +204,9 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) {
switch (bit_depth) {
case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80);
case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80);
- case VPX_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
default:
- assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
- return -1;
+ assert(bit_depth == VPX_BITS_12);
+ return q == 0 ? 64 : (quant < 2368 ? 84 : 80);
}
#else
(void)bit_depth;
diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c
index 9fbb48817..599337f80 100644
--- a/vp9/encoder/vp9_ratectrl.c
+++ b/vp9/encoder/vp9_ratectrl.c
@@ -48,18 +48,16 @@
#define MAX_BPB_FACTOR 50
#if CONFIG_VP9_HIGHBITDEPTH
-#define ASSIGN_MINQ_TABLE(bit_depth, name) \
- do { \
- switch (bit_depth) { \
- case VPX_BITS_8: name = name##_8; break; \
- case VPX_BITS_10: name = name##_10; break; \
- case VPX_BITS_12: name = name##_12; break; \
- default: \
- assert(0 && \
- "bit_depth should be VPX_BITS_8, VPX_BITS_10" \
- " or VPX_BITS_12"); \
- name = NULL; \
- } \
+#define ASSIGN_MINQ_TABLE(bit_depth, name) \
+ do { \
+ switch (bit_depth) { \
+ case VPX_BITS_8: name = name##_8; break; \
+ case VPX_BITS_10: name = name##_10; break; \
+ default: \
+ assert(bit_depth == VPX_BITS_12); \
+ name = name##_12; \
+ break; \
+ } \
} while (0)
#else
#define ASSIGN_MINQ_TABLE(bit_depth, name) \
@@ -167,10 +165,9 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) {
switch (bit_depth) {
case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0;
- case VPX_BITS_12: return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
default:
- assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
- return -1.0;
+ assert(bit_depth == VPX_BITS_12);
+ return vp9_ac_quant(qindex, 0, bit_depth) / 64.0;
}
#else
return vp9_ac_quant(qindex, 0, bit_depth) / 4.0;
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index 6b2306ce9..3407e74c6 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -69,10 +69,12 @@ static void fill_mode_costs(VP9_COMP *cpi) {
const FRAME_CONTEXT *const fc = cpi->common.fc;
int i, j;
- for (i = 0; i < INTRA_MODES; ++i)
- for (j = 0; j < INTRA_MODES; ++j)
+ for (i = 0; i < INTRA_MODES; ++i) {
+ for (j = 0; j < INTRA_MODES; ++j) {
vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j],
vp9_intra_mode_tree);
+ }
+ }
vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree);
for (i = 0; i < INTRA_MODES; ++i) {
@@ -82,9 +84,28 @@ static void fill_mode_costs(VP9_COMP *cpi) {
fc->uv_mode_prob[i], vp9_intra_mode_tree);
}
- for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
+ for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) {
vp9_cost_tokens(cpi->switchable_interp_costs[i],
fc->switchable_interp_prob[i], vp9_switchable_interp_tree);
+ }
+
+ for (i = TX_8X8; i < TX_SIZES; ++i) {
+ for (j = 0; j < TX_SIZE_CONTEXTS; ++j) {
+ const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs);
+ int k;
+ for (k = 0; k <= i; ++k) {
+ int cost = 0;
+ int m;
+ for (m = 0; m <= k - (k == i); ++m) {
+ if (m == k)
+ cost += vp9_cost_zero(tx_probs[m]);
+ else
+ cost += vp9_cost_one(tx_probs[m]);
+ }
+ cpi->tx_size_cost[i - 1][j][k] = cost;
+ }
+ }
+ }
}
static void fill_token_costs(vp9_coeff_cost *c,
@@ -153,10 +174,10 @@ int64_t vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) {
switch (cpi->common.bit_depth) {
case VPX_BITS_8: rdmult = 88 * q * q / 24; break;
case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break;
- case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break;
default:
- assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
- return -1;
+ assert(cpi->common.bit_depth == VPX_BITS_12);
+ rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8);
+ break;
}
#else
int64_t rdmult = 88 * q * q / 24;
@@ -185,10 +206,10 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) {
switch (bit_depth) {
case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break;
case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break;
- case VPX_BITS_12: q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; break;
default:
- assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
- return -1;
+ assert(bit_depth == VPX_BITS_12);
+ q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0;
+ break;
}
#else
(void)bit_depth;
@@ -209,12 +230,11 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) {
x->sadperbit16 = sad_per_bit16lut_10[qindex];
x->sadperbit4 = sad_per_bit4lut_10[qindex];
break;
- case VPX_BITS_12:
+ default:
+ assert(cpi->common.bit_depth == VPX_BITS_12);
x->sadperbit16 = sad_per_bit16lut_12[qindex];
x->sadperbit4 = sad_per_bit4lut_12[qindex];
break;
- default:
- assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12");
}
#else
(void)cpi;
@@ -471,13 +491,13 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
for (i = 0; i < num_4x4_h; i += 4)
t_left[i] = !!*(const uint32_t *)&left[i];
break;
- case TX_32X32:
+ default:
+ assert(tx_size == TX_32X32);
for (i = 0; i < num_4x4_w; i += 8)
t_above[i] = !!*(const uint64_t *)&above[i];
for (i = 0; i < num_4x4_h; i += 8)
t_left[i] = !!*(const uint64_t *)&left[i];
break;
- default: assert(0 && "Invalid transform size."); break;
}
}
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 39cd1d41c..e39df033a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -543,8 +543,9 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
MACROBLOCKD *const xd = &x->e_mbd;
const struct macroblock_plane *const p = &x->plane[plane];
const struct macroblockd_plane *const pd = &xd->plane[plane];
+ const int eob = p->eobs[block];
- if (x->block_tx_domain) {
+ if (x->block_tx_domain && eob) {
const int ss_txfrm_size = tx_size << 1;
int64_t this_sse;
const int shift = tx_size == TX_32X32 ? 0 : 2;
@@ -584,14 +585,13 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
const uint8_t *src = &p->src.buf[src_idx];
const uint8_t *dst = &pd->dst.buf[dst_idx];
const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
- const uint16_t *eob = &p->eobs[block];
unsigned int tmp;
tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row,
blk_col, plane_bsize, tx_bsize);
*out_sse = (int64_t)tmp * 16;
- if (*eob) {
+ if (eob) {
#if CONFIG_VP9_HIGHBITDEPTH
DECLARE_ALIGNED(16, uint16_t, recon16[1024]);
uint8_t *recon = (uint8_t *)recon16;
@@ -604,22 +604,22 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16,
32, NULL, 0, 0, 0, 0, bs, bs, xd->bd);
if (xd->lossless) {
- vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
+ vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
} else {
switch (tx_size) {
case TX_4X4:
- vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd);
+ vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd);
break;
case TX_8X8:
- vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd);
+ vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd);
break;
case TX_16X16:
- vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd);
+ vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd);
break;
- case TX_32X32:
- vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd);
+ default:
+ assert(tx_size == TX_32X32);
+ vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd);
break;
- default: assert(0 && "Invalid transform size");
}
}
recon = CONVERT_TO_BYTEPTR(recon16);
@@ -627,16 +627,16 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane,
#endif // CONFIG_VP9_HIGHBITDEPTH
vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs);
switch (tx_size) {
- case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break;
- case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break;
- case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, *eob); break;
- case TX_4X4:
+ case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break;
+ case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break;
+ case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break;
+ default:
+ assert(tx_size == TX_4X4);
// this is like vp9_short_idct4x4 but has a special case around
// eob<=1, which is significant (not just an optimization) for
// the lossless case.
- x->inv_txfm_add(dqcoeff, recon, 32, *eob);
+ x->inv_txfm_add(dqcoeff, recon, 32, eob);
break;
- default: assert(0 && "Invalid transform size"); break;
}
#if CONFIG_VP9_HIGHBITDEPTH
}
@@ -845,21 +845,20 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
{ INT64_MAX, INT64_MAX },
{ INT64_MAX, INT64_MAX },
{ INT64_MAX, INT64_MAX } };
- int n, m;
+ int n;
int s0, s1;
int64_t best_rd = INT64_MAX;
TX_SIZE best_tx = max_tx_size;
int start_tx, end_tx;
-
- const vpx_prob *tx_probs =
- get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs);
+ const int tx_size_ctx = get_tx_size_context(xd);
assert(skip_prob > 0);
s0 = vp9_cost_bit(skip_prob, 0);
s1 = vp9_cost_bit(skip_prob, 1);
if (cm->tx_mode == TX_MODE_SELECT) {
start_tx = max_tx_size;
- end_tx = 0;
+ end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0);
+ if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx);
} else {
TX_SIZE chosen_tx_size =
VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]);
@@ -868,13 +867,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
}
for (n = start_tx; n >= end_tx; n--) {
- int r_tx_size = 0;
- for (m = 0; m <= n - (n == (int)max_tx_size); m++) {
- if (m == n)
- r_tx_size += vp9_cost_zero(tx_probs[m]);
- else
- r_tx_size += vp9_cost_one(tx_probs[m]);
- }
+ const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n];
txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0,
bs, n, cpi->sf.use_fast_coef_costing);
r[n][1] = r[n][0];
@@ -1469,11 +1462,11 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
if (is_compound)
this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int;
break;
- case ZEROMV:
+ default:
+ assert(mode == ZEROMV);
this_mv[0].as_int = 0;
if (is_compound) this_mv[1].as_int = 0;
break;
- default: break;
}
mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int;
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index c0f985cbd..90da68726 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -817,6 +817,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) {
// Some speed-up features even for best quality as minimal impact on quality.
sf->adaptive_rd_thresh = 1;
sf->tx_size_search_breakout = 1;
+ sf->tx_size_search_depth = 2;
sf->exhaustive_searches_thresh =
(cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20)
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index 15e8dacbd..946bf0545 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -272,6 +272,9 @@ typedef struct SPEED_FEATURES {
// for intra and model coefs for the rest.
TX_SIZE_SEARCH_METHOD tx_size_search_method;
+ // How many levels of tx size to search, starting from the largest.
+ int tx_size_search_depth;
+
// Low precision 32x32 fdct keeps everything in 16 bits and thus is less
// precise but significantly faster than the non lp version.
int use_lp32x32fdct;
diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c
index 42a197769..07d1995a8 100644
--- a/vp9/encoder/vp9_svc_layercontext.c
+++ b/vp9/encoder/vp9_svc_layercontext.c
@@ -46,9 +46,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) {
svc->last_layer_dropped[sl] = 0;
svc->drop_spatial_layer[sl] = 0;
svc->ext_frame_flags[sl] = 0;
- svc->ext_lst_fb_idx[sl] = 0;
- svc->ext_gld_fb_idx[sl] = 1;
- svc->ext_alt_fb_idx[sl] = 2;
+ svc->lst_fb_idx[sl] = 0;
+ svc->gld_fb_idx[sl] = 1;
+ svc->alt_fb_idx[sl] = 2;
svc->downsample_filter_type[sl] = BILINEAR;
svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter.
svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark;
@@ -407,6 +407,40 @@ void get_layer_resolution(const int width_org, const int height_org,
*height_out = h;
}
+void reset_fb_idx_unused(VP9_COMP *const cpi) {
+ // If a reference frame is not referenced or refreshed, then set the
+ // fb_idx for that reference to the first one used/referenced.
+ // This is to avoid setting fb_idx for a reference to a slot that is not
+ // used/needed (i.e., since that reference is not referenced or refreshed).
+ static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+ MV_REFERENCE_FRAME ref_frame;
+ MV_REFERENCE_FRAME first_ref = 0;
+ int first_fb_idx = 0;
+ int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+ if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+ first_ref = ref_frame;
+ first_fb_idx = fb_idx[ref_frame - 1];
+ break;
+ }
+ }
+ if (first_ref > 0) {
+ if (first_ref != LAST_FRAME &&
+ !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) &&
+ !cpi->ext_refresh_last_frame)
+ cpi->lst_fb_idx = first_fb_idx;
+ else if (first_ref != GOLDEN_FRAME &&
+ !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+ !cpi->ext_refresh_golden_frame)
+ cpi->gld_fb_idx = first_fb_idx;
+ else if (first_ref != ALTREF_FRAME &&
+ !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) &&
+ !cpi->ext_refresh_alt_ref_frame)
+ cpi->alt_fb_idx = first_fb_idx;
+ }
+}
+
// The function sets proper ref_frame_flags, buffer indices, and buffer update
// variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering
// scheme.
@@ -510,6 +544,8 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) {
cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
}
+
+ reset_fb_idx_unused(cpi);
}
// The function sets proper ref_frame_flags, buffer indices, and buffer update
@@ -569,6 +605,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) {
cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1;
cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id;
}
+
+ reset_fb_idx_unused(cpi);
}
// The function sets proper ref_frame_flags, buffer indices, and buffer update
@@ -601,6 +639,28 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering(
} else {
cpi->gld_fb_idx = 0;
}
+
+ reset_fb_idx_unused(cpi);
+}
+
+void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
+ SVC *const svc = &cpi->svc;
+ static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+ int sl = svc->spatial_layer_id;
+ svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
+ svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
+ svc->alt_fb_idx[sl] = cpi->alt_fb_idx;
+
+ svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame;
+ svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
+ svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
+ svc->reference_last[sl] =
+ (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]);
+ svc->reference_golden[sl] =
+ (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]);
+ svc->reference_altref[sl] =
+ (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
}
int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
@@ -637,18 +697,30 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode;
sl = cpi->svc.spatial_layer_id;
vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]);
- cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl];
- cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl];
- cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl];
+ cpi->lst_fb_idx = cpi->svc.lst_fb_idx[sl];
+ cpi->gld_fb_idx = cpi->svc.gld_fb_idx[sl];
+ cpi->alt_fb_idx = cpi->svc.alt_fb_idx[sl];
}
}
// Reset the drop flags for all spatial layers, on the base layer.
if (cpi->svc.spatial_layer_id == 0) {
- int i;
- for (i = 0; i < cpi->svc.number_spatial_layers; i++) {
- cpi->svc.drop_spatial_layer[i] = 0;
+ vp9_zero(cpi->svc.drop_spatial_layer);
+ // TODO(jianj/marpan): Investigate why setting cpi->svc.lst/gld/alt_fb_idx
+ // causes an issue with frame dropping and temporal layers, when the frame
+ // flags are passed via the encode call (bypass mode). Issue is that we're
+ // resetting ext_refresh_frame_flags_pending to 0 on frame drops.
+ if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+ memset(&cpi->svc.lst_fb_idx, -1, sizeof(cpi->svc.lst_fb_idx));
+ memset(&cpi->svc.gld_fb_idx, -1, sizeof(cpi->svc.lst_fb_idx));
+ memset(&cpi->svc.alt_fb_idx, -1, sizeof(cpi->svc.lst_fb_idx));
}
+ vp9_zero(cpi->svc.update_last);
+ vp9_zero(cpi->svc.update_golden);
+ vp9_zero(cpi->svc.update_altref);
+ vp9_zero(cpi->svc.reference_last);
+ vp9_zero(cpi->svc.reference_golden);
+ vp9_zero(cpi->svc.reference_altref);
}
lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id *
@@ -714,6 +786,15 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
if (cpi->svc.spatial_layer_id == 0) cpi->svc.high_source_sad_superframe = 0;
+ if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS &&
+ cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id]) {
+ // For fixed/non-flexible mode, if the previous frame (same spatial layer
+ // from previous superframe) was dropped, make sure the lst_fb_idx
+ // for this frame corresponds to the buffer index updated on (last) encoded
+ // TL0 frame (with same spatial layer).
+ cpi->lst_fb_idx = cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id];
+ }
+
if (vp9_set_size_literal(cpi, width, height) != 0)
return VPX_CODEC_INVALID_PARAM;
@@ -799,3 +880,63 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) {
}
}
}
+
+void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) {
+ VP9_COMMON *const cm = &cpi->common;
+ // Check for disabling inter-layer (spatial) prediction, if
+ // svc.disable_inter_layer_pred is set. If the previous spatial layer was
+ // dropped then disable the prediction from this (scaled) reference.
+ if ((cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY &&
+ !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) ||
+ cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
+ cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id - 1]) {
+ MV_REFERENCE_FRAME ref_frame;
+ static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+ VP9_ALT_FLAG };
+ for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+ const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
+ if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+ const struct scale_factors *const scale_fac =
+ &cm->frame_refs[ref_frame - 1].sf;
+ if (vp9_is_scaled(scale_fac))
+ cpi->ref_frame_flags &= (~flag_list[ref_frame]);
+ }
+ }
+ }
+ // Check for disabling inter-layer prediction if
+ // INTER_LAYER_PRED_ON_CONSTRAINED is enabled.
+ // If the reference for inter-layer prediction (the reference that is scaled)
+ // is not the previous spatial layer from the same superframe, then we
+ // disable inter-layer prediction.
+ if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_ON_CONSTRAINED) {
+ // We only use LAST and GOLDEN for prediction in real-time mode, so we
+ // check both here.
+ MV_REFERENCE_FRAME ref_frame;
+ for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) {
+ struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf;
+ if (vp9_is_scaled(scale_fac)) {
+ // If this reference was updated on the previous spatial layer of the
+ // current superframe, then we keep this reference (don't disable).
+ // Otherwise we disable the inter-layer prediction.
+ // This condition is verified by checking if the current frame buffer
+ // index is equal to any of the slots for the previous spatial layer,
+ // and if so, check if that slot was updated/refreshed. If that is the
+ // case, then this reference is valid for inter-layer prediction under
+ // the mode INTER_LAYER_PRED_ON_CONSTRAINED.
+ int fb_idx =
+ ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx;
+ int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG;
+ int sl = cpi->svc.spatial_layer_id;
+ int disable = 1;
+ if ((fb_idx == cpi->svc.lst_fb_idx[sl - 1] &&
+ cpi->svc.update_last[sl - 1]) ||
+ (fb_idx == cpi->svc.gld_fb_idx[sl - 1] &&
+ cpi->svc.update_golden[sl - 1]) ||
+ (fb_idx == cpi->svc.alt_fb_idx[sl - 1] &&
+ cpi->svc.update_altref[sl - 1]))
+ disable = 0;
+ if (disable) cpi->ref_frame_flags &= (~ref_flag);
+ }
+ }
+ }
+}
diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h
index 022fd00f7..617717049 100644
--- a/vp9/encoder/vp9_svc_layercontext.h
+++ b/vp9/encoder/vp9_svc_layercontext.h
@@ -20,9 +20,16 @@ extern "C" {
#endif
typedef enum {
+ // Inter-layer prediction is on on all frames.
INTER_LAYER_PRED_ON,
+ // Inter-layer prediction is off on all frames.
INTER_LAYER_PRED_OFF,
- INTER_LAYER_PRED_OFF_NONKEY
+ // Inter-layer prediction is off on non-key frames.
+ INTER_LAYER_PRED_OFF_NONKEY,
+ // Inter-layer prediction is on on all frames, but constrained such
+ // that any layer S (> 0) can only predict from previous spatial
+ // layer S-1, from the same superframe.
+ INTER_LAYER_PRED_ON_CONSTRAINED
} INTER_LAYER_PRED;
typedef struct {
@@ -86,9 +93,9 @@ typedef struct SVC {
// Frame flags and buffer indexes for each spatial layer, set by the
// application (external settings).
int ext_frame_flags[VPX_MAX_LAYERS];
- int ext_lst_fb_idx[VPX_MAX_LAYERS];
- int ext_gld_fb_idx[VPX_MAX_LAYERS];
- int ext_alt_fb_idx[VPX_MAX_LAYERS];
+ int lst_fb_idx[VPX_MAX_LAYERS];
+ int gld_fb_idx[VPX_MAX_LAYERS];
+ int alt_fb_idx[VPX_MAX_LAYERS];
int ref_frame_index[REF_FRAMES];
int force_zero_mode_spatial_ref;
int current_superframe;
@@ -123,6 +130,18 @@ typedef struct SVC {
// currently checked for each superframe prior to encoding, on the full
// resolution source.
int high_source_sad_superframe;
+
+ // Flags used to get SVC pattern info.
+ uint8_t update_last[VPX_SS_MAX_LAYERS];
+ uint8_t update_golden[VPX_SS_MAX_LAYERS];
+ uint8_t update_altref[VPX_SS_MAX_LAYERS];
+ uint8_t reference_last[VPX_SS_MAX_LAYERS];
+ uint8_t reference_golden[VPX_SS_MAX_LAYERS];
+ uint8_t reference_altref[VPX_SS_MAX_LAYERS];
+
+ // Keep track of the frame buffer index updated/refreshed on the base
+ // temporal superframe.
+ uint8_t fb_idx_upd_tl0[VPX_SS_MAX_LAYERS];
} SVC;
struct VP9_COMP;
@@ -170,6 +189,8 @@ struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi,
// Start a frame and initialize svc parameters
int vp9_svc_start_frame(struct VP9_COMP *const cpi);
+void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi);
+
int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi);
void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi);
@@ -178,6 +199,8 @@ void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi);
void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi);
+void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi);
+
#ifdef __cplusplus
} // extern "C"
#endif
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index e006606f1..5eaa7a18a 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -1487,6 +1487,25 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx,
return VPX_CODEC_OK;
}
+static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
+ va_list args) {
+ VP9_COMP *const cpi = ctx->cpi;
+ vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *);
+ int sl;
+ for (sl = 0; sl <= cpi->svc.spatial_layer_id; sl++) {
+ data->update_last[sl] = cpi->svc.update_last[sl];
+ data->update_golden[sl] = cpi->svc.update_golden[sl];
+ data->update_alt_ref[sl] = cpi->svc.update_altref[sl];
+ data->reference_last[sl] = cpi->svc.reference_last[sl];
+ data->reference_golden[sl] = cpi->svc.reference_golden[sl];
+ data->reference_alt_ref[sl] = cpi->svc.reference_altref[sl];
+ data->lst_fb_idx[sl] = cpi->svc.lst_fb_idx[sl];
+ data->gld_fb_idx[sl] = cpi->svc.gld_fb_idx[sl];
+ data->alt_fb_idx[sl] = cpi->svc.alt_fb_idx[sl];
+ }
+ return VPX_CODEC_OK;
+}
+
static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
va_list args) {
VP9_COMP *const cpi = ctx->cpi;
@@ -1494,9 +1513,9 @@ static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx,
int sl;
for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) {
cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl];
- cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl];
- cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl];
- cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl];
+ cpi->svc.lst_fb_idx[sl] = data->lst_fb_idx[sl];
+ cpi->svc.gld_fb_idx[sl] = data->gld_fb_idx[sl];
+ cpi->svc.alt_fb_idx[sl] = data->alt_fb_idx[sl];
}
return VPX_CODEC_OK;
}
@@ -1613,6 +1632,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
{ VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id },
{ VP9E_GET_ACTIVEMAP, ctrl_get_active_map },
{ VP9E_GET_LEVEL, ctrl_get_level },
+ { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config },
{ -1, NULL },
};
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index f409844b5..b201d96f4 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -620,6 +620,13 @@ enum vp8e_enc_control_id {
* Supported in codecs: VP9
*/
VP9E_SET_SVC_FRAME_DROP_LAYER,
+
+ /*!\brief Codec control function to get the refresh and reference flags and
+ * the buffer indices, up to the last encoded spatial layer.
+ *
+ * Supported in codecs: VP9
+ */
+ VP9E_GET_SVC_REF_FRAME_CONFIG,
};
/*!\brief vpx 1-D scaling mode
@@ -757,10 +764,18 @@ typedef struct vpx_svc_layer_id {
*
*/
typedef struct vpx_svc_ref_frame_config {
- int frame_flags[VPX_TS_MAX_LAYERS]; /**< Frame flags. */
- int lst_fb_idx[VPX_TS_MAX_LAYERS]; /**< Last buffer index. */
- int gld_fb_idx[VPX_TS_MAX_LAYERS]; /**< Golden buffer index. */
- int alt_fb_idx[VPX_TS_MAX_LAYERS]; /**< Altref buffer index. */
+ // TODO(jianj/marpan): Remove the usage of frame_flags, instead use the
+ // update and reference flags.
+ int frame_flags[VPX_SS_MAX_LAYERS]; /**< Frame flags. */
+ int lst_fb_idx[VPX_SS_MAX_LAYERS]; /**< Last buffer index. */
+ int gld_fb_idx[VPX_SS_MAX_LAYERS]; /**< Golden buffer index. */
+ int alt_fb_idx[VPX_SS_MAX_LAYERS]; /**< Altref buffer index. */
+ int update_last[VPX_SS_MAX_LAYERS]; /**< Update last. */
+ int update_golden[VPX_SS_MAX_LAYERS]; /**< Update golden. */
+ int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */
+ int reference_last[VPX_SS_MAX_LAYERS]; /**< Last as eference. */
+ int reference_golden[VPX_SS_MAX_LAYERS]; /**< Golden as reference. */
+ int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */
} vpx_svc_ref_frame_config_t;
/*!\brief VP9 svc frame dropping mode.
@@ -927,6 +942,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int)
VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *)
#define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
+#define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG
+
/*!\endcond */
/*! @} - end defgroup vp8_encoder */
#ifdef __cplusplus
diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h
index 4017e5719..8c08017b6 100644
--- a/vpx/vpx_encoder.h
+++ b/vpx/vpx_encoder.h
@@ -63,7 +63,7 @@ extern "C" {
* fields to structures
*/
#define VPX_ENCODER_ABI_VERSION \
- (11 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
+ (12 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/
/*! \brief Encoder capabilities bitfield
*
diff --git a/vpx_dsp/arm/avg_pred_neon.c b/vpx_dsp/arm/avg_pred_neon.c
index 1370ec2d2..5afdece0a 100644
--- a/vpx_dsp/arm/avg_pred_neon.c
+++ b/vpx_dsp/arm/avg_pred_neon.c
@@ -17,8 +17,8 @@
void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
int height, const uint8_t *ref, int ref_stride) {
if (width > 8) {
- int x, y;
- for (y = 0; y < height; ++y) {
+ int x, y = height;
+ do {
for (x = 0; x < width; x += 16) {
const uint8x16_t p = vld1q_u8(pred + x);
const uint8x16_t r = vld1q_u8(ref + x);
@@ -28,28 +28,38 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
comp += width;
pred += width;
ref += ref_stride;
- }
+ } while (--y);
+ } else if (width == 8) {
+ int i = width * height;
+ do {
+ const uint8x16_t p = vld1q_u8(pred);
+ uint8x16_t r;
+ const uint8x8_t r_0 = vld1_u8(ref);
+ const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+ r = vcombine_u8(r_0, r_1);
+ ref += 2 * ref_stride;
+ r = vrhaddq_u8(r, p);
+ vst1q_u8(comp, r);
+
+ pred += 16;
+ comp += 16;
+ i -= 16;
+ } while (i);
} else {
- int i;
- for (i = 0; i < width * height; i += 16) {
+ int i = width * height;
+ assert(width == 4);
+ do {
const uint8x16_t p = vld1q_u8(pred);
uint8x16_t r;
- if (width == 4) {
- r = load_unaligned_u8q(ref, ref_stride);
- ref += 4 * ref_stride;
- } else {
- const uint8x8_t r_0 = vld1_u8(ref);
- const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
- assert(width == 8);
- r = vcombine_u8(r_0, r_1);
- ref += 2 * ref_stride;
- }
+ r = load_unaligned_u8q(ref, ref_stride);
+ ref += 4 * ref_stride;
r = vrhaddq_u8(r, p);
vst1q_u8(comp, r);
pred += 16;
comp += 16;
- }
+ i -= 16;
+ } while (i);
}
}
diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h
index 12c0a54c8..6745464d7 100644
--- a/vpx_dsp/arm/mem_neon.h
+++ b/vpx_dsp/arm/mem_neon.h
@@ -101,9 +101,9 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) {
if (stride == 4) return vld1_u8(buf);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vld1_lane_u32(&a, a_u32, 0);
+ a_u32 = vset_lane_u32(a, a_u32, 0);
memcpy(&a, buf, 4);
- a_u32 = vld1_lane_u32(&a, a_u32, 1);
+ a_u32 = vset_lane_u32(a, a_u32, 1);
return vreinterpret_u8_u32(a_u32);
}
@@ -127,16 +127,16 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) {
if (stride == 4) return vld1q_u8(buf);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vld1q_lane_u32(&a, a_u32, 0);
+ a_u32 = vsetq_lane_u32(a, a_u32, 0);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vld1q_lane_u32(&a, a_u32, 1);
+ a_u32 = vsetq_lane_u32(a, a_u32, 1);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vld1q_lane_u32(&a, a_u32, 2);
+ a_u32 = vsetq_lane_u32(a, a_u32, 2);
memcpy(&a, buf, 4);
buf += stride;
- a_u32 = vld1q_lane_u32(&a, a_u32, 3);
+ a_u32 = vsetq_lane_u32(a, a_u32, 3);
return vreinterpretq_u8_u32(a_u32);
}
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index b04de3aff..535ec0f0d 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -10,64 +10,152 @@
#include <arm_neon.h>
+#include <assert.h>
#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
#include "vpx_dsp/arm/mem_neon.h"
#include "vpx_dsp/arm/sum_neon.h"
+static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
+ const void *const buf1) {
+ uint32_t a;
+ uint32x2_t aa = vdup_n_u32(0);
+ memcpy(&a, buf0, 4);
+ aa = vset_lane_u32(a, aa, 0);
+ memcpy(&a, buf1, 4);
+ aa = vset_lane_u32(a, aa, 1);
+ return vreinterpret_u8_u32(aa);
+}
+
+static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride,
+ const uint8_t *const ref[4], const int ref_stride,
+ const int height, uint32_t *const res) {
+ int i;
+ uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
+ uint16x4_t a[2];
+ uint32x4_t r;
+
+ assert(!((intptr_t)src % sizeof(uint32_t)));
+ assert(!(src_stride % sizeof(uint32_t)));
+
+ for (i = 0; i < height; ++i) {
+ const uint8x8_t s = vreinterpret_u8_u32(
+ vld1_dup_u32((const uint32_t *)(src + i * src_stride)));
+ const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride,
+ ref[1] + i * ref_stride);
+ const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride,
+ ref[3] + i * ref_stride);
+ abs[0] = vabal_u8(abs[0], s, ref01);
+ abs[1] = vabal_u8(abs[1], s, ref23);
+ }
+
+ a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0]));
+ a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
+ r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
+ vst1q_u32(res, r);
+}
+
void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
- int i;
- const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride);
- for (i = 0; i < 4; ++i) {
- const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride);
- uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8));
- abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8));
- res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
- }
+ sad4x_4d(src, src_stride, ref, ref_stride, 4, res);
}
void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
- int i;
- const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride);
- const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride);
- for (i = 0; i < 4; ++i) {
- const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride);
- const uint8x16_t ref_1 =
- load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride);
- uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0));
- abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0));
- abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1));
- abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1));
- res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0);
- }
+ sad4x_4d(src, src_stride, ref, ref_stride, 8, res);
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
+static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
+ uint32_t *const res) {
+ const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+ const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+ const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+ const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+ const uint16x4_t b0 = vpadd_u16(a0, a1);
+ const uint16x4_t b1 = vpadd_u16(a2, a3);
+ const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
+ vst1q_u32(res, r);
}
-static INLINE void sad8x_4d(const uint8_t *a, int a_stride,
- const uint8_t *const b[4], int b_stride,
- uint32_t *result, const int height) {
+// Can handle 1024 pixels' sad sum (such as 32x32)
+static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
+ uint32_t *const res) {
+ const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
+ const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
+ const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2]));
+ const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3]));
+ const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1));
+ const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
+ const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+ const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+ vst1q_u32(res, vcombine_u32(c0, c1));
+}
+
+// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
+static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
+ uint32_t *const res) {
+ const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+ const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+ const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+ const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+ const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0));
+ const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1));
+ const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2));
+ const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
+ const uint32x2_t c0 = vpadd_u32(b0, b1);
+ const uint32x2_t c1 = vpadd_u32(b2, b3);
+ vst1q_u32(res, vcombine_u32(c0, c1));
+}
+
+// Can handle 4096 pixels' sad sum (such as 64x64)
+static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
+ uint32_t *const res) {
+ const uint32x4_t a0 = vpaddlq_u16(sum[0]);
+ const uint32x4_t a1 = vpaddlq_u16(sum[1]);
+ const uint32x4_t a2 = vpaddlq_u16(sum[2]);
+ const uint32x4_t a3 = vpaddlq_u16(sum[3]);
+ const uint32x4_t a4 = vpaddlq_u16(sum[4]);
+ const uint32x4_t a5 = vpaddlq_u16(sum[5]);
+ const uint32x4_t a6 = vpaddlq_u16(sum[6]);
+ const uint32x4_t a7 = vpaddlq_u16(sum[7]);
+ const uint32x4_t b0 = vaddq_u32(a0, a1);
+ const uint32x4_t b1 = vaddq_u32(a2, a3);
+ const uint32x4_t b2 = vaddq_u32(a4, a5);
+ const uint32x4_t b3 = vaddq_u32(a6, a7);
+ const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0));
+ const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1));
+ const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2));
+ const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
+ const uint32x2_t d0 = vpadd_u32(c0, c1);
+ const uint32x2_t d1 = vpadd_u32(c2, c3);
+ vst1q_u32(res, vcombine_u32(d0, d1));
+}
+
+static INLINE void sad8x_4d(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res, const int height) {
int i, j;
+ const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
- const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
for (i = 0; i < height; ++i) {
- const uint8x8_t a_u8 = vld1_u8(a);
- a += a_stride;
+ const uint8x8_t s = vld1_u8(src);
+ src += src_stride;
for (j = 0; j < 4; ++j) {
- const uint8x8_t b_u8 = vld1_u8(b_loop[j]);
- b_loop[j] += b_stride;
- sum[j] = vabal_u8(sum[j], a_u8, b_u8);
+ const uint8x8_t b_u8 = vld1_u8(ref_loop[j]);
+ ref_loop[j] += ref_stride;
+ sum[j] = vabal_u8(sum[j], s, b_u8);
}
}
- for (j = 0; j < 4; ++j) {
- result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
- }
+ sad_512_pel_final_neon(sum, res);
}
void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride,
@@ -88,28 +176,33 @@ void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride,
sad8x_4d(src, src_stride, ref, ref_stride, res, 16);
}
-static INLINE void sad16x_4d(const uint8_t *a, int a_stride,
- const uint8_t *const b[4], int b_stride,
- uint32_t *result, const int height) {
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src,
+ uint16x8_t *const sum) {
+ const uint8x16_t r = vld1q_u8(ref);
+ *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r));
+ *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r));
+}
+
+static INLINE void sad16x_4d(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res, const int height) {
int i, j;
+ const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
vdupq_n_u16(0) };
- const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
for (i = 0; i < height; ++i) {
- const uint8x16_t a_u8 = vld1q_u8(a);
- a += a_stride;
+ const uint8x16_t s = vld1q_u8(src);
+ src += src_stride;
for (j = 0; j < 4; ++j) {
- const uint8x16_t b_u8 = vld1q_u8(b_loop[j]);
- b_loop[j] += b_stride;
- sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8));
- sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8));
+ sad16_neon(ref_loop[j], s, &sum[j]);
+ ref_loop[j] += ref_stride;
}
}
- for (j = 0; j < 4; ++j) {
- result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
- }
+ sad_512_pel_final_neon(sum, res);
}
void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride,
@@ -130,113 +223,152 @@ void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride,
sad16x_4d(src, src_stride, ref, ref_stride, res, 32);
}
-static INLINE void sad32x_4d(const uint8_t *a, int a_stride,
- const uint8_t *const b[4], int b_stride,
- uint32_t *result, const int height) {
- int i, j;
- uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
- vdupq_n_u16(0) };
- const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void sad32x_4d(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ const int height, uint16x8_t *const sum) {
+ int i;
+ const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+
+ sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0);
for (i = 0; i < height; ++i) {
- const uint8x16_t a_0 = vld1q_u8(a);
- const uint8x16_t a_1 = vld1q_u8(a + 16);
- a += a_stride;
- for (j = 0; j < 4; ++j) {
- const uint8x16_t b_0 = vld1q_u8(b_loop[j]);
- const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16);
- b_loop[j] += b_stride;
- sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0));
- sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0));
- sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1));
- sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1));
- }
- }
+ uint8x16_t s;
- for (j = 0; j < 4; ++j) {
- result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0);
+ s = vld1q_u8(src + 0 * 16);
+ sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+ sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+ sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+ sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
+
+ s = vld1q_u8(src + 1 * 16);
+ sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+ sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+ sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+ sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+ src += src_stride;
+ ref_loop[0] += ref_stride;
+ ref_loop[1] += ref_stride;
+ ref_loop[2] += ref_stride;
+ ref_loop[3] += ref_stride;
}
}
void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
- sad32x_4d(src, src_stride, ref, ref_stride, res, 16);
+ uint16x8_t sum[4];
+ sad32x_4d(src, src_stride, ref, ref_stride, 16, sum);
+ sad_512_pel_final_neon(sum, res);
}
void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
- sad32x_4d(src, src_stride, ref, ref_stride, res, 32);
+ uint16x8_t sum[4];
+ sad32x_4d(src, src_stride, ref, ref_stride, 32, sum);
+ sad_1024_pel_final_neon(sum, res);
}
void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
- sad32x_4d(src, src_stride, ref, ref_stride, res, 64);
+ uint16x8_t sum[4];
+ sad32x_4d(src, src_stride, ref, ref_stride, 64, sum);
+ sad_2048_pel_final_neon(sum, res);
}
-static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1,
- const uint8x16_t b_0, const uint8x16_t b_1,
- uint16x8_t *sum) {
- *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0));
- *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0));
- *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1));
- *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1));
-}
+////////////////////////////////////////////////////////////////////////////////
-static INLINE void sad64x_4d(const uint8_t *a, int a_stride,
- const uint8_t *const b[4], int b_stride,
- uint32_t *result, const int height) {
+void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
+ const uint8_t *const ref[4], int ref_stride,
+ uint32_t *res) {
int i;
- uint16x8_t sum_0 = vdupq_n_u16(0);
- uint16x8_t sum_1 = vdupq_n_u16(0);
- uint16x8_t sum_2 = vdupq_n_u16(0);
- uint16x8_t sum_3 = vdupq_n_u16(0);
- uint16x8_t sum_4 = vdupq_n_u16(0);
- uint16x8_t sum_5 = vdupq_n_u16(0);
- uint16x8_t sum_6 = vdupq_n_u16(0);
- uint16x8_t sum_7 = vdupq_n_u16(0);
- const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] };
+ const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+ uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0) };
- for (i = 0; i < height; ++i) {
- const uint8x16_t a_0 = vld1q_u8(a);
- const uint8x16_t a_1 = vld1q_u8(a + 16);
- const uint8x16_t a_2 = vld1q_u8(a + 32);
- const uint8x16_t a_3 = vld1q_u8(a + 48);
- a += a_stride;
- sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0);
- sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48),
- &sum_1);
- b_loop[0] += b_stride;
- sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2);
- sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48),
- &sum_3);
- b_loop[1] += b_stride;
- sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4);
- sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48),
- &sum_5);
- b_loop[2] += b_stride;
- sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6);
- sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48),
- &sum_7);
- b_loop[3] += b_stride;
- }
+ for (i = 0; i < 32; ++i) {
+ uint8x16_t s;
- result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0);
- result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0);
- result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0);
- result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0);
-}
+ s = vld1q_u8(src + 0 * 16);
+ sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+ sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]);
+ sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]);
+ sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]);
-void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride,
- const uint8_t *const ref[4], int ref_stride,
- uint32_t *res) {
- sad64x_4d(src, src_stride, ref, ref_stride, res, 32);
+ s = vld1q_u8(src + 1 * 16);
+ sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+ sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]);
+ sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]);
+ sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]);
+
+ s = vld1q_u8(src + 2 * 16);
+ sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]);
+ sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]);
+ sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]);
+ sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]);
+
+ s = vld1q_u8(src + 3 * 16);
+ sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]);
+ sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]);
+ sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]);
+ sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]);
+
+ src += src_stride;
+ ref_loop[0] += ref_stride;
+ ref_loop[1] += ref_stride;
+ ref_loop[2] += ref_stride;
+ ref_loop[3] += ref_stride;
+ }
+
+ sad_2048_pel_final_neon(sum, res);
}
void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride,
const uint8_t *const ref[4], int ref_stride,
uint32_t *res) {
- sad64x_4d(src, src_stride, ref, ref_stride, res, 64);
+ int i;
+ const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] };
+ uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0),
+ vdupq_n_u16(0), vdupq_n_u16(0) };
+
+ for (i = 0; i < 64; ++i) {
+ uint8x16_t s;
+
+ s = vld1q_u8(src + 0 * 16);
+ sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]);
+ sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]);
+ sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]);
+ sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]);
+
+ s = vld1q_u8(src + 1 * 16);
+ sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]);
+ sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]);
+ sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]);
+ sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]);
+
+ s = vld1q_u8(src + 2 * 16);
+ sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]);
+ sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]);
+ sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]);
+ sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]);
+
+ s = vld1q_u8(src + 3 * 16);
+ sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]);
+ sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]);
+ sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]);
+ sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]);
+
+ src += src_stride;
+ ref_loop[0] += ref_stride;
+ ref_loop[1] += ref_stride;
+ ref_loop[2] += ref_stride;
+ ref_loop[3] += ref_stride;
+ }
+
+ sad_4096_pel_final_neon(sum, res);
}
diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c
index ce81fb630..7e4610d2e 100644
--- a/vpx_dsp/arm/subtract_neon.c
+++ b/vpx_dsp/arm/subtract_neon.c
@@ -9,71 +9,72 @@
*/
#include <arm_neon.h>
+#include <assert.h>
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
+#include "vpx_dsp/arm/mem_neon.h"
void vpx_subtract_block_neon(int rows, int cols, int16_t *diff,
ptrdiff_t diff_stride, const uint8_t *src,
ptrdiff_t src_stride, const uint8_t *pred,
ptrdiff_t pred_stride) {
- int r, c;
+ int r = rows, c;
if (cols > 16) {
- for (r = 0; r < rows; ++r) {
+ do {
for (c = 0; c < cols; c += 32) {
- const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]);
- const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]);
- const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]);
- const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]);
- const uint16x8_t v_diff_lo_00 =
- vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00));
- const uint16x8_t v_diff_hi_00 =
- vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00));
- const uint16x8_t v_diff_lo_16 =
- vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16));
- const uint16x8_t v_diff_hi_16 =
- vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16));
- vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00));
- vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00));
- vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16));
- vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16));
+ const uint8x16_t s0 = vld1q_u8(&src[c + 0]);
+ const uint8x16_t s1 = vld1q_u8(&src[c + 16]);
+ const uint8x16_t p0 = vld1q_u8(&pred[c + 0]);
+ const uint8x16_t p1 = vld1q_u8(&pred[c + 16]);
+ const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0));
+ const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0));
+ const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1));
+ const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1));
+ vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1));
+ vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2));
+ vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3));
}
diff += diff_stride;
pred += pred_stride;
src += src_stride;
- }
+ } while (--r);
} else if (cols > 8) {
- for (r = 0; r < rows; ++r) {
- const uint8x16_t v_src = vld1q_u8(&src[0]);
- const uint8x16_t v_pred = vld1q_u8(&pred[0]);
- const uint16x8_t v_diff_lo =
- vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred));
- const uint16x8_t v_diff_hi =
- vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred));
- vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo));
- vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi));
+ do {
+ const uint8x16_t s = vld1q_u8(&src[0]);
+ const uint8x16_t p = vld1q_u8(&pred[0]);
+ const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p));
+ const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p));
+ vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0));
+ vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1));
diff += diff_stride;
pred += pred_stride;
src += src_stride;
- }
+ } while (--r);
} else if (cols > 4) {
- for (r = 0; r < rows; ++r) {
- const uint8x8_t v_src = vld1_u8(&src[0]);
- const uint8x8_t v_pred = vld1_u8(&pred[0]);
- const uint16x8_t v_diff = vsubl_u8(v_src, v_pred);
+ do {
+ const uint8x8_t s = vld1_u8(&src[0]);
+ const uint8x8_t p = vld1_u8(&pred[0]);
+ const uint16x8_t v_diff = vsubl_u8(s, p);
vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff));
diff += diff_stride;
pred += pred_stride;
src += src_stride;
- }
+ } while (--r);
} else {
- for (r = 0; r < rows; ++r) {
- for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c];
-
- diff += diff_stride;
- pred += pred_stride;
- src += src_stride;
- }
+ assert(cols == 4);
+ do {
+ const uint8x8_t s = load_unaligned_u8(src, src_stride);
+ const uint8x8_t p = load_unaligned_u8(pred, pred_stride);
+ const uint16x8_t d = vsubl_u8(s, p);
+ vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d)));
+ vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d)));
+ diff += 2 * diff_stride;
+ pred += 2 * pred_stride;
+ src += 2 * src_stride;
+ r -= 2;
+ } while (r);
}
}
diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h
index d74fe0cde..c09841223 100644
--- a/vpx_dsp/arm/sum_neon.h
+++ b/vpx_dsp/arm/sum_neon.h
@@ -30,15 +30,6 @@ static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) {
vreinterpret_u32_u64(vget_high_u64(c)));
}
-static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a,
- const uint16x8_t b) {
- const uint32x4_t c = vpaddlq_u16(a);
- const uint32x4_t d = vpadalq_u16(c, b);
- const uint64x2_t e = vpaddlq_u32(d);
- return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)),
- vreinterpret_u32_u64(vget_high_u64(e)));
-}
-
static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) {
const uint64x2_t b = vpaddlq_u32(a);
return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)),
diff --git a/vpx_dsp/arm/sum_squares_neon.c b/vpx_dsp/arm/sum_squares_neon.c
new file mode 100644
index 000000000..8942ba83b
--- /dev/null
+++ b/vpx_dsp/arm/sum_squares_neon.c
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+
+uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) {
+ int64x1_t s2;
+
+ if (size == 4) {
+ int16x4_t s[4];
+ int32x4_t s0;
+ uint32x2_t s1;
+
+ s[0] = vld1_s16(src + 0 * stride);
+ s[1] = vld1_s16(src + 1 * stride);
+ s[2] = vld1_s16(src + 2 * stride);
+ s[3] = vld1_s16(src + 3 * stride);
+ s0 = vmull_s16(s[0], s[0]);
+ s0 = vmlal_s16(s0, s[1], s[1]);
+ s0 = vmlal_s16(s0, s[2], s[2]);
+ s0 = vmlal_s16(s0, s[3], s[3]);
+ s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)),
+ vget_high_u32(vreinterpretq_u32_s32(s0)));
+ s2 = vpaddl_u32(s1);
+ } else {
+ int r = size;
+ uint64x2_t s1 = vdupq_n_u64(0);
+
+ do {
+ int c = size;
+ int32x4_t s0 = vdupq_n_s32(0);
+ const int16_t *src_t = src;
+
+ do {
+ int16x8_t s[8];
+
+ s[0] = vld1q_s16(src_t + 0 * stride);
+ s[1] = vld1q_s16(src_t + 1 * stride);
+ s[2] = vld1q_s16(src_t + 2 * stride);
+ s[3] = vld1q_s16(src_t + 3 * stride);
+ s[4] = vld1q_s16(src_t + 4 * stride);
+ s[5] = vld1q_s16(src_t + 5 * stride);
+ s[6] = vld1q_s16(src_t + 6 * stride);
+ s[7] = vld1q_s16(src_t + 7 * stride);
+ s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0]));
+ s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1]));
+ s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2]));
+ s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3]));
+ s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4]));
+ s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5]));
+ s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6]));
+ s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7]));
+ s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0]));
+ s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1]));
+ s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2]));
+ s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3]));
+ s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4]));
+ s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5]));
+ s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6]));
+ s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7]));
+ src_t += 8;
+ c -= 8;
+ } while (c);
+
+ s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0)));
+ s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0)));
+ src += 8 * stride;
+ r -= 8;
+ } while (r);
+
+ s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1));
+ }
+
+ return vget_lane_u64(s2, 0);
+}
diff --git a/vpx_dsp/mips/vpx_convolve8_mmi.c b/vpx_dsp/mips/vpx_convolve8_mmi.c
index 0cfb81e4d..ba9ceb866 100644
--- a/vpx_dsp/mips/vpx_convolve8_mmi.c
+++ b/vpx_dsp/mips/vpx_convolve8_mmi.c
@@ -254,6 +254,89 @@ static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
);
}
+static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int x_step_q4, int32_t w, int32_t h) {
+ const int16_t *filter_x = filter[x0_q4];
+ double ftmp[14];
+ uint32_t tmp[2];
+ uint32_t para[2];
+ para[0] = (1 << ((FILTER_BITS)-1));
+ para[1] = FILTER_BITS;
+ src -= SUBPEL_TAPS / 2 - 1;
+ src_stride -= w;
+ dst_stride -= w;
+ (void)x_step_q4;
+
+ __asm__ volatile(
+ "move %[tmp1], %[width] \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "gsldlc1 %[filter1], 0x03(%[filter]) \n\t"
+ "gsldrc1 %[filter1], 0x00(%[filter]) \n\t"
+ "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t"
+ "gsldrc1 %[filter2], 0x08(%[filter]) \n\t"
+ "1: \n\t"
+ /* Get 8 data per row */
+ "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t"
+ "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t"
+ "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t"
+ "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t"
+ "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t"
+ "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t"
+ "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t"
+ "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t"
+ MMI_ADDIU(%[width], %[width], -0x04)
+ /* Get raw data */
+ GET_DATA_H_MMI
+ ROUND_POWER_OF_TWO_MMI
+ CLIP_PIXEL_MMI
+ "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
+ "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t"
+ "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t"
+ "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t"
+ "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t"
+ "li %[tmp0], 0x10001 \n\t"
+ MMI_MTC1(%[tmp0], %[ftmp5])
+ "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t"
+ "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
+ "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t"
+ "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t"
+ "swc1 %[ftmp12], 0x00(%[dst]) \n\t"
+ MMI_ADDIU(%[dst], %[dst], 0x04)
+ MMI_ADDIU(%[src], %[src], 0x04)
+ /* Loop count */
+ "bnez %[width], 1b \n\t"
+ "move %[width], %[tmp1] \n\t"
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+ MMI_ADDIU(%[height], %[height], -0x01)
+ "bnez %[height], 1b \n\t"
+ : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]),
+ [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]),
+ [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]),
+ [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]),
+ [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]),
+ [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]),
+ [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [src]"+&r"(src), [width]"+&r"(w),
+ [dst]"+&r"(dst), [height]"+&r"(h)
+ : [filter]"r"(filter_x), [para]"r"(para),
+ [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+}
+
static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int y0_q4,
@@ -362,52 +445,63 @@ void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int x0_q4, int x_step_q4,
int y0_q4, int y_step_q4, int w, int h) {
- double ftmp[4];
- uint32_t tmp[2];
- src_stride -= w;
- dst_stride -= w;
+ int x, y;
+
(void)filter;
(void)x0_q4;
(void)x_step_q4;
(void)y0_q4;
(void)y_step_q4;
- __asm__ volatile(
- "move %[tmp1], %[width] \n\t"
- "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
- "li %[tmp0], 0x10001 \n\t"
- MMI_MTC1(%[tmp0], %[ftmp3])
- "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
- "1: \n\t"
- "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
- "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t"
- "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t"
- "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t"
- "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
- "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
- "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
- "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
- "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
- "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
- "swc1 %[ftmp1], 0x00(%[dst]) \n\t"
- MMI_ADDIU(%[width], %[width], -0x04)
- MMI_ADDIU(%[dst], %[dst], 0x04)
- MMI_ADDIU(%[src], %[src], 0x04)
- "bnez %[width], 1b \n\t"
- "move %[width], %[tmp1] \n\t"
- MMI_ADDU(%[dst], %[dst], %[dst_stride])
- MMI_ADDU(%[src], %[src], %[src_stride])
- MMI_ADDIU(%[height], %[height], -0x01)
- "bnez %[height], 1b \n\t"
- : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
- [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
- [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
- [src]"+&r"(src), [dst]"+&r"(dst),
- [width]"+&r"(w), [height]"+&r"(h)
- : [src_stride]"r"((mips_reg)src_stride),
- [dst_stride]"r"((mips_reg)dst_stride)
- : "memory"
- );
+ if (w & 0x03) {
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1);
+ src += src_stride;
+ dst += dst_stride;
+ }
+ } else {
+ double ftmp[4];
+ uint32_t tmp[2];
+ src_stride -= w;
+ dst_stride -= w;
+
+ __asm__ volatile(
+ "move %[tmp1], %[width] \n\t"
+ "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t"
+ "li %[tmp0], 0x10001 \n\t"
+ MMI_MTC1(%[tmp0], %[ftmp3])
+ "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t"
+ "1: \n\t"
+ "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t"
+ "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t"
+ "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t"
+ "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t"
+ "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t"
+ "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t"
+ "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t"
+ "swc1 %[ftmp1], 0x00(%[dst]) \n\t"
+ MMI_ADDIU(%[width], %[width], -0x04)
+ MMI_ADDIU(%[dst], %[dst], 0x04)
+ MMI_ADDIU(%[src], %[src], 0x04)
+ "bnez %[width], 1b \n\t"
+ "move %[width], %[tmp1] \n\t"
+ MMI_ADDU(%[dst], %[dst], %[dst_stride])
+ MMI_ADDU(%[src], %[src], %[src_stride])
+ MMI_ADDIU(%[height], %[height], -0x01)
+ "bnez %[height], 1b \n\t"
+ : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]),
+ [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]),
+ [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]),
+ [src]"+&r"(src), [dst]"+&r"(dst),
+ [width]"+&r"(w), [height]"+&r"(h)
+ : [src_stride]"r"((mips_reg)src_stride),
+ [dst_stride]"r"((mips_reg)dst_stride)
+ : "memory"
+ );
+ }
}
static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride,
@@ -481,6 +575,29 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride,
}
}
+static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *x_filters, int x0_q4,
+ int x_step_q4, int w, int h) {
+ int x, y;
+ src -= SUBPEL_TAPS / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ int x_q4 = x0_q4;
+ for (x = 0; x < w; ++x) {
+ const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+ const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+ int k, sum = 0;
+ for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+ dst[x] = ROUND_POWER_OF_TWO(
+ dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+ x_q4 += x_step_q4;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
ptrdiff_t dst_stride, const InterpKernel *filter,
int x0_q4, int32_t x_step_q4, int y0_q4,
@@ -553,6 +670,21 @@ void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
y_step_q4, w, h);
}
+void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const InterpKernel *filter, int x0_q4,
+ int32_t x_step_q4, int y0_q4, int y_step_q4,
+ int w, int h) {
+ (void)y0_q4;
+ (void)y_step_q4;
+ if (w & 0x03)
+ convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, w, h);
+ else
+ convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4,
+ x_step_q4, w, h);
+}
+
void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride,
uint8_t *dst, ptrdiff_t dst_stride,
const InterpKernel *filter, int x0_q4,
@@ -580,8 +712,5 @@ void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride,
vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4,
y_step_q4, w, h);
- if (w & 0x03)
- vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
- else
- vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
+ vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h);
}
diff --git a/vpx_dsp/ppc/inv_txfm_vsx.c b/vpx_dsp/ppc/inv_txfm_vsx.c
index f095cb0a4..6603b85ac 100644
--- a/vpx_dsp/ppc/inv_txfm_vsx.c
+++ b/vpx_dsp/ppc/inv_txfm_vsx.c
@@ -76,6 +76,8 @@ static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 };
static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 };
static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
+static uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
#define ROUND_SHIFT_INIT \
const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \
const uint32x4_t shift14 = vec_splat_u32(14);
@@ -107,6 +109,15 @@ static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 };
out1 = vec_sub(step0, step1); \
out1 = vec_perm(out1, out1, mask0);
+#define PACK_STORE(v0, v1) \
+ tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); \
+ tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); \
+ output_v = vec_packsu(tmp16_0, tmp16_1); \
+ \
+ vec_vsx_st(output_v, 0, tmp_dest); \
+ for (i = 0; i < 4; i++) \
+ for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+
void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
int stride) {
int i, j;
@@ -114,13 +125,10 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1;
uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF,
0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 };
- uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7,
- 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 };
int16x8_t v0 = load_tran_low(0, input);
int16x8_t v1 = load_tran_low(8 * sizeof(*input), input);
int16x8_t t0 = vec_mergeh(v0, v1);
int16x8_t t1 = vec_mergel(v0, v1);
-
uint8x16_t dest0 = vec_vsx_ld(0, dest);
uint8x16_t dest1 = vec_vsx_ld(stride, dest);
uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
@@ -130,6 +138,7 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov);
int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov);
int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov);
+
uint8x16_t output_v;
uint8_t tmp_dest[16];
ROUND_SHIFT_INIT
@@ -148,13 +157,8 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
PIXEL_ADD4(v0, t_out0);
PIXEL_ADD4(v1, t_out1);
- tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0);
- tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1);
- output_v = vec_packsu(tmp16_0, tmp16_1);
- vec_vsx_st(output_v, 0, tmp_dest);
- for (i = 0; i < 4; i++)
- for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i];
+ PACK_STORE(v0, v1);
}
#define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
@@ -1062,3 +1066,67 @@ void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest,
ADD_STORE_BLOCK(src2, 16);
ADD_STORE_BLOCK(src3, 24);
}
+
+#define TRANSFORM_COLS \
+ v32_a = vec_add(v32_a, v32_c); \
+ v32_d = vec_sub(v32_d, v32_b); \
+ v32_e = vec_sub(v32_a, v32_d); \
+ v32_e = vec_sra(v32_e, one); \
+ v32_b = vec_sub(v32_e, v32_b); \
+ v32_c = vec_sub(v32_e, v32_c); \
+ v32_a = vec_sub(v32_a, v32_b); \
+ v32_d = vec_add(v32_d, v32_c); \
+ v_a = vec_packs(v32_a, v32_b); \
+ v_c = vec_packs(v32_c, v32_d);
+
+#define TRANSPOSE_WHT \
+ tmp_a = vec_mergeh(v_a, v_c); \
+ tmp_c = vec_mergel(v_a, v_c); \
+ v_a = vec_mergeh(tmp_a, tmp_c); \
+ v_c = vec_mergel(tmp_a, tmp_c);
+
+void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest,
+ int stride) {
+ int16x8_t v_a = load_tran_low(0, input);
+ int16x8_t v_c = load_tran_low(8 * sizeof(*input), input);
+ int16x8_t tmp_a, tmp_c;
+ uint16x8_t two = vec_splat_u16(2);
+ uint32x4_t one = vec_splat_u32(1);
+ int16x8_t tmp16_0, tmp16_1;
+ int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e;
+ uint8x16_t dest0 = vec_vsx_ld(0, dest);
+ uint8x16_t dest1 = vec_vsx_ld(stride, dest);
+ uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest);
+ uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest);
+ int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0);
+ int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1);
+ int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2);
+ int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3);
+ uint8x16_t output_v;
+ uint8_t tmp_dest[16];
+ int i, j;
+
+ v_a = vec_sra(v_a, two);
+ v_c = vec_sra(v_c, two);
+
+ TRANSPOSE_WHT;
+
+ v32_a = vec_unpackh(v_a);
+ v32_c = vec_unpackl(v_a);
+
+ v32_d = vec_unpackh(v_c);
+ v32_b = vec_unpackl(v_c);
+
+ TRANSFORM_COLS;
+
+ TRANSPOSE_WHT;
+
+ v32_a = vec_unpackh(v_a);
+ v32_c = vec_unpackl(v_a);
+ v32_d = vec_unpackh(v_c);
+ v32_b = vec_unpackl(v_c);
+
+ TRANSFORM_COLS;
+
+ PACK_STORE(v_a, v_c);
+}
diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c
new file mode 100644
index 000000000..e037f89e3
--- /dev/null
+++ b/vpx_dsp/ppc/quantize_vsx.c
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/ppc/types_vsx.h"
+
+// Negate 16-bit integers in a when the corresponding signed 16-bit
+// integer in b is negative.
+static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) {
+ const int16x8_t mask = vec_sra(b, vec_shift_sign_s16);
+ return vec_xor(vec_add(a, mask), mask);
+}
+
+// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
+// integers, and return the high 16 bits of the intermediate integers.
+static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) {
+ // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right
+ // shift.
+ return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_s16);
+}
+
+static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs,
+ int16x8_t round, int16x8_t quant,
+ int16x8_t quant_shift, bool16x8_t mask) {
+ int16x8_t rounded, qcoeff;
+ rounded = vec_vaddshs(coeff_abs, round);
+ qcoeff = vec_mulhi(rounded, quant);
+ qcoeff = vec_add(qcoeff, rounded);
+ qcoeff = vec_mulhi(qcoeff, quant_shift);
+ qcoeff = vec_sign(qcoeff, coeff);
+ return vec_and(qcoeff, mask);
+}
+
+static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask,
+ const int16_t *iscan_ptr) {
+ bool16x8_t zero_coeff;
+ int16x8_t scan = vec_vsx_ld(0, iscan_ptr);
+ zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16);
+ scan = vec_sub(scan, mask);
+ return vec_andc(scan, zero_coeff);
+}
+
+// Compare packed 16-bit integers across a, and return the maximum value in
+// every element. Returns a vector containing the biggest value across vector a.
+static INLINE int16x8_t vec_max_across(int16x8_t a) {
+ a = vec_max(a, vec_perm(a, a, vec_perm64));
+ a = vec_max(a, vec_perm(a, a, vec_perm32));
+ return vec_max(a, vec_perm(a, a, vec_perm16));
+}
+
+void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+ int skip_block, const int16_t *zbin_ptr,
+ const int16_t *round_ptr, const int16_t *quant_ptr,
+ const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
+ tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+ uint16_t *eob_ptr, const int16_t *scan_ptr,
+ const int16_t *iscan_ptr) {
+ int16x8_t qcoeff, dqcoeff, eob;
+
+ // First set of 8 coeff starts with DC + 7 AC
+ int16x8_t zbin = vec_vsx_ld(0, zbin_ptr);
+ int16x8_t round = vec_vsx_ld(0, round_ptr);
+ int16x8_t quant = vec_vsx_ld(0, quant_ptr);
+ int16x8_t dequant = vec_vsx_ld(0, dequant_ptr);
+ int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr);
+
+ int16x8_t coeff = vec_vsx_ld(0, coeff_ptr);
+ int16x8_t coeff_abs = vec_abs(coeff);
+ bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin);
+
+ (void)scan_ptr;
+ (void)skip_block;
+ assert(!skip_block);
+
+ qcoeff =
+ quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
+ vec_vsx_st(qcoeff, 0, qcoeff_ptr);
+
+ dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
+
+ eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr);
+
+ // All other sets of 8 coeffs will only contain AC
+ zbin = vec_splat(zbin, 1);
+ round = vec_splat(round, 1);
+ quant = vec_splat(quant, 1);
+ dequant = vec_splat(dequant, 1);
+ quant_shift = vec_splat(quant_shift, 1);
+
+ n_coeffs -= 8;
+ do {
+ coeff_ptr += 8;
+ qcoeff_ptr += 8;
+ dqcoeff_ptr += 8;
+ iscan_ptr += 8;
+
+ coeff = vec_vsx_ld(0, coeff_ptr);
+ coeff_abs = vec_abs(coeff);
+ zero_mask = vec_cmpge(coeff_abs, zbin);
+ qcoeff =
+ quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask);
+ vec_vsx_st(qcoeff, 0, qcoeff_ptr);
+
+ dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16);
+ vec_vsx_st(dqcoeff, 0, dqcoeff_ptr);
+
+ eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr));
+
+ n_coeffs -= 8;
+ } while (n_coeffs > 0);
+
+ eob = vec_max_across(eob);
+ *eob_ptr = eob[0];
+}
diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h
index f611d02d2..e2af55463 100644
--- a/vpx_dsp/ppc/types_vsx.h
+++ b/vpx_dsp/ppc/types_vsx.h
@@ -19,6 +19,7 @@ typedef vector signed short int16x8_t;
typedef vector unsigned short uint16x8_t;
typedef vector signed int int32x4_t;
typedef vector unsigned int uint32x4_t;
+typedef vector bool short bool16x8_t;
#ifdef __clang__
static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
@@ -65,4 +66,17 @@ static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
#endif
#endif
+static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 };
+static const uint16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 };
+static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 };
+static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D,
+ 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03,
+ 0x04, 0x05, 0x06, 0x07 };
+static const uint8x16_t vec_perm32 = { 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+ 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F,
+ 0x00, 0x01, 0x02, 0x03 };
+static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
+ 0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D,
+ 0x0E, 0x0F, 0x00, 0x01 };
+
#endif // VPX_DSP_PPC_TYPES_VSX_H_
diff --git a/vpx_dsp/sum_squares.c b/vpx_dsp/sum_squares.c
index 7c535ac2d..b80cd588e 100644
--- a/vpx_dsp/sum_squares.c
+++ b/vpx_dsp/sum_squares.c
@@ -10,8 +10,7 @@
#include "./vpx_dsp_rtcd.h"
-uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
- int size) {
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) {
int r, c;
uint64_t ss = 0;
@@ -20,7 +19,7 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
const int16_t v = src[c];
ss += v * v;
}
- src += src_stride;
+ src += stride;
}
return ss;
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 167011034..cb06a476f 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -286,6 +286,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c
DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c
DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c
DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c
+DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c
ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c
endif
@@ -312,6 +313,7 @@ ifeq ($(CONFIG_ENCODERS),yes)
DSP_SRCS-yes += sad.c
DSP_SRCS-yes += subtract.c
DSP_SRCS-yes += sum_squares.c
+DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c
DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c
DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 4dbee088b..c36ec84b0 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -363,7 +363,7 @@ add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride,
specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/;
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/;
add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
@@ -378,7 +378,7 @@ add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride,
specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
@@ -626,7 +626,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/;
specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/;
specialize qw/vpx_idct32x32_1_add neon sse2/;
- specialize qw/vpx_iwht4x4_16_add sse2/;
+ specialize qw/vpx_iwht4x4_16_add sse2 vsx/;
if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
# Note that these specializations are appended to the above ones.
@@ -699,7 +699,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
- specialize qw/vpx_quantize_b neon sse2 ssse3 avx/;
+ specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/;
add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/;
@@ -922,7 +922,7 @@ add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const
specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
-specialize qw/vpx_sum_squares_2d_i16 sse2 msa/;
+specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/;
#
# Structured Similarity (SSIM)
diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h
index 2ce738fb7..419f17863 100644
--- a/vpx_dsp/x86/mem_sse2.h
+++ b/vpx_dsp/x86/mem_sse2.h
@@ -15,6 +15,11 @@
#include "./vpx_config.h"
+static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) {
+ return _mm_castps_si128(
+ _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src));
+}
+
static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride,
__m128i *const d) {
d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride));
diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c
index 026d0ca2f..9eaf6ee1b 100644
--- a/vpx_dsp/x86/sum_squares_sse2.c
+++ b/vpx_dsp/x86/sum_squares_sse2.c
@@ -10,120 +10,96 @@
#include <assert.h>
#include <emmintrin.h>
-#include <stdio.h>
#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/x86/mem_sse2.h"
-static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
- int stride) {
- const __m128i v_val_0_w =
- _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
- const __m128i v_val_1_w =
- _mm_loadl_epi64((const __m128i *)(src + 1 * stride));
- const __m128i v_val_2_w =
- _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
- const __m128i v_val_3_w =
- _mm_loadl_epi64((const __m128i *)(src + 3 * stride));
-
- const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
- const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
- const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
- const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
-
- const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
- const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
- const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
-
- const __m128i v_sum_d =
- _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32));
-
- return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
-}
-
-// TODO(jingning): Evaluate the performance impact here.
-#ifdef __GNUC__
-// This prevents GCC/Clang from inlining this function into
-// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
-// maintenance instructions in the common case of 4x4.
-__attribute__((noinline))
-#endif
-static uint64_t
-vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) {
- int r, c;
- const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
- __m128i v_acc_q = _mm_setzero_si128();
-
- for (r = 0; r < size; r += 8) {
- __m128i v_acc_d = _mm_setzero_si128();
-
- for (c = 0; c < size; c += 8) {
- const int16_t *b = src + c;
- const __m128i v_val_0_w =
- _mm_load_si128((const __m128i *)(b + 0 * stride));
- const __m128i v_val_1_w =
- _mm_load_si128((const __m128i *)(b + 1 * stride));
- const __m128i v_val_2_w =
- _mm_load_si128((const __m128i *)(b + 2 * stride));
- const __m128i v_val_3_w =
- _mm_load_si128((const __m128i *)(b + 3 * stride));
- const __m128i v_val_4_w =
- _mm_load_si128((const __m128i *)(b + 4 * stride));
- const __m128i v_val_5_w =
- _mm_load_si128((const __m128i *)(b + 5 * stride));
- const __m128i v_val_6_w =
- _mm_load_si128((const __m128i *)(b + 6 * stride));
- const __m128i v_val_7_w =
- _mm_load_si128((const __m128i *)(b + 7 * stride));
-
- const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
- const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
- const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
- const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
- const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
- const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
- const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
- const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
-
- const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
- const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
- const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
- const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
-
- const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
- const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
-
- v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
- v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
- }
-
- v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
- v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
+ // Over 75% of all calls are with size == 4.
+ if (size == 4) {
+ __m128i s[2], sq[2], ss;
+
+ s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride));
+ s[0] = loadh_epi64(s[0], src + 1 * stride);
+ s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride));
+ s[1] = loadh_epi64(s[1], src + 3 * stride);
+ sq[0] = _mm_madd_epi16(s[0], s[0]);
+ sq[1] = _mm_madd_epi16(s[1], s[1]);
+ sq[0] = _mm_add_epi32(sq[0], sq[1]);
+ ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8));
+ ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32));
+
+ return (uint64_t)_mm_cvtsi128_si32(ss);
+ } else {
+ // Generic case
+ int r = size;
+ const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+ __m128i v_acc_q = _mm_setzero_si128();
- src += 8 * stride;
- }
+ assert(size % 8 == 0);
- v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+ do {
+ int c = 0;
+ __m128i v_acc_d = _mm_setzero_si128();
+
+ do {
+ const int16_t *const b = src + c;
+ const __m128i v_val_0_w =
+ _mm_load_si128((const __m128i *)(b + 0 * stride));
+ const __m128i v_val_1_w =
+ _mm_load_si128((const __m128i *)(b + 1 * stride));
+ const __m128i v_val_2_w =
+ _mm_load_si128((const __m128i *)(b + 2 * stride));
+ const __m128i v_val_3_w =
+ _mm_load_si128((const __m128i *)(b + 3 * stride));
+ const __m128i v_val_4_w =
+ _mm_load_si128((const __m128i *)(b + 4 * stride));
+ const __m128i v_val_5_w =
+ _mm_load_si128((const __m128i *)(b + 5 * stride));
+ const __m128i v_val_6_w =
+ _mm_load_si128((const __m128i *)(b + 6 * stride));
+ const __m128i v_val_7_w =
+ _mm_load_si128((const __m128i *)(b + 7 * stride));
+
+ const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+ const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+ const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+ const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+ const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+ const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+ const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+ const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+ const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+ const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+ const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+ const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+ const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+ const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+ v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+ c += 8;
+ } while (c < size);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+ src += 8 * stride;
+ r -= 8;
+ } while (r);
+
+ v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
#if ARCH_X86_64
- return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+ return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
#else
- {
- uint64_t tmp;
- _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
- return tmp;
- }
+ {
+ uint64_t tmp;
+ _mm_storel_epi64((__m128i *)&tmp, v_acc_q);
+ return tmp;
+ }
#endif
-}
-
-uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) {
- // 4 elements per row only requires half an XMM register, so this
- // must be a special case, but also note that over 75% of all calls
- // are with size == 4, so it is also the common case.
- if (size == 4) {
- return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
- } else {
- // Generic case
- assert(size % 8 == 0);
- return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
}
}