diff options
47 files changed, 1418 insertions, 630 deletions
@@ -76,7 +76,6 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv8-linux-gcc mips32-linux-gcc mips64-linux-gcc - ppc64-linux-gcc ppc64le-linux-gcc sparc-solaris-gcc x86-android-gcc diff --git a/build/make/configure.sh b/build/make/configure.sh index 876255bfe..f1d0e34c3 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -719,11 +719,8 @@ process_common_toolchain() { *sparc*) tgt_isa=sparc ;; - power*64*-*) - tgt_isa=ppc64 - ;; - power*) - tgt_isa=ppc + power*64le*-*) + tgt_isa=ppc64le ;; *mips64el*) tgt_isa=mips64 @@ -1221,7 +1218,7 @@ EOF check_add_asflags -march=${tgt_isa} check_add_asflags -KPIC ;; - ppc*) + ppc64le*) link_with_cc=gcc setup_gnu_toolchain check_gcc_machine_option "vsx" @@ -1491,7 +1488,7 @@ EOF # bionic includes basic pthread functionality, obviating -lpthread. ;; *) - check_header pthread.h && check_lib -lpthread <<EOF && enable_feature pthread_h && add_extralibs -lpthread + check_header pthread.h && check_lib -lpthread <<EOF && add_extralibs -lpthread || disable_feature pthread_h #include <pthread.h> #include <stddef.h> int main(void) { return pthread_create(NULL, NULL, NULL, NULL); } @@ -116,7 +116,6 @@ all_platforms="${all_platforms} armv7s-darwin-gcc" all_platforms="${all_platforms} armv8-linux-gcc" all_platforms="${all_platforms} mips32-linux-gcc" all_platforms="${all_platforms} mips64-linux-gcc" -all_platforms="${all_platforms} ppc64-linux-gcc" all_platforms="${all_platforms} ppc64le-linux-gcc" all_platforms="${all_platforms} sparc-solaris-gcc" all_platforms="${all_platforms} x86-android-gcc" @@ -585,7 +584,7 @@ EOF # Use both check_header and check_lib here, since check_lib # could be a stub that always returns true. - check_header pthread.h && check_lib -lpthread <<EOF && enable_feature pthread_h + check_header pthread.h && check_lib -lpthread <<EOF || disable_feature pthread_h #include <pthread.h> #include <stddef.h> int main(void) { return pthread_create(NULL, NULL, NULL, NULL); } @@ -593,6 +592,10 @@ EOF check_header unistd.h # for sysconf(3) and friends. check_header vpx/vpx_integer.h -I${source_path} && enable_feature vpx_ports + + if enabled neon && ! enabled external_build; then + check_header arm_neon.h || die "Unable to find arm_neon.h" + fi } process_toolchain() { @@ -76,12 +76,12 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, size_t frame_size = 0; if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) { - if (!feof(infile)) warn("Failed to read frame size\n"); + if (!feof(infile)) warn("Failed to read frame size"); } else { frame_size = mem_get_le32(raw_header); if (frame_size > 256 * 1024 * 1024) { - warn("Read invalid frame size (%u)\n", (unsigned int)frame_size); + warn("Read invalid frame size (%u)", (unsigned int)frame_size); frame_size = 0; } @@ -92,7 +92,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, *buffer = new_buffer; *buffer_size = 2 * frame_size; } else { - warn("Failed to allocate compressed data buffer\n"); + warn("Failed to allocate compressed data buffer"); frame_size = 0; } } @@ -100,7 +100,7 @@ int ivf_read_frame(FILE *infile, uint8_t **buffer, size_t *bytes_read, if (!feof(infile)) { if (fread(*buffer, 1, frame_size, infile) != frame_size) { - warn("Failed to read full frame\n"); + warn("Failed to read full frame"); return 1; } diff --git a/test/convolve_test.cc b/test/convolve_test.cc index 3e9377a0e..8f6c5cd48 100644 --- a/test/convolve_test.cc +++ b/test/convolve_test.cc @@ -1382,8 +1382,8 @@ INSTANTIATE_TEST_CASE_P(VSX, ConvolveTest, #if HAVE_MMI const ConvolveFunctions convolve8_mmi( - vpx_convolve_copy_c, vpx_convolve_avg_c, vpx_convolve8_horiz_mmi, - vpx_convolve8_avg_horiz_c, vpx_convolve8_vert_mmi, + vpx_convolve_copy_c, vpx_convolve_avg_mmi, vpx_convolve8_horiz_mmi, + vpx_convolve8_avg_horiz_mmi, vpx_convolve8_vert_mmi, vpx_convolve8_avg_vert_mmi, vpx_convolve8_mmi, vpx_convolve8_avg_mmi, vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c, vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0); diff --git a/test/dct_test.cc b/test/dct_test.cc index 10062150f..e8ad0cd5d 100644 --- a/test/dct_test.cc +++ b/test/dct_test.cc @@ -725,4 +725,14 @@ INSTANTIATE_TEST_CASE_P(SSE2, TransWHT, ::testing::Values(make_tuple(0, &wht_sse2_func_info, 0, VPX_BITS_8))); #endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE + +#if HAVE_VSX && !CONFIG_EMULATE_HARDWARE && !CONFIG_VP9_HIGHBITDEPTH +static const FuncInfo wht_vsx_func_info = { + &fdct_wrapper<vp9_fwht4x4_c>, &idct_wrapper<vpx_iwht4x4_16_add_vsx>, 4, 1 +}; + +INSTANTIATE_TEST_CASE_P(VSX, TransWHT, + ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0, + VPX_BITS_8))); +#endif // HAVE_VSX && !CONFIG_EMULATE_HARDWARE } // namespace diff --git a/test/ivf_video_source.h b/test/ivf_video_source.h index 5862d2649..4b5d55469 100644 --- a/test/ivf_video_source.h +++ b/test/ivf_video_source.h @@ -16,7 +16,7 @@ #include "test/video_source.h" namespace libvpx_test { -const unsigned int kCodeBufferSize = 256 * 1024; +const unsigned int kCodeBufferSize = 256 * 1024 * 1024; const unsigned int kIvfFileHdrSize = 32; const unsigned int kIvfFrameHdrSize = 12; diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc index 910718b06..e211fb4d8 100644 --- a/test/sum_squares_test.cc +++ b/test/sum_squares_test.cc @@ -104,6 +104,13 @@ TEST_P(SumSquaresTest, ExtremeValues) { using ::testing::make_tuple; +#if HAVE_NEON +INSTANTIATE_TEST_CASE_P( + NEON, SumSquaresTest, + ::testing::Values(make_tuple(&vpx_sum_squares_2d_i16_c, + &vpx_sum_squares_2d_i16_neon))); +#endif // HAVE_NEON + #if HAVE_SSE2 INSTANTIATE_TEST_CASE_P( SSE2, SumSquaresTest, diff --git a/test/variance_test.cc b/test/variance_test.cc index f89c98523..725821ae6 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -20,24 +20,13 @@ #include "test/register_state_check.h" #include "vpx/vpx_codec.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/variance.h" #include "vpx_mem/vpx_mem.h" #include "vpx_ports/mem.h" #include "vpx_ports/vpx_timer.h" namespace { -typedef unsigned int (*VarianceMxNFunc)(const uint8_t *a, int a_stride, - const uint8_t *b, int b_stride, - unsigned int *sse); -typedef unsigned int (*SubpixVarMxNFunc)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - unsigned int *sse); -typedef unsigned int (*SubpixAvgVarMxNFunc)(const uint8_t *a, int a_stride, - int xoffset, int yoffset, - const uint8_t *b, int b_stride, - uint32_t *sse, - const uint8_t *second_pred); typedef unsigned int (*Get4x4SseFunc)(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride); typedef unsigned int (*SumOfSquaresFunction)(const int16_t *src); @@ -692,7 +681,7 @@ void SubpelVarianceTest<SubpelVarianceFunctionType>::ExtremeRefTest() { } template <> -void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() { +void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() { for (int x = 0; x < 8; ++x) { for (int y = 0; y < 8; ++y) { if (!use_high_bit_depth()) { @@ -728,10 +717,10 @@ void SubpelVarianceTest<SubpixAvgVarMxNFunc>::RefTest() { } typedef MainTestClass<Get4x4SseFunc> VpxSseTest; -typedef MainTestClass<VarianceMxNFunc> VpxMseTest; -typedef MainTestClass<VarianceMxNFunc> VpxVarianceTest; -typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxSubpelVarianceTest; -typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxSubpelAvgVarianceTest; +typedef MainTestClass<vpx_variance_fn_t> VpxMseTest; +typedef MainTestClass<vpx_variance_fn_t> VpxVarianceTest; +typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxSubpelVarianceTest; +typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> VpxSubpelAvgVarianceTest; TEST_P(VpxSseTest, RefSse) { RefTestSse(); } TEST_P(VpxSseTest, MaxSse) { MaxTestSse(); } @@ -756,14 +745,14 @@ INSTANTIATE_TEST_CASE_P(C, VpxSseTest, ::testing::Values(SseParams(2, 2, &vpx_get4x4sse_cs_c))); -typedef TestParams<VarianceMxNFunc> MseParams; +typedef TestParams<vpx_variance_fn_t> MseParams; INSTANTIATE_TEST_CASE_P(C, VpxMseTest, ::testing::Values(MseParams(4, 4, &vpx_mse16x16_c), MseParams(4, 3, &vpx_mse16x8_c), MseParams(3, 4, &vpx_mse8x16_c), MseParams(3, 3, &vpx_mse8x8_c))); -typedef TestParams<VarianceMxNFunc> VarianceParams; +typedef TestParams<vpx_variance_fn_t> VarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxVarianceTest, ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_c), @@ -780,7 +769,7 @@ INSTANTIATE_TEST_CASE_P( VarianceParams(2, 3, &vpx_variance4x8_c), VarianceParams(2, 2, &vpx_variance4x4_c))); -typedef TestParams<SubpixVarMxNFunc> SubpelVarianceParams; +typedef TestParams<vpx_subpixvariance_fn_t> SubpelVarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxSubpelVarianceTest, ::testing::Values( @@ -798,7 +787,7 @@ INSTANTIATE_TEST_CASE_P( SubpelVarianceParams(2, 3, &vpx_sub_pixel_variance4x8_c, 0), SubpelVarianceParams(2, 2, &vpx_sub_pixel_variance4x4_c, 0))); -typedef TestParams<SubpixAvgVarMxNFunc> SubpelAvgVarianceParams; +typedef TestParams<vpx_subp_avg_variance_fn_t> SubpelAvgVarianceParams; INSTANTIATE_TEST_CASE_P( C, VpxSubpelAvgVarianceTest, ::testing::Values( @@ -817,10 +806,11 @@ INSTANTIATE_TEST_CASE_P( SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0))); #if CONFIG_VP9_HIGHBITDEPTH -typedef MainTestClass<VarianceMxNFunc> VpxHBDMseTest; -typedef MainTestClass<VarianceMxNFunc> VpxHBDVarianceTest; -typedef SubpelVarianceTest<SubpixVarMxNFunc> VpxHBDSubpelVarianceTest; -typedef SubpelVarianceTest<SubpixAvgVarMxNFunc> VpxHBDSubpelAvgVarianceTest; +typedef MainTestClass<vpx_variance_fn_t> VpxHBDMseTest; +typedef MainTestClass<vpx_variance_fn_t> VpxHBDVarianceTest; +typedef SubpelVarianceTest<vpx_subpixvariance_fn_t> VpxHBDSubpelVarianceTest; +typedef SubpelVarianceTest<vpx_subp_avg_variance_fn_t> + VpxHBDSubpelAvgVarianceTest; TEST_P(VpxHBDMseTest, RefMse) { RefTestMse(); } TEST_P(VpxHBDMseTest, MaxMse) { MaxTestMse(); } diff --git a/test/vp8_multi_resolution_encoder.sh b/test/vp8_multi_resolution_encoder.sh index a8b7fe78e..33fd5b0d8 100755 --- a/test/vp8_multi_resolution_encoder.sh +++ b/test/vp8_multi_resolution_encoder.sh @@ -46,19 +46,31 @@ vp8_multi_resolution_encoder_three_formats() { local readonly output_files="${VPX_TEST_OUTPUT_DIR}/vp8_mre_0.ivf ${VPX_TEST_OUTPUT_DIR}/vp8_mre_1.ivf ${VPX_TEST_OUTPUT_DIR}/vp8_mre_2.ivf" + local readonly layer_bitrates="150 80 50" + local readonly keyframe_insert="200" + local readonly temporal_layers="3 3 3" + local readonly framerate="30" if [ "$(vpx_config_option_enabled CONFIG_MULTI_RES_ENCODING)" = "yes" ]; then if [ "$(vp8_encode_available)" = "yes" ]; then # Param order: # Input width # Input height + # Framerate # Input file path # Output file names + # Layer bitrates + # Temporal layers + # Keyframe insert # Output PSNR vp8_mre "${YUV_RAW_INPUT_WIDTH}" \ "${YUV_RAW_INPUT_HEIGHT}" \ + "${framerate}" \ "${YUV_RAW_INPUT}" \ ${output_files} \ + ${layer_bitrates} \ + ${temporal_layers} \ + "${keyframe_insert}" \ 0 for output_file in ${output_files}; do diff --git a/test/vp9_quantize_test.cc b/test/vp9_quantize_test.cc index 4740cadd9..f0bbedbfa 100644 --- a/test/vp9_quantize_test.cc +++ b/test/vp9_quantize_test.cc @@ -575,3 +575,10 @@ INSTANTIATE_TEST_CASE_P( &QuantFPWrapper<vp9_quantize_fp_32x32_c>, VPX_BITS_8, 32, true))); } // namespace + +#if HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH +INSTANTIATE_TEST_CASE_P(VSX, VP9QuantizeTest, + ::testing::Values(make_tuple(&vpx_quantize_b_vsx, + &vpx_quantize_b_c, + VPX_BITS_8, 16, false))); +#endif // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c index abc705363..4e7d99f50 100644 --- a/vp9/encoder/vp9_bitstream.c +++ b/vp9/encoder/vp9_bitstream.c @@ -459,7 +459,8 @@ static void write_modes_sb( write_modes_b(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, max_mv_magnitude, interp_filter_selected); break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col, subsize, max_mv_magnitude, interp_filter_selected); write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row, mi_col + bs, @@ -469,7 +470,6 @@ static void write_modes_sb( write_modes_sb(cpi, xd, tile, w, tok, tok_end, mi_row + bs, mi_col + bs, subsize, max_mv_magnitude, interp_filter_selected); break; - default: assert(0); } } @@ -618,9 +618,10 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, return; } - case ONE_LOOP_REDUCED: { + default: { int updates = 0; int noupdates_before_first = 0; + assert(cpi->sf.use_fast_coef_updates == ONE_LOOP_REDUCED); for (i = 0; i < PLANE_TYPES; ++i) { for (j = 0; j < REF_TYPES; ++j) { for (k = 0; k < COEF_BANDS; ++k) { @@ -670,7 +671,6 @@ static void update_coef_probs_common(vpx_writer *const bc, VP9_COMP *cpi, } return; } - default: assert(0); } } @@ -1149,8 +1149,10 @@ static void write_profile(BITSTREAM_PROFILE profile, case PROFILE_0: vpx_wb_write_literal(wb, 0, 2); break; case PROFILE_1: vpx_wb_write_literal(wb, 2, 2); break; case PROFILE_2: vpx_wb_write_literal(wb, 1, 2); break; - case PROFILE_3: vpx_wb_write_literal(wb, 6, 3); break; - default: assert(0); + default: + assert(profile == PROFILE_3); + vpx_wb_write_literal(wb, 6, 3); + break; } } diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index a283d92a8..091992dbd 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -385,16 +385,13 @@ static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) { node->split[i] = &vt->split[i].part_variances.none; break; } - case BLOCK_4X4: { + default: { v4x4 *vt = (v4x4 *)data; + assert(bsize == BLOCK_4X4); node->part_variances = &vt->part_variances; for (i = 0; i < 4; i++) node->split[i] = &vt->split[i]; break; } - default: { - assert(0); - break; - } } } @@ -885,13 +882,13 @@ static void copy_partitioning_helper(VP9_COMP *cpi, MACROBLOCK *x, set_block_size(cpi, x, xd, mi_row, mi_col, subsize); set_block_size(cpi, x, xd, mi_row, mi_col + bs, subsize); break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col); copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col); copy_partitioning_helper(cpi, x, xd, subsize, mi_row, mi_col + bs); copy_partitioning_helper(cpi, x, xd, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } @@ -1004,7 +1001,8 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, set_block_size(cpi, x, xd, mi_row_high, mi_col_high + bs_high, subsize_high); break; - case PARTITION_SPLIT: + default: + assert(partition_high == PARTITION_SPLIT); if (scale_partitioning_svc(cpi, x, xd, subsize_high, mi_row, mi_col, mi_row_high, mi_col_high)) return 1; @@ -1020,7 +1018,6 @@ static int scale_partitioning_svc(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, mi_col_high + bs_high)) return 1; break; - default: assert(0); } } @@ -1067,13 +1064,13 @@ static void update_partition_svc(VP9_COMP *cpi, BLOCK_SIZE bsize, int mi_row, prev_part[start_pos] = subsize; if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); update_partition_svc(cpi, subsize, mi_row, mi_col); update_partition_svc(cpi, subsize, mi_row + bs, mi_col); update_partition_svc(cpi, subsize, mi_row, mi_col + bs); update_partition_svc(cpi, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } @@ -1108,13 +1105,13 @@ static void update_prev_partition_helper(VP9_COMP *cpi, BLOCK_SIZE bsize, prev_part[start_pos] = subsize; if (mi_col + bs < cm->mi_cols) prev_part[start_pos + bs] = subsize; break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); update_prev_partition_helper(cpi, subsize, mi_row, mi_col); update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col); update_prev_partition_helper(cpi, subsize, mi_row, mi_col + bs); update_prev_partition_helper(cpi, subsize, mi_row + bs, mi_col + bs); break; - default: assert(0); } } } @@ -1387,7 +1384,9 @@ static int choose_partitioning(VP9_COMP *cpi, const TileInfo *const tile, x->plane[0].src.buf, x->plane[0].src.stride, xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride); } else { - y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + const MV dummy_mv = { 0, 0 }; + y_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col, + &dummy_mv); x->sb_use_mv_part = 1; x->sb_mvcol_part = mi->mv[0].as_mv.col; x->sb_mvrow_part = mi->mv[0].as_mv.row; @@ -2181,7 +2180,8 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize, pc_tree->leaf_split[0]); @@ -2196,7 +2196,6 @@ static void encode_sb(VP9_COMP *cpi, ThreadData *td, const TileInfo *const tile, subsize, pc_tree->split[3]); } break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) @@ -2522,7 +2521,8 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); encode_sb_rt(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize, pc_tree->split[0]); @@ -2533,7 +2533,6 @@ static void encode_sb_rt(VP9_COMP *cpi, ThreadData *td, encode_sb_rt(cpi, td, tile, tp, mi_row + hbs, mi_col + hbs, output_enabled, subsize, pc_tree->split[3]); break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) @@ -2672,7 +2671,8 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, last_part_rdc.rdcost += tmp_rdc.rdcost; } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); if (bsize == BLOCK_8X8) { rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc, subsize, pc_tree->leaf_split[0], INT64_MAX); @@ -2702,7 +2702,6 @@ static void rd_use_partition(VP9_COMP *cpi, ThreadData *td, last_part_rdc.dist += tmp_rdc.dist; } break; - default: assert(0); break; } pl = partition_plane_context(xd, mi_row, mi_col, bsize); @@ -4208,7 +4207,8 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, } } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); nonrd_select_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, subsize, output_enabled, rd_cost, @@ -4238,7 +4238,6 @@ static void nonrd_select_partition(VP9_COMP *cpi, ThreadData *td, rd_cost->dist += this_rdc.dist; } break; - default: assert(0 && "Invalid partition type."); break; } } @@ -4327,7 +4326,8 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, output_enabled, subsize, &pc_tree->horizontal[1]); } break; - case PARTITION_SPLIT: + default: + assert(partition == PARTITION_SPLIT); subsize = get_subsize(bsize, PARTITION_SPLIT); if (bsize == BLOCK_8X8) { nonrd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, dummy_cost, @@ -4348,7 +4348,6 @@ static void nonrd_use_partition(VP9_COMP *cpi, ThreadData *td, dummy_cost, pc_tree->split[3]); } break; - default: assert(0 && "Invalid partition type."); break; } if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8) @@ -4452,7 +4451,8 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, nonrd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col, BLOCK_64X64, 1, &dummy_rdc, td->pc_root); break; - case REFERENCE_PARTITION: + default: + assert(partition_search_type == REFERENCE_PARTITION); x->sb_pickmode_part = 1; set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64); // Use nonrd_pick_partition on scene-cut for VBR mode. @@ -4484,7 +4484,6 @@ static void encode_nonrd_sb_row(VP9_COMP *cpi, ThreadData *td, } break; - default: assert(0); break; } // Update ref_frame usage for inter frame if this group is ARF group. @@ -4551,16 +4550,12 @@ static int set_var_thresh_from_histogram(VP9_COMP *cpi) { &var16->sse, &var16->sum); var16->var = variance_highbd(var16); break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); vpx_highbd_12_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, &var16->sum); var16->var = variance_highbd(var16); break; - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, VPX_BITS_10" - " or VPX_BITS_12"); - return -1; } } else { vpx_get16x16var(src, src_stride, last_src, last_stride, &var16->sse, diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 970077d89..bc2765728 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -358,13 +358,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); } return; } @@ -388,13 +388,13 @@ void vp9_xform_quant_fp(MACROBLOCK *x, int plane, int block, int row, int col, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); break; } } @@ -434,13 +434,13 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - default: assert(0); } return; } @@ -462,12 +462,12 @@ void vp9_xform_quant_dc(MACROBLOCK *x, int plane, int block, int row, int col, vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0], eob); break; - default: assert(0); break; } } @@ -511,14 +511,14 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); } return; } @@ -544,13 +544,13 @@ void vp9_xform_quant(MACROBLOCK *x, int plane, int block, int row, int col, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); break; } } @@ -634,14 +634,14 @@ static void encode_block(int plane, int block, int row, int col, vp9_highbd_idct8x8_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. x->highbd_inv_txfm_add(dqcoeff, dst16, pd->dst.stride, p->eobs[block], xd->bd); break; - default: assert(0 && "Invalid transform size"); } return; } @@ -657,13 +657,13 @@ static void encode_block(int plane, int block, int row, int col, case TX_8X8: vp9_idct8x8_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around eob<=1 // which is significant (not just an optimization) for the lossless // case. x->inv_txfm_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]); break; - default: assert(0 && "Invalid transform size"); break; } } @@ -848,7 +848,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, xd->bd); } break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); if (!x->skip_recode) { vpx_highbd_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride, xd->bd); @@ -876,7 +877,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, } } break; - default: assert(0); return; } if (*eob) *(args->skip) = 0; return; @@ -930,7 +930,8 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, if (!x->skip_encode && *eob) vp9_iht8x8_add(tx_type, dqcoeff, dst, dst_stride, *eob); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); if (!x->skip_recode) { vpx_subtract_block(4, 4, src_diff, diff_stride, src, src_stride, dst, dst_stride); @@ -955,7 +956,6 @@ void vp9_encode_block_intra(int plane, int block, int row, int col, vp9_iht4x4_16_add(dqcoeff, dst, dst_stride, tx_type); } break; - default: assert(0); break; } if (*eob) *(args->skip) = 0; } diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 2f92456f2..3384de7ea 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -483,14 +483,10 @@ static INLINE void Scale2Ratio(VPX_SCALING mode, int *hr, int *hs) { *hr = 3; *hs = 5; break; - case ONETWO: - *hr = 1; - *hs = 2; - break; default: + assert(mode == ONETWO); *hr = 1; - *hs = 1; - assert(0); + *hs = 2; break; } } @@ -1726,7 +1722,8 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_sad4x4x4d_bits10) break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); HIGHBD_BFP(BLOCK_32X16, vpx_highbd_sad32x16_bits12, vpx_highbd_sad32x16_avg_bits12, vpx_highbd_12_variance32x16, vpx_highbd_12_sub_pixel_variance32x16, @@ -1805,11 +1802,6 @@ static void highbd_set_var_fns(VP9_COMP *const cpi) { vpx_highbd_12_sub_pixel_avg_variance4x4, vpx_highbd_sad4x4x4d_bits12) break; - - default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); } } } @@ -3031,9 +3023,17 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { // Keep track of frame index for each reference frame. SVC *const svc = &cpi->svc; if (cm->frame_type == KEY_FRAME) { + int i; svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; svc->ref_frame_index[cpi->gld_fb_idx] = svc->current_superframe; svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; + // On key frame update all reference frame slots. + for (i = 0; i < REF_FRAMES; i++) { + // LAST/GOLDEN/ALTREF is already updated above. + if (i != cpi->lst_fb_idx && i != cpi->gld_fb_idx && + i != cpi->alt_fb_idx) + ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[i], cm->new_fb_idx); + } } else { if (cpi->refresh_last_frame) svc->ref_frame_index[cpi->lst_fb_idx] = svc->current_superframe; @@ -3042,6 +3042,8 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { if (cpi->refresh_alt_ref_frame) svc->ref_frame_index[cpi->alt_fb_idx] = svc->current_superframe; } + // Copy flags from encoder to SVC struct. + vp9_copy_flags_ref_update_idx(cpi); } } @@ -3284,11 +3286,9 @@ static void output_frame_level_debug_stats(VP9_COMP *cpi) { case VPX_BITS_10: dc_quant_devisor = 16.0; break; - case VPX_BITS_12: - dc_quant_devisor = 64.0; - break; default: - assert(0 && "bit_depth must be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + dc_quant_devisor = 64.0; break; } #else @@ -3730,28 +3730,9 @@ static void encode_without_recode_loop(VP9_COMP *cpi, size_t *size, suppress_active_map(cpi); // For SVC on non-zero spatial layer: check for disabling inter-layer - // (spatial) prediction, if svc.disable_inter_layer_pred is set. - // if the previous spatial layer was dropped then disable the prediction from - // this (scaled) reference. - if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { - if ((cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY && - !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || - cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF || - cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id - 1]) { - MV_REFERENCE_FRAME ref_frame; - static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, - VP9_ALT_FLAG }; - for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { - const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); - if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { - const struct scale_factors *const scale_fac = - &cm->frame_refs[ref_frame - 1].sf; - if (vp9_is_scaled(scale_fac)) - cpi->ref_frame_flags &= (~flag_list[ref_frame]); - } - } - } - } + // prediction. + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) + vp9_svc_constrain_inter_layer_pred(cpi); // Variance adaptive and in frame q adjustment experiments are mutually // exclusive. @@ -4612,6 +4593,16 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, size_t *size, cpi->last_frame_dropped = 0; cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id] = 0; + // Keep track of the frame buffer index updated/refreshed for the + // current encoded TL0 superframe. + if (cpi->svc.temporal_layer_id == 0) { + if (cpi->refresh_last_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->lst_fb_idx; + else if (cpi->refresh_golden_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->gld_fb_idx; + else if (cpi->refresh_alt_ref_frame) + cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id] = cpi->alt_fb_idx; + } // Disable segmentation if it decrease rate/distortion ratio if (cpi->oxcf.aq_mode == LOOKAHEAD_AQ) diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 303c00a6f..1e0ed70fb 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -645,6 +645,8 @@ typedef struct VP9_COMP { int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES]; int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS]; int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES]; + // Indices are: max_tx_size-1, tx_size_ctx, tx_size + int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES]; int multi_arf_allowed; int multi_arf_enabled; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 3302fde08..453879fb8 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -494,11 +494,10 @@ static int scale_sse_threshold(VP9_COMMON *cm, int thresh) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = thresh; break; case VPX_BITS_10: ret_val = thresh << 4; break; - case VPX_BITS_12: ret_val = thresh << 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = thresh << 8; + break; } } #else @@ -520,11 +519,10 @@ static int get_ul_intra_threshold(VP9_COMMON *cm) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = UL_INTRA_THRESH; break; case VPX_BITS_10: ret_val = UL_INTRA_THRESH << 2; break; - case VPX_BITS_12: ret_val = UL_INTRA_THRESH << 4; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = UL_INTRA_THRESH << 4; + break; } } #else @@ -541,11 +539,10 @@ static int get_smooth_intra_threshold(VP9_COMMON *cm) { switch (cm->bit_depth) { case VPX_BITS_8: ret_val = SMOOTH_INTRA_THRESH; break; case VPX_BITS_10: ret_val = SMOOTH_INTRA_THRESH << 4; break; - case VPX_BITS_12: ret_val = SMOOTH_INTRA_THRESH << 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); + assert(cm->bit_depth == VPX_BITS_12); + ret_val = SMOOTH_INTRA_THRESH << 8; + break; } } #else @@ -971,12 +968,10 @@ void vp9_first_pass_encode_tile_mb_row(VP9_COMP *cpi, ThreadData *td, switch (cm->bit_depth) { case VPX_BITS_8: break; case VPX_BITS_10: this_error >>= 4; break; - case VPX_BITS_12: this_error >>= 8; break; default: - assert(0 && - "cm->bit_depth should be VPX_BITS_8, " - "VPX_BITS_10 or VPX_BITS_12"); - return; + assert(cm->bit_depth == VPX_BITS_12); + this_error >>= 8; + break; } } #endif // CONFIG_VP9_HIGHBITDEPTH @@ -2446,8 +2441,9 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { loop_decay_rate = get_prediction_decay_rate(cpi, &next_frame); // Monitor for static sections. - zero_motion_accumulator = VPXMIN( - zero_motion_accumulator, get_zero_motion_factor(cpi, &next_frame)); + if ((rc->frames_since_key + i - 1) > 1) { + zero_motion_accumulator *= get_zero_motion_factor(cpi, &next_frame); + } // Break clause to detect very still sections after motion. For example, // a static image after a fade or other transition. @@ -2469,8 +2465,17 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { // Break out conditions. // Break at maximum of active_max_gf_interval unless almost totally static. - if (((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && - (i >= active_max_gf_interval) && (zero_motion_accumulator < 0.995)) || + // + // Note that the addition of a test of rc->source_alt_ref_active is + // deliberate. The effect of this is that after a normal altref group even + // if the material is static there will be one normal length GF group + // before allowing longer GF groups. The reason for this is that in cases + // such as slide shows where slides are separated by a complex transition + // such as a fade, the arf group spanning the transition may not be coded + // at a very high quality and hence this frame (with its overlay) is a + // poor golden frame to use for an extended group. + if (((i >= active_max_gf_interval) && + ((zero_motion_accumulator < 0.995) || (rc->source_alt_ref_active))) || ( // Don't break out with a very short interval. (i >= active_min_gf_interval) && @@ -2490,7 +2495,7 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->constrained_gf_group = (i >= rc->frames_to_key) ? 1 : 0; // Should we use the alternate reference frame. - if ((twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) && allow_alt_ref && + if ((zero_motion_accumulator < 0.995) && allow_alt_ref && (i < cpi->oxcf.lag_in_frames) && (i >= rc->min_gf_interval)) { const int forward_frames = (rc->frames_to_key - i >= i - 1) ? i - 1 @@ -2518,11 +2523,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { rc->gfu_boost = VPXMIN((int)rc->gfu_boost, i * 200); #endif - // Set the interval until the next gf. rc->baseline_gf_interval = - (twopass->kf_zeromotion_pct < STATIC_KF_GROUP_THRESH) - ? (i - (is_key_frame || rc->source_alt_ref_pending)) - : i; + ((twopass->kf_zeromotion_pct >= STATIC_KF_GROUP_THRESH) && + (i >= rc->frames_to_key)) + ? i + : (i - (is_key_frame || rc->source_alt_ref_pending)); rc->frames_till_gf_update_due = rc->baseline_gf_interval; @@ -2769,6 +2774,7 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { vp9_zero(next_frame); cpi->common.frame_type = KEY_FRAME; + rc->frames_since_key = 0; // Reset the GF group data structures. vp9_zero(*gf_group); @@ -2913,7 +2919,10 @@ static void find_next_key_frame(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) { for (i = 0; i < (rc->frames_to_key - 1); ++i) { if (EOF == input_stats(twopass, &next_frame)) break; - if (i <= KF_BOOST_SCAN_MAX_FRAMES) { + // The zero motion test here insures that if we mark a kf group as static + // it is static throughout not just the first KF_BOOST_SCAN_MAX_FRAMES. + // It also allows for a larger boost on long static groups. + if ((i <= KF_BOOST_SCAN_MAX_FRAMES) || (zero_motion_accumulator >= 0.99)) { double frame_boost; double zm_factor; @@ -3025,12 +3034,13 @@ static void configure_buffer_updates(VP9_COMP *cpi) { cpi->refresh_alt_ref_frame = 0; cpi->rc.is_src_frame_alt_ref = 1; break; - case ARF_UPDATE: + default: + assert(twopass->gf_group.update_type[twopass->gf_group.index] == + ARF_UPDATE); cpi->refresh_last_frame = 0; cpi->refresh_golden_frame = 0; cpi->refresh_alt_ref_frame = 1; break; - default: assert(0); break; } } diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 1cb978667..ba72c0be5 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -1793,7 +1793,7 @@ static const MV search_pos[4] = { unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, int mi_row, - int mi_col) { + int mi_col, const MV *ref_mv) { MACROBLOCKD *xd = &x->e_mbd; MODE_INFO *mi = xd->mi[0]; struct buf_2d backup_yv12[MAX_MB_PLANE] = { { 0, 0 } }; @@ -1815,6 +1815,7 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, const int norm_factor = 3 + (bw >> 5); const YV12_BUFFER_CONFIG *scaled_ref_frame = vp9_get_scaled_ref_frame(cpi, mi->ref_frame[0]); + MvLimits subpel_mv_limits; if (scaled_ref_frame) { int i; @@ -1917,6 +1918,10 @@ unsigned int vp9_int_pro_motion_estimation(const VP9_COMP *cpi, MACROBLOCK *x, tmp_mv->row *= 8; tmp_mv->col *= 8; + vp9_set_subpel_mv_search_range(&subpel_mv_limits, &x->mv_limits, ref_mv); + clamp_mv(tmp_mv, subpel_mv_limits.col_min, subpel_mv_limits.col_max, + subpel_mv_limits.row_min, subpel_mv_limits.row_max); + if (scaled_ref_frame) { int i; for (i = 0; i < MAX_MB_PLANE; i++) xd->plane[i].pre[0] = backup_yv12[i]; @@ -2210,7 +2215,8 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1, cost_list, fn_ptr, 1, ref_mv, tmp_mv); break; - case NSTEP: + default: + assert(method == NSTEP); var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, MAX_MVSEARCH_STEPS - 1 - step_param, 1, cost_list, fn_ptr, ref_mv, tmp_mv); @@ -2236,7 +2242,6 @@ int vp9_full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, } } break; - default: assert(0 && "Invalid search method."); } if (method != NSTEP && rd && var < var_max) diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index b8db2c353..b4787fe1f 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -66,7 +66,8 @@ int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv, // Perform integral projection based motion estimation. unsigned int vp9_int_pro_motion_estimation(const struct VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE bsize, - int mi_row, int mi_col); + int mi_row, int mi_col, + const MV *ref_mv); typedef uint32_t(fractional_mv_step_fp)( const MACROBLOCK *x, MV *bestmv, const MV *ref_mv, int allow_hp, diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c index 1c2c55b9e..4e9649065 100644 --- a/vp9/encoder/vp9_picklpf.c +++ b/vp9/encoder/vp9_picklpf.c @@ -169,14 +169,10 @@ void vp9_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP9_COMP *cpi, case VPX_BITS_10: filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20); break; - case VPX_BITS_12: + default: + assert(cm->bit_depth == VPX_BITS_12); filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22); break; - default: - assert(0 && - "bit_depth should be VPX_BITS_8, VPX_BITS_10 " - "or VPX_BITS_12"); - return; } #else int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18); diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 3aee46636..f9d7a6db8 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -726,13 +726,13 @@ static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, RD_COST *this_rdc, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - case TX_4X4: + default: + assert(tx_size == TX_4X4); x->fwd_txfm4x4(src_diff, coeff, diff_stride); vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan); break; - default: assert(0); break; } *skippable &= (*eob == 0); eob_cost += 1; @@ -1502,8 +1502,17 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, int flag_svc_subpel = 0; int svc_mv_col = 0; int svc_mv_row = 0; + int no_scaling = 0; unsigned int thresh_svc_skip_golden = 500; - if (cpi->svc.spatial_layer_id > 0 && cpi->svc.high_source_sad_superframe) + if (cpi->use_svc && cpi->svc.spatial_layer_id > 0) { + int layer = LAYER_IDS_TO_IDX(cpi->svc.spatial_layer_id - 1, + cpi->svc.temporal_layer_id, + cpi->svc.number_temporal_layers); + LAYER_CONTEXT *lc = &cpi->svc.layer_context[layer]; + if (lc->scaling_factor_num == lc->scaling_factor_den) no_scaling = 1; + } + if (cpi->svc.spatial_layer_id > 0 && + (cpi->svc.high_source_sad_superframe || no_scaling)) thresh_svc_skip_golden = 0; // Lower the skip threshold if lower spatial layer is better quality relative // to current layer. @@ -1517,7 +1526,6 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, thresh_svc_skip_golden = 1000; init_ref_frame_cost(cm, xd, ref_frame_cost); - memset(&mode_checked[0][0], 0, MB_MODE_COUNT * MAX_REF_FRAMES); if (reuse_inter_pred) { @@ -1764,7 +1772,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, continue; if (sf->short_circuit_flat_blocks && x->source_variance == 0 && - this_mode != NEARESTMV) { + frame_mv[this_mode][ref_frame].as_int != 0) { continue; } @@ -1875,7 +1883,7 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, (!cpi->sf.adaptive_rd_thresh_row_mt && rd_less_than_thresh(best_rdc.rdcost, mode_rd_thresh, &rd_thresh_freq_fact[mode_index]))) - continue; + if (frame_mv[this_mode][ref_frame].as_int != 0) continue; if (this_mode == NEWMV && !force_gf_mv) { if (ref_frame > LAST_FRAME && !cpi->use_svc && @@ -1886,7 +1894,9 @@ void vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, TileDataEnc *tile_data, if (bsize < BLOCK_16X16) continue; - tmp_sad = vp9_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col); + tmp_sad = vp9_int_pro_motion_estimation( + cpi, x, bsize, mi_row, mi_col, + &x->mbmi_ext->ref_mvs[ref_frame][0].as_mv); if (tmp_sad > x->pred_mv_sad[LAST_FRAME]) continue; if (tmp_sad + (num_pels_log2_lookup[bsize] << 4) > best_pred_sad) diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 09f61ead2..276022a56 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -204,10 +204,9 @@ static int get_qzbin_factor(int q, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: return q == 0 ? 64 : (quant < 148 ? 84 : 80); case VPX_BITS_10: return q == 0 ? 64 : (quant < 592 ? 84 : 80); - case VPX_BITS_12: return q == 0 ? 64 : (quant < 2368 ? 84 : 80); default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + assert(bit_depth == VPX_BITS_12); + return q == 0 ? 64 : (quant < 2368 ? 84 : 80); } #else (void)bit_depth; diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index 9fbb48817..599337f80 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -48,18 +48,16 @@ #define MAX_BPB_FACTOR 50 #if CONFIG_VP9_HIGHBITDEPTH -#define ASSIGN_MINQ_TABLE(bit_depth, name) \ - do { \ - switch (bit_depth) { \ - case VPX_BITS_8: name = name##_8; break; \ - case VPX_BITS_10: name = name##_10; break; \ - case VPX_BITS_12: name = name##_12; break; \ - default: \ - assert(0 && \ - "bit_depth should be VPX_BITS_8, VPX_BITS_10" \ - " or VPX_BITS_12"); \ - name = NULL; \ - } \ +#define ASSIGN_MINQ_TABLE(bit_depth, name) \ + do { \ + switch (bit_depth) { \ + case VPX_BITS_8: name = name##_8; break; \ + case VPX_BITS_10: name = name##_10; break; \ + default: \ + assert(bit_depth == VPX_BITS_12); \ + name = name##_12; \ + break; \ + } \ } while (0) #else #define ASSIGN_MINQ_TABLE(bit_depth, name) \ @@ -167,10 +165,9 @@ double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; case VPX_BITS_10: return vp9_ac_quant(qindex, 0, bit_depth) / 16.0; - case VPX_BITS_12: return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1.0; + assert(bit_depth == VPX_BITS_12); + return vp9_ac_quant(qindex, 0, bit_depth) / 64.0; } #else return vp9_ac_quant(qindex, 0, bit_depth) / 4.0; diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c index 6b2306ce9..3407e74c6 100644 --- a/vp9/encoder/vp9_rd.c +++ b/vp9/encoder/vp9_rd.c @@ -69,10 +69,12 @@ static void fill_mode_costs(VP9_COMP *cpi) { const FRAME_CONTEXT *const fc = cpi->common.fc; int i, j; - for (i = 0; i < INTRA_MODES; ++i) - for (j = 0; j < INTRA_MODES; ++j) + for (i = 0; i < INTRA_MODES; ++i) { + for (j = 0; j < INTRA_MODES; ++j) { vp9_cost_tokens(cpi->y_mode_costs[i][j], vp9_kf_y_mode_prob[i][j], vp9_intra_mode_tree); + } + } vp9_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp9_intra_mode_tree); for (i = 0; i < INTRA_MODES; ++i) { @@ -82,9 +84,28 @@ static void fill_mode_costs(VP9_COMP *cpi) { fc->uv_mode_prob[i], vp9_intra_mode_tree); } - for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) + for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i) { vp9_cost_tokens(cpi->switchable_interp_costs[i], fc->switchable_interp_prob[i], vp9_switchable_interp_tree); + } + + for (i = TX_8X8; i < TX_SIZES; ++i) { + for (j = 0; j < TX_SIZE_CONTEXTS; ++j) { + const vpx_prob *tx_probs = get_tx_probs(i, j, &fc->tx_probs); + int k; + for (k = 0; k <= i; ++k) { + int cost = 0; + int m; + for (m = 0; m <= k - (k == i); ++m) { + if (m == k) + cost += vp9_cost_zero(tx_probs[m]); + else + cost += vp9_cost_one(tx_probs[m]); + } + cpi->tx_size_cost[i - 1][j][k] = cost; + } + } + } } static void fill_token_costs(vp9_coeff_cost *c, @@ -153,10 +174,10 @@ int64_t vp9_compute_rd_mult_based_on_qindex(const VP9_COMP *cpi, int qindex) { switch (cpi->common.bit_depth) { case VPX_BITS_8: rdmult = 88 * q * q / 24; break; case VPX_BITS_10: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 4); break; - case VPX_BITS_12: rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); break; default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + assert(cpi->common.bit_depth == VPX_BITS_12); + rdmult = ROUND_POWER_OF_TWO(88 * q * q / 24, 8); + break; } #else int64_t rdmult = 88 * q * q / 24; @@ -185,10 +206,10 @@ static int compute_rd_thresh_factor(int qindex, vpx_bit_depth_t bit_depth) { switch (bit_depth) { case VPX_BITS_8: q = vp9_dc_quant(qindex, 0, VPX_BITS_8) / 4.0; break; case VPX_BITS_10: q = vp9_dc_quant(qindex, 0, VPX_BITS_10) / 16.0; break; - case VPX_BITS_12: q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; break; default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); - return -1; + assert(bit_depth == VPX_BITS_12); + q = vp9_dc_quant(qindex, 0, VPX_BITS_12) / 64.0; + break; } #else (void)bit_depth; @@ -209,12 +230,11 @@ void vp9_initialize_me_consts(VP9_COMP *cpi, MACROBLOCK *x, int qindex) { x->sadperbit16 = sad_per_bit16lut_10[qindex]; x->sadperbit4 = sad_per_bit4lut_10[qindex]; break; - case VPX_BITS_12: + default: + assert(cpi->common.bit_depth == VPX_BITS_12); x->sadperbit16 = sad_per_bit16lut_12[qindex]; x->sadperbit4 = sad_per_bit4lut_12[qindex]; break; - default: - assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 or VPX_BITS_12"); } #else (void)cpi; @@ -471,13 +491,13 @@ void vp9_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size, for (i = 0; i < num_4x4_h; i += 4) t_left[i] = !!*(const uint32_t *)&left[i]; break; - case TX_32X32: + default: + assert(tx_size == TX_32X32); for (i = 0; i < num_4x4_w; i += 8) t_above[i] = !!*(const uint64_t *)&above[i]; for (i = 0; i < num_4x4_h; i += 8) t_left[i] = !!*(const uint64_t *)&left[i]; break; - default: assert(0 && "Invalid transform size."); break; } } diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 39cd1d41c..e39df033a 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -543,8 +543,9 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, MACROBLOCKD *const xd = &x->e_mbd; const struct macroblock_plane *const p = &x->plane[plane]; const struct macroblockd_plane *const pd = &xd->plane[plane]; + const int eob = p->eobs[block]; - if (x->block_tx_domain) { + if (x->block_tx_domain && eob) { const int ss_txfrm_size = tx_size << 1; int64_t this_sse; const int shift = tx_size == TX_32X32 ? 0 : 2; @@ -584,14 +585,13 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, const uint8_t *src = &p->src.buf[src_idx]; const uint8_t *dst = &pd->dst.buf[dst_idx]; const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block); - const uint16_t *eob = &p->eobs[block]; unsigned int tmp; tmp = pixel_sse(cpi, xd, pd, src, src_stride, dst, dst_stride, blk_row, blk_col, plane_bsize, tx_bsize); *out_sse = (int64_t)tmp * 16; - if (*eob) { + if (eob) { #if CONFIG_VP9_HIGHBITDEPTH DECLARE_ALIGNED(16, uint16_t, recon16[1024]); uint8_t *recon = (uint8_t *)recon16; @@ -604,22 +604,22 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, vpx_highbd_convolve_copy(CONVERT_TO_SHORTPTR(dst), dst_stride, recon16, 32, NULL, 0, 0, 0, 0, bs, bs, xd->bd); if (xd->lossless) { - vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_iwht4x4_add(dqcoeff, recon16, 32, eob, xd->bd); } else { switch (tx_size) { case TX_4X4: - vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_idct4x4_add(dqcoeff, recon16, 32, eob, xd->bd); break; case TX_8X8: - vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_idct8x8_add(dqcoeff, recon16, 32, eob, xd->bd); break; case TX_16X16: - vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, *eob, xd->bd); + vp9_highbd_idct16x16_add(dqcoeff, recon16, 32, eob, xd->bd); break; - case TX_32X32: - vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, *eob, xd->bd); + default: + assert(tx_size == TX_32X32); + vp9_highbd_idct32x32_add(dqcoeff, recon16, 32, eob, xd->bd); break; - default: assert(0 && "Invalid transform size"); } } recon = CONVERT_TO_BYTEPTR(recon16); @@ -627,16 +627,16 @@ static void dist_block(const VP9_COMP *cpi, MACROBLOCK *x, int plane, #endif // CONFIG_VP9_HIGHBITDEPTH vpx_convolve_copy(dst, dst_stride, recon, 32, NULL, 0, 0, 0, 0, bs, bs); switch (tx_size) { - case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, *eob); break; - case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, *eob); break; - case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, *eob); break; - case TX_4X4: + case TX_32X32: vp9_idct32x32_add(dqcoeff, recon, 32, eob); break; + case TX_16X16: vp9_idct16x16_add(dqcoeff, recon, 32, eob); break; + case TX_8X8: vp9_idct8x8_add(dqcoeff, recon, 32, eob); break; + default: + assert(tx_size == TX_4X4); // this is like vp9_short_idct4x4 but has a special case around // eob<=1, which is significant (not just an optimization) for // the lossless case. - x->inv_txfm_add(dqcoeff, recon, 32, *eob); + x->inv_txfm_add(dqcoeff, recon, 32, eob); break; - default: assert(0 && "Invalid transform size"); break; } #if CONFIG_VP9_HIGHBITDEPTH } @@ -845,21 +845,20 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, { INT64_MAX, INT64_MAX }, { INT64_MAX, INT64_MAX }, { INT64_MAX, INT64_MAX } }; - int n, m; + int n; int s0, s1; int64_t best_rd = INT64_MAX; TX_SIZE best_tx = max_tx_size; int start_tx, end_tx; - - const vpx_prob *tx_probs = - get_tx_probs(max_tx_size, get_tx_size_context(xd), &cm->fc->tx_probs); + const int tx_size_ctx = get_tx_size_context(xd); assert(skip_prob > 0); s0 = vp9_cost_bit(skip_prob, 0); s1 = vp9_cost_bit(skip_prob, 1); if (cm->tx_mode == TX_MODE_SELECT) { start_tx = max_tx_size; - end_tx = 0; + end_tx = VPXMAX(start_tx - cpi->sf.tx_size_search_depth, 0); + if (bs > BLOCK_32X32) end_tx = VPXMIN(end_tx + 1, start_tx); } else { TX_SIZE chosen_tx_size = VPXMIN(max_tx_size, tx_mode_to_biggest_tx_size[cm->tx_mode]); @@ -868,13 +867,7 @@ static void choose_tx_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, } for (n = start_tx; n >= end_tx; n--) { - int r_tx_size = 0; - for (m = 0; m <= n - (n == (int)max_tx_size); m++) { - if (m == n) - r_tx_size += vp9_cost_zero(tx_probs[m]); - else - r_tx_size += vp9_cost_one(tx_probs[m]); - } + const int r_tx_size = cpi->tx_size_cost[max_tx_size - 1][tx_size_ctx][n]; txfm_rd_in_plane(cpi, x, &r[n][0], &d[n], &s[n], &sse[n], ref_best_rd, 0, bs, n, cpi->sf.use_fast_coef_costing); r[n][1] = r[n][0]; @@ -1469,11 +1462,11 @@ static int set_and_cost_bmi_mvs(VP9_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd, if (is_compound) this_mv[1].as_int = frame_mv[mode][mi->ref_frame[1]].as_int; break; - case ZEROMV: + default: + assert(mode == ZEROMV); this_mv[0].as_int = 0; if (is_compound) this_mv[1].as_int = 0; break; - default: break; } mi->bmi[i].as_mv[0].as_int = this_mv[0].as_int; diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index c0f985cbd..90da68726 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -817,6 +817,7 @@ void vp9_set_speed_features_framesize_independent(VP9_COMP *cpi) { // Some speed-up features even for best quality as minimal impact on quality. sf->adaptive_rd_thresh = 1; sf->tx_size_search_breakout = 1; + sf->tx_size_search_depth = 2; sf->exhaustive_searches_thresh = (cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ? (1 << 20) diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h index 15e8dacbd..946bf0545 100644 --- a/vp9/encoder/vp9_speed_features.h +++ b/vp9/encoder/vp9_speed_features.h @@ -272,6 +272,9 @@ typedef struct SPEED_FEATURES { // for intra and model coefs for the rest. TX_SIZE_SEARCH_METHOD tx_size_search_method; + // How many levels of tx size to search, starting from the largest. + int tx_size_search_depth; + // Low precision 32x32 fdct keeps everything in 16 bits and thus is less // precise but significantly faster than the non lp version. int use_lp32x32fdct; diff --git a/vp9/encoder/vp9_svc_layercontext.c b/vp9/encoder/vp9_svc_layercontext.c index 42a197769..07d1995a8 100644 --- a/vp9/encoder/vp9_svc_layercontext.c +++ b/vp9/encoder/vp9_svc_layercontext.c @@ -46,9 +46,9 @@ void vp9_init_layer_context(VP9_COMP *const cpi) { svc->last_layer_dropped[sl] = 0; svc->drop_spatial_layer[sl] = 0; svc->ext_frame_flags[sl] = 0; - svc->ext_lst_fb_idx[sl] = 0; - svc->ext_gld_fb_idx[sl] = 1; - svc->ext_alt_fb_idx[sl] = 2; + svc->lst_fb_idx[sl] = 0; + svc->gld_fb_idx[sl] = 1; + svc->alt_fb_idx[sl] = 2; svc->downsample_filter_type[sl] = BILINEAR; svc->downsample_filter_phase[sl] = 8; // Set to 8 for averaging filter. svc->framedrop_thresh[sl] = oxcf->drop_frames_water_mark; @@ -407,6 +407,40 @@ void get_layer_resolution(const int width_org, const int height_org, *height_out = h; } +void reset_fb_idx_unused(VP9_COMP *const cpi) { + // If a reference frame is not referenced or refreshed, then set the + // fb_idx for that reference to the first one used/referenced. + // This is to avoid setting fb_idx for a reference to a slot that is not + // used/needed (i.e., since that reference is not referenced or refreshed). + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + MV_REFERENCE_FRAME ref_frame; + MV_REFERENCE_FRAME first_ref = 0; + int first_fb_idx = 0; + int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx }; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) { + if (cpi->ref_frame_flags & flag_list[ref_frame]) { + first_ref = ref_frame; + first_fb_idx = fb_idx[ref_frame - 1]; + break; + } + } + if (first_ref > 0) { + if (first_ref != LAST_FRAME && + !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) && + !cpi->ext_refresh_last_frame) + cpi->lst_fb_idx = first_fb_idx; + else if (first_ref != GOLDEN_FRAME && + !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) && + !cpi->ext_refresh_golden_frame) + cpi->gld_fb_idx = first_fb_idx; + else if (first_ref != ALTREF_FRAME && + !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) && + !cpi->ext_refresh_alt_ref_frame) + cpi->alt_fb_idx = first_fb_idx; + } +} + // The function sets proper ref_frame_flags, buffer indices, and buffer update // variables for temporal layering mode 3 - that does 0-2-1-2 temporal layering // scheme. @@ -510,6 +544,8 @@ static void set_flags_and_fb_idx_for_temporal_mode3(VP9_COMP *const cpi) { cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; } + + reset_fb_idx_unused(cpi); } // The function sets proper ref_frame_flags, buffer indices, and buffer update @@ -569,6 +605,8 @@ static void set_flags_and_fb_idx_for_temporal_mode2(VP9_COMP *const cpi) { cpi->gld_fb_idx = cpi->svc.number_spatial_layers + spatial_id - 1; cpi->alt_fb_idx = cpi->svc.number_spatial_layers + spatial_id; } + + reset_fb_idx_unused(cpi); } // The function sets proper ref_frame_flags, buffer indices, and buffer update @@ -601,6 +639,28 @@ static void set_flags_and_fb_idx_for_temporal_mode_noLayering( } else { cpi->gld_fb_idx = 0; } + + reset_fb_idx_unused(cpi); +} + +void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) { + SVC *const svc = &cpi->svc; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + int sl = svc->spatial_layer_id; + svc->lst_fb_idx[sl] = cpi->lst_fb_idx; + svc->gld_fb_idx[sl] = cpi->gld_fb_idx; + svc->alt_fb_idx[sl] = cpi->alt_fb_idx; + + svc->update_last[sl] = (uint8_t)cpi->refresh_last_frame; + svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame; + svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame; + svc->reference_last[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]); + svc->reference_golden[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]); + svc->reference_altref[sl] = + (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]); } int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { @@ -637,18 +697,30 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { cpi->svc.spatial_layer_id = cpi->svc.spatial_layer_to_encode; sl = cpi->svc.spatial_layer_id; vp9_apply_encoding_flags(cpi, cpi->svc.ext_frame_flags[sl]); - cpi->lst_fb_idx = cpi->svc.ext_lst_fb_idx[sl]; - cpi->gld_fb_idx = cpi->svc.ext_gld_fb_idx[sl]; - cpi->alt_fb_idx = cpi->svc.ext_alt_fb_idx[sl]; + cpi->lst_fb_idx = cpi->svc.lst_fb_idx[sl]; + cpi->gld_fb_idx = cpi->svc.gld_fb_idx[sl]; + cpi->alt_fb_idx = cpi->svc.alt_fb_idx[sl]; } } // Reset the drop flags for all spatial layers, on the base layer. if (cpi->svc.spatial_layer_id == 0) { - int i; - for (i = 0; i < cpi->svc.number_spatial_layers; i++) { - cpi->svc.drop_spatial_layer[i] = 0; + vp9_zero(cpi->svc.drop_spatial_layer); + // TODO(jianj/marpan): Investigate why setting cpi->svc.lst/gld/alt_fb_idx + // causes an issue with frame dropping and temporal layers, when the frame + // flags are passed via the encode call (bypass mode). Issue is that we're + // resetting ext_refresh_frame_flags_pending to 0 on frame drops. + if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS) { + memset(&cpi->svc.lst_fb_idx, -1, sizeof(cpi->svc.lst_fb_idx)); + memset(&cpi->svc.gld_fb_idx, -1, sizeof(cpi->svc.lst_fb_idx)); + memset(&cpi->svc.alt_fb_idx, -1, sizeof(cpi->svc.lst_fb_idx)); } + vp9_zero(cpi->svc.update_last); + vp9_zero(cpi->svc.update_golden); + vp9_zero(cpi->svc.update_altref); + vp9_zero(cpi->svc.reference_last); + vp9_zero(cpi->svc.reference_golden); + vp9_zero(cpi->svc.reference_altref); } lc = &cpi->svc.layer_context[cpi->svc.spatial_layer_id * @@ -714,6 +786,15 @@ int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) { if (cpi->svc.spatial_layer_id == 0) cpi->svc.high_source_sad_superframe = 0; + if (cpi->svc.temporal_layering_mode != VP9E_TEMPORAL_LAYERING_MODE_BYPASS && + cpi->svc.last_layer_dropped[cpi->svc.spatial_layer_id]) { + // For fixed/non-flexible mode, if the previous frame (same spatial layer + // from previous superframe) was dropped, make sure the lst_fb_idx + // for this frame corresponds to the buffer index updated on (last) encoded + // TL0 frame (with same spatial layer). + cpi->lst_fb_idx = cpi->svc.fb_idx_upd_tl0[cpi->svc.spatial_layer_id]; + } + if (vp9_set_size_literal(cpi, width, height) != 0) return VPX_CODEC_INVALID_PARAM; @@ -799,3 +880,63 @@ void vp9_svc_check_reset_layer_rc_flag(VP9_COMP *const cpi) { } } } + +void vp9_svc_constrain_inter_layer_pred(VP9_COMP *const cpi) { + VP9_COMMON *const cm = &cpi->common; + // Check for disabling inter-layer (spatial) prediction, if + // svc.disable_inter_layer_pred is set. If the previous spatial layer was + // dropped then disable the prediction from this (scaled) reference. + if ((cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF_NONKEY && + !cpi->svc.layer_context[cpi->svc.temporal_layer_id].is_key_frame) || + cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_OFF || + cpi->svc.drop_spatial_layer[cpi->svc.spatial_layer_id - 1]) { + MV_REFERENCE_FRAME ref_frame; + static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, + VP9_ALT_FLAG }; + for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) { + const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame); + if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) { + const struct scale_factors *const scale_fac = + &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) + cpi->ref_frame_flags &= (~flag_list[ref_frame]); + } + } + } + // Check for disabling inter-layer prediction if + // INTER_LAYER_PRED_ON_CONSTRAINED is enabled. + // If the reference for inter-layer prediction (the reference that is scaled) + // is not the previous spatial layer from the same superframe, then we + // disable inter-layer prediction. + if (cpi->svc.disable_inter_layer_pred == INTER_LAYER_PRED_ON_CONSTRAINED) { + // We only use LAST and GOLDEN for prediction in real-time mode, so we + // check both here. + MV_REFERENCE_FRAME ref_frame; + for (ref_frame = LAST_FRAME; ref_frame <= GOLDEN_FRAME; ref_frame++) { + struct scale_factors *scale_fac = &cm->frame_refs[ref_frame - 1].sf; + if (vp9_is_scaled(scale_fac)) { + // If this reference was updated on the previous spatial layer of the + // current superframe, then we keep this reference (don't disable). + // Otherwise we disable the inter-layer prediction. + // This condition is verified by checking if the current frame buffer + // index is equal to any of the slots for the previous spatial layer, + // and if so, check if that slot was updated/refreshed. If that is the + // case, then this reference is valid for inter-layer prediction under + // the mode INTER_LAYER_PRED_ON_CONSTRAINED. + int fb_idx = + ref_frame == LAST_FRAME ? cpi->lst_fb_idx : cpi->gld_fb_idx; + int ref_flag = ref_frame == LAST_FRAME ? VP9_LAST_FLAG : VP9_GOLD_FLAG; + int sl = cpi->svc.spatial_layer_id; + int disable = 1; + if ((fb_idx == cpi->svc.lst_fb_idx[sl - 1] && + cpi->svc.update_last[sl - 1]) || + (fb_idx == cpi->svc.gld_fb_idx[sl - 1] && + cpi->svc.update_golden[sl - 1]) || + (fb_idx == cpi->svc.alt_fb_idx[sl - 1] && + cpi->svc.update_altref[sl - 1])) + disable = 0; + if (disable) cpi->ref_frame_flags &= (~ref_flag); + } + } + } +} diff --git a/vp9/encoder/vp9_svc_layercontext.h b/vp9/encoder/vp9_svc_layercontext.h index 022fd00f7..617717049 100644 --- a/vp9/encoder/vp9_svc_layercontext.h +++ b/vp9/encoder/vp9_svc_layercontext.h @@ -20,9 +20,16 @@ extern "C" { #endif typedef enum { + // Inter-layer prediction is on on all frames. INTER_LAYER_PRED_ON, + // Inter-layer prediction is off on all frames. INTER_LAYER_PRED_OFF, - INTER_LAYER_PRED_OFF_NONKEY + // Inter-layer prediction is off on non-key frames. + INTER_LAYER_PRED_OFF_NONKEY, + // Inter-layer prediction is on on all frames, but constrained such + // that any layer S (> 0) can only predict from previous spatial + // layer S-1, from the same superframe. + INTER_LAYER_PRED_ON_CONSTRAINED } INTER_LAYER_PRED; typedef struct { @@ -86,9 +93,9 @@ typedef struct SVC { // Frame flags and buffer indexes for each spatial layer, set by the // application (external settings). int ext_frame_flags[VPX_MAX_LAYERS]; - int ext_lst_fb_idx[VPX_MAX_LAYERS]; - int ext_gld_fb_idx[VPX_MAX_LAYERS]; - int ext_alt_fb_idx[VPX_MAX_LAYERS]; + int lst_fb_idx[VPX_MAX_LAYERS]; + int gld_fb_idx[VPX_MAX_LAYERS]; + int alt_fb_idx[VPX_MAX_LAYERS]; int ref_frame_index[REF_FRAMES]; int force_zero_mode_spatial_ref; int current_superframe; @@ -123,6 +130,18 @@ typedef struct SVC { // currently checked for each superframe prior to encoding, on the full // resolution source. int high_source_sad_superframe; + + // Flags used to get SVC pattern info. + uint8_t update_last[VPX_SS_MAX_LAYERS]; + uint8_t update_golden[VPX_SS_MAX_LAYERS]; + uint8_t update_altref[VPX_SS_MAX_LAYERS]; + uint8_t reference_last[VPX_SS_MAX_LAYERS]; + uint8_t reference_golden[VPX_SS_MAX_LAYERS]; + uint8_t reference_altref[VPX_SS_MAX_LAYERS]; + + // Keep track of the frame buffer index updated/refreshed on the base + // temporal superframe. + uint8_t fb_idx_upd_tl0[VPX_SS_MAX_LAYERS]; } SVC; struct VP9_COMP; @@ -170,6 +189,8 @@ struct lookahead_entry *vp9_svc_lookahead_pop(struct VP9_COMP *const cpi, // Start a frame and initialize svc parameters int vp9_svc_start_frame(struct VP9_COMP *const cpi); +void vp9_copy_flags_ref_update_idx(struct VP9_COMP *const cpi); + int vp9_one_pass_cbr_svc_start_layer(struct VP9_COMP *const cpi); void vp9_free_svc_cyclic_refresh(struct VP9_COMP *const cpi); @@ -178,6 +199,8 @@ void vp9_svc_reset_key_frame(struct VP9_COMP *const cpi); void vp9_svc_check_reset_layer_rc_flag(struct VP9_COMP *const cpi); +void vp9_svc_constrain_inter_layer_pred(struct VP9_COMP *const cpi); + #ifdef __cplusplus } // extern "C" #endif diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index e006606f1..5eaa7a18a 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -1487,6 +1487,25 @@ static vpx_codec_err_t ctrl_set_svc_parameters(vpx_codec_alg_priv_t *ctx, return VPX_CODEC_OK; } +static vpx_codec_err_t ctrl_get_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, + va_list args) { + VP9_COMP *const cpi = ctx->cpi; + vpx_svc_ref_frame_config_t *data = va_arg(args, vpx_svc_ref_frame_config_t *); + int sl; + for (sl = 0; sl <= cpi->svc.spatial_layer_id; sl++) { + data->update_last[sl] = cpi->svc.update_last[sl]; + data->update_golden[sl] = cpi->svc.update_golden[sl]; + data->update_alt_ref[sl] = cpi->svc.update_altref[sl]; + data->reference_last[sl] = cpi->svc.reference_last[sl]; + data->reference_golden[sl] = cpi->svc.reference_golden[sl]; + data->reference_alt_ref[sl] = cpi->svc.reference_altref[sl]; + data->lst_fb_idx[sl] = cpi->svc.lst_fb_idx[sl]; + data->gld_fb_idx[sl] = cpi->svc.gld_fb_idx[sl]; + data->alt_fb_idx[sl] = cpi->svc.alt_fb_idx[sl]; + } + return VPX_CODEC_OK; +} + static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, va_list args) { VP9_COMP *const cpi = ctx->cpi; @@ -1494,9 +1513,9 @@ static vpx_codec_err_t ctrl_set_svc_ref_frame_config(vpx_codec_alg_priv_t *ctx, int sl; for (sl = 0; sl < cpi->svc.number_spatial_layers; ++sl) { cpi->svc.ext_frame_flags[sl] = data->frame_flags[sl]; - cpi->svc.ext_lst_fb_idx[sl] = data->lst_fb_idx[sl]; - cpi->svc.ext_gld_fb_idx[sl] = data->gld_fb_idx[sl]; - cpi->svc.ext_alt_fb_idx[sl] = data->alt_fb_idx[sl]; + cpi->svc.lst_fb_idx[sl] = data->lst_fb_idx[sl]; + cpi->svc.gld_fb_idx[sl] = data->gld_fb_idx[sl]; + cpi->svc.alt_fb_idx[sl] = data->alt_fb_idx[sl]; } return VPX_CODEC_OK; } @@ -1613,6 +1632,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { { VP9E_GET_SVC_LAYER_ID, ctrl_get_svc_layer_id }, { VP9E_GET_ACTIVEMAP, ctrl_get_active_map }, { VP9E_GET_LEVEL, ctrl_get_level }, + { VP9E_GET_SVC_REF_FRAME_CONFIG, ctrl_get_svc_ref_frame_config }, { -1, NULL }, }; diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index f409844b5..b201d96f4 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -620,6 +620,13 @@ enum vp8e_enc_control_id { * Supported in codecs: VP9 */ VP9E_SET_SVC_FRAME_DROP_LAYER, + + /*!\brief Codec control function to get the refresh and reference flags and + * the buffer indices, up to the last encoded spatial layer. + * + * Supported in codecs: VP9 + */ + VP9E_GET_SVC_REF_FRAME_CONFIG, }; /*!\brief vpx 1-D scaling mode @@ -757,10 +764,18 @@ typedef struct vpx_svc_layer_id { * */ typedef struct vpx_svc_ref_frame_config { - int frame_flags[VPX_TS_MAX_LAYERS]; /**< Frame flags. */ - int lst_fb_idx[VPX_TS_MAX_LAYERS]; /**< Last buffer index. */ - int gld_fb_idx[VPX_TS_MAX_LAYERS]; /**< Golden buffer index. */ - int alt_fb_idx[VPX_TS_MAX_LAYERS]; /**< Altref buffer index. */ + // TODO(jianj/marpan): Remove the usage of frame_flags, instead use the + // update and reference flags. + int frame_flags[VPX_SS_MAX_LAYERS]; /**< Frame flags. */ + int lst_fb_idx[VPX_SS_MAX_LAYERS]; /**< Last buffer index. */ + int gld_fb_idx[VPX_SS_MAX_LAYERS]; /**< Golden buffer index. */ + int alt_fb_idx[VPX_SS_MAX_LAYERS]; /**< Altref buffer index. */ + int update_last[VPX_SS_MAX_LAYERS]; /**< Update last. */ + int update_golden[VPX_SS_MAX_LAYERS]; /**< Update golden. */ + int update_alt_ref[VPX_SS_MAX_LAYERS]; /**< Update altref. */ + int reference_last[VPX_SS_MAX_LAYERS]; /**< Last as eference. */ + int reference_golden[VPX_SS_MAX_LAYERS]; /**< Golden as reference. */ + int reference_alt_ref[VPX_SS_MAX_LAYERS]; /**< Altref as reference. */ } vpx_svc_ref_frame_config_t; /*!\brief VP9 svc frame dropping mode. @@ -927,6 +942,9 @@ VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int) VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *) #define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER +VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *) +#define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG + /*!\endcond */ /*! @} - end defgroup vp8_encoder */ #ifdef __cplusplus diff --git a/vpx/vpx_encoder.h b/vpx/vpx_encoder.h index 4017e5719..8c08017b6 100644 --- a/vpx/vpx_encoder.h +++ b/vpx/vpx_encoder.h @@ -63,7 +63,7 @@ extern "C" { * fields to structures */ #define VPX_ENCODER_ABI_VERSION \ - (11 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ + (12 + VPX_CODEC_ABI_VERSION) /**<\hideinitializer*/ /*! \brief Encoder capabilities bitfield * diff --git a/vpx_dsp/arm/avg_pred_neon.c b/vpx_dsp/arm/avg_pred_neon.c index 1370ec2d2..5afdece0a 100644 --- a/vpx_dsp/arm/avg_pred_neon.c +++ b/vpx_dsp/arm/avg_pred_neon.c @@ -17,8 +17,8 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { if (width > 8) { - int x, y; - for (y = 0; y < height; ++y) { + int x, y = height; + do { for (x = 0; x < width; x += 16) { const uint8x16_t p = vld1q_u8(pred + x); const uint8x16_t r = vld1q_u8(ref + x); @@ -28,28 +28,38 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, comp += width; pred += width; ref += ref_stride; - } + } while (--y); + } else if (width == 8) { + int i = width * height; + do { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + const uint8x8_t r_0 = vld1_u8(ref); + const uint8x8_t r_1 = vld1_u8(ref + ref_stride); + r = vcombine_u8(r_0, r_1); + ref += 2 * ref_stride; + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + i -= 16; + } while (i); } else { - int i; - for (i = 0; i < width * height; i += 16) { + int i = width * height; + assert(width == 4); + do { const uint8x16_t p = vld1q_u8(pred); uint8x16_t r; - if (width == 4) { - r = load_unaligned_u8q(ref, ref_stride); - ref += 4 * ref_stride; - } else { - const uint8x8_t r_0 = vld1_u8(ref); - const uint8x8_t r_1 = vld1_u8(ref + ref_stride); - assert(width == 8); - r = vcombine_u8(r_0, r_1); - ref += 2 * ref_stride; - } + r = load_unaligned_u8q(ref, ref_stride); + ref += 4 * ref_stride; r = vrhaddq_u8(r, p); vst1q_u8(comp, r); pred += 16; comp += 16; - } + i -= 16; + } while (i); } } diff --git a/vpx_dsp/arm/mem_neon.h b/vpx_dsp/arm/mem_neon.h index 12c0a54c8..6745464d7 100644 --- a/vpx_dsp/arm/mem_neon.h +++ b/vpx_dsp/arm/mem_neon.h @@ -101,9 +101,9 @@ static INLINE uint8x8_t load_unaligned_u8(const uint8_t *buf, int stride) { if (stride == 4) return vld1_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1_lane_u32(&a, a_u32, 0); + a_u32 = vset_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); - a_u32 = vld1_lane_u32(&a, a_u32, 1); + a_u32 = vset_lane_u32(a, a_u32, 1); return vreinterpret_u8_u32(a_u32); } @@ -127,16 +127,16 @@ static INLINE uint8x16_t load_unaligned_u8q(const uint8_t *buf, int stride) { if (stride == 4) return vld1q_u8(buf); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 0); + a_u32 = vsetq_lane_u32(a, a_u32, 0); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 1); + a_u32 = vsetq_lane_u32(a, a_u32, 1); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 2); + a_u32 = vsetq_lane_u32(a, a_u32, 2); memcpy(&a, buf, 4); buf += stride; - a_u32 = vld1q_lane_u32(&a, a_u32, 3); + a_u32 = vsetq_lane_u32(a, a_u32, 3); return vreinterpretq_u8_u32(a_u32); } diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index b04de3aff..535ec0f0d 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -10,64 +10,152 @@ #include <arm_neon.h> +#include <assert.h> #include "./vpx_config.h" #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" #include "vpx_dsp/arm/mem_neon.h" #include "vpx_dsp/arm/sum_neon.h" +static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, + const void *const buf1) { + uint32_t a; + uint32x2_t aa = vdup_n_u32(0); + memcpy(&a, buf0, 4); + aa = vset_lane_u32(a, aa, 0); + memcpy(&a, buf1, 4); + aa = vset_lane_u32(a, aa, 1); + return vreinterpret_u8_u32(aa); +} + +static INLINE void sad4x_4d(const uint8_t *const src, const int src_stride, + const uint8_t *const ref[4], const int ref_stride, + const int height, uint32_t *const res) { + int i; + uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; + uint16x4_t a[2]; + uint32x4_t r; + + assert(!((intptr_t)src % sizeof(uint32_t))); + assert(!(src_stride % sizeof(uint32_t))); + + for (i = 0; i < height; ++i) { + const uint8x8_t s = vreinterpret_u8_u32( + vld1_dup_u32((const uint32_t *)(src + i * src_stride))); + const uint8x8_t ref01 = load_unaligned_2_buffers(ref[0] + i * ref_stride, + ref[1] + i * ref_stride); + const uint8x8_t ref23 = load_unaligned_2_buffers(ref[2] + i * ref_stride, + ref[3] + i * ref_stride); + abs[0] = vabal_u8(abs[0], s, ref01); + abs[1] = vabal_u8(abs[1], s, ref23); + } + + a[0] = vpadd_u16(vget_low_u16(abs[0]), vget_high_u16(abs[0])); + a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); + r = vpaddlq_u16(vcombine_u16(a[0], a[1])); + vst1q_u32(res, r); +} + void vpx_sad4x4x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - int i; - const uint8x16_t src_u8 = load_unaligned_u8q(src, src_stride); - for (i = 0; i < 4; ++i) { - const uint8x16_t ref_u8 = load_unaligned_u8q(ref[i], ref_stride); - uint16x8_t abs = vabdl_u8(vget_low_u8(src_u8), vget_low_u8(ref_u8)); - abs = vabal_u8(abs, vget_high_u8(src_u8), vget_high_u8(ref_u8)); - res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); - } + sad4x_4d(src, src_stride, ref, ref_stride, 4, res); } void vpx_sad4x8x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - int i; - const uint8x16_t src_0 = load_unaligned_u8q(src, src_stride); - const uint8x16_t src_1 = load_unaligned_u8q(src + 4 * src_stride, src_stride); - for (i = 0; i < 4; ++i) { - const uint8x16_t ref_0 = load_unaligned_u8q(ref[i], ref_stride); - const uint8x16_t ref_1 = - load_unaligned_u8q(ref[i] + 4 * ref_stride, ref_stride); - uint16x8_t abs = vabdl_u8(vget_low_u8(src_0), vget_low_u8(ref_0)); - abs = vabal_u8(abs, vget_high_u8(src_0), vget_high_u8(ref_0)); - abs = vabal_u8(abs, vget_low_u8(src_1), vget_low_u8(ref_1)); - abs = vabal_u8(abs, vget_high_u8(src_1), vget_high_u8(ref_1)); - res[i] = vget_lane_u32(horizontal_add_uint16x8(abs), 0); - } + sad4x_4d(src, src_stride, ref, ref_stride, 8, res); +} + +//////////////////////////////////////////////////////////////////////////////// + +// Can handle 512 pixels' sad sum (such as 16x32 or 32x16) +static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, + uint32_t *const res) { + const uint16x4_t a0 = vadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint16x4_t b0 = vpadd_u16(a0, a1); + const uint16x4_t b1 = vpadd_u16(a2, a3); + const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); + vst1q_u32(res, r); } -static INLINE void sad8x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { +// Can handle 1024 pixels' sad sum (such as 32x32) +static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, + uint32_t *const res) { + const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); + const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); + const uint16x4_t a2 = vpadd_u16(vget_low_u16(sum[2]), vget_high_u16(sum[2])); + const uint16x4_t a3 = vpadd_u16(vget_low_u16(sum[3]), vget_high_u16(sum[3])); + const uint32x4_t b0 = vpaddlq_u16(vcombine_u16(a0, a1)); + const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3)); + const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + vst1q_u32(res, vcombine_u32(c0, c1)); +} + +// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) +static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, + uint32_t *const res) { + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x2_t b0 = vadd_u32(vget_low_u32(a0), vget_high_u32(a0)); + const uint32x2_t b1 = vadd_u32(vget_low_u32(a1), vget_high_u32(a1)); + const uint32x2_t b2 = vadd_u32(vget_low_u32(a2), vget_high_u32(a2)); + const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3)); + const uint32x2_t c0 = vpadd_u32(b0, b1); + const uint32x2_t c1 = vpadd_u32(b2, b3); + vst1q_u32(res, vcombine_u32(c0, c1)); +} + +// Can handle 4096 pixels' sad sum (such as 64x64) +static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, + uint32_t *const res) { + const uint32x4_t a0 = vpaddlq_u16(sum[0]); + const uint32x4_t a1 = vpaddlq_u16(sum[1]); + const uint32x4_t a2 = vpaddlq_u16(sum[2]); + const uint32x4_t a3 = vpaddlq_u16(sum[3]); + const uint32x4_t a4 = vpaddlq_u16(sum[4]); + const uint32x4_t a5 = vpaddlq_u16(sum[5]); + const uint32x4_t a6 = vpaddlq_u16(sum[6]); + const uint32x4_t a7 = vpaddlq_u16(sum[7]); + const uint32x4_t b0 = vaddq_u32(a0, a1); + const uint32x4_t b1 = vaddq_u32(a2, a3); + const uint32x4_t b2 = vaddq_u32(a4, a5); + const uint32x4_t b3 = vaddq_u32(a6, a7); + const uint32x2_t c0 = vadd_u32(vget_low_u32(b0), vget_high_u32(b0)); + const uint32x2_t c1 = vadd_u32(vget_low_u32(b1), vget_high_u32(b1)); + const uint32x2_t c2 = vadd_u32(vget_low_u32(b2), vget_high_u32(b2)); + const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); + const uint32x2_t d0 = vpadd_u32(c0, c1); + const uint32x2_t d1 = vpadd_u32(c2, c3); + vst1q_u32(res, vcombine_u32(d0, d1)); +} + +static INLINE void sad8x_4d(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res, const int height) { int i, j; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; for (i = 0; i < height; ++i) { - const uint8x8_t a_u8 = vld1_u8(a); - a += a_stride; + const uint8x8_t s = vld1_u8(src); + src += src_stride; for (j = 0; j < 4; ++j) { - const uint8x8_t b_u8 = vld1_u8(b_loop[j]); - b_loop[j] += b_stride; - sum[j] = vabal_u8(sum[j], a_u8, b_u8); + const uint8x8_t b_u8 = vld1_u8(ref_loop[j]); + ref_loop[j] += ref_stride; + sum[j] = vabal_u8(sum[j], s, b_u8); } } - for (j = 0; j < 4; ++j) { - result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); - } + sad_512_pel_final_neon(sum, res); } void vpx_sad8x4x4d_neon(const uint8_t *src, int src_stride, @@ -88,28 +176,33 @@ void vpx_sad8x16x4d_neon(const uint8_t *src, int src_stride, sad8x_4d(src, src_stride, ref, ref_stride, res, 16); } -static INLINE void sad16x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void sad16_neon(const uint8_t *ref, const uint8x16_t src, + uint16x8_t *const sum) { + const uint8x16_t r = vld1q_u8(ref); + *sum = vabal_u8(*sum, vget_low_u8(src), vget_low_u8(r)); + *sum = vabal_u8(*sum, vget_high_u8(src), vget_high_u8(r)); +} + +static INLINE void sad16x_4d(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res, const int height) { int i, j; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0) }; - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; for (i = 0; i < height; ++i) { - const uint8x16_t a_u8 = vld1q_u8(a); - a += a_stride; + const uint8x16_t s = vld1q_u8(src); + src += src_stride; for (j = 0; j < 4; ++j) { - const uint8x16_t b_u8 = vld1q_u8(b_loop[j]); - b_loop[j] += b_stride; - sum[j] = vabal_u8(sum[j], vget_low_u8(a_u8), vget_low_u8(b_u8)); - sum[j] = vabal_u8(sum[j], vget_high_u8(a_u8), vget_high_u8(b_u8)); + sad16_neon(ref_loop[j], s, &sum[j]); + ref_loop[j] += ref_stride; } } - for (j = 0; j < 4; ++j) { - result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); - } + sad_512_pel_final_neon(sum, res); } void vpx_sad16x8x4d_neon(const uint8_t *src, int src_stride, @@ -130,113 +223,152 @@ void vpx_sad16x32x4d_neon(const uint8_t *src, int src_stride, sad16x_4d(src, src_stride, ref, ref_stride, res, 32); } -static INLINE void sad32x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { - int i, j; - uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), - vdupq_n_u16(0) }; - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; +//////////////////////////////////////////////////////////////////////////////// + +static INLINE void sad32x_4d(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + const int height, uint16x8_t *const sum) { + int i; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + + sum[0] = sum[1] = sum[2] = sum[3] = vdupq_n_u16(0); for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - a += a_stride; - for (j = 0; j < 4; ++j) { - const uint8x16_t b_0 = vld1q_u8(b_loop[j]); - const uint8x16_t b_1 = vld1q_u8(b_loop[j] + 16); - b_loop[j] += b_stride; - sum[j] = vabal_u8(sum[j], vget_low_u8(a_0), vget_low_u8(b_0)); - sum[j] = vabal_u8(sum[j], vget_high_u8(a_0), vget_high_u8(b_0)); - sum[j] = vabal_u8(sum[j], vget_low_u8(a_1), vget_low_u8(b_1)); - sum[j] = vabal_u8(sum[j], vget_high_u8(a_1), vget_high_u8(b_1)); - } - } + uint8x16_t s; - for (j = 0; j < 4; ++j) { - result[j] = vget_lane_u32(horizontal_add_uint16x8(sum[j]), 0); + s = vld1q_u8(src + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); + + s = vld1q_u8(src + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); + + src += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; } } void vpx_sad32x16x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - sad32x_4d(src, src_stride, ref, ref_stride, res, 16); + uint16x8_t sum[4]; + sad32x_4d(src, src_stride, ref, ref_stride, 16, sum); + sad_512_pel_final_neon(sum, res); } void vpx_sad32x32x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - sad32x_4d(src, src_stride, ref, ref_stride, res, 32); + uint16x8_t sum[4]; + sad32x_4d(src, src_stride, ref, ref_stride, 32, sum); + sad_1024_pel_final_neon(sum, res); } void vpx_sad32x64x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - sad32x_4d(src, src_stride, ref, ref_stride, res, 64); + uint16x8_t sum[4]; + sad32x_4d(src, src_stride, ref, ref_stride, 64, sum); + sad_2048_pel_final_neon(sum, res); } -static INLINE void sum64x(const uint8x16_t a_0, const uint8x16_t a_1, - const uint8x16_t b_0, const uint8x16_t b_1, - uint16x8_t *sum) { - *sum = vabal_u8(*sum, vget_low_u8(a_0), vget_low_u8(b_0)); - *sum = vabal_u8(*sum, vget_high_u8(a_0), vget_high_u8(b_0)); - *sum = vabal_u8(*sum, vget_low_u8(a_1), vget_low_u8(b_1)); - *sum = vabal_u8(*sum, vget_high_u8(a_1), vget_high_u8(b_1)); -} +//////////////////////////////////////////////////////////////////////////////// -static INLINE void sad64x_4d(const uint8_t *a, int a_stride, - const uint8_t *const b[4], int b_stride, - uint32_t *result, const int height) { +void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, + const uint8_t *const ref[4], int ref_stride, + uint32_t *res) { int i; - uint16x8_t sum_0 = vdupq_n_u16(0); - uint16x8_t sum_1 = vdupq_n_u16(0); - uint16x8_t sum_2 = vdupq_n_u16(0); - uint16x8_t sum_3 = vdupq_n_u16(0); - uint16x8_t sum_4 = vdupq_n_u16(0); - uint16x8_t sum_5 = vdupq_n_u16(0); - uint16x8_t sum_6 = vdupq_n_u16(0); - uint16x8_t sum_7 = vdupq_n_u16(0); - const uint8_t *b_loop[4] = { b[0], b[1], b[2], b[3] }; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + uint16x8_t sum[4] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0) }; - for (i = 0; i < height; ++i) { - const uint8x16_t a_0 = vld1q_u8(a); - const uint8x16_t a_1 = vld1q_u8(a + 16); - const uint8x16_t a_2 = vld1q_u8(a + 32); - const uint8x16_t a_3 = vld1q_u8(a + 48); - a += a_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[0]), vld1q_u8(b_loop[0] + 16), &sum_0); - sum64x(a_2, a_3, vld1q_u8(b_loop[0] + 32), vld1q_u8(b_loop[0] + 48), - &sum_1); - b_loop[0] += b_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[1]), vld1q_u8(b_loop[1] + 16), &sum_2); - sum64x(a_2, a_3, vld1q_u8(b_loop[1] + 32), vld1q_u8(b_loop[1] + 48), - &sum_3); - b_loop[1] += b_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[2]), vld1q_u8(b_loop[2] + 16), &sum_4); - sum64x(a_2, a_3, vld1q_u8(b_loop[2] + 32), vld1q_u8(b_loop[2] + 48), - &sum_5); - b_loop[2] += b_stride; - sum64x(a_0, a_1, vld1q_u8(b_loop[3]), vld1q_u8(b_loop[3] + 16), &sum_6); - sum64x(a_2, a_3, vld1q_u8(b_loop[3] + 32), vld1q_u8(b_loop[3] + 48), - &sum_7); - b_loop[3] += b_stride; - } + for (i = 0; i < 32; ++i) { + uint8x16_t s; - result[0] = vget_lane_u32(horizontal_add_long_uint16x8(sum_0, sum_1), 0); - result[1] = vget_lane_u32(horizontal_add_long_uint16x8(sum_2, sum_3), 0); - result[2] = vget_lane_u32(horizontal_add_long_uint16x8(sum_4, sum_5), 0); - result[3] = vget_lane_u32(horizontal_add_long_uint16x8(sum_6, sum_7), 0); -} + s = vld1q_u8(src + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[3]); -void vpx_sad64x32x4d_neon(const uint8_t *src, int src_stride, - const uint8_t *const ref[4], int ref_stride, - uint32_t *res) { - sad64x_4d(src, src_stride, ref, ref_stride, res, 32); + s = vld1q_u8(src + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[3]); + + s = vld1q_u8(src + 2 * 16); + sad16_neon(ref_loop[0] + 2 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 2 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 2 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 2 * 16, s, &sum[3]); + + s = vld1q_u8(src + 3 * 16); + sad16_neon(ref_loop[0] + 3 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 3 * 16, s, &sum[1]); + sad16_neon(ref_loop[2] + 3 * 16, s, &sum[2]); + sad16_neon(ref_loop[3] + 3 * 16, s, &sum[3]); + + src += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; + } + + sad_2048_pel_final_neon(sum, res); } void vpx_sad64x64x4d_neon(const uint8_t *src, int src_stride, const uint8_t *const ref[4], int ref_stride, uint32_t *res) { - sad64x_4d(src, src_stride, ref, ref_stride, res, 64); + int i; + const uint8_t *ref_loop[4] = { ref[0], ref[1], ref[2], ref[3] }; + uint16x8_t sum[8] = { vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0), vdupq_n_u16(0), + vdupq_n_u16(0), vdupq_n_u16(0) }; + + for (i = 0; i < 64; ++i) { + uint8x16_t s; + + s = vld1q_u8(src + 0 * 16); + sad16_neon(ref_loop[0] + 0 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 0 * 16, s, &sum[2]); + sad16_neon(ref_loop[2] + 0 * 16, s, &sum[4]); + sad16_neon(ref_loop[3] + 0 * 16, s, &sum[6]); + + s = vld1q_u8(src + 1 * 16); + sad16_neon(ref_loop[0] + 1 * 16, s, &sum[0]); + sad16_neon(ref_loop[1] + 1 * 16, s, &sum[2]); + sad16_neon(ref_loop[2] + 1 * 16, s, &sum[4]); + sad16_neon(ref_loop[3] + 1 * 16, s, &sum[6]); + + s = vld1q_u8(src + 2 * 16); + sad16_neon(ref_loop[0] + 2 * 16, s, &sum[1]); + sad16_neon(ref_loop[1] + 2 * 16, s, &sum[3]); + sad16_neon(ref_loop[2] + 2 * 16, s, &sum[5]); + sad16_neon(ref_loop[3] + 2 * 16, s, &sum[7]); + + s = vld1q_u8(src + 3 * 16); + sad16_neon(ref_loop[0] + 3 * 16, s, &sum[1]); + sad16_neon(ref_loop[1] + 3 * 16, s, &sum[3]); + sad16_neon(ref_loop[2] + 3 * 16, s, &sum[5]); + sad16_neon(ref_loop[3] + 3 * 16, s, &sum[7]); + + src += src_stride; + ref_loop[0] += ref_stride; + ref_loop[1] += ref_stride; + ref_loop[2] += ref_stride; + ref_loop[3] += ref_stride; + } + + sad_4096_pel_final_neon(sum, res); } diff --git a/vpx_dsp/arm/subtract_neon.c b/vpx_dsp/arm/subtract_neon.c index ce81fb630..7e4610d2e 100644 --- a/vpx_dsp/arm/subtract_neon.c +++ b/vpx_dsp/arm/subtract_neon.c @@ -9,71 +9,72 @@ */ #include <arm_neon.h> +#include <assert.h> #include "./vpx_config.h" #include "vpx/vpx_integer.h" +#include "vpx_dsp/arm/mem_neon.h" void vpx_subtract_block_neon(int rows, int cols, int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { - int r, c; + int r = rows, c; if (cols > 16) { - for (r = 0; r < rows; ++r) { + do { for (c = 0; c < cols; c += 32) { - const uint8x16_t v_src_00 = vld1q_u8(&src[c + 0]); - const uint8x16_t v_src_16 = vld1q_u8(&src[c + 16]); - const uint8x16_t v_pred_00 = vld1q_u8(&pred[c + 0]); - const uint8x16_t v_pred_16 = vld1q_u8(&pred[c + 16]); - const uint16x8_t v_diff_lo_00 = - vsubl_u8(vget_low_u8(v_src_00), vget_low_u8(v_pred_00)); - const uint16x8_t v_diff_hi_00 = - vsubl_u8(vget_high_u8(v_src_00), vget_high_u8(v_pred_00)); - const uint16x8_t v_diff_lo_16 = - vsubl_u8(vget_low_u8(v_src_16), vget_low_u8(v_pred_16)); - const uint16x8_t v_diff_hi_16 = - vsubl_u8(vget_high_u8(v_src_16), vget_high_u8(v_pred_16)); - vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(v_diff_lo_00)); - vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(v_diff_hi_00)); - vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(v_diff_lo_16)); - vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(v_diff_hi_16)); + const uint8x16_t s0 = vld1q_u8(&src[c + 0]); + const uint8x16_t s1 = vld1q_u8(&src[c + 16]); + const uint8x16_t p0 = vld1q_u8(&pred[c + 0]); + const uint8x16_t p1 = vld1q_u8(&pred[c + 16]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s0), vget_low_u8(p0)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s0), vget_high_u8(p0)); + const uint16x8_t d2 = vsubl_u8(vget_low_u8(s1), vget_low_u8(p1)); + const uint16x8_t d3 = vsubl_u8(vget_high_u8(s1), vget_high_u8(p1)); + vst1q_s16(&diff[c + 0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[c + 8], vreinterpretq_s16_u16(d1)); + vst1q_s16(&diff[c + 16], vreinterpretq_s16_u16(d2)); + vst1q_s16(&diff[c + 24], vreinterpretq_s16_u16(d3)); } diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else if (cols > 8) { - for (r = 0; r < rows; ++r) { - const uint8x16_t v_src = vld1q_u8(&src[0]); - const uint8x16_t v_pred = vld1q_u8(&pred[0]); - const uint16x8_t v_diff_lo = - vsubl_u8(vget_low_u8(v_src), vget_low_u8(v_pred)); - const uint16x8_t v_diff_hi = - vsubl_u8(vget_high_u8(v_src), vget_high_u8(v_pred)); - vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff_lo)); - vst1q_s16(&diff[8], vreinterpretq_s16_u16(v_diff_hi)); + do { + const uint8x16_t s = vld1q_u8(&src[0]); + const uint8x16_t p = vld1q_u8(&pred[0]); + const uint16x8_t d0 = vsubl_u8(vget_low_u8(s), vget_low_u8(p)); + const uint16x8_t d1 = vsubl_u8(vget_high_u8(s), vget_high_u8(p)); + vst1q_s16(&diff[0], vreinterpretq_s16_u16(d0)); + vst1q_s16(&diff[8], vreinterpretq_s16_u16(d1)); diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else if (cols > 4) { - for (r = 0; r < rows; ++r) { - const uint8x8_t v_src = vld1_u8(&src[0]); - const uint8x8_t v_pred = vld1_u8(&pred[0]); - const uint16x8_t v_diff = vsubl_u8(v_src, v_pred); + do { + const uint8x8_t s = vld1_u8(&src[0]); + const uint8x8_t p = vld1_u8(&pred[0]); + const uint16x8_t v_diff = vsubl_u8(s, p); vst1q_s16(&diff[0], vreinterpretq_s16_u16(v_diff)); diff += diff_stride; pred += pred_stride; src += src_stride; - } + } while (--r); } else { - for (r = 0; r < rows; ++r) { - for (c = 0; c < cols; ++c) diff[c] = src[c] - pred[c]; - - diff += diff_stride; - pred += pred_stride; - src += src_stride; - } + assert(cols == 4); + do { + const uint8x8_t s = load_unaligned_u8(src, src_stride); + const uint8x8_t p = load_unaligned_u8(pred, pred_stride); + const uint16x8_t d = vsubl_u8(s, p); + vst1_s16(diff + 0 * diff_stride, vreinterpret_s16_u16(vget_low_u16(d))); + vst1_s16(diff + 1 * diff_stride, vreinterpret_s16_u16(vget_high_u16(d))); + diff += 2 * diff_stride; + pred += 2 * pred_stride; + src += 2 * src_stride; + r -= 2; + } while (r); } } diff --git a/vpx_dsp/arm/sum_neon.h b/vpx_dsp/arm/sum_neon.h index d74fe0cde..c09841223 100644 --- a/vpx_dsp/arm/sum_neon.h +++ b/vpx_dsp/arm/sum_neon.h @@ -30,15 +30,6 @@ static INLINE uint32x2_t horizontal_add_uint16x8(const uint16x8_t a) { vreinterpret_u32_u64(vget_high_u64(c))); } -static INLINE uint32x2_t horizontal_add_long_uint16x8(const uint16x8_t a, - const uint16x8_t b) { - const uint32x4_t c = vpaddlq_u16(a); - const uint32x4_t d = vpadalq_u16(c, b); - const uint64x2_t e = vpaddlq_u32(d); - return vadd_u32(vreinterpret_u32_u64(vget_low_u64(e)), - vreinterpret_u32_u64(vget_high_u64(e))); -} - static INLINE uint32x2_t horizontal_add_uint32x4(const uint32x4_t a) { const uint64x2_t b = vpaddlq_u32(a); return vadd_u32(vreinterpret_u32_u64(vget_low_u64(b)), diff --git a/vpx_dsp/arm/sum_squares_neon.c b/vpx_dsp/arm/sum_squares_neon.c new file mode 100644 index 000000000..8942ba83b --- /dev/null +++ b/vpx_dsp/arm/sum_squares_neon.c @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +#include <assert.h> +#include "./vpx_dsp_rtcd.h" + +uint64_t vpx_sum_squares_2d_i16_neon(const int16_t *src, int stride, int size) { + int64x1_t s2; + + if (size == 4) { + int16x4_t s[4]; + int32x4_t s0; + uint32x2_t s1; + + s[0] = vld1_s16(src + 0 * stride); + s[1] = vld1_s16(src + 1 * stride); + s[2] = vld1_s16(src + 2 * stride); + s[3] = vld1_s16(src + 3 * stride); + s0 = vmull_s16(s[0], s[0]); + s0 = vmlal_s16(s0, s[1], s[1]); + s0 = vmlal_s16(s0, s[2], s[2]); + s0 = vmlal_s16(s0, s[3], s[3]); + s1 = vpadd_u32(vget_low_u32(vreinterpretq_u32_s32(s0)), + vget_high_u32(vreinterpretq_u32_s32(s0))); + s2 = vpaddl_u32(s1); + } else { + int r = size; + uint64x2_t s1 = vdupq_n_u64(0); + + do { + int c = size; + int32x4_t s0 = vdupq_n_s32(0); + const int16_t *src_t = src; + + do { + int16x8_t s[8]; + + s[0] = vld1q_s16(src_t + 0 * stride); + s[1] = vld1q_s16(src_t + 1 * stride); + s[2] = vld1q_s16(src_t + 2 * stride); + s[3] = vld1q_s16(src_t + 3 * stride); + s[4] = vld1q_s16(src_t + 4 * stride); + s[5] = vld1q_s16(src_t + 5 * stride); + s[6] = vld1q_s16(src_t + 6 * stride); + s[7] = vld1q_s16(src_t + 7 * stride); + s0 = vmlal_s16(s0, vget_low_s16(s[0]), vget_low_s16(s[0])); + s0 = vmlal_s16(s0, vget_low_s16(s[1]), vget_low_s16(s[1])); + s0 = vmlal_s16(s0, vget_low_s16(s[2]), vget_low_s16(s[2])); + s0 = vmlal_s16(s0, vget_low_s16(s[3]), vget_low_s16(s[3])); + s0 = vmlal_s16(s0, vget_low_s16(s[4]), vget_low_s16(s[4])); + s0 = vmlal_s16(s0, vget_low_s16(s[5]), vget_low_s16(s[5])); + s0 = vmlal_s16(s0, vget_low_s16(s[6]), vget_low_s16(s[6])); + s0 = vmlal_s16(s0, vget_low_s16(s[7]), vget_low_s16(s[7])); + s0 = vmlal_s16(s0, vget_high_s16(s[0]), vget_high_s16(s[0])); + s0 = vmlal_s16(s0, vget_high_s16(s[1]), vget_high_s16(s[1])); + s0 = vmlal_s16(s0, vget_high_s16(s[2]), vget_high_s16(s[2])); + s0 = vmlal_s16(s0, vget_high_s16(s[3]), vget_high_s16(s[3])); + s0 = vmlal_s16(s0, vget_high_s16(s[4]), vget_high_s16(s[4])); + s0 = vmlal_s16(s0, vget_high_s16(s[5]), vget_high_s16(s[5])); + s0 = vmlal_s16(s0, vget_high_s16(s[6]), vget_high_s16(s[6])); + s0 = vmlal_s16(s0, vget_high_s16(s[7]), vget_high_s16(s[7])); + src_t += 8; + c -= 8; + } while (c); + + s1 = vaddw_u32(s1, vget_low_u32(vreinterpretq_u32_s32(s0))); + s1 = vaddw_u32(s1, vget_high_u32(vreinterpretq_u32_s32(s0))); + src += 8 * stride; + r -= 8; + } while (r); + + s2 = vadd_u64(vget_low_u64(s1), vget_high_u64(s1)); + } + + return vget_lane_u64(s2, 0); +} diff --git a/vpx_dsp/mips/vpx_convolve8_mmi.c b/vpx_dsp/mips/vpx_convolve8_mmi.c index 0cfb81e4d..ba9ceb866 100644 --- a/vpx_dsp/mips/vpx_convolve8_mmi.c +++ b/vpx_dsp/mips/vpx_convolve8_mmi.c @@ -254,6 +254,89 @@ static void convolve_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, ); } +static void convolve_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int x_step_q4, int32_t w, int32_t h) { + const int16_t *filter_x = filter[x0_q4]; + double ftmp[14]; + uint32_t tmp[2]; + uint32_t para[2]; + para[0] = (1 << ((FILTER_BITS)-1)); + para[1] = FILTER_BITS; + src -= SUBPEL_TAPS / 2 - 1; + src_stride -= w; + dst_stride -= w; + (void)x_step_q4; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "gsldlc1 %[filter1], 0x03(%[filter]) \n\t" + "gsldrc1 %[filter1], 0x00(%[filter]) \n\t" + "gsldlc1 %[filter2], 0x0b(%[filter]) \n\t" + "gsldrc1 %[filter2], 0x08(%[filter]) \n\t" + "1: \n\t" + /* Get 8 data per row */ + "gsldlc1 %[ftmp5], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp5], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp7], 0x08(%[src]) \n\t" + "gsldrc1 %[ftmp7], 0x01(%[src]) \n\t" + "gsldlc1 %[ftmp9], 0x09(%[src]) \n\t" + "gsldrc1 %[ftmp9], 0x02(%[src]) \n\t" + "gsldlc1 %[ftmp11], 0x0A(%[src]) \n\t" + "gsldrc1 %[ftmp11], 0x03(%[src]) \n\t" + "punpcklbh %[ftmp4], %[ftmp5], %[ftmp0] \n\t" + "punpckhbh %[ftmp5], %[ftmp5], %[ftmp0] \n\t" + "punpcklbh %[ftmp6], %[ftmp7], %[ftmp0] \n\t" + "punpckhbh %[ftmp7], %[ftmp7], %[ftmp0] \n\t" + "punpcklbh %[ftmp8], %[ftmp9], %[ftmp0] \n\t" + "punpckhbh %[ftmp9], %[ftmp9], %[ftmp0] \n\t" + "punpcklbh %[ftmp10], %[ftmp11], %[ftmp0] \n\t" + "punpckhbh %[ftmp11], %[ftmp11], %[ftmp0] \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + /* Get raw data */ + GET_DATA_H_MMI + ROUND_POWER_OF_TWO_MMI + CLIP_PIXEL_MMI + "punpcklbh %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "gsldlc1 %[ftmp4], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp4], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp4], %[ftmp4], %[ftmp0] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp4] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp5]) + "punpcklhw %[ftmp5], %[ftmp5], %[ftmp5] \n\t" + "paddh %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "psrah %[ftmp12], %[ftmp12], %[ftmp5] \n\t" + "packushb %[ftmp12], %[ftmp12], %[ftmp0] \n\t" + "swc1 %[ftmp12], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + /* Loop count */ + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [srcl]"=&f"(ftmp[0]), [srch]"=&f"(ftmp[1]), + [filter1]"=&f"(ftmp[2]), [filter2]"=&f"(ftmp[3]), + [ftmp0]"=&f"(ftmp[4]), [ftmp4]"=&f"(ftmp[5]), + [ftmp5]"=&f"(ftmp[6]), [ftmp6]"=&f"(ftmp[7]), + [ftmp7]"=&f"(ftmp[8]), [ftmp8]"=&f"(ftmp[9]), + [ftmp9]"=&f"(ftmp[10]), [ftmp10]"=&f"(ftmp[11]), + [ftmp11]"=&f"(ftmp[12]), [ftmp12]"=&f"(ftmp[13]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [width]"+&r"(w), + [dst]"+&r"(dst), [height]"+&r"(h) + : [filter]"r"(filter_x), [para]"r"(para), + [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); +} + static void convolve_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int y0_q4, @@ -362,52 +445,63 @@ void vpx_convolve_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { - double ftmp[4]; - uint32_t tmp[2]; - src_stride -= w; - dst_stride -= w; + int x, y; + (void)filter; (void)x0_q4; (void)x_step_q4; (void)y0_q4; (void)y_step_q4; - __asm__ volatile( - "move %[tmp1], %[width] \n\t" - "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" - "li %[tmp0], 0x10001 \n\t" - MMI_MTC1(%[tmp0], %[ftmp3]) - "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" - "1: \n\t" - "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" - "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" - "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t" - "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t" - "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" - "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" - "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" - "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" - "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t" - "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" - "swc1 %[ftmp1], 0x00(%[dst]) \n\t" - MMI_ADDIU(%[width], %[width], -0x04) - MMI_ADDIU(%[dst], %[dst], 0x04) - MMI_ADDIU(%[src], %[src], 0x04) - "bnez %[width], 1b \n\t" - "move %[width], %[tmp1] \n\t" - MMI_ADDU(%[dst], %[dst], %[dst_stride]) - MMI_ADDU(%[src], %[src], %[src_stride]) - MMI_ADDIU(%[height], %[height], -0x01) - "bnez %[height], 1b \n\t" - : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), - [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), - [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), - [src]"+&r"(src), [dst]"+&r"(dst), - [width]"+&r"(w), [height]"+&r"(h) - : [src_stride]"r"((mips_reg)src_stride), - [dst_stride]"r"((mips_reg)dst_stride) - : "memory" - ); + if (w & 0x03) { + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) dst[x] = ROUND_POWER_OF_TWO(dst[x] + src[x], 1); + src += src_stride; + dst += dst_stride; + } + } else { + double ftmp[4]; + uint32_t tmp[2]; + src_stride -= w; + dst_stride -= w; + + __asm__ volatile( + "move %[tmp1], %[width] \n\t" + "xor %[ftmp0], %[ftmp0], %[ftmp0] \n\t" + "li %[tmp0], 0x10001 \n\t" + MMI_MTC1(%[tmp0], %[ftmp3]) + "punpcklhw %[ftmp3], %[ftmp3], %[ftmp3] \n\t" + "1: \n\t" + "gsldlc1 %[ftmp1], 0x07(%[src]) \n\t" + "gsldrc1 %[ftmp1], 0x00(%[src]) \n\t" + "gsldlc1 %[ftmp2], 0x07(%[dst]) \n\t" + "gsldrc1 %[ftmp2], 0x00(%[dst]) \n\t" + "punpcklbh %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "punpcklbh %[ftmp2], %[ftmp2], %[ftmp0] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp2] \n\t" + "paddh %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "psrah %[ftmp1], %[ftmp1], %[ftmp3] \n\t" + "packushb %[ftmp1], %[ftmp1], %[ftmp0] \n\t" + "swc1 %[ftmp1], 0x00(%[dst]) \n\t" + MMI_ADDIU(%[width], %[width], -0x04) + MMI_ADDIU(%[dst], %[dst], 0x04) + MMI_ADDIU(%[src], %[src], 0x04) + "bnez %[width], 1b \n\t" + "move %[width], %[tmp1] \n\t" + MMI_ADDU(%[dst], %[dst], %[dst_stride]) + MMI_ADDU(%[src], %[src], %[src_stride]) + MMI_ADDIU(%[height], %[height], -0x01) + "bnez %[height], 1b \n\t" + : [ftmp0]"=&f"(ftmp[0]), [ftmp1]"=&f"(ftmp[1]), + [ftmp2]"=&f"(ftmp[2]), [ftmp3]"=&f"(ftmp[3]), + [tmp0]"=&r"(tmp[0]), [tmp1]"=&r"(tmp[1]), + [src]"+&r"(src), [dst]"+&r"(dst), + [width]"+&r"(w), [height]"+&r"(h) + : [src_stride]"r"((mips_reg)src_stride), + [dst_stride]"r"((mips_reg)dst_stride) + : "memory" + ); + } } static void convolve_horiz(const uint8_t *src, ptrdiff_t src_stride, @@ -481,6 +575,29 @@ static void convolve_avg_vert(const uint8_t *src, ptrdiff_t src_stride, } } +static void convolve_avg_horiz(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *x_filters, int x0_q4, + int x_step_q4, int w, int h) { + int x, y; + src -= SUBPEL_TAPS / 2 - 1; + + for (y = 0; y < h; ++y) { + int x_q4 = x0_q4; + for (x = 0; x < w; ++x) { + const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; + const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK]; + int k, sum = 0; + for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k]; + dst[x] = ROUND_POWER_OF_TWO( + dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1); + x_q4 += x_step_q4; + } + src += src_stride; + dst += dst_stride; + } +} + void vpx_convolve8_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int32_t x_step_q4, int y0_q4, @@ -553,6 +670,21 @@ void vpx_convolve8_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, y_step_q4, w, h); } +void vpx_convolve8_avg_horiz_mmi(const uint8_t *src, ptrdiff_t src_stride, + uint8_t *dst, ptrdiff_t dst_stride, + const InterpKernel *filter, int x0_q4, + int32_t x_step_q4, int y0_q4, int y_step_q4, + int w, int h) { + (void)y0_q4; + (void)y_step_q4; + if (w & 0x03) + convolve_avg_horiz(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); + else + convolve_avg_horiz_mmi(src, src_stride, dst, dst_stride, filter, x0_q4, + x_step_q4, w, h); +} + void vpx_convolve8_avg_vert_mmi(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, @@ -580,8 +712,5 @@ void vpx_convolve8_avg_mmi(const uint8_t *src, ptrdiff_t src_stride, vpx_convolve8_mmi(src, src_stride, temp, 64, filter, x0_q4, x_step_q4, y0_q4, y_step_q4, w, h); - if (w & 0x03) - vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); - else - vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); + vpx_convolve_avg_mmi(temp, 64, dst, dst_stride, NULL, 0, 0, 0, 0, w, h); } diff --git a/vpx_dsp/ppc/inv_txfm_vsx.c b/vpx_dsp/ppc/inv_txfm_vsx.c index f095cb0a4..6603b85ac 100644 --- a/vpx_dsp/ppc/inv_txfm_vsx.c +++ b/vpx_dsp/ppc/inv_txfm_vsx.c @@ -76,6 +76,8 @@ static int16x8_t cospi29_v = { 2404, 2404, 2404, 2404, 2404, 2404, 2404, 2404 }; static int16x8_t cospi30_v = { 1606, 1606, 1606, 1606, 1606, 1606, 1606, 1606 }; static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; +static uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; #define ROUND_SHIFT_INIT \ const int32x4_t shift = vec_sl(vec_splat_s32(1), vec_splat_u32(13)); \ const uint32x4_t shift14 = vec_splat_u32(14); @@ -107,6 +109,15 @@ static int16x8_t cospi31_v = { 804, 804, 804, 804, 804, 804, 804, 804 }; out1 = vec_sub(step0, step1); \ out1 = vec_perm(out1, out1, mask0); +#define PACK_STORE(v0, v1) \ + tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); \ + tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); \ + output_v = vec_packsu(tmp16_0, tmp16_1); \ + \ + vec_vsx_st(output_v, 0, tmp_dest); \ + for (i = 0; i < 4; i++) \ + for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; + void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride) { int i, j; @@ -114,13 +125,10 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int16x8_t step0, step1, tmp16_0, tmp16_1, t_out0, t_out1; uint8x16_t mask0 = { 0x8, 0x9, 0xA, 0xB, 0xC, 0xD, 0xE, 0xF, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 }; - uint8x16_t mask1 = { 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }; int16x8_t v0 = load_tran_low(0, input); int16x8_t v1 = load_tran_low(8 * sizeof(*input), input); int16x8_t t0 = vec_mergeh(v0, v1); int16x8_t t1 = vec_mergel(v0, v1); - uint8x16_t dest0 = vec_vsx_ld(0, dest); uint8x16_t dest1 = vec_vsx_ld(stride, dest); uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); @@ -130,6 +138,7 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int16x8_t d_u1 = (int16x8_t)vec_mergeh(dest1, zerov); int16x8_t d_u2 = (int16x8_t)vec_mergeh(dest2, zerov); int16x8_t d_u3 = (int16x8_t)vec_mergeh(dest3, zerov); + uint8x16_t output_v; uint8_t tmp_dest[16]; ROUND_SHIFT_INIT @@ -148,13 +157,8 @@ void vpx_idct4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, PIXEL_ADD4(v0, t_out0); PIXEL_ADD4(v1, t_out1); - tmp16_0 = vec_add(vec_perm(d_u0, d_u1, mask1), v0); - tmp16_1 = vec_add(vec_perm(d_u2, d_u3, mask1), v1); - output_v = vec_packsu(tmp16_0, tmp16_1); - vec_vsx_st(output_v, 0, tmp_dest); - for (i = 0; i < 4; i++) - for (j = 0; j < 4; j++) dest[j * stride + i] = tmp_dest[j * 4 + i]; + PACK_STORE(v0, v1); } #define TRANSPOSE8x8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \ @@ -1062,3 +1066,67 @@ void vpx_idct32x32_1024_add_vsx(const tran_low_t *input, uint8_t *dest, ADD_STORE_BLOCK(src2, 16); ADD_STORE_BLOCK(src3, 24); } + +#define TRANSFORM_COLS \ + v32_a = vec_add(v32_a, v32_c); \ + v32_d = vec_sub(v32_d, v32_b); \ + v32_e = vec_sub(v32_a, v32_d); \ + v32_e = vec_sra(v32_e, one); \ + v32_b = vec_sub(v32_e, v32_b); \ + v32_c = vec_sub(v32_e, v32_c); \ + v32_a = vec_sub(v32_a, v32_b); \ + v32_d = vec_add(v32_d, v32_c); \ + v_a = vec_packs(v32_a, v32_b); \ + v_c = vec_packs(v32_c, v32_d); + +#define TRANSPOSE_WHT \ + tmp_a = vec_mergeh(v_a, v_c); \ + tmp_c = vec_mergel(v_a, v_c); \ + v_a = vec_mergeh(tmp_a, tmp_c); \ + v_c = vec_mergel(tmp_a, tmp_c); + +void vpx_iwht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, + int stride) { + int16x8_t v_a = load_tran_low(0, input); + int16x8_t v_c = load_tran_low(8 * sizeof(*input), input); + int16x8_t tmp_a, tmp_c; + uint16x8_t two = vec_splat_u16(2); + uint32x4_t one = vec_splat_u32(1); + int16x8_t tmp16_0, tmp16_1; + int32x4_t v32_a, v32_c, v32_d, v32_b, v32_e; + uint8x16_t dest0 = vec_vsx_ld(0, dest); + uint8x16_t dest1 = vec_vsx_ld(stride, dest); + uint8x16_t dest2 = vec_vsx_ld(2 * stride, dest); + uint8x16_t dest3 = vec_vsx_ld(3 * stride, dest); + int16x8_t d_u0 = (int16x8_t)unpack_to_u16_h(dest0); + int16x8_t d_u1 = (int16x8_t)unpack_to_u16_h(dest1); + int16x8_t d_u2 = (int16x8_t)unpack_to_u16_h(dest2); + int16x8_t d_u3 = (int16x8_t)unpack_to_u16_h(dest3); + uint8x16_t output_v; + uint8_t tmp_dest[16]; + int i, j; + + v_a = vec_sra(v_a, two); + v_c = vec_sra(v_c, two); + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + TRANSPOSE_WHT; + + v32_a = vec_unpackh(v_a); + v32_c = vec_unpackl(v_a); + v32_d = vec_unpackh(v_c); + v32_b = vec_unpackl(v_c); + + TRANSFORM_COLS; + + PACK_STORE(v_a, v_c); +} diff --git a/vpx_dsp/ppc/quantize_vsx.c b/vpx_dsp/ppc/quantize_vsx.c new file mode 100644 index 000000000..e037f89e3 --- /dev/null +++ b/vpx_dsp/ppc/quantize_vsx.c @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/ppc/types_vsx.h" + +// Negate 16-bit integers in a when the corresponding signed 16-bit +// integer in b is negative. +static INLINE int16x8_t vec_sign(int16x8_t a, int16x8_t b) { + const int16x8_t mask = vec_sra(b, vec_shift_sign_s16); + return vec_xor(vec_add(a, mask), mask); +} + +// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit +// integers, and return the high 16 bits of the intermediate integers. +static INLINE int16x8_t vec_mulhi(int16x8_t a, int16x8_t b) { + // madds does ((A * B) >>15) + C, we need >> 16, so we perform an extra right + // shift. + return vec_sra(vec_madds(a, b, vec_zeros_s16), vec_ones_s16); +} + +static INLINE int16x8_t quantize_coeff(int16x8_t coeff, int16x8_t coeff_abs, + int16x8_t round, int16x8_t quant, + int16x8_t quant_shift, bool16x8_t mask) { + int16x8_t rounded, qcoeff; + rounded = vec_vaddshs(coeff_abs, round); + qcoeff = vec_mulhi(rounded, quant); + qcoeff = vec_add(qcoeff, rounded); + qcoeff = vec_mulhi(qcoeff, quant_shift); + qcoeff = vec_sign(qcoeff, coeff); + return vec_and(qcoeff, mask); +} + +static INLINE int16x8_t nonzero_scanindex(int16x8_t qcoeff, bool16x8_t mask, + const int16_t *iscan_ptr) { + bool16x8_t zero_coeff; + int16x8_t scan = vec_vsx_ld(0, iscan_ptr); + zero_coeff = vec_cmpeq(qcoeff, vec_zeros_s16); + scan = vec_sub(scan, mask); + return vec_andc(scan, zero_coeff); +} + +// Compare packed 16-bit integers across a, and return the maximum value in +// every element. Returns a vector containing the biggest value across vector a. +static INLINE int16x8_t vec_max_across(int16x8_t a) { + a = vec_max(a, vec_perm(a, a, vec_perm64)); + a = vec_max(a, vec_perm(a, a, vec_perm32)); + return vec_max(a, vec_perm(a, a, vec_perm16)); +} + +void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *zbin_ptr, + const int16_t *round_ptr, const int16_t *quant_ptr, + const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, + uint16_t *eob_ptr, const int16_t *scan_ptr, + const int16_t *iscan_ptr) { + int16x8_t qcoeff, dqcoeff, eob; + + // First set of 8 coeff starts with DC + 7 AC + int16x8_t zbin = vec_vsx_ld(0, zbin_ptr); + int16x8_t round = vec_vsx_ld(0, round_ptr); + int16x8_t quant = vec_vsx_ld(0, quant_ptr); + int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); + int16x8_t quant_shift = vec_vsx_ld(0, quant_shift_ptr); + + int16x8_t coeff = vec_vsx_ld(0, coeff_ptr); + int16x8_t coeff_abs = vec_abs(coeff); + bool16x8_t zero_mask = vec_cmpge(coeff_abs, zbin); + + (void)scan_ptr; + (void)skip_block; + assert(!skip_block); + + qcoeff = + quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask); + vec_vsx_st(qcoeff, 0, qcoeff_ptr); + + dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff, 0, dqcoeff_ptr); + + eob = nonzero_scanindex(qcoeff, zero_mask, iscan_ptr); + + // All other sets of 8 coeffs will only contain AC + zbin = vec_splat(zbin, 1); + round = vec_splat(round, 1); + quant = vec_splat(quant, 1); + dequant = vec_splat(dequant, 1); + quant_shift = vec_splat(quant_shift, 1); + + n_coeffs -= 8; + do { + coeff_ptr += 8; + qcoeff_ptr += 8; + dqcoeff_ptr += 8; + iscan_ptr += 8; + + coeff = vec_vsx_ld(0, coeff_ptr); + coeff_abs = vec_abs(coeff); + zero_mask = vec_cmpge(coeff_abs, zbin); + qcoeff = + quantize_coeff(coeff, coeff_abs, round, quant, quant_shift, zero_mask); + vec_vsx_st(qcoeff, 0, qcoeff_ptr); + + dqcoeff = vec_mladd(qcoeff, dequant, vec_zeros_s16); + vec_vsx_st(dqcoeff, 0, dqcoeff_ptr); + + eob = vec_max(eob, nonzero_scanindex(qcoeff, zero_mask, iscan_ptr)); + + n_coeffs -= 8; + } while (n_coeffs > 0); + + eob = vec_max_across(eob); + *eob_ptr = eob[0]; +} diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h index f611d02d2..e2af55463 100644 --- a/vpx_dsp/ppc/types_vsx.h +++ b/vpx_dsp/ppc/types_vsx.h @@ -19,6 +19,7 @@ typedef vector signed short int16x8_t; typedef vector unsigned short uint16x8_t; typedef vector signed int int32x4_t; typedef vector unsigned int uint32x4_t; +typedef vector bool short bool16x8_t; #ifdef __clang__ static const uint8x16_t xxpermdi0_perm = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, @@ -65,4 +66,17 @@ static const uint8x16_t xxpermdi3_perm = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, #endif #endif +static const int16x8_t vec_zeros_s16 = { 0, 0, 0, 0, 0, 0, 0, 0 }; +static const uint16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; +static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 }; +static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, + 0x0E, 0x0F, 0x00, 0x01, 0x02, 0x03, + 0x04, 0x05, 0x06, 0x07 }; +static const uint8x16_t vec_perm32 = { 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, + 0x00, 0x01, 0x02, 0x03 }; +static const uint8x16_t vec_perm16 = { 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0A, 0x0B, 0x0E, 0x0D, + 0x0E, 0x0F, 0x00, 0x01 }; + #endif // VPX_DSP_PPC_TYPES_VSX_H_ diff --git a/vpx_dsp/sum_squares.c b/vpx_dsp/sum_squares.c index 7c535ac2d..b80cd588e 100644 --- a/vpx_dsp/sum_squares.c +++ b/vpx_dsp/sum_squares.c @@ -10,8 +10,7 @@ #include "./vpx_dsp_rtcd.h" -uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride, - int size) { +uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int stride, int size) { int r, c; uint64_t ss = 0; @@ -20,7 +19,7 @@ uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride, const int16_t v = src[c]; ss += v * v; } - src += src_stride; + src += stride; } return ss; diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index 167011034..cb06a476f 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -286,6 +286,7 @@ DSP_SRCS-$(HAVE_SSE2) += x86/quantize_sse2.c DSP_SRCS-$(HAVE_SSSE3) += x86/quantize_ssse3.c DSP_SRCS-$(HAVE_AVX) += x86/quantize_avx.c DSP_SRCS-$(HAVE_NEON) += arm/quantize_neon.c +DSP_SRCS-$(HAVE_VSX) += ppc/quantize_vsx.c ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes) DSP_SRCS-$(HAVE_SSE2) += x86/highbd_quantize_intrin_sse2.c endif @@ -312,6 +313,7 @@ ifeq ($(CONFIG_ENCODERS),yes) DSP_SRCS-yes += sad.c DSP_SRCS-yes += subtract.c DSP_SRCS-yes += sum_squares.c +DSP_SRCS-$(HAVE_NEON) += arm/sum_squares_neon.c DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c DSP_SRCS-$(HAVE_MSA) += mips/sum_squares_msa.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4dbee088b..c36ec84b0 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -363,7 +363,7 @@ add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/; add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx/; +specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/; add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; @@ -378,7 +378,7 @@ add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; -specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx/; +specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h"; specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/; @@ -626,7 +626,7 @@ if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vpx_idct32x32_135_add neon sse2 ssse3/; specialize qw/vpx_idct32x32_34_add neon sse2 ssse3/; specialize qw/vpx_idct32x32_1_add neon sse2/; - specialize qw/vpx_iwht4x4_16_add sse2/; + specialize qw/vpx_iwht4x4_16_add sse2 vsx/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") { # Note that these specializations are appended to the above ones. @@ -699,7 +699,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") { add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; - specialize qw/vpx_quantize_b neon sse2 ssse3 avx/; + specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/; add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan"; specialize qw/vpx_quantize_b_32x32 neon ssse3 avx/; @@ -922,7 +922,7 @@ add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; -specialize qw/vpx_sum_squares_2d_i16 sse2 msa/; +specialize qw/vpx_sum_squares_2d_i16 neon sse2 msa/; # # Structured Similarity (SSIM) diff --git a/vpx_dsp/x86/mem_sse2.h b/vpx_dsp/x86/mem_sse2.h index 2ce738fb7..419f17863 100644 --- a/vpx_dsp/x86/mem_sse2.h +++ b/vpx_dsp/x86/mem_sse2.h @@ -15,6 +15,11 @@ #include "./vpx_config.h" +static INLINE __m128i loadh_epi64(const __m128i s, const void *const src) { + return _mm_castps_si128( + _mm_loadh_pi(_mm_castsi128_ps(s), (const __m64 *)src)); +} + static INLINE void load_8bit_4x4(const uint8_t *const s, const ptrdiff_t stride, __m128i *const d) { d[0] = _mm_cvtsi32_si128(*(const int *)(s + 0 * stride)); diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c index 026d0ca2f..9eaf6ee1b 100644 --- a/vpx_dsp/x86/sum_squares_sse2.c +++ b/vpx_dsp/x86/sum_squares_sse2.c @@ -10,120 +10,96 @@ #include <assert.h> #include <emmintrin.h> -#include <stdio.h> #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/x86/mem_sse2.h" -static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src, - int stride) { - const __m128i v_val_0_w = - _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); - const __m128i v_val_1_w = - _mm_loadl_epi64((const __m128i *)(src + 1 * stride)); - const __m128i v_val_2_w = - _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); - const __m128i v_val_3_w = - _mm_loadl_epi64((const __m128i *)(src + 3 * stride)); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - - const __m128i v_sum_d = - _mm_add_epi32(v_sum_0123_d, _mm_srli_epi64(v_sum_0123_d, 32)); - - return (uint64_t)_mm_cvtsi128_si32(v_sum_d); -} - -// TODO(jingning): Evaluate the performance impact here. -#ifdef __GNUC__ -// This prevents GCC/Clang from inlining this function into -// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack -// maintenance instructions in the common case of 4x4. -__attribute__((noinline)) -#endif -static uint64_t -vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src, int stride, int size) { - int r, c; - const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); - __m128i v_acc_q = _mm_setzero_si128(); - - for (r = 0; r < size; r += 8) { - __m128i v_acc_d = _mm_setzero_si128(); - - for (c = 0; c < size; c += 8) { - const int16_t *b = src + c; - const __m128i v_val_0_w = - _mm_load_si128((const __m128i *)(b + 0 * stride)); - const __m128i v_val_1_w = - _mm_load_si128((const __m128i *)(b + 1 * stride)); - const __m128i v_val_2_w = - _mm_load_si128((const __m128i *)(b + 2 * stride)); - const __m128i v_val_3_w = - _mm_load_si128((const __m128i *)(b + 3 * stride)); - const __m128i v_val_4_w = - _mm_load_si128((const __m128i *)(b + 4 * stride)); - const __m128i v_val_5_w = - _mm_load_si128((const __m128i *)(b + 5 * stride)); - const __m128i v_val_6_w = - _mm_load_si128((const __m128i *)(b + 6 * stride)); - const __m128i v_val_7_w = - _mm_load_si128((const __m128i *)(b + 7 * stride)); - - const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); - const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); - const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); - const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); - const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); - const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); - const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); - const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); - - const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); - const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); - const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); - const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); - - const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); - const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); - - v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); - v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); - } - - v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); - v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); +uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { + // Over 75% of all calls are with size == 4. + if (size == 4) { + __m128i s[2], sq[2], ss; + + s[0] = _mm_loadl_epi64((const __m128i *)(src + 0 * stride)); + s[0] = loadh_epi64(s[0], src + 1 * stride); + s[1] = _mm_loadl_epi64((const __m128i *)(src + 2 * stride)); + s[1] = loadh_epi64(s[1], src + 3 * stride); + sq[0] = _mm_madd_epi16(s[0], s[0]); + sq[1] = _mm_madd_epi16(s[1], s[1]); + sq[0] = _mm_add_epi32(sq[0], sq[1]); + ss = _mm_add_epi32(sq[0], _mm_srli_si128(sq[0], 8)); + ss = _mm_add_epi32(ss, _mm_srli_epi64(ss, 32)); + + return (uint64_t)_mm_cvtsi128_si32(ss); + } else { + // Generic case + int r = size; + const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); + __m128i v_acc_q = _mm_setzero_si128(); - src += 8 * stride; - } + assert(size % 8 == 0); - v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); + do { + int c = 0; + __m128i v_acc_d = _mm_setzero_si128(); + + do { + const int16_t *const b = src + c; + const __m128i v_val_0_w = + _mm_load_si128((const __m128i *)(b + 0 * stride)); + const __m128i v_val_1_w = + _mm_load_si128((const __m128i *)(b + 1 * stride)); + const __m128i v_val_2_w = + _mm_load_si128((const __m128i *)(b + 2 * stride)); + const __m128i v_val_3_w = + _mm_load_si128((const __m128i *)(b + 3 * stride)); + const __m128i v_val_4_w = + _mm_load_si128((const __m128i *)(b + 4 * stride)); + const __m128i v_val_5_w = + _mm_load_si128((const __m128i *)(b + 5 * stride)); + const __m128i v_val_6_w = + _mm_load_si128((const __m128i *)(b + 6 * stride)); + const __m128i v_val_7_w = + _mm_load_si128((const __m128i *)(b + 7 * stride)); + + const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); + const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); + const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); + const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); + const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); + const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); + const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); + const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); + + const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); + const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); + const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); + const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); + + const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); + const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); + + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d); + v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d); + c += 8; + } while (c < size); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q)); + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32)); + + src += 8 * stride; + r -= 8; + } while (r); + + v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); #if ARCH_X86_64 - return (uint64_t)_mm_cvtsi128_si64(v_acc_q); + return (uint64_t)_mm_cvtsi128_si64(v_acc_q); #else - { - uint64_t tmp; - _mm_storel_epi64((__m128i *)&tmp, v_acc_q); - return tmp; - } + { + uint64_t tmp; + _mm_storel_epi64((__m128i *)&tmp, v_acc_q); + return tmp; + } #endif -} - -uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride, int size) { - // 4 elements per row only requires half an XMM register, so this - // must be a special case, but also note that over 75% of all calls - // are with size == 4, so it is also the common case. - if (size == 4) { - return vpx_sum_squares_2d_i16_4x4_sse2(src, stride); - } else { - // Generic case - assert(size % 8 == 0); - return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size); } } |