diff options
39 files changed, 1157 insertions, 287 deletions
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index ebdabc9b2..05c72df3f 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -247,35 +247,35 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf); void vp8_remove_compressor(struct VP8_COMP **comp); void vp8_init_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); -void vp8_change_config(struct VP8_COMP *onyx, VP8_CONFIG *oxcf); +void vp8_change_config(struct VP8_COMP *cpi, VP8_CONFIG *oxcf); -int vp8_receive_raw_frame(struct VP8_COMP *comp, unsigned int frame_flags, +int vp8_receive_raw_frame(struct VP8_COMP *cpi, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); -int vp8_get_compressed_data(struct VP8_COMP *comp, unsigned int *frame_flags, + int64_t end_time); +int vp8_get_compressed_data(struct VP8_COMP *cpi, unsigned int *frame_flags, size_t *size, unsigned char *dest, unsigned char *dest_end, int64_t *time_stamp, int64_t *time_end, int flush); -int vp8_get_preview_raw_frame(struct VP8_COMP *comp, YV12_BUFFER_CONFIG *dest, +int vp8_get_preview_raw_frame(struct VP8_COMP *cpi, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *flags); -int vp8_use_as_reference(struct VP8_COMP *comp, int ref_frame_flags); -int vp8_update_reference(struct VP8_COMP *comp, int ref_frame_flags); -int vp8_get_reference(struct VP8_COMP *comp, +int vp8_use_as_reference(struct VP8_COMP *cpi, int ref_frame_flags); +int vp8_update_reference(struct VP8_COMP *cpi, int ref_frame_flags); +int vp8_get_reference(struct VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp8_set_reference(struct VP8_COMP *comp, +int vp8_set_reference(struct VP8_COMP *cpi, enum vpx_ref_frame_type ref_frame_flag, YV12_BUFFER_CONFIG *sd); -int vp8_update_entropy(struct VP8_COMP *comp, int update); -int vp8_set_roimap(struct VP8_COMP *comp, unsigned char *map, unsigned int rows, +int vp8_update_entropy(struct VP8_COMP *cpi, int update); +int vp8_set_roimap(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols, int delta_q[4], int delta_lf[4], unsigned int threshold[4]); -int vp8_set_active_map(struct VP8_COMP *comp, unsigned char *map, +int vp8_set_active_map(struct VP8_COMP *cpi, unsigned char *map, unsigned int rows, unsigned int cols); -int vp8_set_internal_size(struct VP8_COMP *comp, VPX_SCALING horiz_mode, +int vp8_set_internal_size(struct VP8_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); -int vp8_get_quantizer(struct VP8_COMP *c); +int vp8_get_quantizer(struct VP8_COMP *cpi); #ifdef __cplusplus } diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index 24636b91c..235c77e38 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -223,7 +223,7 @@ specialize qw/vp8_full_search_sad sse3 sse4_1/; $vp8_full_search_sad_sse3=vp8_full_search_sadx3; $vp8_full_search_sad_sse4_1=vp8_full_search_sadx8; -add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; +add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv"; specialize qw/vp8_refining_search_sad sse2 msa/; $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4; $vp8_refining_search_sad_msa=vp8_refining_search_sadx4; diff --git a/vp8/encoder/boolhuff.h b/vp8/encoder/boolhuff.h index 2cf62def1..ba37cc01d 100644 --- a/vp8/encoder/boolhuff.h +++ b/vp8/encoder/boolhuff.h @@ -35,11 +35,11 @@ typedef struct { struct vpx_internal_error_info *error; } BOOL_CODER; -extern void vp8_start_encode(BOOL_CODER *bc, unsigned char *buffer, - unsigned char *buffer_end); +void vp8_start_encode(BOOL_CODER *br, unsigned char *source, + unsigned char *source_end); -extern void vp8_encode_value(BOOL_CODER *br, int data, int bits); -extern void vp8_stop_encode(BOOL_CODER *bc); +void vp8_encode_value(BOOL_CODER *br, int data, int bits); +void vp8_stop_encode(BOOL_CODER *br); extern const unsigned int vp8_prob_cost[256]; DECLARE_ALIGNED(16, extern const unsigned char, vp8_norm[256]); diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c index eb963b97e..e54d1e9f4 100644 --- a/vp8/encoder/denoising.c +++ b/vp8/encoder/denoising.c @@ -213,13 +213,12 @@ int vp8_denoiser_filter_c(unsigned char *mc_running_avg_y, int mc_avg_y_stride, return FILTER_BLOCK; } -int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, - int mc_avg_uv_stride, - unsigned char *running_avg_uv, int avg_uv_stride, +int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg, int mc_avg_stride, + unsigned char *running_avg, int avg_stride, unsigned char *sig, int sig_stride, unsigned int motion_magnitude, int increase_denoising) { - unsigned char *running_avg_uv_start = running_avg_uv; + unsigned char *running_avg_start = running_avg; unsigned char *sig_start = sig; int sum_diff_thresh; int r, c; @@ -259,13 +258,13 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, int adjustment = 0; int absdiff = 0; - diff = mc_running_avg_uv[c] - sig[c]; + diff = mc_running_avg[c] - sig[c]; absdiff = abs(diff); // When |diff| <= |3 + shift_inc1|, use pixel value from // last denoised raw. if (absdiff <= 3 + shift_inc1) { - running_avg_uv[c] = mc_running_avg_uv[c]; + running_avg[c] = mc_running_avg[c]; sum_diff += diff; } else { if (absdiff >= 4 && absdiff <= 7) { @@ -277,16 +276,16 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, } if (diff > 0) { if ((sig[c] + adjustment) > 255) { - running_avg_uv[c] = 255; + running_avg[c] = 255; } else { - running_avg_uv[c] = sig[c] + adjustment; + running_avg[c] = sig[c] + adjustment; } sum_diff += adjustment; } else { if ((sig[c] - adjustment) < 0) { - running_avg_uv[c] = 0; + running_avg[c] = 0; } else { - running_avg_uv[c] = sig[c] - adjustment; + running_avg[c] = sig[c] - adjustment; } sum_diff -= adjustment; } @@ -294,8 +293,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, } /* Update pointers for next iteration. */ sig += sig_stride; - mc_running_avg_uv += mc_avg_uv_stride; - running_avg_uv += avg_uv_stride; + mc_running_avg += mc_avg_stride; + running_avg += avg_stride; } sum_diff_thresh = SUM_DIFF_THRESHOLD_UV; @@ -314,27 +313,27 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, // Only apply the adjustment for max delta up to 3. if (delta < 4) { sig -= sig_stride * 8; - mc_running_avg_uv -= mc_avg_uv_stride * 8; - running_avg_uv -= avg_uv_stride * 8; + mc_running_avg -= mc_avg_stride * 8; + running_avg -= avg_stride * 8; for (r = 0; r < 8; ++r) { for (c = 0; c < 8; ++c) { - int diff = mc_running_avg_uv[c] - sig[c]; + int diff = mc_running_avg[c] - sig[c]; int adjustment = abs(diff); if (adjustment > delta) adjustment = delta; if (diff > 0) { // Bring denoised signal down. - if (running_avg_uv[c] - adjustment < 0) { - running_avg_uv[c] = 0; + if (running_avg[c] - adjustment < 0) { + running_avg[c] = 0; } else { - running_avg_uv[c] = running_avg_uv[c] - adjustment; + running_avg[c] = running_avg[c] - adjustment; } sum_diff -= adjustment; } else if (diff < 0) { // Bring denoised signal up. - if (running_avg_uv[c] + adjustment > 255) { - running_avg_uv[c] = 255; + if (running_avg[c] + adjustment > 255) { + running_avg[c] = 255; } else { - running_avg_uv[c] = running_avg_uv[c] + adjustment; + running_avg[c] = running_avg[c] + adjustment; } sum_diff += adjustment; } @@ -342,8 +341,8 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, // TODO(marpan): Check here if abs(sum_diff) has gone below the // threshold sum_diff_thresh, and if so, we can exit the row loop. sig += sig_stride; - mc_running_avg_uv += mc_avg_uv_stride; - running_avg_uv += avg_uv_stride; + mc_running_avg += mc_avg_stride; + running_avg += avg_stride; } if (abs(sum_diff) > sum_diff_thresh) return COPY_BLOCK; } else { @@ -351,7 +350,7 @@ int vp8_denoiser_filter_uv_c(unsigned char *mc_running_avg_uv, } } - vp8_copy_mem8x8(running_avg_uv_start, avg_uv_stride, sig_start, sig_stride); + vp8_copy_mem8x8(running_avg_start, avg_stride, sig_start, sig_stride); return FILTER_BLOCK; } diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h index 490b0b872..397f872e2 100644 --- a/vp8/encoder/mcomp.h +++ b/vp8/encoder/mcomp.h @@ -19,8 +19,8 @@ extern "C" { #endif #ifdef VP8_ENTROPY_STATS -extern void init_mv_ref_counts(); -extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); +void init_mv_ref_counts(); +void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); #endif /* The maximum number of steps in a step search given the largest allowed @@ -34,15 +34,15 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]); /* Maximum size of the first step in full pel units */ #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS - 1)) -extern void print_mode_context(void); -extern int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight); -extern void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride); -extern void vp8_init3smotion_compensation(MACROBLOCK *x, int stride); +void print_mode_context(void); +int vp8_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvcost[2], int Weight); +void vp8_init_dsmotion_compensation(MACROBLOCK *x, int stride); +void vp8_init3smotion_compensation(MACROBLOCK *x, int stride); -extern int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, - int_mv *best_mv, int search_param, int error_per_bit, - const vp8_variance_fn_ptr_t *vf, int *mvsadcost[2], - int *mvcost[2], int_mv *center_mv); +int vp8_hex_search(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, + int_mv *best_mv, int search_param, int sad_per_bit, + const vp8_variance_fn_ptr_t *vfp, int *mvsadcost[2], + int *mvcost[2], int_mv *center_mv); typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *bestmv, int_mv *ref_mv, @@ -51,10 +51,10 @@ typedef int(fractional_mv_step_fp)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int *mvcost[2], int *distortion, unsigned int *sse); -extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; -extern fractional_mv_step_fp vp8_find_best_sub_pixel_step; -extern fractional_mv_step_fp vp8_find_best_half_pixel_step; -extern fractional_mv_step_fp vp8_skip_fractional_mv_step; +fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; +fractional_mv_step_fp vp8_find_best_sub_pixel_step; +fractional_mv_step_fp vp8_find_best_half_pixel_step; +fractional_mv_step_fp vp8_skip_fractional_mv_step; typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv, int sad_per_bit, diff --git a/vp8/encoder/modecosts.h b/vp8/encoder/modecosts.h index 422a79b36..09ee2b552 100644 --- a/vp8/encoder/modecosts.h +++ b/vp8/encoder/modecosts.h @@ -17,7 +17,7 @@ extern "C" { struct VP8_COMP; -void vp8_init_mode_costs(struct VP8_COMP *x); +void vp8_init_mode_costs(struct VP8_COMP *c); #ifdef __cplusplus } // extern "C" diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index 2da940199..ddd588294 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -2106,8 +2106,8 @@ struct VP8_COMP *vp8_create_compressor(VP8_CONFIG *oxcf) { return cpi; } -void vp8_remove_compressor(VP8_COMP **ptr) { - VP8_COMP *cpi = *ptr; +void vp8_remove_compressor(VP8_COMP **comp) { + VP8_COMP *cpi = *comp; if (!cpi) return; @@ -2326,7 +2326,7 @@ void vp8_remove_compressor(VP8_COMP **ptr) { vp8_remove_common(&cpi->common); vpx_free(cpi); - *ptr = 0; + *comp = 0; #ifdef OUTPUT_YUV_SRC fclose(yuv_file); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 1bb54fc2b..6bb3cacc5 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -173,9 +173,8 @@ static int get_prediction_error(BLOCK *be, BLOCKD *b) { static int pick_intra4x4block(MACROBLOCK *x, int ib, B_PREDICTION_MODE *best_mode, - const int *mode_costs, - - int *bestrate, int *bestdistortion) { + const int *mode_costs, int *bestrate, + int *bestdistortion) { BLOCKD *b = &x->e_mbd.block[ib]; BLOCK *be = &x->block[ib]; int dst_stride = x->e_mbd.dst.y_stride; @@ -1303,9 +1302,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, update_mvcount(x, &best_ref_mv); } -void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) { +void vp8_pick_intra_mode(MACROBLOCK *x, int *rate) { int error4x4, error16x16 = INT_MAX; - int rate, best_rate = 0, distortion, best_sse; + int rate_, best_rate = 0, distortion, best_sse; MB_PREDICTION_MODE mode, best_mode = DC_PRED; int this_rd; unsigned int sse; @@ -1323,23 +1322,23 @@ void vp8_pick_intra_mode(MACROBLOCK *x, int *rate_) { xd->predictor, 16); distortion = vpx_variance16x16(*(b->base_src), b->src_stride, xd->predictor, 16, &sse); - rate = x->mbmode_cost[xd->frame_type][mode]; - this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); + rate_ = x->mbmode_cost[xd->frame_type][mode]; + this_rd = RDCOST(x->rdmult, x->rddiv, rate_, distortion); if (error16x16 > this_rd) { error16x16 = this_rd; best_mode = mode; best_sse = sse; - best_rate = rate; + best_rate = rate_; } } xd->mode_info_context->mbmi.mode = best_mode; - error4x4 = pick_intra4x4mby_modes(x, &rate, &best_sse); + error4x4 = pick_intra4x4mby_modes(x, &rate_, &best_sse); if (error4x4 < error16x16) { xd->mode_info_context->mbmi.mode = B_PRED; - best_rate = rate; + best_rate = rate_; } - *rate_ = best_rate; + *rate = best_rate; } diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index b4182c5cd..679d66bbf 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -2358,11 +2358,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, rd_update_mvcount(x, &best_ref_mv); } -void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) { +void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate) { int error4x4, error16x16; int rate4x4, rate16x16 = 0, rateuv; int dist4x4, dist16x16, distuv; - int rate; + int rate_; int rate4x4_tokenonly = 0; int rate16x16_tokenonly = 0; int rateuv_tokenonly = 0; @@ -2370,7 +2370,7 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) { x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; rd_pick_intra_mbuv_mode(x, &rateuv, &rateuv_tokenonly, &distuv); - rate = rateuv; + rate_ = rateuv; error16x16 = rd_pick_intra16x16mby_mode(x, &rate16x16, &rate16x16_tokenonly, &dist16x16); @@ -2380,10 +2380,10 @@ void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate_) { if (error4x4 < error16x16) { x->e_mbd.mode_info_context->mbmi.mode = B_PRED; - rate += rate4x4; + rate_ += rate4x4; } else { - rate += rate16x16; + rate_ += rate16x16; } - *rate_ = rate; + *rate = rate_; } diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h index e22b58b8a..cc3db8197 100644 --- a/vp8/encoder/rdopt.h +++ b/vp8/encoder/rdopt.h @@ -63,12 +63,12 @@ static INLINE void insertsortsad(int arr[], int idx[], int len) { } } -extern void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue); -extern void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, - int recon_yoffset, int recon_uvoffset, - int *returnrate, int *returndistortion, - int *returnintra, int mb_row, int mb_col); -extern void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate); +void vp8_initialize_rd_consts(VP8_COMP *cpi, MACROBLOCK *x, int Qvalue); +void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, + int recon_uvoffset, int *returnrate, + int *returndistortion, int *returnintra, int mb_row, + int mb_col); +void vp8_rd_pick_intra_mode(MACROBLOCK *x, int *rate); static INLINE void get_plane_pointers(const YV12_BUFFER_CONFIG *fb, unsigned char *plane[3], @@ -110,9 +110,9 @@ static INLINE void get_reference_search_order(const VP8_COMP *cpi, for (; i < 4; ++i) ref_frame_map[i] = -1; } -extern void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here, - int_mv *mvp, int refframe, int *ref_frame_sign_bias, - int *sr, int near_sadidx[]); +void vp8_mv_pred(VP8_COMP *cpi, MACROBLOCKD *xd, const MODE_INFO *here, + int_mv *mvp, int refframe, int *ref_frame_sign_bias, int *sr, + int near_sadidx[]); void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffset, int near_sadidx[]); int VP8_UVSSE(MACROBLOCK *x); diff --git a/vp8/encoder/treewriter.h b/vp8/encoder/treewriter.h index 0d7b06e56..c02683a58 100644 --- a/vp8/encoder/treewriter.h +++ b/vp8/encoder/treewriter.h @@ -91,9 +91,9 @@ static INLINE int vp8_cost_token(vp8_tree t, const vp8_prob *const p, /* Fill array of costs for all possible token values. */ -void vp8_cost_tokens(int *Costs, const vp8_prob *, vp8_tree); +void vp8_cost_tokens(int *c, const vp8_prob *, vp8_tree); -void vp8_cost_tokens2(int *Costs, const vp8_prob *, vp8_tree, int); +void vp8_cost_tokens2(int *c, const vp8_prob *, vp8_tree, int); #ifdef __cplusplus } // extern "C" diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h index dcc8e2998..ee9d37973 100644 --- a/vp9/common/vp9_entropymv.h +++ b/vp9/common/vp9_entropymv.h @@ -25,7 +25,7 @@ struct VP9Common; void vp9_init_mv_probs(struct VP9Common *cm); -void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); +void vp9_adapt_mv_probs(struct VP9Common *cm, int allow_hp); static INLINE int use_mv_hp(const MV *ref) { const int kMvRefThresh = 64; // threshold for use of high-precision 1/8 mv @@ -127,7 +127,7 @@ typedef struct { nmv_component_counts comps[2]; } nmv_context_counts; -void vp9_inc_mv(const MV *mv, nmv_context_counts *mvctx); +void vp9_inc_mv(const MV *mv, nmv_context_counts *counts); #ifdef __cplusplus } // extern "C" diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c index da9180b71..04a93f3b9 100644 --- a/vp9/common/vp9_loopfilter.c +++ b/vp9/common/vp9_loopfilter.c @@ -880,12 +880,12 @@ void vp9_adjust_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. void vp9_setup_mask(VP9_COMMON *const cm, const int mi_row, const int mi_col, - MODE_INFO **mi, const int mode_info_stride, + MODE_INFO **mi8x8, const int mode_info_stride, LOOP_FILTER_MASK *lfm) { int idx_32, idx_16, idx_8; const loop_filter_info_n *const lfi_n = &cm->lf_info; - MODE_INFO **mip = mi; - MODE_INFO **mip2 = mi; + MODE_INFO **mip = mi8x8; + MODE_INFO **mip2 = mi8x8; // These are offsets to the next mi in the 64x64 block. It is what gets // added to the mi ptr as we go through each loop. It helps us to avoid diff --git a/vp9/common/vp9_loopfilter.h b/vp9/common/vp9_loopfilter.h index daf3b9131..39648a72c 100644 --- a/vp9/common/vp9_loopfilter.h +++ b/vp9/common/vp9_loopfilter.h @@ -97,7 +97,7 @@ struct VP9LfSyncData; // This function sets up the bit masks for the entire 64x64 region represented // by mi_row, mi_col. void vp9_setup_mask(struct VP9Common *const cm, const int mi_row, - const int mi_col, MODE_INFO **mi_8x8, + const int mi_col, MODE_INFO **mi8x8, const int mode_info_stride, LOOP_FILTER_MASK *lfm); void vp9_filter_block_plane_ss00(struct VP9Common *const cm, @@ -120,7 +120,7 @@ void vp9_loop_filter_init(struct VP9Common *cm); void vp9_loop_filter_frame_init(struct VP9Common *cm, int default_filt_lvl); void vp9_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm, - struct macroblockd *mbd, int filter_level, + struct macroblockd *xd, int frame_filter_level, int y_only, int partial_frame); // Get the superblock lfm for a given mi_row, mi_col. diff --git a/vp9/common/vp9_postproc.h b/vp9/common/vp9_postproc.h index 0aafa72ca..67efc1b4e 100644 --- a/vp9/common/vp9_postproc.h +++ b/vp9/common/vp9_postproc.h @@ -38,7 +38,7 @@ struct VP9Common; #define MFQE_PRECISION 4 int vp9_post_proc_frame(struct VP9Common *cm, YV12_BUFFER_CONFIG *dest, - vp9_ppflags_t *flags, int unscaled_width); + vp9_ppflags_t *ppflags, int unscaled_width); void vp9_denoise(const YV12_BUFFER_CONFIG *src, YV12_BUFFER_CONFIG *dst, int q, uint8_t *limits); diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h index 2c6d6695a..992e30c34 100644 --- a/vp9/common/vp9_reconinter.h +++ b/vp9/common/vp9_reconinter.h @@ -61,15 +61,15 @@ void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col, BLOCK_SIZE bsize); void vp9_build_inter_predictor(const uint8_t *src, int src_stride, uint8_t *dst, - int dst_stride, const MV *mv_q3, + int dst_stride, const MV *src_mv, const struct scale_factors *sf, int w, int h, - int do_avg, const InterpKernel *kernel, + int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y); #if CONFIG_VP9_HIGHBITDEPTH void vp9_highbd_build_inter_predictor( const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, - const MV *mv_q3, const struct scale_factors *sf, int w, int h, int do_avg, + const MV *src_mv, const struct scale_factors *sf, int w, int h, int ref, const InterpKernel *kernel, enum mv_precision precision, int x, int y, int bd); #endif diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl index 6d7f95260..d7ad2b693 100644 --- a/vp9/common/vp9_rtcd_defs.pl +++ b/vp9/common/vp9_rtcd_defs.pl @@ -62,7 +62,7 @@ add_proto qw/void vp9_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, i add_proto qw/void vp9_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; -add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type"; +add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int stride, int tx_type"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { # Note that there are more specializations appended when @@ -100,7 +100,7 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/void vp9_highbd_iht8x8_64_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; - add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *output, int pitch, int tx_type, int bd"; + add_proto qw/void vp9_highbd_iht16x16_256_add/, "const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd"; if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") { specialize qw/vp9_highbd_iht4x4_16_add neon sse4_1/; diff --git a/vp9/common/vp9_scale.h b/vp9/common/vp9_scale.h index 53c6eef72..aaafdf867 100644 --- a/vp9/common/vp9_scale.h +++ b/vp9/common/vp9_scale.h @@ -42,7 +42,7 @@ MV32 vp9_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf); #if CONFIG_VP9_HIGHBITDEPTH void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h, - int use_high); + int use_highbd); #else void vp9_setup_scale_factors_for_frame(struct scale_factors *sf, int other_w, int other_h, int this_w, int this_h); diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h index 5354105f8..9a582fffb 100644 --- a/vp9/decoder/vp9_decoder.h +++ b/vp9/decoder/vp9_decoder.h @@ -93,7 +93,7 @@ typedef struct VP9Decoder { } VP9Decoder; int vp9_receive_compressed_data(struct VP9Decoder *pbi, size_t size, - const uint8_t **dest); + const uint8_t **psource); int vp9_get_raw_frame(struct VP9Decoder *pbi, YV12_BUFFER_CONFIG *sd, vp9_ppflags_t *flags); diff --git a/vp9/encoder/arm/neon/vp9_dct_neon.c b/vp9/encoder/arm/neon/vp9_dct_neon.c index 513718e7c..f8dd0a6f7 100644 --- a/vp9/encoder/arm/neon/vp9_dct_neon.c +++ b/vp9/encoder/arm/neon/vp9_dct_neon.c @@ -23,13 +23,13 @@ void vp9_fdct8x8_quant_neon(const int16_t *input, int stride, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { tran_low_t temp_buffer[64]; (void)coeff_ptr; vpx_fdct8x8_neon(input, temp_buffer, stride); vp9_quantize_fp_neon(temp_buffer, n_coeffs, skip_block, round_ptr, quant_ptr, - qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan_ptr, - iscan_ptr); + qcoeff_ptr, dqcoeff_ptr, dequant_ptr, eob_ptr, scan, + iscan); } diff --git a/vp9/encoder/arm/neon/vp9_quantize_neon.c b/vp9/encoder/arm/neon/vp9_quantize_neon.c index 97a09bdff..2cec8bd03 100644 --- a/vp9/encoder/arm/neon/vp9_quantize_neon.c +++ b/vp9/encoder/arm/neon/vp9_quantize_neon.c @@ -122,7 +122,7 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan, const int16_t *iscan_ptr) { + const int16_t *scan, const int16_t *iscan) { const int16x8_t one = vdupq_n_s16(1); const int16x8_t neg_one = vdupq_n_s16(-1); @@ -134,8 +134,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, const int16x8_t dequant_thresh = vshrq_n_s16(vld1q_s16(dequant_ptr), 2); // Process dc and the first seven ac coeffs. - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); const int16x8_t coeff_abs = vabsq_s16(coeff); @@ -169,12 +169,12 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, dqcoeff = vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); - eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), iscan); + eob_max = vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan); store_s16q_to_tran_low(qcoeff_ptr, qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); - iscan_ptr += 8; + iscan += 8; coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; @@ -188,8 +188,8 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, // Process the rest of the ac coeffs. for (i = 8; i < 32 * 32; i += 8) { - const uint16x8_t iscan = - vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan_ptr), one)); + const uint16x8_t v_iscan = + vreinterpretq_u16_s16(vaddq_s16(vld1q_s16(iscan), one)); const int16x8_t coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t coeff_sign = vshrq_n_s16(coeff, 15); const int16x8_t coeff_abs = vabsq_s16(coeff); @@ -215,12 +215,12 @@ void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count, vcombine_s16(vshrn_n_s32(dqcoeff_0, 1), vshrn_n_s32(dqcoeff_1, 1)); eob_max = - vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), iscan)); + vmaxq_u16(eob_max, vandq_u16(vtstq_s16(qcoeff, neg_one), v_iscan)); store_s16q_to_tran_low(qcoeff_ptr, qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, dqcoeff); - iscan_ptr += 8; + iscan += 8; coeff_ptr += 8; qcoeff_ptr += 8; dqcoeff_ptr += 8; diff --git a/vp9/encoder/ppc/vp9_quantize_vsx.c b/vp9/encoder/ppc/vp9_quantize_vsx.c index 3720b0876..4f88b8fff 100644 --- a/vp9/encoder/ppc/vp9_quantize_vsx.c +++ b/vp9/encoder/ppc/vp9_quantize_vsx.c @@ -42,8 +42,8 @@ void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob; bool16x8_t zero_coeff0, zero_coeff1; @@ -52,10 +52,10 @@ void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); - int16x8_t scan0 = vec_vsx_ld(0, iscan_ptr); - int16x8_t scan1 = vec_vsx_ld(16, iscan_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); @@ -103,9 +103,9 @@ void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, coeff0 = vec_vsx_ld(off0, coeff_ptr); coeff1 = vec_vsx_ld(off1, coeff_ptr); coeff2 = vec_vsx_ld(off2, coeff_ptr); - scan0 = vec_vsx_ld(off0, iscan_ptr); - scan1 = vec_vsx_ld(off1, iscan_ptr); - scan2 = vec_vsx_ld(off2, iscan_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant); zero_coeff0 = vec_cmpeq(qcoeff0, vec_zeros_s16); @@ -169,8 +169,7 @@ void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, - const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + const int16_t *scan, const int16_t *iscan) { // In stage 1, we quantize 16 coeffs (DC + 15 AC) // In stage 2, we loop 42 times and quantize 24 coeffs per iteration // (32 * 32 - 16) / 24 = 42 @@ -188,13 +187,13 @@ void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int16x8_t dequant = vec_vsx_ld(0, dequant_ptr); int16x8_t coeff0 = vec_vsx_ld(0, coeff_ptr); int16x8_t coeff1 = vec_vsx_ld(16, coeff_ptr); - int16x8_t scan0 = vec_vsx_ld(0, iscan_ptr); - int16x8_t scan1 = vec_vsx_ld(16, iscan_ptr); + int16x8_t scan0 = vec_vsx_ld(0, iscan); + int16x8_t scan1 = vec_vsx_ld(16, iscan); int16x8_t thres = vec_sra(dequant, vec_splats((uint16_t)2)); int16x8_t abs_coeff0 = vec_abs(coeff0); int16x8_t abs_coeff1 = vec_abs(coeff1); - (void)scan_ptr; + (void)scan; (void)skip_block; (void)n_coeffs; assert(!skip_block); @@ -238,9 +237,9 @@ void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs, coeff0 = vec_vsx_ld(off0, coeff_ptr); coeff1 = vec_vsx_ld(off1, coeff_ptr); coeff2 = vec_vsx_ld(off2, coeff_ptr); - scan0 = vec_vsx_ld(off0, iscan_ptr); - scan1 = vec_vsx_ld(off1, iscan_ptr); - scan2 = vec_vsx_ld(off2, iscan_ptr); + scan0 = vec_vsx_ld(off0, iscan); + scan1 = vec_vsx_ld(off1, iscan); + scan2 = vec_vsx_ld(off2, iscan); abs_coeff0 = vec_abs(coeff0); abs_coeff1 = vec_abs(coeff1); diff --git a/vp9/encoder/vp9_encodemv.h b/vp9/encoder/vp9_encodemv.h index 8bbf85787..2f1be4b23 100644 --- a/vp9/encoder/vp9_encodemv.h +++ b/vp9/encoder/vp9_encodemv.h @@ -27,7 +27,7 @@ void vp9_encode_mv(VP9_COMP *cpi, vpx_writer *w, const MV *mv, const MV *ref, unsigned int *const max_mv_magnitude); void vp9_build_nmv_cost_table(int *mvjoint, int *mvcost[2], - const nmv_context *mvctx, int usehp); + const nmv_context *ctx, int usehp); void vp9_update_mv_count(ThreadData *td); diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 07289588c..02814599d 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -811,7 +811,7 @@ void vp9_change_config(VP9_COMP *cpi, const VP9EncoderConfig *oxcf); // frame is made and not just a copy of the pointer.. int vp9_receive_raw_frame(VP9_COMP *cpi, vpx_enc_frame_flags_t frame_flags, YV12_BUFFER_CONFIG *sd, int64_t time_stamp, - int64_t end_time_stamp); + int64_t end_time); int vp9_get_compressed_data(VP9_COMP *cpi, unsigned int *frame_flags, size_t *size, uint8_t *dest, int64_t *time_stamp, @@ -832,9 +832,11 @@ int vp9_set_reference_enc(VP9_COMP *cpi, VP9_REFFRAME ref_frame_flag, int vp9_update_entropy(VP9_COMP *cpi, int update); -int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); +int vp9_set_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); -int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols); +int vp9_get_active_map(VP9_COMP *cpi, unsigned char *new_map_16x16, int rows, + int cols); int vp9_set_internal_size(VP9_COMP *cpi, VPX_SCALING horiz_mode, VPX_SCALING vert_mode); diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 6bd85a152..558ecbcd8 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -59,7 +59,7 @@ struct SPEED_FEATURES; int vp9_init_search_range(int size); int vp9_refining_search_sad(const struct macroblock *x, struct mv *ref_mv, - int sad_per_bit, int distance, + int error_per_bit, int search_range, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv); diff --git a/vp9/encoder/vp9_ratectrl.h b/vp9/encoder/vp9_ratectrl.h index 3b441bf1f..a9f75555e 100644 --- a/vp9/encoder/vp9_ratectrl.h +++ b/vp9/encoder/vp9_ratectrl.h @@ -194,7 +194,7 @@ struct VP9EncoderConfig; void vp9_rc_init(const struct VP9EncoderConfig *oxcf, int pass, RATE_CONTROL *rc); -int vp9_estimate_bits_at_q(FRAME_TYPE frame_kind, int q, int mbs, +int vp9_estimate_bits_at_q(FRAME_TYPE frame_type, int q, int mbs, double correction_factor, vpx_bit_depth_t bit_depth); double vp9_convert_qindex_to_q(int qindex, vpx_bit_depth_t bit_depth); @@ -205,9 +205,9 @@ void vp9_rc_init_minq_luts(void); int vp9_rc_get_default_min_gf_interval(int width, int height, double framerate); // Note vp9_rc_get_default_max_gf_interval() requires the min_gf_interval to -// be passed in to ensure that the max_gf_interval returned is at least as bis +// be passed in to ensure that the max_gf_interval returned is at least as big // as that. -int vp9_rc_get_default_max_gf_interval(double framerate, int min_frame_rate); +int vp9_rc_get_default_max_gf_interval(double framerate, int min_gf_interval); // Generally at the high level, the following flow is expected // to be enforced for rate control: @@ -253,7 +253,7 @@ int vp9_rc_drop_frame(struct VP9_COMP *cpi); // Computes frame size bounds. void vp9_rc_compute_frame_size_bounds(const struct VP9_COMP *cpi, - int this_frame_target, + int frame_target, int *frame_under_shoot_limit, int *frame_over_shoot_limit); diff --git a/vp9/encoder/vp9_rd.h b/vp9/encoder/vp9_rd.h index f2fc776a4..a1a98bd91 100644 --- a/vp9/encoder/vp9_rd.h +++ b/vp9/encoder/vp9_rd.h @@ -145,7 +145,7 @@ void vp9_initialize_rd_consts(struct VP9_COMP *cpi); void vp9_initialize_me_consts(struct VP9_COMP *cpi, MACROBLOCK *x, int qindex); -void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n, +void vp9_model_rd_from_var_lapndz(unsigned int var, unsigned int n_log2, unsigned int qstep, int *rate, int64_t *dist); void vp9_model_rd_from_var_lapndz_vec(unsigned int var[MAX_MB_PLANE], @@ -176,8 +176,8 @@ void vp9_set_rd_speed_thresholds(struct VP9_COMP *cpi); void vp9_set_rd_speed_thresholds_sub8x8(struct VP9_COMP *cpi); -void vp9_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh, int bsize, - int best_mode_index); +void vp9_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh, + int bsize, int best_mode_index); static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh, const int *const thresh_fact) { diff --git a/vp9/encoder/x86/vp9_dct_intrin_sse2.c b/vp9/encoder/x86/vp9_dct_intrin_sse2.c index 293cdcd67..0cecd6540 100644 --- a/vp9/encoder/x86/vp9_dct_intrin_sse2.c +++ b/vp9/encoder/x86/vp9_dct_intrin_sse2.c @@ -185,8 +185,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { __m128i zero; int pass; @@ -215,7 +215,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, __m128i *in[8]; int index = 0; - (void)scan_ptr; + (void)scan; (void)coeff_ptr; // Pre-condition input (shift by two) @@ -449,7 +449,7 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, in7 = _mm_srai_epi16(in7, 1); } - iscan_ptr += n_coeffs; + iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; @@ -518,8 +518,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -582,8 +582,8 @@ void vp9_fdct8x8_quant_sse2(const int16_t *input, int stride, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c index bf874a09e..99c193894 100644 --- a/vp9/encoder/x86/vp9_dct_ssse3.c +++ b/vp9/encoder/x86/vp9_dct_ssse3.c @@ -18,11 +18,13 @@ #include "vpx_dsp/x86/inv_txfm_sse2.h" #include "vpx_dsp/x86/txfm_common_sse2.h" -void vp9_fdct8x8_quant_ssse3( - const int16_t *input, int stride, tran_low_t *coeff_ptr, intptr_t n_coeffs, - int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, - tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, const int16_t *iscan_ptr) { +void vp9_fdct8x8_quant_ssse3(const int16_t *input, int stride, + tran_low_t *coeff_ptr, intptr_t n_coeffs, + int skip_block, const int16_t *round_ptr, + const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, + tran_low_t *dqcoeff_ptr, + const int16_t *dequant_ptr, uint16_t *eob_ptr, + const int16_t *scan, const int16_t *iscan) { __m128i zero; int pass; @@ -52,7 +54,7 @@ void vp9_fdct8x8_quant_ssse3( __m128i *in[8]; int index = 0; - (void)scan_ptr; + (void)scan; (void)coeff_ptr; // Pre-condition input (shift by two) @@ -280,7 +282,7 @@ void vp9_fdct8x8_quant_ssse3( in7 = _mm_srai_epi16(in7, 1); } - iscan_ptr += n_coeffs; + iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; @@ -350,8 +352,8 @@ void vp9_fdct8x8_quant_ssse3( zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -427,8 +429,8 @@ void vp9_fdct8x8_quant_ssse3( zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); diff --git a/vp9/encoder/x86/vp9_quantize_avx2.c b/vp9/encoder/x86/vp9_quantize_avx2.c index 4bebc34d6..556a9fbaa 100644 --- a/vp9/encoder/x86/vp9_quantize_avx2.c +++ b/vp9/encoder/x86/vp9_quantize_avx2.c @@ -50,18 +50,18 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { __m128i eob; __m256i round256, quant256, dequant256; __m256i eob256, thr256; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; + iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; @@ -97,7 +97,7 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); } - eob256 = scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256); + eob256 = scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256); n_coeffs += 8 * 2; } @@ -124,8 +124,7 @@ void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, coeff256 = _mm256_mullo_epi16(qcoeff256, dequant256); store_tran_low(coeff256, dqcoeff_ptr + n_coeffs); eob256 = _mm256_max_epi16( - eob256, - scan_eob_256((const __m256i *)(iscan_ptr + n_coeffs), &coeff256)); + eob256, scan_eob_256((const __m256i *)(iscan + n_coeffs), &coeff256)); } else { store_zero_tran_low(qcoeff_ptr + n_coeffs); store_zero_tran_low(dqcoeff_ptr + n_coeffs); diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c index ca0ad4407..885220a71 100644 --- a/vp9/encoder/x86/vp9_quantize_sse2.c +++ b/vp9/encoder/x86/vp9_quantize_sse2.c @@ -21,20 +21,20 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, - uint16_t *eob_ptr, const int16_t *scan_ptr, - const int16_t *iscan_ptr) { + uint16_t *eob_ptr, const int16_t *scan, + const int16_t *iscan) { __m128i zero; __m128i thr; int16_t nzflag; __m128i eob; __m128i round, quant, dequant; - (void)scan_ptr; + (void)scan; (void)skip_block; assert(!skip_block); coeff_ptr += n_coeffs; - iscan_ptr += n_coeffs; + iscan += n_coeffs; qcoeff_ptr += n_coeffs; dqcoeff_ptr += n_coeffs; n_coeffs = -n_coeffs; @@ -100,8 +100,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); @@ -175,8 +175,8 @@ void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, zero_coeff1 = _mm_cmpeq_epi16(coeff1, zero); nzero_coeff0 = _mm_cmpeq_epi16(zero_coeff0, zero); nzero_coeff1 = _mm_cmpeq_epi16(zero_coeff1, zero); - iscan0 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs)); - iscan1 = _mm_load_si128((const __m128i *)(iscan_ptr + n_coeffs) + 1); + iscan0 = _mm_load_si128((const __m128i *)(iscan + n_coeffs)); + iscan1 = _mm_load_si128((const __m128i *)(iscan + n_coeffs) + 1); // Add one to convert from indices to counts iscan0 = _mm_sub_epi16(iscan0, nzero_coeff0); iscan1 = _mm_sub_epi16(iscan1, nzero_coeff1); diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h index f47cce4d2..aa60f44f7 100644 --- a/vpx_dsp/x86/convolve.h +++ b/vpx_dsp/x86/convolve.h @@ -128,7 +128,7 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, int x_step_q4, int y0_q4, int y_step_q4, int w, int h, int bd) { \ const int16_t *filter = filter_kernel[offset]; \ if (step_q4 == 16 && filter[3] != 128) { \ - if (filter[0] | filter[1] | filter[2]) { \ + if (filter[0] | filter[1] | filter[6] | filter[7]) { \ while (w >= 16) { \ vpx_highbd_filter_block1d16_##dir##8_##avg##opt( \ src_start, src_stride, dst, dst_stride, h, filter, bd); \ @@ -150,6 +150,28 @@ typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr, dst += 4; \ w -= 4; \ } \ + } else if (filter[2] | filter[5]) { \ + while (w >= 16) { \ + vpx_highbd_filter_block1d16_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 16; \ + dst += 16; \ + w -= 16; \ + } \ + while (w >= 8) { \ + vpx_highbd_filter_block1d8_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 8; \ + dst += 8; \ + w -= 8; \ + } \ + while (w >= 4) { \ + vpx_highbd_filter_block1d4_##dir##4_##avg##opt( \ + src_start, src_stride, dst, dst_stride, h, filter, bd); \ + src += 4; \ + dst += 4; \ + w -= 4; \ + } \ } else { \ while (w >= 16) { \ vpx_highbd_filter_block1d16_##dir##2_##avg##opt( \ diff --git a/vpx_dsp/x86/convolve_avx2.h b/vpx_dsp/x86/convolve_avx2.h index e9fc9c06a..99bc9637f 100644 --- a/vpx_dsp/x86/convolve_avx2.h +++ b/vpx_dsp/x86/convolve_avx2.h @@ -134,6 +134,13 @@ static INLINE void mm256_storeu2_epi32(__m128i *const dst_ptr_1, _mm_cvtsi128_si32(_mm256_extractf128_si256(*src, 1)); } +static INLINE __m256i mm256_round_epi32(const __m256i *const src, + const __m256i *const half_depth, + const int depth) { + const __m256i nearest_src = _mm256_add_epi32(*src, *half_depth); + return _mm256_srai_epi32(nearest_src, depth); +} + static INLINE __m256i mm256_round_epi16(const __m256i *const src, const __m256i *const half_depth, const int depth) { @@ -141,6 +148,15 @@ static INLINE __m256i mm256_round_epi16(const __m256i *const src, return _mm256_srai_epi16(nearest_src, depth); } +static INLINE __m256i mm256_madd_add_epi32(const __m256i *const src_0, + const __m256i *const src_1, + const __m256i *const ker_0, + const __m256i *const ker_1) { + const __m256i tmp_0 = _mm256_madd_epi16(*src_0, *ker_0); + const __m256i tmp_1 = _mm256_madd_epi16(*src_1, *ker_1); + return _mm256_add_epi32(tmp_0, tmp_1); +} + #undef MM256_BROADCASTSI128_SI256 #endif // VPX_VPX_DSP_X86_CONVOLVE_AVX2_H_ diff --git a/vpx_dsp/x86/convolve_sse2.h b/vpx_dsp/x86/convolve_sse2.h index 81fae2951..844354639 100644 --- a/vpx_dsp/x86/convolve_sse2.h +++ b/vpx_dsp/x86/convolve_sse2.h @@ -32,10 +32,10 @@ static INLINE __m128i extract_quarter_3_epi16_sse2(const __m128i *const reg) { // Interprets src as 8-bit words, zero extends to form 16-bit words, then // multiplies with ker and add the adjacent results to form 32-bit words. // Finally adds the result from 1 and 2 together. -static INLINE __m128i multiply_add_epi8_sse2(const __m128i *const src_1, - const __m128i *const src_2, - const __m128i *const ker_1, - const __m128i *const ker_2) { +static INLINE __m128i mm_madd_add_epi8_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { const __m128i src_1_half = _mm_unpacklo_epi8(*src_1, _mm_setzero_si128()); const __m128i src_2_half = _mm_unpacklo_epi8(*src_2, _mm_setzero_si128()); const __m128i madd_1 = _mm_madd_epi16(src_1_half, *ker_1); @@ -43,25 +43,44 @@ static INLINE __m128i multiply_add_epi8_sse2(const __m128i *const src_1, return _mm_add_epi32(madd_1, madd_2); } -static INLINE __m128i multiply_add_packs_epi16_sse2(const __m128i *const src_0, - const __m128i *const src_1, - const __m128i *const ker) { +// Interprets src as 16-bit words, then multiplies with ker and add the +// adjacent results to form 32-bit words. Finally adds the result from 1 and 2 +// together. +static INLINE __m128i mm_madd_add_epi16_sse2(const __m128i *const src_1, + const __m128i *const src_2, + const __m128i *const ker_1, + const __m128i *const ker_2) { + const __m128i madd_1 = _mm_madd_epi16(*src_1, *ker_1); + const __m128i madd_2 = _mm_madd_epi16(*src_2, *ker_2); + return _mm_add_epi32(madd_1, madd_2); +} + +static INLINE __m128i mm_madd_packs_epi16_sse2(const __m128i *const src_0, + const __m128i *const src_1, + const __m128i *const ker) { const __m128i madd_1 = _mm_madd_epi16(*src_0, *ker); const __m128i madd_2 = _mm_madd_epi16(*src_1, *ker); return _mm_packs_epi32(madd_1, madd_2); } // Interleaves src_1 and src_2 -static INLINE __m128i combine_epi32_sse2(const __m128i *const src_1, - const __m128i *const src_2) { +static INLINE __m128i mm_zip_epi32_sse2(const __m128i *const src_1, + const __m128i *const src_2) { const __m128i tmp_1 = _mm_unpacklo_epi32(*src_1, *src_2); const __m128i tmp_2 = _mm_unpackhi_epi32(*src_1, *src_2); return _mm_packs_epi32(tmp_1, tmp_2); } -static INLINE __m128i round_epi16_sse2(const __m128i *const src, - const __m128i *const half_depth, - const int depth) { +static INLINE __m128i mm_round_epi32_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { + const __m128i nearest_src = _mm_add_epi32(*src, *half_depth); + return _mm_srai_epi32(nearest_src, depth); +} + +static INLINE __m128i mm_round_epi16_sse2(const __m128i *const src, + const __m128i *const half_depth, + const int depth) { const __m128i nearest_src = _mm_adds_epi16(*src, *half_depth); return _mm_srai_epi16(nearest_src, depth); } diff --git a/vpx_dsp/x86/highbd_convolve_avx2.c b/vpx_dsp/x86/highbd_convolve_avx2.c index ef94522a3..ff5ef5f85 100644 --- a/vpx_dsp/x86/highbd_convolve_avx2.c +++ b/vpx_dsp/x86/highbd_convolve_avx2.c @@ -9,9 +9,9 @@ */ #include <immintrin.h> - #include "./vpx_dsp_rtcd.h" #include "vpx_dsp/x86/convolve.h" +#include "vpx_dsp/x86/convolve_avx2.h" // ----------------------------------------------------------------------------- // Copy and average @@ -209,6 +209,7 @@ static const uint8_t signal_pattern_2[32] = { 6, 7, 8, 9, 8, 9, 10, 11, static const uint32_t signal_index[8] = { 2, 3, 4, 5, 2, 3, 4, 5 }; #define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) // ----------------------------------------------------------------------------- // Horizontal Filtering @@ -923,6 +924,200 @@ static void vpx_highbd_filter_block1d16_h8_avg_avx2( } while (height > 0); } +static void vpx_highbd_filter_block1d4_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + // We extract the middle four elements of the kernel into two registers in + // the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum. Calling add on the two + // halves gives us the output. Since avx2 allows us to use 256-bit buffer, we + // can do this two rows at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Get the output + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round the result + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + _mm_storel_epi64((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); + } +} + +void vpx_highbd_filter_block1d8_h4_avx2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will extract the middle four elements of the kernel into two registers + // in the form + // ... k[3] k[2] k[3] k[2] + // ... k[5] k[4] k[5] k[4] + // Then we shuffle the source into + // ... s[1] s[0] s[0] s[-1] + // ... s[3] s[2] s[2] s[1] + // Calling multiply and add gives us half of the sum of the first half. + // Calling add gives us first half of the output. Repat again to get the whole + // output. Since avx2 allows us to use 256-bit buffer, we can do this two rows + // at a time. + + __m256i src_reg, src_reg_shift_0, src_reg_shift_2; + __m256i res_reg, res_first, res_last; + __m256i idx_shift_0 = + _mm256_setr_epi8(0, 1, 2, 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9, 0, 1, 2, + 3, 2, 3, 4, 5, 4, 5, 6, 7, 6, 7, 8, 9); + __m256i idx_shift_2 = + _mm256_setr_epi8(4, 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13, 4, + 5, 6, 7, 6, 7, 8, 9, 8, 9, 10, 11, 10, 11, 12, 13); + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, + kernel_reg_45; // Segments of the kernel used + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t unrolled_src_stride = src_stride << 1; + const ptrdiff_t unrolled_dst_stride = dst_stride << 1; + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + for (h = height; h >= 2; h -= 2) { + // Load the source + src_reg = mm256_loadu2_si128(src_ptr, src_ptr + src_stride); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for first half + res_first = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Do again to get the second half of dst + // Load the source + src_reg = mm256_loadu2_si128(src_ptr + 4, src_ptr + src_stride + 4); + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + // Result for second half + res_last = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + // Round each result + res_first = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); + res_last = mm256_round_epi32(&res_last, ®_round, CONV8_ROUNDING_BITS); + + // Finally combine to get the final dst + res_reg = _mm256_packus_epi32(res_first, res_last); + res_reg = _mm256_min_epi16(res_reg, reg_max); + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + src_ptr += unrolled_src_stride; + dst_ptr += unrolled_dst_stride; + } + + // Repeat for the last row if needed + if (h > 0) { + src_reg = _mm256_loadu_si256((const __m256i *)src_ptr); + // Reorder into 2 1 1 2 + src_reg = _mm256_permute4x64_epi64(src_reg, 0x94); + + src_reg_shift_0 = _mm256_shuffle_epi8(src_reg, idx_shift_0); + src_reg_shift_2 = _mm256_shuffle_epi8(src_reg, idx_shift_2); + + res_reg = mm256_madd_add_epi32(&src_reg_shift_0, &src_reg_shift_2, + &kernel_reg_23, &kernel_reg_45); + + res_reg = mm256_round_epi32(&res_first, ®_round, CONV8_ROUNDING_BITS); + + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_permute4x64_epi64(res_reg, 0x8); + + _mm_store_si128((__m128i *)dst_ptr, _mm256_castsi256_si128(res_reg)); + } +} + +static void vpx_highbd_filter_block1d16_h4_avx2( + const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + static void vpx_highbd_filter_block1d8_v8_avg_avx2( const uint16_t *src_ptr, ptrdiff_t src_pitch, uint16_t *dst_ptr, ptrdiff_t dst_pitch, uint32_t height, const int16_t *filter, int bd) { @@ -1058,39 +1253,239 @@ static void vpx_highbd_filter_block1d8_v2_avg_avx2( } while (height > 0); } -void vpx_highbd_filter_block1d4_h8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void vpx_highbd_filter_block1d4_h2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void vpx_highbd_filter_block1d4_v8_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); -void vpx_highbd_filter_block1d4_v2_sse2(const uint16_t *, ptrdiff_t, uint16_t *, - ptrdiff_t, uint32_t, const int16_t *, - int); +void vpx_highbd_filter_block1d4_v4_avx2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001, src_reg_1223; + + // Result after multiply and add + __m256i res_reg; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel used + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_epi64((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001 = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223 = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + + // Output + res_reg = mm256_madd_add_epi32(&src_reg_m1001, &src_reg_1223, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg = mm256_round_epi32(&res_reg, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg, res_reg); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_storeu2_epi64((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001 = src_reg_1223; + src_reg_1 = src_reg_3; + } +} + +void vpx_highbd_filter_block1d8_v4_avx2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load two rows of pixels and rearrange them into the form + // ... s[1,0] s[0,0] s[0,0] s[-1,0] + // so that we can call multiply and add with the kernel partial output. Then + // we can call add with another row to get the output. + + // Register for source s[-1:3, :] + __m256i src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m256i src_reg_m10, src_reg_01, src_reg_12, src_reg_23; + __m256i src_reg_m1001_lo, src_reg_m1001_hi, src_reg_1223_lo, src_reg_1223_hi; + + __m128i kernel_reg_128; // Kernel + __m256i kernel_reg, kernel_reg_23, kernel_reg_45; // Segments of kernel + + // Result after multiply and add + __m256i res_reg, res_reg_lo, res_reg_hi; + + const __m256i reg_round = + _mm256_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m256i reg_max = _mm256_set1_epi16((1 << bd) - 1); + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the souce, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg_128 = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg = _mm256_broadcastsi128_si256(kernel_reg_128); + kernel_reg_23 = _mm256_shuffle_epi32(kernel_reg, 0x55); + kernel_reg_45 = _mm256_shuffle_epi32(kernel_reg, 0xaa); + + // Row -1 to row 0 + src_reg_m10 = mm256_loadu2_si128((const __m128i *)src_ptr, + (const __m128i *)(src_ptr + src_stride)); + + // Row 0 to row 1 + src_reg_1 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2))); + src_reg_01 = _mm256_permute2x128_si256(src_reg_m10, src_reg_1, 0x21); + + // First three rows + src_reg_m1001_lo = _mm256_unpacklo_epi16(src_reg_m10, src_reg_01); + src_reg_m1001_hi = _mm256_unpackhi_epi16(src_reg_m10, src_reg_01); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3))); + + src_reg_12 = _mm256_inserti128_si256(src_reg_1, + _mm256_castsi256_si128(src_reg_2), 1); + + src_reg_3 = _mm256_castsi128_si256( + _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4))); + + src_reg_23 = _mm256_inserti128_si256(src_reg_2, + _mm256_castsi256_si128(src_reg_3), 1); + + // Last three rows + src_reg_1223_lo = _mm256_unpacklo_epi16(src_reg_12, src_reg_23); + src_reg_1223_hi = _mm256_unpackhi_epi16(src_reg_12, src_reg_23); + + // Output from first half + res_reg_lo = mm256_madd_add_epi32(&src_reg_m1001_lo, &src_reg_1223_lo, + &kernel_reg_23, &kernel_reg_45); + + // Output from second half + res_reg_hi = mm256_madd_add_epi32(&src_reg_m1001_hi, &src_reg_1223_hi, + &kernel_reg_23, &kernel_reg_45); + + // Round the words + res_reg_lo = + mm256_round_epi32(&res_reg_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_hi = + mm256_round_epi32(&res_reg_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine to get the result + res_reg = _mm256_packus_epi32(res_reg_lo, res_reg_hi); + res_reg = _mm256_min_epi16(res_reg, reg_max); + + // Save the result + mm256_store2_si128((__m128i *)dst_ptr, (__m128i *)(dst_ptr + dst_stride), + &res_reg); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m1001_lo = src_reg_1223_lo; + src_reg_m1001_hi = src_reg_1223_hi; + src_reg_1 = src_reg_3; + } +} + +void vpx_highbd_filter_block1d16_v4_avx2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_avx2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_avx2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_sse2; + #define vpx_highbd_filter_block1d4_h8_avx2 vpx_highbd_filter_block1d4_h8_sse2 #define vpx_highbd_filter_block1d4_h2_avx2 vpx_highbd_filter_block1d4_h2_sse2 #define vpx_highbd_filter_block1d4_v8_avx2 vpx_highbd_filter_block1d4_v8_sse2 #define vpx_highbd_filter_block1d4_v2_avx2 vpx_highbd_filter_block1d4_v2_sse2 +#define vpx_highbd_filter_block1d16_v4_avg_avx2 \ + vpx_highbd_filter_block1d16_v8_avg_avx2 +#define vpx_highbd_filter_block1d16_h4_avg_avx2 \ + vpx_highbd_filter_block1d16_h8_avg_avx2 +#define vpx_highbd_filter_block1d8_v4_avg_avx2 \ + vpx_highbd_filter_block1d8_v8_avg_avx2 +#define vpx_highbd_filter_block1d8_h4_avg_avx2 \ + vpx_highbd_filter_block1d8_h8_avg_avx2 +#define vpx_highbd_filter_block1d4_v4_avg_avx2 \ + vpx_highbd_filter_block1d4_v8_avg_avx2 +#define vpx_highbd_filter_block1d4_h4_avg_avx2 \ + vpx_highbd_filter_block1d4_h8_avg_avx2 + HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2); HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * 3, , avx2); HIGH_FUN_CONV_2D(, avx2); -void vpx_highbd_filter_block1d4_h8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void vpx_highbd_filter_block1d4_h2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void vpx_highbd_filter_block1d4_v8_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); -void vpx_highbd_filter_block1d4_v2_avg_sse2(const uint16_t *, ptrdiff_t, - uint16_t *, ptrdiff_t, uint32_t, - const int16_t *, int); +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h2_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v2_avg_sse2; + #define vpx_highbd_filter_block1d4_h8_avg_avx2 \ vpx_highbd_filter_block1d4_h8_avg_sse2 #define vpx_highbd_filter_block1d4_h2_avg_avx2 \ diff --git a/vpx_dsp/x86/vpx_asm_stubs.c b/vpx_dsp/x86/vpx_asm_stubs.c index 80c7654d5..9d6f83787 100644 --- a/vpx_dsp/x86/vpx_asm_stubs.c +++ b/vpx_dsp/x86/vpx_asm_stubs.c @@ -104,6 +104,25 @@ highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h8_avg_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v8_avg_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d8_h4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_v4_sse2; +highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h4_sse2; +#define vpx_highbd_filter_block1d16_v4_avg_sse2 \ + vpx_highbd_filter_block1d16_v8_avg_sse2 +#define vpx_highbd_filter_block1d16_h4_avg_sse2 \ + vpx_highbd_filter_block1d16_h8_avg_sse2 +#define vpx_highbd_filter_block1d8_v4_avg_sse2 \ + vpx_highbd_filter_block1d8_v8_avg_sse2 +#define vpx_highbd_filter_block1d8_h4_avg_sse2 \ + vpx_highbd_filter_block1d8_h8_avg_sse2 +#define vpx_highbd_filter_block1d4_v4_avg_sse2 \ + vpx_highbd_filter_block1d4_v8_avg_sse2 +#define vpx_highbd_filter_block1d4_h4_avg_sse2 \ + vpx_highbd_filter_block1d4_h8_avg_sse2 + highbd_filter8_1dfunction vpx_highbd_filter_block1d16_v2_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d16_h2_sse2; highbd_filter8_1dfunction vpx_highbd_filter_block1d8_v2_sse2; diff --git a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c index fa223aed0..0be2c0fef 100644 --- a/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c +++ b/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c @@ -16,6 +16,9 @@ #include "vpx_dsp/x86/convolve_sse2.h" #include "vpx_ports/mem.h" +#define CONV8_ROUNDING_BITS (7) +#define CONV8_ROUNDING_NUM (1 << (CONV8_ROUNDING_BITS - 1)) + void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel) { @@ -54,15 +57,15 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 6 4 2 0 - even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, - &kernel_reg_45); + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); // Output 7 5 3 1 - odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, - &kernel_reg_23, &kernel_reg_45); + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst - dst_first = combine_epi32_sse2(&even, &odd); + dst_first = mm_zip_epi32_sse2(&even, &odd); // Do again to get the second half of dst src_reg = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); @@ -71,19 +74,19 @@ void vpx_filter_block1d16_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 14 12 10 8 - even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, - &kernel_reg_45); + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); // Output 15 13 11 9 - odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, - &kernel_reg_23, &kernel_reg_45); + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the second half of the dst - dst_second = combine_epi32_sse2(&even, &odd); + dst_second = mm_zip_epi32_sse2(&even, &odd); // Round each result - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); - dst_second = round_epi16_sse2(&dst_second, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); // Finally combine to get the final dst dst_first = _mm_packus_epi16(dst_first, dst_second); @@ -181,21 +184,21 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_23_hi = _mm_unpackhi_epi8(src_reg_2, src_reg_3); // Partial output from first half - res_reg_m10_lo = multiply_add_packs_epi16_sse2( + res_reg_m10_lo = mm_madd_packs_epi16_sse2( &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); - res_reg_01_lo = multiply_add_packs_epi16_sse2( - &src_reg_01_lo_1, &src_reg_01_lo_2, &kernel_reg_23); + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); - res_reg_12_lo = multiply_add_packs_epi16_sse2( - &src_reg_12_lo_1, &src_reg_12_lo_2, &kernel_reg_45); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); - res_reg_23_lo = multiply_add_packs_epi16_sse2( - &src_reg_23_lo_1, &src_reg_23_lo_2, &kernel_reg_45); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); // Add to get first half of the results res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); @@ -203,31 +206,31 @@ void vpx_filter_block1d16_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, // Now repeat everything again for the second half // Partial output for second half - res_reg_m10_hi = multiply_add_packs_epi16_sse2( + res_reg_m10_hi = mm_madd_packs_epi16_sse2( &src_reg_m10_hi_1, &src_reg_m10_hi_2, &kernel_reg_23); - res_reg_01_hi = multiply_add_packs_epi16_sse2( - &src_reg_01_hi_1, &src_reg_01_hi_2, &kernel_reg_23); + res_reg_01_hi = mm_madd_packs_epi16_sse2(&src_reg_01_hi_1, &src_reg_01_hi_2, + &kernel_reg_23); src_reg_12_hi_1 = _mm_unpacklo_epi8(src_reg_12_hi, _mm_setzero_si128()); src_reg_12_hi_2 = _mm_unpackhi_epi8(src_reg_12_hi, _mm_setzero_si128()); - res_reg_12_hi = multiply_add_packs_epi16_sse2( - &src_reg_12_hi_1, &src_reg_12_hi_2, &kernel_reg_45); + res_reg_12_hi = mm_madd_packs_epi16_sse2(&src_reg_12_hi_1, &src_reg_12_hi_2, + &kernel_reg_45); src_reg_23_hi_1 = _mm_unpacklo_epi8(src_reg_23_hi, _mm_setzero_si128()); src_reg_23_hi_2 = _mm_unpackhi_epi8(src_reg_23_hi, _mm_setzero_si128()); - res_reg_23_hi = multiply_add_packs_epi16_sse2( - &src_reg_23_hi_1, &src_reg_23_hi_2, &kernel_reg_45); + res_reg_23_hi = mm_madd_packs_epi16_sse2(&src_reg_23_hi_1, &src_reg_23_hi_2, + &kernel_reg_45); // Second half of the results res_reg_m1012_hi = _mm_adds_epi16(res_reg_m10_hi, res_reg_12_hi); res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); // Round the words - res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); - res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); - res_reg_m1012_hi = round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); - res_reg_0123_hi = round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); // Combine to get the result res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); @@ -288,16 +291,16 @@ void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 6 4 2 0 - even = multiply_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, - &kernel_reg_45); + even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); // Output 7 5 3 1 - odd = multiply_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, - &kernel_reg_23, &kernel_reg_45); + odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst - dst_first = combine_epi32_sse2(&even, &odd); - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_first = mm_zip_epi32_sse2(&even, &odd); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Saturate and convert to 8-bit words dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); @@ -383,29 +386,29 @@ void vpx_filter_block1d8_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); // Partial output - res_reg_m10_lo = multiply_add_packs_epi16_sse2( + res_reg_m10_lo = mm_madd_packs_epi16_sse2( &src_reg_m10_lo_1, &src_reg_m10_lo_2, &kernel_reg_23); - res_reg_01_lo = multiply_add_packs_epi16_sse2( - &src_reg_01_lo_1, &src_reg_01_lo_2, &kernel_reg_23); + res_reg_01_lo = mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, &src_reg_01_lo_2, + &kernel_reg_23); src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); src_reg_12_lo_2 = _mm_unpackhi_epi8(src_reg_12_lo, _mm_setzero_si128()); - res_reg_12_lo = multiply_add_packs_epi16_sse2( - &src_reg_12_lo_1, &src_reg_12_lo_2, &kernel_reg_45); + res_reg_12_lo = mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, &src_reg_12_lo_2, + &kernel_reg_45); src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); src_reg_23_lo_2 = _mm_unpackhi_epi8(src_reg_23_lo, _mm_setzero_si128()); - res_reg_23_lo = multiply_add_packs_epi16_sse2( - &src_reg_23_lo_1, &src_reg_23_lo_2, &kernel_reg_45); + res_reg_23_lo = mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, &src_reg_23_lo_2, + &kernel_reg_45); // Add to get results res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); // Round the words - res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); - res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); // Convert to 8-bit words res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, _mm_setzero_si128()); @@ -480,7 +483,7 @@ void vpx_filter_block1d4_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, dst_first = _mm_add_epi32(tmp_0, tmp_1); dst_first = _mm_packs_epi32(dst_first, _mm_setzero_si128()); - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Saturate and convert to 8-bit words dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); @@ -565,27 +568,27 @@ void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_23_lo = _mm_unpacklo_epi8(src_reg_2, src_reg_3); // Partial output - res_reg_m10_lo = multiply_add_packs_epi16_sse2(&src_reg_m10_lo_1, ®_zero, - &kernel_reg_23); + res_reg_m10_lo = + mm_madd_packs_epi16_sse2(&src_reg_m10_lo_1, ®_zero, &kernel_reg_23); - res_reg_01_lo = multiply_add_packs_epi16_sse2(&src_reg_01_lo_1, ®_zero, - &kernel_reg_23); + res_reg_01_lo = + mm_madd_packs_epi16_sse2(&src_reg_01_lo_1, ®_zero, &kernel_reg_23); src_reg_12_lo_1 = _mm_unpacklo_epi8(src_reg_12_lo, _mm_setzero_si128()); - res_reg_12_lo = multiply_add_packs_epi16_sse2(&src_reg_12_lo_1, ®_zero, - &kernel_reg_45); + res_reg_12_lo = + mm_madd_packs_epi16_sse2(&src_reg_12_lo_1, ®_zero, &kernel_reg_45); src_reg_23_lo_1 = _mm_unpacklo_epi8(src_reg_23_lo, _mm_setzero_si128()); - res_reg_23_lo = multiply_add_packs_epi16_sse2(&src_reg_23_lo_1, ®_zero, - &kernel_reg_45); + res_reg_23_lo = + mm_madd_packs_epi16_sse2(&src_reg_23_lo_1, ®_zero, &kernel_reg_45); // Add to get results res_reg_m1012_lo = _mm_adds_epi16(res_reg_m10_lo, res_reg_12_lo); res_reg_0123_lo = _mm_adds_epi16(res_reg_01_lo, res_reg_23_lo); // Round the words - res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); - res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); // Convert to 8-bit words res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, reg_zero); @@ -604,3 +607,399 @@ void vpx_filter_block1d4_v4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, src_reg_1 = src_reg_3; } } + +void vpx_highbd_filter_block1d4_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together to get the even output + + __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_shift_1 = _mm_srli_si128(src_reg, 2); + src_reg_shift_2 = _mm_srli_si128(src_reg, 4); + src_reg_shift_3 = _mm_srli_si128(src_reg, 6); + + // Output 2 0 + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 3 1 + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + res_reg = _mm_unpacklo_epi32(even, odd); + res_reg = mm_round_epi32_sse2(&res_reg, ®_round, CONV8_ROUNDING_BITS); + res_reg = _mm_packs_epi32(res_reg, reg_zero); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + _mm_storel_epi64((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_highbd_filter_block1d4_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10, src_reg_01; + __m128i src_reg_12, src_reg_23; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + // Result after multiply and add + __m128i res_reg_m10, res_reg_01, res_reg_12, res_reg_23; + __m128i res_reg_m1012, res_reg_0123; + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the source, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadl_epi64((const __m128i *)src_ptr); + src_reg_0 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride)); + src_reg_m10 = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01 = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12 = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23 = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + + // Partial output + res_reg_m10 = _mm_madd_epi16(src_reg_m10, kernel_reg_23); + res_reg_01 = _mm_madd_epi16(src_reg_01, kernel_reg_23); + res_reg_12 = _mm_madd_epi16(src_reg_12, kernel_reg_45); + res_reg_23 = _mm_madd_epi16(src_reg_23, kernel_reg_45); + + // Add to get results + res_reg_m1012 = _mm_add_epi32(res_reg_m10, res_reg_12); + res_reg_0123 = _mm_add_epi32(res_reg_01, res_reg_23); + + // Round the words + res_reg_m1012 = + mm_round_epi32_sse2(&res_reg_m1012, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123 = + mm_round_epi32_sse2(&res_reg_0123, ®_round, CONV8_ROUNDING_BITS); + + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_packs_epi32(res_reg_0123, reg_zero); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_storel_epi64((__m128i *)dst_ptr, res_reg_m1012); + _mm_storel_epi64((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10 = src_reg_12; + src_reg_01 = src_reg_23; + src_reg_1 = src_reg_3; + } +} + +void vpx_highbd_filter_block1d8_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load multiple shifted versions of the row and shuffle them into + // 16-bit words of the form + // ... s[2] s[1] s[0] s[-1] + // ... s[4] s[3] s[2] s[1] + // Then we call multiply and add to get partial results + // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] + // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] + // The two results are then added together for the first half of even + // output. + // Repeat multiple times to get the whole outoput + + __m128i src_reg, src_reg_next, src_reg_shift_1, src_reg_shift_2, + src_reg_shift_3; + __m128i res_reg; + __m128i even, odd; + __m128i tmp_0, tmp_1; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + int h; + + // Start one pixel before as we need tap/2 - 1 = 1 sample from the past + src_ptr -= 1; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + for (h = height; h > 0; --h) { + // We will put first half in the first half of the reg, and second half in + // second half + src_reg = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_next = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); + + // Output 6 4 2 0 + tmp_0 = _mm_srli_si128(src_reg, 4); + tmp_1 = _mm_srli_si128(src_reg_next, 2); + src_reg_shift_2 = _mm_unpacklo_epi64(tmp_0, tmp_1); + even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, + &kernel_reg_45); + + // Output 7 5 3 1 + tmp_0 = _mm_srli_si128(src_reg, 2); + tmp_1 = src_reg_next; + src_reg_shift_1 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + tmp_0 = _mm_srli_si128(src_reg, 6); + tmp_1 = _mm_srli_si128(src_reg_next, 4); + src_reg_shift_3 = _mm_unpacklo_epi64(tmp_0, tmp_1); + + odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, + &kernel_reg_23, &kernel_reg_45); + + // Combine to get the first half of the dst + even = mm_round_epi32_sse2(&even, ®_round, CONV8_ROUNDING_BITS); + odd = mm_round_epi32_sse2(&odd, ®_round, CONV8_ROUNDING_BITS); + res_reg = mm_zip_epi32_sse2(&even, &odd); + + // Saturate the result and save + res_reg = _mm_min_epi16(res_reg, reg_max); + res_reg = _mm_max_epi16(res_reg, reg_zero); + + _mm_store_si128((__m128i *)dst_ptr, res_reg); + + src_ptr += src_stride; + dst_ptr += dst_stride; + } +} + +void vpx_highbd_filter_block1d8_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + // We will load two rows of pixels as 16-bit words, and shuffle them into the + // form + // ... s[0,1] s[-1,1] s[0,0] s[-1,0] + // ... s[0,7] s[-1,7] s[0,6] s[-1,6] + // ... s[0,9] s[-1,9] s[0,8] s[-1,8] + // ... s[0,13] s[-1,13] s[0,12] s[-1,12] + // so that we can call multiply and add with the kernel to get 32-bit words of + // the form + // ... s[0,1]k[3]+s[-1,1]k[2] s[0,0]k[3]+s[-1,0]k[2] + // Finally, we can add multiple rows together to get the desired output. + + // Register for source s[-1:3, :] + __m128i src_reg_m1, src_reg_0, src_reg_1, src_reg_2, src_reg_3; + // Interleaved rows of the source. lo is first half, hi second + __m128i src_reg_m10_lo, src_reg_01_lo, src_reg_m10_hi, src_reg_01_hi; + __m128i src_reg_12_lo, src_reg_23_lo, src_reg_12_hi, src_reg_23_hi; + + // Result after multiply and add + __m128i res_reg_m10_lo, res_reg_01_lo, res_reg_12_lo, res_reg_23_lo; + __m128i res_reg_m10_hi, res_reg_01_hi, res_reg_12_hi, res_reg_23_hi; + __m128i res_reg_m1012, res_reg_0123; + __m128i res_reg_m1012_lo, res_reg_0123_lo; + __m128i res_reg_m1012_hi, res_reg_0123_hi; + + __m128i kernel_reg; // Kernel + __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used + + const __m128i reg_round = + _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding + const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); + const __m128i reg_zero = _mm_setzero_si128(); + + // We will compute the result two rows at a time + const ptrdiff_t src_stride_unrolled = src_stride << 1; + const ptrdiff_t dst_stride_unrolled = dst_stride << 1; + int h; + + // We only need to go num_taps/2 - 1 row above the source, so we move + // 3 - (num_taps/2 - 1) = 4 - num_taps/2 = 2 back down + src_ptr += src_stride_unrolled; + + // Load Kernel + kernel_reg = _mm_loadu_si128((const __m128i *)kernel); + kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); + kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); + + // First shuffle the data + src_reg_m1 = _mm_loadu_si128((const __m128i *)src_ptr); + src_reg_0 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride)); + src_reg_m10_lo = _mm_unpacklo_epi16(src_reg_m1, src_reg_0); + src_reg_m10_hi = _mm_unpackhi_epi16(src_reg_m1, src_reg_0); + + // More shuffling + src_reg_1 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 2)); + src_reg_01_lo = _mm_unpacklo_epi16(src_reg_0, src_reg_1); + src_reg_01_hi = _mm_unpackhi_epi16(src_reg_0, src_reg_1); + + for (h = height; h > 1; h -= 2) { + src_reg_2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 3)); + + src_reg_12_lo = _mm_unpacklo_epi16(src_reg_1, src_reg_2); + src_reg_12_hi = _mm_unpackhi_epi16(src_reg_1, src_reg_2); + + src_reg_3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_stride * 4)); + + src_reg_23_lo = _mm_unpacklo_epi16(src_reg_2, src_reg_3); + src_reg_23_hi = _mm_unpackhi_epi16(src_reg_2, src_reg_3); + + // Partial output for first half + res_reg_m10_lo = _mm_madd_epi16(src_reg_m10_lo, kernel_reg_23); + res_reg_01_lo = _mm_madd_epi16(src_reg_01_lo, kernel_reg_23); + res_reg_12_lo = _mm_madd_epi16(src_reg_12_lo, kernel_reg_45); + res_reg_23_lo = _mm_madd_epi16(src_reg_23_lo, kernel_reg_45); + + // Add to get results + res_reg_m1012_lo = _mm_add_epi32(res_reg_m10_lo, res_reg_12_lo); + res_reg_0123_lo = _mm_add_epi32(res_reg_01_lo, res_reg_23_lo); + + // Round the words + res_reg_m1012_lo = + mm_round_epi32_sse2(&res_reg_m1012_lo, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_lo = + mm_round_epi32_sse2(&res_reg_0123_lo, ®_round, CONV8_ROUNDING_BITS); + + // Partial output for first half + res_reg_m10_hi = _mm_madd_epi16(src_reg_m10_hi, kernel_reg_23); + res_reg_01_hi = _mm_madd_epi16(src_reg_01_hi, kernel_reg_23); + res_reg_12_hi = _mm_madd_epi16(src_reg_12_hi, kernel_reg_45); + res_reg_23_hi = _mm_madd_epi16(src_reg_23_hi, kernel_reg_45); + + // Add to get results + res_reg_m1012_hi = _mm_add_epi32(res_reg_m10_hi, res_reg_12_hi); + res_reg_0123_hi = _mm_add_epi32(res_reg_01_hi, res_reg_23_hi); + + // Round the words + res_reg_m1012_hi = + mm_round_epi32_sse2(&res_reg_m1012_hi, ®_round, CONV8_ROUNDING_BITS); + res_reg_0123_hi = + mm_round_epi32_sse2(&res_reg_0123_hi, ®_round, CONV8_ROUNDING_BITS); + + // Combine the two halfs + res_reg_m1012 = _mm_packs_epi32(res_reg_m1012_lo, res_reg_m1012_hi); + res_reg_0123 = _mm_packs_epi32(res_reg_0123_lo, res_reg_0123_hi); + + // Saturate according to bit depth + res_reg_m1012 = _mm_min_epi16(res_reg_m1012, reg_max); + res_reg_0123 = _mm_min_epi16(res_reg_0123, reg_max); + res_reg_m1012 = _mm_max_epi16(res_reg_m1012, reg_zero); + res_reg_0123 = _mm_max_epi16(res_reg_0123, reg_zero); + + // Save only half of the register (8 words) + _mm_store_si128((__m128i *)dst_ptr, res_reg_m1012); + _mm_store_si128((__m128i *)(dst_ptr + dst_stride), res_reg_0123); + + // Update the source by two rows + src_ptr += src_stride_unrolled; + dst_ptr += dst_stride_unrolled; + + src_reg_m10_lo = src_reg_12_lo; + src_reg_m10_hi = src_reg_12_hi; + src_reg_01_lo = src_reg_23_lo; + src_reg_01_hi = src_reg_23_hi; + src_reg_1 = src_reg_3; + } +} + +void vpx_highbd_filter_block1d16_h4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_h4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_h4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} + +void vpx_highbd_filter_block1d16_v4_sse2(const uint16_t *src_ptr, + ptrdiff_t src_stride, + uint16_t *dst_ptr, + ptrdiff_t dst_stride, uint32_t height, + const int16_t *kernel, int bd) { + vpx_highbd_filter_block1d8_v4_sse2(src_ptr, src_stride, dst_ptr, dst_stride, + height, kernel, bd); + vpx_highbd_filter_block1d8_v4_sse2(src_ptr + 8, src_stride, dst_ptr + 8, + dst_stride, height, kernel, bd); +} diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c index 0ccf89694..b55b7e57a 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c @@ -623,7 +623,7 @@ void vpx_filter_block1d8_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, _mm256_castsi256_si128(kernel_reg_45)); dst_reg = _mm_adds_epi16(tmp_0, tmp_1); - dst_reg = round_epi16_sse2(&dst_reg, ®_32, 6); + dst_reg = mm_round_epi16_sse2(&dst_reg, ®_32, 6); dst_reg = _mm_packus_epi16(dst_reg, _mm_setzero_si128()); @@ -797,7 +797,7 @@ void vpx_filter_block1d4_h4_avx2(const uint8_t *src_ptr, ptrdiff_t src_stride, dst = _mm_hadds_epi16(dst, _mm_setzero_si128()); // Round result - dst = round_epi16_sse2(&dst, ®_32, 6); + dst = mm_round_epi16_sse2(&dst, ®_32, 6); // Pack to 8-bits dst = _mm_packus_epi16(dst, _mm_setzero_si128()); diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c index 9e5b73047..b5f6ca57d 100644 --- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c +++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c @@ -246,8 +246,8 @@ void vpx_filter_block1d16_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, dst_second = _mm_adds_epi16(tmp_0, tmp_1); // Round each result - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); - dst_second = round_epi16_sse2(&dst_second, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); + dst_second = mm_round_epi16_sse2(&dst_second, ®_32, 6); // Finally combine to get the final dst dst_first = _mm_packus_epi16(dst_first, dst_second); @@ -348,10 +348,10 @@ void vpx_filter_block1d16_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, res_reg_0123_hi = _mm_adds_epi16(res_reg_01_hi, res_reg_23_hi); // Round the words - res_reg_m1012_lo = round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); - res_reg_0123_lo = round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); - res_reg_m1012_hi = round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); - res_reg_0123_hi = round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); + res_reg_m1012_lo = mm_round_epi16_sse2(&res_reg_m1012_lo, ®_32, 6); + res_reg_0123_lo = mm_round_epi16_sse2(&res_reg_0123_lo, ®_32, 6); + res_reg_m1012_hi = mm_round_epi16_sse2(&res_reg_m1012_hi, ®_32, 6); + res_reg_0123_hi = mm_round_epi16_sse2(&res_reg_0123_hi, ®_32, 6); // Combine to get the result res_reg_m1012 = _mm_packus_epi16(res_reg_m1012_lo, res_reg_m1012_hi); @@ -421,7 +421,7 @@ void vpx_filter_block1d8_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, dst_first = _mm_adds_epi16(tmp_0, tmp_1); // Round round result - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Pack to 8-bits dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); @@ -504,8 +504,8 @@ void vpx_filter_block1d8_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, res_reg_0123 = _mm_adds_epi16(res_reg_01, res_reg_23); // Round the words - res_reg_m1012 = round_epi16_sse2(&res_reg_m1012, ®_32, 6); - res_reg_0123 = round_epi16_sse2(&res_reg_0123, ®_32, 6); + res_reg_m1012 = mm_round_epi16_sse2(&res_reg_m1012, ®_32, 6); + res_reg_0123 = mm_round_epi16_sse2(&res_reg_0123, ®_32, 6); // Pack from 16-bit to 8-bit res_reg_m1012 = _mm_packus_epi16(res_reg_m1012, _mm_setzero_si128()); @@ -563,7 +563,7 @@ void vpx_filter_block1d4_h4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, dst_first = _mm_hadds_epi16(dst_first, _mm_setzero_si128()); // Round result - dst_first = round_epi16_sse2(&dst_first, ®_32, 6); + dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Pack to 8-bits dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); @@ -648,8 +648,8 @@ void vpx_filter_block1d4_v4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_stride, reg_1 = _mm_hadds_epi16(reg_1, _mm_setzero_si128()); // Round the words - reg_0 = round_epi16_sse2(®_0, ®_32, 6); - reg_1 = round_epi16_sse2(®_1, ®_32, 6); + reg_0 = mm_round_epi16_sse2(®_0, ®_32, 6); + reg_1 = mm_round_epi16_sse2(®_1, ®_32, 6); // Pack from 16-bit to 8-bit and put them in the right order reg_0 = _mm_packus_epi16(reg_0, reg_0); |