diff options
Diffstat (limited to 'vp8/common')
-rw-r--r-- | vp8/common/blockd.h | 23 | ||||
-rw-r--r-- | vp8/common/findnearmv.c | 137 | ||||
-rw-r--r-- | vp8/common/mvref_common.c | 303 | ||||
-rw-r--r-- | vp8/common/mvref_common.h | 37 | ||||
-rw-r--r-- | vp8/common/recon.h | 8 | ||||
-rw-r--r-- | vp8/common/reconinter.h | 9 | ||||
-rw-r--r-- | vp8/common/reconintra.c | 246 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.sh | 8 | ||||
-rw-r--r-- | vp8/common/x86/filter_sse2.c | 289 | ||||
-rw-r--r-- | vp8/common/x86/filter_sse4.c | 3 |
10 files changed, 848 insertions, 215 deletions
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index 4e5d9e813..46d002af9 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -44,6 +44,9 @@ void vpx_log(const char *format, ...); /* Segment Feature Masks */ #define SEGMENT_DELTADATA 0 #define SEGMENT_ABSDATA 1 +#if CONFIG_NEW_MVREF +#define MAX_MV_REFS 10 +#endif typedef struct { int r, c; @@ -179,6 +182,14 @@ typedef enum { B_MODE_COUNT } B_PREDICTION_MODE; +#if CONFIG_NEW_MVREF +// Segment level features. +typedef enum { + FIRST_REF = 0, + SECOND_REF = 1 +} MV_REF_TYPE; +#endif + #if CONFIG_HYBRIDTRANSFORM8X8 // convert MB_PREDICTION_MODE to B_PREDICTION_MODE static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) { @@ -268,9 +279,14 @@ typedef struct { MV_REFERENCE_FRAME ref_frame, second_ref_frame; TX_SIZE txfm_size; int_mv mv[2]; // for each reference frame used -#if CONFIG_NEWBESTREFMV +#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF int_mv ref_mv, second_ref_mv; #endif +#if CONFIG_NEW_MVREF + int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS]; + int mv_ref_index[MAX_REF_FRAMES]; +#endif + unsigned char partitioning; unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ unsigned char need_to_clamp_mvs; @@ -432,9 +448,14 @@ typedef struct MacroBlockD { #endif int mb_index; // Index of the MB in the SB (0..3) + #if CONFIG_NEWBESTREFMV +#if CONFIG_NEW_MVREF + int_mv ref_mv[MAX_MV_REFS]; +#else int_mv ref_mv[4]; #endif +#endif #if CONFIG_HYBRIDTRANSFORM int q_index; diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c index 6f7361dd0..694f4cc32 100644 --- a/vp8/common/findnearmv.c +++ b/vp8/common/findnearmv.c @@ -200,6 +200,139 @@ vp8_prob *vp8_mv_ref_probs(VP8_COMMON *pc, * above and a number cols of pixels in the left to select the one with best * score to use as ref motion vector */ + +#if CONFIG_NEW_MVREF + +void vp8_find_best_ref_mvs(MACROBLOCKD *xd, + unsigned char *ref_y_buffer, + int ref_y_stride, + int_mv *best_mv, + int_mv *nearest, + int_mv *near) { + int_mv *ref_mv = xd->ref_mv; + int i, j; + unsigned char *above_src; + unsigned char *left_src; + unsigned char *above_ref; + unsigned char *left_ref; + int sad; + int sad_scores[MAX_MV_REFS]; + int_mv sorted_mvs[MAX_MV_REFS]; + int zero_seen = FALSE; + + // Default all to 0,0 if nothing else available + best_mv->as_int = nearest->as_int = near->as_int = 0; + vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs)); + + above_src = xd->dst.y_buffer - xd->dst.y_stride * 2; + left_src = xd->dst.y_buffer - 2; + above_ref = ref_y_buffer - ref_y_stride * 2; + left_ref = ref_y_buffer - 2; + + for(i = 0; i < MAX_MV_REFS; ++i) { + int_mv this_mv; + int offset=0; + int row_offset, col_offset; + + this_mv.as_int = ref_mv[i].as_int; + + // If we see a 0,0 vector for a second time we have reached the end of + // the list of valid candidate vectors. + if (!this_mv.as_int) + if (zero_seen) + break; + else + zero_seen = TRUE; + + vp8_clamp_mv(&this_mv, + xd->mb_to_left_edge - LEFT_TOP_MARGIN + 16, + xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN, + xd->mb_to_top_edge - LEFT_TOP_MARGIN + 16, + xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN); + + row_offset = (this_mv.as_mv.row > 0) ? + ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3); + col_offset = (this_mv.as_mv.col > 0) ? + ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3); + offset = ref_y_stride * row_offset + col_offset; + + sad = vp8_sad16x2_c(above_src, xd->dst.y_stride, + above_ref + offset, ref_y_stride, INT_MAX); + + sad += vp8_sad2x16_c(left_src, xd->dst.y_stride, + left_ref + offset, ref_y_stride, INT_MAX); + + // Add the entry to our list and then resort the list on score. + sad_scores[i] = sad; + sorted_mvs[i].as_int = this_mv.as_int; + j = i; + while (j > 0) { + if (sad_scores[j] < sad_scores[j-1]) { + sad_scores[j] = sad_scores[j-1]; + sorted_mvs[j].as_int = sorted_mvs[j-1].as_int; + sad_scores[j-1] = sad; + sorted_mvs[j-1].as_int = this_mv.as_int; + j--; + } else + break; + } + } + + // If not see add 0,0 as a possibility + /*if ( (i < MAX_MV_REFS) && !zero_seen ) { + + sad = vp8_sad16x2_c(above_src, xd->dst.y_stride, + above_ref, ref_y_stride, + INT_MAX); + sad += vp8_sad2x16_c(left_src, xd->dst.y_stride, + left_ref, ref_y_stride, + INT_MAX); + this_mv.as_int = 0; + + // Add the entry to our list and then resort the list on score. + sad_scores[i] = sad; + sorted_mvs[i].as_int = this_mv.as_int; + j = i; + while (j > 0) { + if (sad_scores[j] < sad_scores[j-1]) { + sad_scores[j] = sad_scores[j-1]; + sorted_mvs[j].as_int = sorted_mvs[j-1].as_int; + sad_scores[j-1] = sad; + sorted_mvs[j-1].as_int = this_mv.as_int; + j--; + } else + break; + } + }*/ + + // Set the best mv to the first entry in the sorted list + best_mv->as_int = sorted_mvs[0].as_int; + + // Provided that there are non zero vectors available there will not + // be more than one 0,0 entry in the sorted list. + // The best ref mv is always set to the first entry (which gave the best + // results. The nearest is set to the first non zero vector if available and + // near to the second non zero vector if avaialable. + // We do not use 0,0 as a nearest or near as 0,0 has its own mode. + if ( sorted_mvs[0].as_int ) { + nearest->as_int = sorted_mvs[0].as_int; + if ( sorted_mvs[1].as_int ) + near->as_int = sorted_mvs[1].as_int; + else + near->as_int = sorted_mvs[2].as_int; + } else { + nearest->as_int = sorted_mvs[1].as_int; + near->as_int = sorted_mvs[2].as_int; + } + + if (!xd->allow_high_precision_mv) + lower_mv_precision(best_mv); + + vp8_clamp_mv2(best_mv, xd); +} + +#else // !CONFIG_NEW_MVREF + void vp8_find_best_ref_mvs(MACROBLOCKD *xd, unsigned char *ref_y_buffer, int ref_y_stride, @@ -270,5 +403,5 @@ void vp8_find_best_ref_mvs(MACROBLOCKD *xd, nearest->as_int = best_mv->as_int; } } - -#endif +#endif // CONFIG_NEW_MVREF +#endif // CONFIG_NEWBESTREFMV diff --git a/vp8/common/mvref_common.c b/vp8/common/mvref_common.c new file mode 100644 index 000000000..1c345dba5 --- /dev/null +++ b/vp8/common/mvref_common.c @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "mvref_common.h" + +#if CONFIG_NEW_MVREF + +#define MVREF_NEIGHBOURS 8 +static int mv_ref_search[MVREF_NEIGHBOURS][2] = + { {0,-1},{-1,0},{-1,-1},{0,-2},{-2,0},{-1,-2},{-2,-1},{-2,-2} }; +static int ref_distance_weight[MVREF_NEIGHBOURS] = + { 3,3,2,1,1,1,1,1 }; + //{ 4,4,2,1,1,1,1,1 }; + +// clamp_mv +#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units +static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) { + + if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER)) + mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER; + else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER) + mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER; + + if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER)) + mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER; + else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER) + mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER; +} + +// Code for selecting / building and entropy coding a motion vector reference +// Returns a seperation value for two vectors. +// This is taken as the sum of the abs x and y difference. +unsigned int mv_distance(int_mv *mv1, int_mv *mv2) { + return (abs(mv1->as_mv.row - mv2->as_mv.row) + + abs(mv1->as_mv.col - mv2->as_mv.col)); +} + +// Gets a best matching candidate refenence motion vector +// from the given mode info structure (if available) +int get_candidate_mvref( + const MODE_INFO *candidate_mi, + MV_REFERENCE_FRAME ref_frame, + MV_REFERENCE_FRAME *candidate_ref_frame, + int_mv *candidate_mv +) { + + int ret_val = FALSE; + + if (ref_frame == candidate_mi->mbmi.ref_frame) { + candidate_mv->as_int = candidate_mi->mbmi.mv[FIRST_REF].as_int; + *candidate_ref_frame = ref_frame; + ret_val = TRUE; + + } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) { + candidate_mv->as_int = candidate_mi->mbmi.mv[SECOND_REF].as_int; + *candidate_ref_frame = ref_frame; + ret_val = TRUE; + + } else if (candidate_mi->mbmi.ref_frame != INTRA_FRAME) { + candidate_mv->as_int = candidate_mi->mbmi.mv[FIRST_REF].as_int; + *candidate_ref_frame = candidate_mi->mbmi.ref_frame; + ret_val = TRUE; + + } else if (candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) { + candidate_mv->as_int = candidate_mi->mbmi.mv[SECOND_REF].as_int; + *candidate_ref_frame = candidate_mi->mbmi.second_ref_frame; + ret_val = TRUE; + } + + return ret_val; +} + +// Performs mv adjustment based on reference frame and clamps the MV +// if it goes off the edge of the buffer. +void scale_mv( + MACROBLOCKD *xd, + MV_REFERENCE_FRAME this_ref_frame, + MV_REFERENCE_FRAME candidate_ref_frame, + int_mv *candidate_mv, + int *ref_sign_bias +) { + + if (candidate_ref_frame != this_ref_frame) { + + //int frame_distances[MAX_REF_FRAMES]; + //int last_distance = 1; + //int gf_distance = xd->frames_since_golden; + //int arf_distance = xd->frames_till_alt_ref_frame; + + // Sign inversion where appropriate. + if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) { + candidate_mv->as_mv.row = -candidate_mv->as_mv.row; + candidate_mv->as_mv.col = -candidate_mv->as_mv.col; + } + + // Scale based on frame distance if the reference frames not the same. + /*frame_distances[INTRA_FRAME] = 1; // should never be used + frame_distances[LAST_FRAME] = 1; + frame_distances[GOLDEN_FRAME] = + (xd->frames_since_golden) ? xd->frames_since_golden : 1; + frame_distances[ALTREF_FRAME] = + (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1; + + if (frame_distances[this_ref_frame] && + frame_distances[candidate_ref_frame]) { + candidate_mv->as_mv.row = + (short)(((int)(candidate_mv->as_mv.row) * + frame_distances[this_ref_frame]) / + frame_distances[candidate_ref_frame]); + + candidate_mv->as_mv.col = + (short)(((int)(candidate_mv->as_mv.col) * + frame_distances[this_ref_frame]) / + frame_distances[candidate_ref_frame]); + } + */ + } + + // Clamp the MV so it does not point out of the frame buffer + clamp_mv(xd, candidate_mv); +} + +// Adds a new candidate reference vector to the list if indeed it is new. +// If it is not new then the score of the existing candidate that it matches +// is increased and the list is resorted. +void addmv_and_shuffle( + int_mv *mv_list, + int *mv_scores, + int *index, + int_mv candidate_mv, + int weight +) { + + int i = *index; + int duplicate_found = FALSE; + + // Check for duplicates. If there is one increment its score. + while (i > 0) { + i--; + if (candidate_mv.as_int == mv_list[i].as_int) { + duplicate_found = TRUE; + mv_scores[i] += weight; + break; + } + } + + // If no duplicate was found add the new vector and give it a weight + if (!duplicate_found) { + mv_list[*index].as_int = candidate_mv.as_int; + mv_scores[*index] = weight; + i = *index; + (*index)++; + } + + // Reshuffle the list so that highest scoring mvs at the top. + while (i > 0) { + if (mv_scores[i] > mv_scores[i-1]) { + int tmp_score = mv_scores[i-1]; + int_mv tmp_mv = mv_list[i-1]; + + mv_scores[i-1] = mv_scores[i]; + mv_list[i-1] = mv_list[i]; + mv_scores[i] = tmp_score; + mv_list[i] = tmp_mv; + i--; + } else + break; + } +} + + +// Measure the distance of each reference candidate from the actual +// residual vector and return the nearest +unsigned int pick_best_mv_ref( int_mv target_mv, + int_mv * mv_ref_list, + int_mv * best_ref ) { + + int i; + int best_index = 0; + unsigned int distance, distance2; + + distance = mv_distance(&target_mv, &mv_ref_list[0]); + + for (i = 1; i < MAX_MV_REFS; ++i ) { + distance2 = + mv_distance(&target_mv, &mv_ref_list[i]); + if (distance2 < distance) { + distance = distance2; + best_index = i; + } + } + + (*best_ref).as_int = mv_ref_list[best_index].as_int; + + return best_index; +} + +// This function searches the neighbourhood of a given MB/SB and populates a +// list of candidate reference vectors. +// +void find_mv_refs( + MACROBLOCKD *xd, + MODE_INFO *here, + MODE_INFO *lf_here, + MV_REFERENCE_FRAME ref_frame, + int_mv *mv_ref_list, + int *ref_sign_bias +) { + + int i; + MODE_INFO *candidate_mi; + int_mv candidate_mvs[MAX_MV_REFS]; + int_mv c_refmv; + MV_REFERENCE_FRAME c_ref_frame; + int candidate_scores[MAX_MV_REFS]; + int index = 0; + int ref_weight = 0; + int valid_mv_ref; + + // Blank the reference vector lists and other local structures. + vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS); + vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS); + vpx_memset(candidate_scores, 0, sizeof(candidate_scores)); + + // Populate a list with candidate reference vectors from the + // spatial neighbours. + for (i = 0; i < 2; ++i) { + if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) && + ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) { + + candidate_mi = here + mv_ref_search[i][0] + + (mv_ref_search[i][1] * xd->mode_info_stride); + + valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame, + &c_ref_frame, &c_refmv); + + if (valid_mv_ref) { + scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias ); + ref_weight = ref_distance_weight[i] + + ((c_ref_frame == ref_frame) << 3); + + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c_refmv, ref_weight); + } + } + } + + // Look at the corresponding vector in the last frame + candidate_mi = lf_here; + valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame, + &c_ref_frame, &c_refmv); + if (valid_mv_ref) { + scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias ); + ref_weight = 2 + ((c_ref_frame == ref_frame) << 3); + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c_refmv, ref_weight); + } + + // Populate a list with candidate reference vectors from the + // spatial neighbours. + for (i = 2; i < MVREF_NEIGHBOURS; ++i) { + if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) && + ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) { + + candidate_mi = here + mv_ref_search[i][0] + + (mv_ref_search[i][1] * xd->mode_info_stride); + + valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame, + &c_ref_frame, &c_refmv); + + if (valid_mv_ref) { + scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias ); + ref_weight = ref_distance_weight[i] + + ((c_ref_frame == ref_frame) << 3); + + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c_refmv, ref_weight); + } + } + } + + // 0,0 is always a valid reference. + for (i = 0; i < index; ++i) + if (candidate_mvs[i].as_int == 0) + break; + if (i == index) { + c_refmv.as_int = 0; + addmv_and_shuffle(candidate_mvs, candidate_scores, + &index, c_refmv, 1); + } + + // Copy over the candidate list. + vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs)); +} + +#endif diff --git a/vp8/common/mvref_common.h b/vp8/common/mvref_common.h new file mode 100644 index 000000000..9be408894 --- /dev/null +++ b/vp8/common/mvref_common.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "onyxc_int.h" +#include "blockd.h" + +// MR reference entropy header file. +#if CONFIG_NEW_MVREF + +#ifndef __INC_MVREF_COMMON_H +#define __INC_MVREF_COMMON_H + +unsigned int mv_distance(int_mv *mv1, int_mv *mv2); + +unsigned int pick_best_mv_ref( int_mv target_mv, + int_mv * mv_ref_list, + int_mv * best_ref ); + +void find_mv_refs( + MACROBLOCKD *xd, + MODE_INFO *here, + MODE_INFO *lf_here, + MV_REFERENCE_FRAME ref_frame, + int_mv * mv_ref_list, + int *ref_sign_bias +); + +#endif + +#endif diff --git a/vp8/common/recon.h b/vp8/common/recon.h index 3527fc14d..0bb5c8863 100644 --- a/vp8/common/recon.h +++ b/vp8/common/recon.h @@ -262,4 +262,12 @@ typedef struct vp8_recon_rtcd_vtable { void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, MACROBLOCKD *xd); + +#if CONFIG_SUPERBLOCKS +extern void vp8_recon_mby_s_c(const vp8_recon_rtcd_vtable_t *rtcd, + MACROBLOCKD *xd, uint8_t *dst); +extern void vp8_recon_mbuv_s_c(const vp8_recon_rtcd_vtable_t *rtcd, + MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst); +#endif + #endif diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h index 7ad0adbd4..37e34b5e1 100644 --- a/vp8/common/reconinter.h +++ b/vp8/common/reconinter.h @@ -45,6 +45,15 @@ extern void vp8_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd, int dst_ystride, int dst_uvstride); +#if CONFIG_SUPERBLOCKS +extern void vp8_build_inter32x32_predictors_sb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride); +#endif + extern void vp8_build_inter_predictors_mb(MACROBLOCKD *xd); extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c index e391fa9be..cad9652b7 100644 --- a/vp8/common/reconintra.c +++ b/vp8/common/reconintra.c @@ -207,10 +207,10 @@ void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd, } } -void vp8_build_intra_predictors_internal(MACROBLOCKD *xd, - unsigned char *src, int src_stride, +void vp8_build_intra_predictors_internal(unsigned char *src, int src_stride, unsigned char *ypred_ptr, - int y_stride, int mode, int bsize) { + int y_stride, int mode, int bsize, + int up_available, int left_available) { unsigned char *yabove_row = src - src_stride; unsigned char yleft_col[32]; @@ -218,7 +218,7 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd, int r, c, i; for (i = 0; i < bsize; i++) { - yleft_col[i] = xd->dst.y_buffer [i * src_stride - 1]; + yleft_col[i] = src[i * src_stride - 1]; } /* for Y */ @@ -230,8 +230,10 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd, int average = 0; int log2_bsize_minus_1; - assert(bsize == 8 || bsize == 16 || bsize == 32); - if (bsize == 8) { + assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32); + if (bsize == 4) { + log2_bsize_minus_1 = 1; + } else if (bsize == 8) { log2_bsize_minus_1 = 2; } else if (bsize == 16) { log2_bsize_minus_1 = 3; @@ -239,19 +241,19 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd, log2_bsize_minus_1 = 4; } - if (xd->up_available || xd->left_available) { - if (xd->up_available) { + if (up_available || left_available) { + if (up_available) { for (i = 0; i < bsize; i++) { average += yabove_row[i]; } } - if (xd->left_available) { + if (left_available) { for (i = 0; i < bsize; i++) { average += yleft_col[i]; } } - shift = log2_bsize_minus_1 + xd->up_available + xd->left_available; + shift = log2_bsize_minus_1 + up_available + left_available; expected_dc = (average + (1 << (shift - 1))) >> shift; } else { expected_dc = 128; @@ -332,22 +334,25 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd, } void vp8_build_intra_predictors_mby(MACROBLOCKD *xd) { - vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride, + vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, xd->predictor, 16, - xd->mode_info_context->mbmi.mode, 16); + xd->mode_info_context->mbmi.mode, 16, + xd->up_available, xd->left_available); } void vp8_build_intra_predictors_mby_s(MACROBLOCKD *xd) { - vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride, + vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, xd->dst.y_buffer, xd->dst.y_stride, - xd->mode_info_context->mbmi.mode, 16); + xd->mode_info_context->mbmi.mode, 16, + xd->up_available, xd->left_available); } #if CONFIG_SUPERBLOCKS -void vp8_build_intra_predictors_sby_s(MACROBLOCKD *x) { - vp8_build_intra_predictors_internal(x, x->dst.y_buffer, x->dst.y_stride, - x->dst.y_buffer, x->dst.y_stride, - x->mode_info_context->mbmi.mode, 32); +void vp8_build_intra_predictors_sby_s(MACROBLOCKD *xd) { + vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, + xd->dst.y_buffer, xd->dst.y_stride, + xd->mode_info_context->mbmi.mode, 32, + xd->up_available, xd->left_available); } #endif @@ -356,14 +361,16 @@ void vp8_build_comp_intra_predictors_mby(MACROBLOCKD *xd) { unsigned char predictor[2][256]; int i; - vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride, + vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, predictor[0], 16, xd->mode_info_context->mbmi.mode, - 16); - vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride, + 16, xd->up_available, + xd->left_available); + vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride, predictor[1], 16, xd->mode_info_context->mbmi.second_mode, - 16); + 16, xd->up_available, + xd->left_available); for (i = 0; i < 256; i++) { xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1; @@ -376,10 +383,12 @@ void vp8_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd, unsigned char *vpred_ptr, int uv_stride, int mode, int bsize) { - vp8_build_intra_predictors_internal(xd, xd->dst.u_buffer, xd->dst.uv_stride, - upred_ptr, uv_stride, mode, bsize); - vp8_build_intra_predictors_internal(xd, xd->dst.v_buffer, xd->dst.uv_stride, - vpred_ptr, uv_stride, mode, bsize); + vp8_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride, + upred_ptr, uv_stride, mode, bsize, + xd->up_available, xd->left_available); + vp8_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride, + vpred_ptr, uv_stride, mode, bsize, + xd->up_available, xd->left_available); } void vp8_build_intra_predictors_mbuv(MACROBLOCKD *xd) { @@ -428,95 +437,9 @@ void vp8_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) { void vp8_intra8x8_predict(BLOCKD *xd, int mode, unsigned char *predictor) { - - unsigned char *yabove_row = *(xd->base_dst) + xd->dst - xd->dst_stride; - unsigned char yleft_col[8]; - unsigned char ytop_left = yabove_row[-1]; - int r, c, i; - - for (i = 0; i < 8; i++) { - yleft_col[i] = (*(xd->base_dst))[xd->dst - 1 + i * xd->dst_stride]; - } - switch (mode) { - case DC_PRED: { - int expected_dc = 0; - - for (i = 0; i < 8; i++) { - expected_dc += yabove_row[i]; - expected_dc += yleft_col[i]; - } - expected_dc = (expected_dc + 8) >> 4; - - for (r = 0; r < 8; r++) { - for (c = 0; c < 8; c++) { - predictor[c] = expected_dc; - } - predictor += 16; - } - } - break; - case V_PRED: { - for (r = 0; r < 8; r++) { - for (c = 0; c < 8; c++) { - predictor[c] = yabove_row[c]; - } - predictor += 16; - } - - } - break; - case H_PRED: { - - for (r = 0; r < 8; r++) { - for (c = 0; c < 8; c++) { - predictor[c] = yleft_col[r]; - } - predictor += 16; - } - } - break; - case TM_PRED: { - /* prediction similar to true_motion prediction */ - for (r = 0; r < 8; r++) { - for (c = 0; c < 8; c++) { - int pred = yabove_row[c] - ytop_left + yleft_col[r]; - if (pred < 0) - pred = 0; - - if (pred > 255) - pred = 255; - predictor[c] = pred; - } - - predictor += 16; - } - } - break; - case D45_PRED: { - d45_predictor(predictor, 16, 8, yabove_row, yleft_col); - } - break; - case D135_PRED: { - d135_predictor(predictor, 16, 8, yabove_row, yleft_col); - } - break; - case D117_PRED: { - d117_predictor(predictor, 16, 8, yabove_row, yleft_col); - } - break; - case D153_PRED: { - d153_predictor(predictor, 16, 8, yabove_row, yleft_col); - } - break; - case D27_PRED: { - d27_predictor(predictor, 16, 8, yabove_row, yleft_col); - } - break; - case D63_PRED: { - d63_predictor(predictor, 16, 8, yabove_row, yleft_col); - } - break; - } + vp8_build_intra_predictors_internal(*(xd->base_dst) + xd->dst, + xd->dst_stride, predictor, 16, + mode, 8, 1, 1); } #if CONFIG_COMP_INTRA_PRED @@ -540,96 +463,9 @@ void vp8_comp_intra8x8_predict(BLOCKD *xd, void vp8_intra_uv4x4_predict(BLOCKD *xd, int mode, unsigned char *predictor) { - - unsigned char *above_row = *(xd->base_dst) + xd->dst - xd->dst_stride; - unsigned char left_col[4]; - unsigned char top_left = above_row[-1]; - int r, c, i; - - for (i = 0; i < 4; i++) { - left_col[i] = (*(xd->base_dst))[xd->dst - 1 + i * xd->dst_stride]; - } - switch (mode) { - case DC_PRED: { - int expected_dc = 0; - - for (i = 0; i < 4; i++) { - expected_dc += above_row[i]; - expected_dc += left_col[i]; - } - expected_dc = (expected_dc + 4) >> 3; - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - predictor[c] = expected_dc; - } - predictor += 8; - } - } - break; - case V_PRED: { - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - - predictor[c] = above_row[c]; - } - predictor += 8; - } - - } - break; - case H_PRED: { - - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - predictor[c] = left_col[r]; - } - predictor += 8; - } - } - break; - case TM_PRED: { - /* prediction similar to true_motion prediction */ - for (r = 0; r < 4; r++) { - for (c = 0; c < 4; c++) { - int pred = above_row[c] - top_left + left_col[r]; - if (pred < 0) - pred = 0; - - if (pred > 255) - pred = 255; - predictor[c] = pred; - } - - predictor += 8; - } - } - break; - case D45_PRED: { - d45_predictor(predictor, 8, 4, above_row, left_col); - } - break; - case D135_PRED: { - d135_predictor(predictor, 8, 4, above_row, left_col); - } - break; - case D117_PRED: { - d117_predictor(predictor, 8, 4, above_row, left_col); - } - break; - case D153_PRED: { - d153_predictor(predictor, 8, 4, above_row, left_col); - } - break; - case D27_PRED: { - d27_predictor(predictor, 8, 4, above_row, left_col); - } - break; - case D63_PRED: { - d63_predictor(predictor, 8, 4, above_row, left_col); - } - break; - } + vp8_build_intra_predictors_internal(*(xd->base_dst) + xd->dst, + xd->dst_stride, predictor, 8, + mode, 4, 1, 1); } #if CONFIG_COMP_INTRA_PRED diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh index 1cb5de311..66029f88e 100644 --- a/vp8/common/rtcd_defs.sh +++ b/vp8/common/rtcd_defs.sh @@ -14,8 +14,8 @@ prototype void vp8_filter_block2d_16x16_8 "const unsigned char *src_ptr, const u # compiles warning free but a dissassembly of generated code show bugs. To be # on the safe side, only enabled when compiled with 'gcc'. if [ "$CONFIG_GCC" = "yes" ]; then - specialize vp8_filter_block2d_4x4_8 sse4_1 - specialize vp8_filter_block2d_8x4_8 sse4_1 - specialize vp8_filter_block2d_8x8_8 sse4_1 - specialize vp8_filter_block2d_16x16_8 sse4_1 + specialize vp8_filter_block2d_4x4_8 sse4_1 sse2 + specialize vp8_filter_block2d_8x4_8 sse4_1 sse2 + specialize vp8_filter_block2d_8x8_8 sse4_1 sse2 + specialize vp8_filter_block2d_16x16_8 sse4_1 sse2 fi diff --git a/vp8/common/x86/filter_sse2.c b/vp8/common/x86/filter_sse2.c new file mode 100644 index 000000000..fe57b4e0b --- /dev/null +++ b/vp8/common/x86/filter_sse2.c @@ -0,0 +1,289 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> // for alignment checks +#include <emmintrin.h> // SSE2 +#include "vp8/common/filter.h" +#include "vpx_ports/mem.h" // for DECLARE_ALIGNED +#include "vpx_rtcd.h" + +// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is +// just a quick partial snapshot so that other can already use some +// speedup. +// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap +// filtering. +// TODO(cd): Add some comments, better variable naming. +// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum +// of positive above 128), or have higher precision filter +// coefficients. + +DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = { + VP8_FILTER_WEIGHT >> 1, + VP8_FILTER_WEIGHT >> 1, + VP8_FILTER_WEIGHT >> 1, + VP8_FILTER_WEIGHT >> 1, +}; + +// Creating a macro to do more than four pixels at once to hide instruction +// latency is actually slower :-( +#define DO_FOUR_PIXELS(result, src_ptr, offset) \ + { \ + /* Do shifted load to achieve require shuffles through unpacking */ \ + const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \ + const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \ + const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \ + const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \ + const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \ + const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \ + const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \ + const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \ + /* Shit by 4 bytes through suffle to get additional shifted loads */ \ + const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \ + const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \ + const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \ + const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \ + const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \ + /* multiply accumulate them */ \ + const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \ + const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \ + const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \ + const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \ + const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \ + const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \ + __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \ + mad_all = _mm_add_epi32(mad_all, rounding); \ + result = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); \ + } + +void vp8_filter_block2d_4x4_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + __m128i intermediateA, intermediateB, intermediateC; + + const int kInterp_Extend = 4; + + const __m128i zero = _mm_set1_epi16(0); + const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c); + + // check alignment + assert(0 == ((long)HFilter_aligned16)%16); + assert(0 == ((long)VFilter_aligned16)%16); + + { + __m128i transpose3_0; + __m128i transpose3_1; + __m128i transpose3_2; + __m128i transpose3_3; + + // Horizontal pass (src -> intermediate). + { + const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3)); + src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1); + + { + __m128i mad_all0; + __m128i mad_all1; + __m128i mad_all2; + __m128i mad_all3; + DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) + DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); + intermediateA = _mm_packus_epi16(mad_all0, mad_all2); + // -- + src_ptr += src_stride*4; + // -- + DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) + DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all3); + intermediateB = _mm_packus_epi16(mad_all0, mad_all2); + // -- + src_ptr += src_stride*4; + // -- + DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride) + DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride) + DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride) + mad_all0 = _mm_packs_epi32(mad_all0, mad_all1); + mad_all2 = _mm_packs_epi32(mad_all2, mad_all2); + intermediateC = _mm_packus_epi16(mad_all0, mad_all2); + } + } + + // Transpose result (intermediate -> transpose3_x) + { + // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33 + // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73 + // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx + const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB); + const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB); + const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC); + const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC); + // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53 + // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73 + // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx + // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx + const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3); + const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3); + // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63 + // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73 + // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx + // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx + const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1); + const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1); + const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3); + const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3); + // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73 + // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx + // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx + transpose3_0 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose2_2), + _MM_SHUFFLE(1, 0, 1, 0))); + transpose3_1 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0), + _mm_castsi128_ps(transpose2_2), + _MM_SHUFFLE(3, 2, 3, 2))); + transpose3_2 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose2_3), + _MM_SHUFFLE(1, 0, 1, 0))); + transpose3_3 = _mm_castps_si128( + _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1), + _mm_castsi128_ps(transpose2_3), + _MM_SHUFFLE(3, 2, 3, 2))); + // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx + // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx + // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx + // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx + } + + // Vertical pass (transpose3_x -> dst). + { + const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16); + // get first two columns filter coefficients + __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0)); + __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1)); + __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2)); + __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3)); + __m128i col0, col1, col2, col3; + DECLARE_ALIGNED(16, unsigned char, temp[32]); + { + _mm_store_si128((__m128i *)temp, transpose3_0); + DO_FOUR_PIXELS(col0, temp, 0); + } + { + _mm_store_si128((__m128i *)temp, transpose3_1); + DO_FOUR_PIXELS(col1, temp, 0); + } + { + _mm_store_si128((__m128i *)temp, transpose3_2); + DO_FOUR_PIXELS(col2, temp, 0); + } + { + _mm_store_si128((__m128i *)temp, transpose3_3); + DO_FOUR_PIXELS(col3, temp, 0); + } + // transpose + { + __m128i T0 = _mm_unpacklo_epi32(col0, col1); + __m128i T1 = _mm_unpacklo_epi32(col2, col3); + __m128i T2 = _mm_unpackhi_epi32(col0, col1); + __m128i T3 = _mm_unpackhi_epi32(col2, col3); + col0 = _mm_unpacklo_epi64(T0, T1); + col1 = _mm_unpackhi_epi64(T0, T1); + col2 = _mm_unpacklo_epi64(T2, T3); + col3 = _mm_unpackhi_epi64(T2, T3); + } + // saturate to 8 bit + { + col0 = _mm_packs_epi32(col0, col0); + col0 = _mm_packus_epi16(col0, col0); + col1 = _mm_packs_epi32(col1, col1); + col1 = _mm_packus_epi16(col1, col1); + col2 = _mm_packs_epi32 (col2, col2); + col2 = _mm_packus_epi16(col2, col2); + col3 = _mm_packs_epi32 (col3, col3); + col3 = _mm_packus_epi16(col3, col3); + } + // store + { + *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0); + *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1); + *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2); + *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3); + } + } + } +} + +void vp8_filter_block2d_8x4_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int j; + for (j=0; j<8; j+=4) { + vp8_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j, dst_stride); + } +} + +void vp8_filter_block2d_8x8_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<8; i+=4) { + for (j=0; j<8; j+=4) { + vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} + +void vp8_filter_block2d_16x16_8_sse2 +( + const unsigned char *src_ptr, const unsigned int src_stride, + const short *HFilter_aligned16, const short *VFilter_aligned16, + unsigned char *dst_ptr, unsigned int dst_stride +) { + int i, j; + for (i=0; i<16; i+=4) { + for (j=0; j<16; j+=4) { + vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride, + HFilter_aligned16, VFilter_aligned16, + dst_ptr + j + i*dst_stride, dst_stride); + } + } +} diff --git a/vp8/common/x86/filter_sse4.c b/vp8/common/x86/filter_sse4.c index a037622e1..c461db173 100644 --- a/vp8/common/x86/filter_sse4.c +++ b/vp8/common/x86/filter_sse4.c @@ -25,9 +25,6 @@ // TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum // of positive above 128), or have higher precision filter // coefficients. -// TODO(cd): Remove use of _mm_extract_epi32 and _mm_extract_epi64, to not -// require SSE4.1 -// TODO(cd): Remove use of _mm_shuffle_epi8 to not require SSSE3 DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = { 0x00, 0x01, |