diff options
Diffstat (limited to 'vp9')
32 files changed, 3057 insertions, 3814 deletions
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c index 5841f8091..370ebe8f8 100644 --- a/vp9/common/vp9_debugmodes.c +++ b/vp9/common/vp9_debugmodes.c @@ -11,126 +11,68 @@ #include <stdio.h> #include "vp9/common/vp9_blockd.h" +#include "vp9/common/vp9_onyxc_int.h" -void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols, - int frame, char *file) { +static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) { + fprintf(f, "%s", str); + fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame, + cm->show_frame, cm->base_qindex); +} +/* This function dereferences a pointer to the mbmi structure + * and uses the passed in member offset to print out the value of an integer + * for each mbmi member value in the mi structure. + */ +static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor, + size_t member_offset) { int mi_row; int mi_col; int mi_index = 0; - FILE *mvs = fopen(file, "a"); - - // Print out the macroblock Y modes - fprintf(mvs, "SB Types for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.sb_type); - - mi_index++; - } - - fprintf(mvs, "\n"); - mi_index += 8; - } + MODE_INFO *mi = common->mi; + int rows = common->mi_rows; + int cols = common->mi_cols; + char prefix = descriptor[0]; - // Print out the macroblock Y modes - fprintf(mvs, "Mb Modes for Frame %d\n", frame); + log_frame_info(common, descriptor, file); mi_index = 0; for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(file, "%c ", prefix); for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.mode); - + fprintf(file, "%2d ", + *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset))); mi_index++; } - - fprintf(mvs, "\n"); + fprintf(file, "\n"); mi_index += 8; } - - fprintf(mvs, "\n"); - - mi_index = 0; - fprintf(mvs, "Mb mv ref for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.ref_frame[0]); - - mi_index++; - } - - fprintf(mvs, "\n"); - mi_index += 8; - } - fprintf(mvs, "\n"); - - mi_index = 0; - fprintf(mvs, "Mb mv ref for Frame %d\n", frame); - + fprintf(file, "\n"); +} +void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) { + int mi_row; + int mi_col; + int mi_index = 0; + FILE *mvs = fopen(file, "a"); + MODE_INFO *mi = cm->mi; + int rows = cm->mi_rows; + int cols = cm->mi_cols; + + print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type)); + print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode)); + print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff)); + print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0])); + print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size)); + print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode)); + + log_frame_info(cm, "Vectors ",mvs); for (mi_row = 0; mi_row < rows; mi_row++) { + fprintf(mvs,"V "); for (mi_col = 0; mi_col < cols; mi_col++) { fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row, mi[mi_index].mbmi.mv[0].as_mv.col); - mi_index++; } - fprintf(mvs, "\n"); mi_index += 8; } - - fprintf(mvs, "\n"); - - /* print out the macroblock txform sizes */ - mi_index = 0; - fprintf(mvs, "TXFM size for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.txfm_size); - - mi_index++; - } - - mi_index += 8; - fprintf(mvs, "\n"); - } - - fprintf(mvs, "\n"); - - /* print out the macroblock UV modes */ - mi_index = 0; - fprintf(mvs, "UV Modes for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%2d ", mi[mi_index].mbmi.uv_mode); - - mi_index++; - } - - mi_index += 8; - fprintf(mvs, "\n"); - } - - fprintf(mvs, "\n"); - - /* print out the macroblock mvs */ - mi_index = 0; - fprintf(mvs, "MVs for Frame %d\n", frame); - - for (mi_row = 0; mi_row < rows; mi_row++) { - for (mi_col = 0; mi_col < cols; mi_col++) { - fprintf(mvs, "%5d:%-5d", mi[mi_index].mbmi.mv[0].as_mv.row / 2, - mi[mi_index].mbmi.mv[0].as_mv.col / 2); - - mi_index++; - } - - mi_index += 8; - fprintf(mvs, "\n"); - } - fprintf(mvs, "\n"); fclose(mvs); diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index e07e43c8b..000e284ee 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -114,7 +114,7 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { return c; } -int vp9_use_nmv_hp(const MV *ref) { +int vp9_use_mv_hp(const MV *ref) { return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH && (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH; } @@ -123,54 +123,50 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) { return mv_class_base(c) + offset; } -static void increment_nmv_component_count(int v, - nmv_component_counts *mvcomp, - int incr, - int usehp) { - assert (v != 0); /* should not be zero */ - mvcomp->mvcount[MV_MAX + v] += incr; +static void inc_mv_component_count(int v, nmv_component_counts *comp_counts, + int incr) { + assert (v != 0); + comp_counts->mvcount[MV_MAX + v] += incr; } -static void increment_nmv_component(int v, - nmv_component_counts *mvcomp, - int incr, - int usehp) { +static void inc_mv_component(int v, nmv_component_counts *comp_counts, + int incr, int usehp) { int s, z, c, o, d, e, f; if (!incr) return; assert (v != 0); /* should not be zero */ s = v < 0; - mvcomp->sign[s] += incr; + comp_counts->sign[s] += incr; z = (s ? -v : v) - 1; /* magnitude - 1 */ c = vp9_get_mv_class(z, &o); - mvcomp->classes[c] += incr; + comp_counts->classes[c] += incr; d = (o >> 3); /* int mv data */ f = (o >> 1) & 3; /* fractional pel mv data */ e = (o & 1); /* high precision mv data */ if (c == MV_CLASS_0) { - mvcomp->class0[d] += incr; + comp_counts->class0[d] += incr; } else { int i; int b = c + CLASS0_BITS - 1; // number of bits for (i = 0; i < b; ++i) - mvcomp->bits[i][((d >> i) & 1)] += incr; + comp_counts->bits[i][((d >> i) & 1)] += incr; } /* Code the fractional pel bits */ if (c == MV_CLASS_0) { - mvcomp->class0_fp[d][f] += incr; + comp_counts->class0_fp[d][f] += incr; } else { - mvcomp->fp[f] += incr; + comp_counts->fp[f] += incr; } /* Code the high precision bit */ if (usehp) { if (c == MV_CLASS_0) { - mvcomp->class0_hp[e] += incr; + comp_counts->class0_hp[e] += incr; } else { - mvcomp->hp[e] += incr; + comp_counts->hp[e] += incr; } } } @@ -197,8 +193,8 @@ static void counts_to_context(nmv_component_counts *mvcomp, int usehp) { int v; vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount)); for (v = 1; v <= MV_MAX; v++) { - increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp); - increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp); + inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp); + inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp); } } @@ -206,12 +202,12 @@ void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx, int usehp) { const MV_JOINT_TYPE j = vp9_get_mv_joint(mv); mvctx->joints[j]++; - usehp = usehp && vp9_use_nmv_hp(ref); + usehp = usehp && vp9_use_mv_hp(ref); if (mv_joint_vertical(j)) - increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp); + inc_mv_component_count(mv->row, &mvctx->comps[0], 1); if (mv_joint_horizontal(j)) - increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp); + inc_mv_component_count(mv->col, &mvctx->comps[1], 1); } static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) { @@ -332,7 +328,7 @@ static unsigned int adapt_probs(unsigned int i, } -void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) { +void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) { int i, j; #ifdef MV_COUNT_TESTING printf("joints count: "); diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h index 15994a6ae..0fc20dbfc 100644 --- a/vp9/common/vp9_entropymv.h +++ b/vp9/common/vp9_entropymv.h @@ -21,8 +21,8 @@ struct VP9Common; void vp9_entropy_mv_init(); void vp9_init_mv_probs(struct VP9Common *cm); -void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp); -int vp9_use_nmv_hp(const MV *ref); +void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp); +int vp9_use_mv_hp(const MV *ref); #define VP9_NMV_UPDATE_PROB 252 diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c index a6922715e..643b229a6 100644 --- a/vp9/common/vp9_findnearmv.c +++ b/vp9/common/vp9_findnearmv.c @@ -15,7 +15,7 @@ #include "vp9/common/vp9_sadmxn.h" static void lower_mv_precision(int_mv *mv, int usehp) { - if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) { + if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) { if (mv->as_mv.row & 1) mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1); if (mv->as_mv.col & 1) diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index a405aab8d..892ad5615 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -266,88 +266,84 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid specialize vp9_variance4x4 mmx sse2 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x64 sse2 +specialize vp9_sub_pixel_variance64x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x64 +specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x64 +specialize vp9_sub_pixel_variance32x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x64 +specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x32 +specialize vp9_sub_pixel_variance64x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance64x32 +specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x16 +specialize vp9_sub_pixel_variance32x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x16 +specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x32 +specialize vp9_sub_pixel_variance16x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x32 +specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x32 sse2 +specialize vp9_sub_pixel_variance32x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance32x32 +specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3 +specialize vp9_sub_pixel_variance16x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x16 +specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x16 sse2 mmx -vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt +specialize vp9_sub_pixel_variance8x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x16 +specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3 -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3; -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt +specialize vp9_sub_pixel_variance16x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance16x8 +specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x8 sse2 mmx -vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt +specialize vp9_sub_pixel_variance8x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x8 +specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x4 +specialize vp9_sub_pixel_variance8x4 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance8x4 +specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x8 +specialize vp9_sub_pixel_variance4x8 sse ssse3 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x8 +specialize vp9_sub_pixel_avg_variance4x8 sse ssse3 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x4 sse2 mmx -vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt +specialize vp9_sub_pixel_variance4x4 sse ssse3 +#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" -specialize vp9_sub_pixel_avg_variance4x4 +specialize vp9_sub_pixel_avg_variance4x4 sse ssse3 prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad64x64 sse2 @@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co specialize vp9_sad4x4 mmx sse prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_h mmx sse2 +specialize vp9_variance_halfpixvar16x16_h sse2 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_v mmx sse2 +specialize vp9_variance_halfpixvar16x16_v sse2 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_hv mmx sse2 +specialize vp9_variance_halfpixvar16x16_hv sse2 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" @@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x4d sse -prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" -specialize vp9_sub_pixel_mse16x16 sse2 mmx +#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" +#specialize vp9_sub_pixel_mse16x16 sse2 mmx prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" specialize vp9_mse16x16 mmx sse2 @@ -533,9 +529,11 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *" specialize vp9_get_mb_ss mmx sse2 # ENCODEMB INVOKE -prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size" -specialize vp9_block_error mmx sse2 -vp9_block_error_sse2=vp9_block_error_xmm +prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size" +specialize vp9_block_error sse2 + +prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride" +specialize vp9_subtract_block sse2 # # Structured Similarity (SSIM) diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c index b3d41bed7..d8836c962 100644 --- a/vp9/decoder/vp9_decodemv.c +++ b/vp9/decoder/vp9_decodemv.c @@ -8,20 +8,21 @@ * be found in the AUTHORS file in the root of the source tree. */ - -#include "vp9/decoder/vp9_treereader.h" -#include "vp9/common/vp9_entropymv.h" +#include "vp9/common/vp9_common.h" +#include "vp9/common/vp9_entropy.h" #include "vp9/common/vp9_entropymode.h" -#include "vp9/common/vp9_reconinter.h" -#include "vp9/decoder/vp9_onyxd_int.h" +#include "vp9/common/vp9_entropymv.h" #include "vp9/common/vp9_findnearmv.h" -#include "vp9/common/vp9_common.h" -#include "vp9/common/vp9_seg_common.h" +#include "vp9/common/vp9_mvref_common.h" #include "vp9/common/vp9_pred_common.h" -#include "vp9/common/vp9_entropy.h" +#include "vp9/common/vp9_reconinter.h" +#include "vp9/common/vp9_seg_common.h" + #include "vp9/decoder/vp9_decodemv.h" #include "vp9/decoder/vp9_decodframe.h" -#include "vp9/common/vp9_mvref_common.h" +#include "vp9/decoder/vp9_onyxd_int.h" +#include "vp9/decoder/vp9_treereader.h" + #if CONFIG_DEBUG #include <assert.h> #endif @@ -37,14 +38,52 @@ extern int dec_debug; #endif static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) { - MB_PREDICTION_MODE m = treed_read(r, vp9_intra_mode_tree, p); - return m; + return treed_read(r, vp9_intra_mode_tree, p); } static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) { return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs); } +static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd, + vp9_reader *r, BLOCK_SIZE_TYPE bsize) { + const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE); + const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE); + TX_SIZE txfm_size = vp9_read(r, tx_probs[0]); + if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) { + txfm_size += vp9_read(r, tx_probs[1]); + if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32) + txfm_size += vp9_read(r, tx_probs[2]); + } + + if (bsize >= BLOCK_SIZE_SB32X32) + cm->fc.tx_count_32x32p[context][txfm_size]++; + else if (bsize >= BLOCK_SIZE_MB16X16) + cm->fc.tx_count_16x16p[context][txfm_size]++; + else + cm->fc.tx_count_8x8p[context][txfm_size]++; + + return txfm_size; +} + +static TX_SIZE get_txfm_size(VP9D_COMP *pbi, TXFM_MODE txfm_mode, + BLOCK_SIZE_TYPE bsize, int select_cond, + vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + + if (txfm_mode == TX_MODE_SELECT && bsize >= BLOCK_SIZE_SB8X8 && select_cond) + return select_txfm_size(cm, xd, r, bsize); + else if (txfm_mode >= ALLOW_32X32 && bsize >= BLOCK_SIZE_SB32X32) + return TX_32X32; + else if (txfm_mode >= ALLOW_16X16 && bsize >= BLOCK_SIZE_MB16X16) + return TX_16X16; + else if (txfm_mode >= ALLOW_8X8 && bsize >= BLOCK_SIZE_SB8X8) + return TX_8X8; + else + return TX_4X4; +} + static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi, int mi_row, int mi_col, int segment_id) { const int mi_index = mi_row * cm->mi_cols + mi_col; @@ -63,27 +102,6 @@ static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi, } } -static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd, - vp9_reader *r, BLOCK_SIZE_TYPE bsize) { - const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE); - const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE); - TX_SIZE txfm_size = vp9_read(r, tx_probs[0]); - if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) { - txfm_size += vp9_read(r, tx_probs[1]); - if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32) - txfm_size += vp9_read(r, tx_probs[2]); - } - if (bsize >= BLOCK_SIZE_SB32X32) { - cm->fc.tx_count_32x32p[context][txfm_size]++; - } else if (bsize >= BLOCK_SIZE_MB16X16) { - cm->fc.tx_count_16x16p[context][txfm_size]++; - } else { - cm->fc.tx_count_8x8p[context][txfm_size]++; - } - return txfm_size; -} - - static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m, int mi_row, int mi_col, vp9_reader *r) { @@ -106,21 +124,8 @@ static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m, [m->mbmi.mb_skip_coeff]++; } - if (cm->txfm_mode == TX_MODE_SELECT && - m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) { - m->mbmi.txfm_size = select_txfm_size(cm, xd, r, m->mbmi.sb_type); - } else if (cm->txfm_mode >= ALLOW_32X32 && - m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) { - m->mbmi.txfm_size = TX_32X32; - } else if (cm->txfm_mode >= ALLOW_16X16 && - m->mbmi.sb_type >= BLOCK_SIZE_MB16X16) { - m->mbmi.txfm_size = TX_16X16; - } else if (cm->txfm_mode >= ALLOW_8X8 && - m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) { - m->mbmi.txfm_size = TX_8X8; - } else { - m->mbmi.txfm_size = TX_4X4; - } + m->mbmi.txfm_size = get_txfm_size(pbi, cm->txfm_mode, m->mbmi.sb_type, + 1, r); // luma mode m->mbmi.ref_frame[0] = INTRA_FRAME; @@ -303,28 +308,22 @@ unsigned int vp9_mv_cont_count[5][4] = { }; #endif -static void read_switchable_interp_probs(VP9_COMMON* const cm, vp9_reader *r) { +static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) { int i, j; - for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j) - for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) { - cm->fc.switchable_interp_prob[j][i] = - // vp9_read_prob(r); - vp9_read_prob_diff_update(r, cm->fc.switchable_interp_prob[j][i]); - } - } + for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j) + for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + fc->switchable_interp_prob[j][i] = vp9_read_prob_diff_update(r, + fc->switchable_interp_prob[j][i]); } -static void read_inter_mode_probs(VP9_COMMON *const cm, vp9_reader *r) { +static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) { int i, j; for (i = 0; i < INTER_MODE_CONTEXTS; ++i) - for (j = 0; j < VP9_INTER_MODES - 1; ++j) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) { - // cm->fc.inter_mode_probs[i][j] = vp9_read_prob(r); - cm->fc.inter_mode_probs[i][j] = - vp9_read_prob_diff_update(r, cm->fc.inter_mode_probs[i][j]); - } - } + for (j = 0; j < VP9_INTER_MODES - 1; ++j) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) + fc->inter_mode_probs[i][j] = vp9_read_prob_diff_update(r, + fc->inter_mode_probs[i][j]); } static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) { @@ -337,21 +336,20 @@ static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) { static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; - if ((cm->frame_type != KEY_FRAME) && (!cm->intra_only)) { + if (cm->frame_type != KEY_FRAME && !cm->intra_only) { nmv_context *const nmvc = &pbi->common.fc.nmvc; MACROBLOCKD *const xd = &pbi->mb; int i, j; - read_inter_mode_probs(cm, r); + read_inter_mode_probs(&cm->fc, r); if (cm->mcomp_filter_type == SWITCHABLE) - read_switchable_interp_probs(cm, r); + read_switchable_interp_probs(&cm->fc, r); - for (i = 0; i < INTRA_INTER_CONTEXTS; i++) { + for (i = 0; i < INTRA_INTER_CONTEXTS; i++) if (vp9_read(r, VP9_MODE_UPDATE_PROB)) cm->fc.intra_inter_prob[i] = vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]); - } if (cm->allow_comp_inter_inter) { cm->comp_pred_mode = read_comp_pred_mode(r); @@ -461,7 +459,7 @@ static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref, const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints); MV diff = {0, 0}; - usehp = usehp && vp9_use_nmv_hp(ref); + usehp = usehp && vp9_use_mv_hp(ref); if (mv_joint_vertical(j)) diff.row = read_mv_component(r, &ctx->comps[0], usehp); @@ -476,27 +474,80 @@ static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref, static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type( VP9D_COMP *pbi, vp9_reader *r) { - const int index = treed_read(r, vp9_switchable_interp_tree, - vp9_get_pred_probs(&pbi->common, &pbi->mb, - PRED_SWITCHABLE_INTERP)); - ++pbi->common.fc.switchable_interp_count - [vp9_get_pred_context( - &pbi->common, &pbi->mb, PRED_SWITCHABLE_INTERP)][index]; + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + const vp9_prob *probs = vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP); + const int index = treed_read(r, vp9_switchable_interp_tree, probs); + const int ctx = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP); + ++cm->fc.switchable_interp_count[ctx][index]; return vp9_switchable_interp[index]; } +static void read_intra_block_modes(VP9D_COMP *pbi, MODE_INFO *mi, + MB_MODE_INFO *mbmi, vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; + const int bw = 1 << b_width_log2(bsize); + const int bh = 1 << b_height_log2(bsize); + + if (bsize >= BLOCK_SIZE_SB8X8) { + const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; + const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); + const int bsl = MIN(bwl, bhl); + mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]); + cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++; + } else { + int idx, idy; + for (idy = 0; idy < 2; idy += bh) { + for (idx = 0; idx < 2; idx += bw) { + int ib = idy * 2 + idx, k; + int m = read_intra_mode(r, cm->fc.y_mode_prob[0]); + mi->bmi[ib].as_mode.first = m; + cm->fc.y_mode_counts[0][m]++; + for (k = 1; k < bh; ++k) + mi->bmi[ib + k * 2].as_mode.first = m; + for (k = 1; k < bw; ++k) + mi->bmi[ib + k].as_mode.first = m; + } + } + mbmi->mode = mi->bmi[3].as_mode.first; + } + + mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]); + cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++; +} + +static MV_REFERENCE_FRAME read_reference_frame(VP9D_COMP *pbi, int segment_id, + vp9_reader *r) { + VP9_COMMON *const cm = &pbi->common; + MACROBLOCKD *const xd = &pbi->mb; + + MV_REFERENCE_FRAME ref; + if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) { + const int ctx = vp9_get_pred_context(cm, xd, PRED_INTRA_INTER); + ref = (MV_REFERENCE_FRAME) + vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER)); + cm->fc.intra_inter_count[ctx][ref != INTRA_FRAME]++; + } else { + ref = (MV_REFERENCE_FRAME) + vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; + } + return ref; +} + static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, int mi_row, int mi_col, vp9_reader *r) { VP9_COMMON *const cm = &pbi->common; - nmv_context *const nmvc = &cm->fc.nmvc; MACROBLOCKD *const xd = &pbi->mb; + nmv_context *const nmvc = &cm->fc.nmvc; int_mv *const mv0 = &mbmi->mv[0]; int_mv *const mv1 = &mbmi->mv[1]; - BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; - int bw = 1 << b_width_log2(bsize); - int bh = 1 << b_height_log2(bsize); + const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type; + const int bw = 1 << b_width_log2(bsize); + const int bh = 1 << b_height_log2(bsize); int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge; int j, idx, idy; @@ -529,32 +580,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, [mbmi->mb_skip_coeff]++; } - // Read the reference frame - if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_REF_FRAME)) { - mbmi->ref_frame[0] = - vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER)); - cm->fc.intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)] - [mbmi->ref_frame[0] != INTRA_FRAME]++; - } else { - mbmi->ref_frame[0] = - vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME; - } - - if (cm->txfm_mode == TX_MODE_SELECT && - (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME) && - bsize >= BLOCK_SIZE_SB8X8) { - mbmi->txfm_size = select_txfm_size(cm, xd, r, bsize); - } else if (bsize >= BLOCK_SIZE_SB32X32 && - cm->txfm_mode >= ALLOW_32X32) { - mbmi->txfm_size = TX_32X32; - } else if (cm->txfm_mode >= ALLOW_16X16 && - bsize >= BLOCK_SIZE_MB16X16) { - mbmi->txfm_size = TX_16X16; - } else if (cm->txfm_mode >= ALLOW_8X8 && (bsize >= BLOCK_SIZE_SB8X8)) { - mbmi->txfm_size = TX_8X8; - } else { - mbmi->txfm_size = TX_4X4; - } + mbmi->ref_frame[0] = read_reference_frame(pbi, mbmi->segment_id, r); + mbmi->txfm_size = get_txfm_size(pbi, cm->txfm_mode, bsize, + (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME), r); // If reference frame is an Inter frame if (mbmi->ref_frame[0] != INTRA_FRAME) { @@ -745,34 +773,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi, } } } else { - // required for left and above block mv - mv0->as_int = 0; - - if (bsize >= BLOCK_SIZE_SB8X8) { - const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; - const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); - const int bsl = MIN(bwl, bhl); - mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]); - cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++; - } else { - int idx, idy; - for (idy = 0; idy < 2; idy += bh) { - for (idx = 0; idx < 2; idx += bw) { - int ib = idy * 2 + idx, k; - int m = read_intra_mode(r, cm->fc.y_mode_prob[0]); - mi->bmi[ib].as_mode.first = m; - cm->fc.y_mode_counts[0][m]++; - for (k = 1; k < bh; ++k) - mi->bmi[ib + k * 2].as_mode.first = m; - for (k = 1; k < bw; ++k) - mi->bmi[ib + k].as_mode.first = m; - } - } - mbmi->mode = mi->bmi[3].as_mode.first; - } - - mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]); - cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++; + mv0->as_int = 0; // required for left and above block mv + read_intra_block_modes(pbi, mi, mbmi, r); } } @@ -782,13 +784,10 @@ void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) { // TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove. // vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs)); - for (k = 0; k < MBSKIP_CONTEXTS; ++k) { - if (vp9_read(r, VP9_MODE_UPDATE_PROB)) { + for (k = 0; k < MBSKIP_CONTEXTS; ++k) + if (vp9_read(r, VP9_MODE_UPDATE_PROB)) cm->fc.mbskip_probs[k] = vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]); - } - // cm->fc.mbskip_probs[k] = vp9_read_prob(r); - } mb_mode_mv_init(pbi, r); } @@ -802,7 +801,7 @@ void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi, MODE_INFO *mi = xd->mode_info_context; MB_MODE_INFO *const mbmi = &mi->mbmi; - if ((cm->frame_type == KEY_FRAME) || cm->intra_only) { + if (cm->frame_type == KEY_FRAME || cm->intra_only) { kfread_modes(pbi, mi, mi_row, mi_col, r); } else { read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r); diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 49b181d69..078d09b0d 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -276,9 +276,7 @@ static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd, MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; assert(mbmi->ref_frame[0] != INTRA_FRAME); - - if ((pbi->common.frame_type != KEY_FRAME) && (!pbi->common.intra_only)) - vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common); + vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common); // prediction vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); @@ -327,8 +325,7 @@ static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col, assert(mbmi->sb_type == bsize); assert(mbmi->ref_frame[0] != INTRA_FRAME); - if (pbi->common.frame_type != KEY_FRAME) - vp9_setup_interp_filters(xd, mbmi->interp_filter, pc); + vp9_setup_interp_filters(xd, mbmi->interp_filter, pc); // generate prediction vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); @@ -392,26 +389,24 @@ static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) { MACROBLOCKD *const xd = &pbi->mb; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; - if (mbmi->ref_frame[0] > INTRA_FRAME) { + // Select the appropriate reference frame for this MB + const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1]; + const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx]; + xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1]; + xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1]; + setup_pre_planes(xd, cfg, NULL, mi_row, mi_col, xd->scale_factor, + xd->scale_factor_uv); + xd->corrupted |= cfg->corrupted; + + if (mbmi->ref_frame[1] > INTRA_FRAME) { // Select the appropriate reference frame for this MB - const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1]; - const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx]; - xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1]; - xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1]; - setup_pre_planes(xd, cfg, NULL, mi_row, mi_col, - xd->scale_factor, xd->scale_factor_uv); - xd->corrupted |= cfg->corrupted; - - if (mbmi->ref_frame[1] > INTRA_FRAME) { - // Select the appropriate reference frame for this MB - const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1]; - const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx]; - xd->scale_factor[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1]; - xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1]; - setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col, - xd->scale_factor, xd->scale_factor_uv); - xd->corrupted |= second_cfg->corrupted; - } + const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1]; + const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx]; + xd->scale_factor[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1]; + xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1]; + setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col, xd->scale_factor, + xd->scale_factor_uv); + xd->corrupted |= second_cfg->corrupted; } } @@ -424,16 +419,17 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col, return; set_offsets(pbi, bsize, mi_row, mi_col); vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r); - set_refs(pbi, mi_row, mi_col); - if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) + if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) { decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); - else if (bsize < BLOCK_SIZE_SB8X8) - decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8); - else - decode_sb(pbi, xd, mi_row, mi_col, r, bsize); - + } else { + set_refs(pbi, mi_row, mi_col); + if (bsize < BLOCK_SIZE_SB8X8) + decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8); + else + decode_sb(pbi, xd, mi_row, mi_col, r, bsize); + } xd->corrupted |= vp9_reader_has_error(r); } @@ -1187,7 +1183,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) { if ((!keyframe) && (!pc->intra_only)) { vp9_adapt_mode_probs(pc); vp9_adapt_mode_context(pc); - vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv); + vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv); } } diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 54b6e2440..f655d456b 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -8,7 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - #include "./vpx_config.h" #include "./vp9_rtcd.h" #include "vp9/encoder/vp9_encodeframe.h" @@ -46,9 +45,8 @@ int enc_debug = 0; void vp9_select_interp_filter_type(VP9_COMP *cpi); -static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, - int output_enabled, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize); +static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, + int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize); static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); @@ -64,10 +62,8 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x); * Eventually this should be replaced by custom no-reference routines, * which will be faster. */ -static const uint8_t VP9_VAR_OFFS[16] = { - 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 -}; - +static const uint8_t VP9_VAR_OFFS[16] = {128, 128, 128, 128, 128, 128, 128, 128, + 128, 128, 128, 128, 128, 128, 128, 128}; // Original activity measure from Tim T's code. static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) { @@ -92,13 +88,11 @@ static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) { } // Stub for alternative experimental activity measures. -static unsigned int alt_activity_measure(VP9_COMP *cpi, - MACROBLOCK *x, int use_dc_pred) { +static unsigned int alt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x, + int use_dc_pred) { return vp9_encode_intra(cpi, x, use_dc_pred); } - -DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 }; - +DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0}; // Measure the activity of the current macroblock // What we measure here is TBD so abstracted to this function @@ -136,13 +130,12 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { // Create a list to sort to CHECK_MEM_ERROR(sortlist, - vpx_calloc(sizeof(unsigned int), - cpi->common.MBs)); + vpx_calloc(sizeof(unsigned int), + cpi->common.MBs)); // Copy map to sort list vpx_memcpy(sortlist, cpi->mb_activity_map, - sizeof(unsigned int) * cpi->common.MBs); - + sizeof(unsigned int) * cpi->common.MBs); // Ripple each value down to its correct position for (i = 1; i < cpi->common.MBs; i ++) { @@ -153,13 +146,13 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { sortlist[j - 1] = sortlist[j]; sortlist[j] = tmp; } else - break; + break; } } // Even number MBs so estimate median as mean of two either side. median = (1 + sortlist[cpi->common.MBs >> 1] + - sortlist[(cpi->common.MBs >> 1) + 1]) >> 1; + sortlist[(cpi->common.MBs >> 1) + 1]) >> 1; cpi->activity_avg = median; @@ -167,7 +160,7 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) { } #else // Simple mean for now - cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs); + cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs); #endif if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN) @@ -211,9 +204,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) { b = 4 * act + cpi->activity_avg; if (b >= a) - *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1; + *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1; else - *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b); + *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b); #if OUTPUT_NORM_ACT_STATS fprintf(f, " %6d", *(x->mb_activity_ptr)); @@ -238,9 +231,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) { // Loop through all MBs. Note activity of each, average activity and // calculate a normalized activity for each static void build_activity_map(VP9_COMP *cpi) { - MACROBLOCK *const x = &cpi->mb; + MACROBLOCK * const x = &cpi->mb; MACROBLOCKD *xd = &x->e_mbd; - VP9_COMMON *const cm = &cpi->common; + VP9_COMMON * const cm = &cpi->common; #if ALT_ACT_MEASURE YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx]; @@ -285,7 +278,6 @@ static void build_activity_map(VP9_COMP *cpi) { x->plane[0].src.buf += 16; } - // adjust to the next row of mbs x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols; } @@ -315,7 +307,7 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) { a = act + (2 * cpi->activity_avg); b = (2 * act) + cpi->activity_avg; - x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a); + x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a); x->errorperbit = x->rdmult * 100 / (110 * x->rddiv); x->errorperbit += (x->errorperbit == 0); #endif @@ -324,15 +316,13 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) { adjust_act_zbin(cpi, x); } -static void update_state(VP9_COMP *cpi, - PICK_MODE_CONTEXT *ctx, - BLOCK_SIZE_TYPE bsize, - int output_enabled) { +static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx, + BLOCK_SIZE_TYPE bsize, int output_enabled) { int i, x_idx, y; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; MODE_INFO *mi = &ctx->mic; - MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; + MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi; #if CONFIG_DEBUG || CONFIG_INTERNAL_STATS MB_PREDICTION_MODE mb_mode = mi->mbmi.mode; #endif @@ -352,8 +342,8 @@ static void update_state(VP9_COMP *cpi, // when the mode was picked for it for (y = 0; y < bh; y++) { for (x_idx = 0; x_idx < bw; x_idx++) { - if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx && - (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) { + if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx + && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) { MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis; *mi_addr = *mi; } @@ -408,27 +398,27 @@ static void update_state(VP9_COMP *cpi, #endif } else { /* - // Reduce the activation RD thresholds for the best choice mode - if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) && - (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2))) - { - int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2); - - cpi->rd_thresh_mult[mb_mode_index] = - (cpi->rd_thresh_mult[mb_mode_index] - >= (MIN_THRESHMULT + best_adjustment)) ? - cpi->rd_thresh_mult[mb_mode_index] - best_adjustment : - MIN_THRESHMULT; - cpi->rd_threshes[mb_mode_index] = - (cpi->rd_baseline_thresh[mb_mode_index] >> 7) - * cpi->rd_thresh_mult[mb_mode_index]; - - } - */ + // Reduce the activation RD thresholds for the best choice mode + if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) && + (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2))) + { + int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2); + + cpi->rd_thresh_mult[mb_mode_index] = + (cpi->rd_thresh_mult[mb_mode_index] + >= (MIN_THRESHMULT + best_adjustment)) ? + cpi->rd_thresh_mult[mb_mode_index] - best_adjustment : + MIN_THRESHMULT; + cpi->rd_threshes[mb_mode_index] = + (cpi->rd_baseline_thresh[mb_mode_index] >> 7) + * cpi->rd_thresh_mult[mb_mode_index]; + + } + */ // Note how often each mode chosen as best cpi->mode_chosen_counts[mb_mode_index]++; - if (mbmi->ref_frame[0] != INTRA_FRAME && - (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) { + if (mbmi->ref_frame[0] != INTRA_FRAME + && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) { int_mv best_mv, best_second_mv; const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0]; const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1]; @@ -447,21 +437,21 @@ static void update_state(VP9_COMP *cpi, int i, j; for (j = 0; j < bh; ++j) for (i = 0; i < bw; ++i) - if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i && - (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j) + if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i + && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j) xd->mode_info_context[mis * j + i].mbmi = *mbmi; } - if (cpi->common.mcomp_filter_type == SWITCHABLE && - is_inter_mode(mbmi->mode)) { - ++cpi->common.fc.switchable_interp_count - [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)] - [vp9_switchable_interp_map[mbmi->interp_filter]]; + if (cpi->common.mcomp_filter_type == SWITCHABLE + && is_inter_mode(mbmi->mode)) { + ++cpi->common.fc.switchable_interp_count[vp9_get_pred_context( + &cpi->common, xd, PRED_SWITCHABLE_INTERP)][vp9_switchable_interp_map[mbmi + ->interp_filter]]; } cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff; - cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; - cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; + cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; + cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; } } @@ -484,29 +474,26 @@ static unsigned find_seg_id(VP9_COMMON *cm, uint8_t *buf, BLOCK_SIZE_TYPE bsize, return seg_id; } -void vp9_setup_src_planes(MACROBLOCK *x, - const YV12_BUFFER_CONFIG *src, +void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src, int mb_row, int mb_col) { - uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, - src->alpha_buffer}; - int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, - src->alpha_stride}; + uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src + ->alpha_buffer}; + int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src + ->alpha_stride}; int i; for (i = 0; i < MAX_MB_PLANE; i++) { - setup_pred_plane(&x->plane[i].src, - buffers[i], strides[i], - mb_row, mb_col, NULL, - x->e_mbd.plane[i].subsampling_x, + setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col, + NULL, x->e_mbd.plane[i].subsampling_x, x->e_mbd.plane[i].subsampling_y); } } -static void set_offsets(VP9_COMP *cpi, - int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) { - MACROBLOCK *const x = &cpi->mb; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; +static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col, + BLOCK_SIZE_TYPE bsize) { + MACROBLOCK * const x = &cpi->mb; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCKD * const xd = &x->e_mbd; MB_MODE_INFO *mbmi; const int dst_fb_idx = cm->new_fb_idx; const int idx_str = xd->mode_info_stride * mi_row + mi_col; @@ -518,10 +505,10 @@ static void set_offsets(VP9_COMP *cpi, // entropy context structures for (i = 0; i < MAX_MB_PLANE; i++) { - xd->plane[i].above_context = cm->above_context[i] + - (mi_col * 2 >> xd->plane[i].subsampling_x); - xd->plane[i].left_context = cm->left_context[i] + - (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y); + xd->plane[i].above_context = cm->above_context[i] + + (mi_col * 2 >> xd->plane[i].subsampling_x); + xd->plane[i].left_context = cm->left_context[i] + + (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y); } // partition contexts @@ -532,25 +519,24 @@ static void set_offsets(VP9_COMP *cpi, x->active_ptr = cpi->active_map + idx_map; /* pointers to mode info contexts */ - x->partition_info = x->pi + idx_str; - xd->mode_info_context = cm->mi + idx_str; + x->partition_info = x->pi + idx_str; + xd->mode_info_context = cm->mi + idx_str; mbmi = &xd->mode_info_context->mbmi; // Special case: if prev_mi is NULL, the previous mode info context // cannot be used. - xd->prev_mode_info_context = cm->prev_mi ? - cm->prev_mi + idx_str : NULL; + xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + idx_str : NULL; // Set up destination pointers setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col); /* Set up limit values for MV components to prevent them from * extending beyond the UMV borders assuming 16x16 block size */ - x->mv_row_min = -((mi_row * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND); - x->mv_col_min = -((mi_col * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND); - x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE + - (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND)); - x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE + - (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND)); + x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND); + x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND); + x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE + + (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND)); + x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE + + (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND)); // Set up distance of MB to edge of frame in 1/8th pel units assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1))); @@ -565,30 +551,30 @@ static void set_offsets(VP9_COMP *cpi, /* segment ID */ if (xd->segmentation_enabled) { - uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map - : cm->last_frame_seg_map; - mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row, - cm->mi_rows, mi_col, cm->mi_cols); + uint8_t *map = + xd->update_mb_segmentation_map ? + cpi->segmentation_map : cm->last_frame_seg_map; + mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row, cm->mi_rows, mi_col, + cm->mi_cols); assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1)); vp9_mb_init_quantizer(cpi, x); - if (xd->segmentation_enabled && cpi->seg0_cnt > 0 && - !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) && - vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) { + if (xd->segmentation_enabled && cpi->seg0_cnt > 0 + && !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) + && vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) { cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt; } else { const int y = mb_row & ~3; const int x = mb_col & ~3; - const int p16 = ((mb_row & 1) << 1) + (mb_col & 1); + const int p16 = ((mb_row & 1) << 1) + (mb_col & 1); const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1); - const int tile_progress = - cm->cur_tile_mi_col_start * cm->mb_rows >> 1; - const int mb_cols = - (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1; + const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1; + const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) + >> 1; - cpi->seg0_progress = - ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs; + cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) + << 16) / cm->MBs; } } else { mbmi->segment_id = 0; @@ -596,11 +582,11 @@ static void set_offsets(VP9_COMP *cpi, } static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, - TOKENEXTRA **tp, int *totalrate, int *totaldist, + TOKENEXTRA **tp, int *totalrate, int64_t *totaldist, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; x->rd_search = 1; @@ -624,22 +610,21 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col, } static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; MODE_INFO *mi = xd->mode_info_context; - MB_MODE_INFO *const mbmi = &mi->mbmi; + MB_MODE_INFO * const mbmi = &mi->mbmi; if (cm->frame_type != KEY_FRAME) { int segment_id, seg_ref_active; segment_id = mbmi->segment_id; - seg_ref_active = vp9_segfeature_active(xd, segment_id, - SEG_LVL_REF_FRAME); + seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME); if (!seg_ref_active) - cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)] - [mbmi->ref_frame[0] > INTRA_FRAME]++; + cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)][mbmi + ->ref_frame[0] > INTRA_FRAME]++; // If the segment reference feature is enabled we have only a single // reference frame allowed for the segment so exclude it from @@ -647,19 +632,18 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) { if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) { if (cm->comp_pred_mode == HYBRID_PREDICTION) cpi->comp_inter_count[vp9_get_pred_context(cm, xd, - PRED_COMP_INTER_INTER)] - [mbmi->ref_frame[1] > INTRA_FRAME]++; + PRED_COMP_INTER_INTER)][mbmi + ->ref_frame[1] > INTRA_FRAME]++; if (mbmi->ref_frame[1] > INTRA_FRAME) { - cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)] - [mbmi->ref_frame[0] == GOLDEN_FRAME]++; + cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)][mbmi + ->ref_frame[0] == GOLDEN_FRAME]++; } else { - cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)] - [0][mbmi->ref_frame[0] != LAST_FRAME]++; + cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)][0][mbmi + ->ref_frame[0] != LAST_FRAME]++; if (mbmi->ref_frame[0] != LAST_FRAME) - cpi->single_ref_count[vp9_get_pred_context(cm, xd, - PRED_SINGLE_REF_P2)] - [1][mbmi->ref_frame[0] != GOLDEN_FRAME]++; + cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2)][1][mbmi + ->ref_frame[0] != GOLDEN_FRAME]++; } } // Count of last ref frame 0,0 usage @@ -673,7 +657,7 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) { // partition down to 4x4 block size is enabled. static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { - MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCKD * const xd = &x->e_mbd; switch (bsize) { case BLOCK_SIZE_SB64X64: @@ -704,7 +688,7 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index]; default: assert(0); - return NULL; + return NULL ; } } @@ -722,48 +706,45 @@ static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x, return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index]; default: assert(0); - return NULL; + return NULL ; } } static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], - PARTITION_CONTEXT sa[8], - PARTITION_CONTEXT sl[8], + PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; int p; int bwl = b_width_log2(bsize), bw = 1 << bwl; int bhl = b_height_log2(bsize), bh = 1 << bhl; int mwl = mi_width_log2(bsize), mw = 1 << mwl; int mhl = mi_height_log2(bsize), mh = 1 << mhl; for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(cm->above_context[p] + - ((mi_col * 2) >> xd->plane[p].subsampling_x), - a + bw * p, - sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x); - vpx_memcpy(cm->left_context[p] + - ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), - l + bh * p, - sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y); - } + vpx_memcpy( + cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x), + a + bw * p, sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x); + vpx_memcpy( + cm->left_context[p] + + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),l + bh * p, + sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y); + } vpx_memcpy(cm->above_seg_context + mi_col, sa, sizeof(PARTITION_CONTEXT) * mw); vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl, - sizeof(PARTITION_CONTEXT) * mh); -} + sizeof(PARTITION_CONTEXT) * mh) + ;} static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, - ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], - PARTITION_CONTEXT sa[8], - PARTITION_CONTEXT sl[8], - BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], + PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8], + BLOCK_SIZE_TYPE bsize) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; int p; int bwl = b_width_log2(bsize), bw = 1 << bwl; int bhl = b_height_log2(bsize), bh = 1 << bhl; @@ -772,25 +753,26 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col, // buffer the above/left context information of the block in search. for (p = 0; p < MAX_MB_PLANE; ++p) { - vpx_memcpy(a + bw * p, cm->above_context[p] + - (mi_col * 2 >> xd->plane[p].subsampling_x), - sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x); - vpx_memcpy(l + bh * p, cm->left_context[p] + - ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), - sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y); - } + vpx_memcpy( + a + bw * p, + cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x), + sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x); + vpx_memcpy( + l + bh * p, + cm->left_context[p] + + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y); + } vpx_memcpy(sa, cm->above_seg_context + mi_col, sizeof(PARTITION_CONTEXT) * mw); vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK), - sizeof(PARTITION_CONTEXT) * mh); -} + sizeof(PARTITION_CONTEXT) * mh) + ;} -static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, - int mi_row, int mi_col, int output_enabled, - BLOCK_SIZE_TYPE bsize, int sub_index) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; +static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, + int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; @@ -813,12 +795,11 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, } } -static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, - int mi_row, int mi_col, int output_enabled, - BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; +static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, + int output_enabled, BLOCK_SIZE_TYPE bsize) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8; const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4; int bwl, bhl; @@ -838,17 +819,17 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, if (bsl == bwl && bsl == bhl) { if (output_enabled && bsize >= BLOCK_SIZE_SB8X8) - cpi->partition_count[pl][PARTITION_NONE]++; + cpi->partition_count[pl][PARTITION_NONE]++; encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1); } else if (bsl == bhl && bsl > bwl) { if (output_enabled) cpi->partition_count[pl][PARTITION_VERT]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1); } else if (bsl == bwl && bsl > bhl) { if (output_enabled) cpi->partition_count[pl][PARTITION_HORZ]++; - encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); + encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0); encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1); } else { BLOCK_SIZE_TYPE subsize; @@ -869,8 +850,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, } } - if (bsize >= BLOCK_SIZE_SB8X8 && - (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) { + if (bsize >= BLOCK_SIZE_SB8X8 + && (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) { set_partition_seg_context(cm, xd, mi_row, mi_col); update_partition_context(xd, c1, bsize); } @@ -880,26 +861,28 @@ static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m, BLOCK_SIZE_TYPE bsize) { VP9_COMMON *const cm = &cpi->common; const int mis = cm->mode_info_stride; - int bsl = b_width_log2(bsize); - int bs = (1 << bsl) / 2; // int block_row, block_col; - int row, col; - - // this test function sets the entire macroblock to the same bsize - for (block_row = 0; block_row < 8; block_row += bs) { - for (block_col = 0; block_col < 8; block_col += bs) { - for (row = 0; row < bs; row++) { - for (col = 0; col < bs; col++) { - m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize; - } - } + for (block_row = 0; block_row < 8; ++block_row) { + for (block_col = 0; block_col < 8; ++block_col) { + m[block_row * mis + block_col].mbmi.sb_type = bsize; + } + } +} +static void copy_partitioning(VP9_COMP *cpi, MODE_INFO *m, MODE_INFO *p) { + VP9_COMMON *const cm = &cpi->common; + const int mis = cm->mode_info_stride; + int block_row, block_col; + for (block_row = 0; block_row < 8; ++block_row) { + for (block_col = 0; block_col < 8; ++block_col) { + m[block_row * mis + block_col].mbmi.sb_type = + p[block_row * mis + block_col].mbmi.sb_type; } } } -static void set_block_size(VP9_COMMON *const cm, - MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis, - int mi_row, int mi_col) { +static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m, + BLOCK_SIZE_TYPE bsize, int mis, int mi_row, + int mi_col) { int row, col; int bwl = b_width_log2(bsize); int bhl = b_height_log2(bsize); @@ -911,10 +894,11 @@ static void set_block_size(VP9_COMMON *const cm, for (col = 0; col < bs; col++) { if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols) continue; - m2[row*mis+col].mbmi.sb_type = bsize; + m2[row * mis + col].mbmi.sb_type = bsize; } } } + typedef struct { int64_t sum_square_error; int64_t sum_error; @@ -922,11 +906,15 @@ typedef struct { int variance; } var; +typedef struct { + var none; + var horz[2]; + var vert[2]; +} partition_variance; + #define VT(TYPE, BLOCKSIZE) \ typedef struct { \ - var none; \ - var horz[2]; \ - var vert[2]; \ + partition_variance vt; \ BLOCKSIZE split[4]; } TYPE; VT(v8x8, var) @@ -934,20 +922,67 @@ VT(v16x16, v8x8) VT(v32x32, v16x16) VT(v64x64, v32x32) +typedef struct { + partition_variance *vt; + var *split[4]; +} vt_node; + typedef enum { V16X16, V32X32, V64X64, } TREE_LEVEL; +static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) { + int i; + switch (block_size) { + case BLOCK_SIZE_SB64X64: { + v64x64 *vt = (v64x64 *) data; + node->vt = &vt->vt; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].vt.none; + break; + } + case BLOCK_SIZE_SB32X32: { + v32x32 *vt = (v32x32 *) data; + node->vt = &vt->vt; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].vt.none; + break; + } + case BLOCK_SIZE_MB16X16: { + v16x16 *vt = (v16x16 *) data; + node->vt = &vt->vt; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i].vt.none; + break; + } + case BLOCK_SIZE_SB8X8: { + v8x8 *vt = (v8x8 *) data; + node->vt = &vt->vt; + for (i = 0; i < 4; i++) + node->split[i] = &vt->split[i]; + break; + } + default: + node->vt = 0; + for (i = 0; i < 4; i++) + node->split[i] = 0; + assert(-1); + } +} + // Set variance values given sum square error, sum error, count. static void fill_variance(var *v, int64_t s2, int64_t s, int c) { v->sum_square_error = s2; v->sum_error = s; v->count = c; - v->variance = 256 - * (v->sum_square_error - v->sum_error * v->sum_error / v->count) - / v->count; + if (c > 0) + v->variance = 256 + * (v->sum_square_error - v->sum_error * v->sum_error / v->count) + / v->count; + else + v->variance = 0; } // Combine 2 variance structures by summing the sum_error, sum_square_error, @@ -956,31 +991,95 @@ void sum_2_variances(var *r, var *a, var*b) { fill_variance(r, a->sum_square_error + b->sum_square_error, a->sum_error + b->sum_error, a->count + b->count); } -// Fill one level of our variance tree, by summing the split sums into each of -// the horizontal, vertical and none from split and recalculating variance. -#define fill_variance_tree(VT) \ - sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \ - sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \ - sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \ - sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \ - sum_2_variances(VT.none, VT.vert[0], VT.vert[1]); - -// Set the blocksize in the macroblock info structure if the variance is less -// than our threshold to one of none, horz, vert. -#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \ - if (VT.none.variance < threshold) { \ - set_block_size(cm, m, BLOCKSIZE, mis, R, C); \ - ACTION; \ - } \ - if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \ - set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \ - ACTION; \ - } \ - if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \ - set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \ - ACTION; \ + +static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) { + vt_node node; + tree_to_node(data, block_size, &node); + sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]); + sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]); + sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]); + sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]); + sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]); +} + +#if PERFORM_RANDOM_PARTITIONING +static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m, + BLOCK_SIZE_TYPE block_size, int mi_row, + int mi_col, int mi_size) { + VP9_COMMON * const cm = &cpi->common; + vt_node vt; + const int mis = cm->mode_info_stride; + int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex; + + tree_to_node(data, block_size, &vt); + + // split none is available only if we have more than half a block size + // in width and height inside the visible image + if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows && + (rand() & 3) < 1) { + set_block_size(cm, m, block_size, mis, mi_row, mi_col); + return 1; + } + + // vertical split is available on all but the bottom border + if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold + && (rand() & 3) < 1) { + set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row, + mi_col); + return 1; } + // horizontal split is available on all but the right border + if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold + && (rand() & 3) < 1) { + set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row, + mi_col); + return 1; + } + + return 0; +} + +#else + +static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m, + BLOCK_SIZE_TYPE block_size, int mi_row, + int mi_col, int mi_size) { + VP9_COMMON * const cm = &cpi->common; + vt_node vt; + const int mis = cm->mode_info_stride; + int64_t threshold = 50 * cpi->common.base_qindex; + + tree_to_node(data, block_size, &vt); + + // split none is available only if we have more than half a block size + // in width and height inside the visible image + if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows + && vt.vt->none.variance < threshold) { + set_block_size(cm, m, block_size, mis, mi_row, mi_col); + return 1; + } + + // vertical split is available on all but the bottom border + if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold + && vt.vt->vert[1].variance < threshold) { + set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row, + mi_col); + return 1; + } + + // horizontal split is available on all but the right border + if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold + && vt.vt->horz[1].variance < threshold) { + set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row, + mi_col); + return 1; + } + + return 0; +} +#endif + static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, int mi_col) { VP9_COMMON * const cm = &cpi->common; @@ -993,8 +1092,8 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, v64x64 vt; unsigned char * s; int sp; - const unsigned char * d = xd->plane[0].pre->buf; - int dp = xd->plane[0].pre->stride; + const unsigned char * d; + int dp; int pixels_wide = 64, pixels_high = 64; vpx_memset(&vt, 0, sizeof(vt)); @@ -1014,81 +1113,89 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row, // but this needs more experimentation. threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex; - // if ( cm->frame_type == KEY_FRAME ) { d = vp9_64x64_zeros; dp = 64; - // } + if (cm->frame_type != KEY_FRAME) { + int_mv nearest_mv, near_mv; + YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[0]; + YV12_BUFFER_CONFIG *second_ref_fb = NULL; + + setup_pre_planes(xd, ref_fb, second_ref_fb, mi_row, mi_col, + xd->scale_factor, xd->scale_factor_uv); + xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME; + xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64; + vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]], + &nearest_mv, &near_mv); + + xd->mode_info_context->mbmi.mv[0] = nearest_mv; + vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64); + d = xd->plane[0].dst.buf; + dp = xd->plane[0].dst.stride; + + } // Fill in the entire tree of 8x8 variances for splits. for (i = 0; i < 4; i++) { const int x32_idx = ((i & 1) << 5); const int y32_idx = ((i >> 1) << 5); for (j = 0; j < 4; j++) { - const int x_idx = x32_idx + ((j & 1) << 4); - const int y_idx = y32_idx + ((j >> 1) << 4); - const uint8_t *st = s + y_idx * sp + x_idx; - const uint8_t *dt = d + y_idx * dp + x_idx; - unsigned int sse = 0; - int sum = 0; + const int x16_idx = x32_idx + ((j & 1) << 4); + const int y16_idx = y32_idx + ((j >> 1) << 4); v16x16 *vst = &vt.split[i].split[j]; - sse = sum = 0; - if (x_idx < pixels_wide && y_idx < pixels_high) - vp9_get_sse_sum_8x8(st, sp, dt, dp, &sse, &sum); - fill_variance(&vst->split[0].none, sse, sum, 64); - sse = sum = 0; - if (x_idx + 8 < pixels_wide && y_idx < pixels_high) - vp9_get_sse_sum_8x8(st + 8, sp, dt + 8, dp, &sse, &sum); - fill_variance(&vst->split[1].none, sse, sum, 64); - sse = sum = 0; - if (x_idx < pixels_wide && y_idx + 8 < pixels_high) - vp9_get_sse_sum_8x8(st + 8 * sp, sp, dt + 8 * dp, dp, &sse, &sum); - fill_variance(&vst->split[2].none, sse, sum, 64); - sse = sum = 0; - if (x_idx + 8 < pixels_wide && y_idx + 8 < pixels_high) - vp9_get_sse_sum_8x8(st + 8 * sp + 8, sp, dt + 8 + 8 * dp, dp, &sse, - &sum); - fill_variance(&vst->split[3].none, sse, sum, 64); + for (k = 0; k < 4; k++) { + int x_idx = x16_idx + ((k & 1) << 3); + int y_idx = y16_idx + ((k >> 1) << 3); + unsigned int sse = 0; + int sum = 0; + if (x_idx < pixels_wide && y_idx < pixels_high) + vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp, + d + y_idx * dp + x_idx, dp, &sse, &sum); + fill_variance(&vst->split[k].vt.none, sse, sum, 64); + } } } // Fill the rest of the variance tree by summing the split partition // values. for (i = 0; i < 4; i++) { for (j = 0; j < 4; j++) { - fill_variance_tree(&vt.split[i].split[j]) + fill_variance_tree(&vt.split[i].split[j], BLOCK_SIZE_MB16X16); } - fill_variance_tree(&vt.split[i]) + fill_variance_tree(&vt.split[i], BLOCK_SIZE_SB32X32); } - fill_variance_tree(&vt) - - // Now go through the entire structure, splitting every blocksize until + fill_variance_tree(&vt, BLOCK_SIZE_SB64X64); + // Now go through the entire structure, splitting every block size until // we get to one that's got a variance lower than our threshold, or we // hit 8x8. - set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return); - for (i = 0; i < 4; ++i) { - const int x32_idx = ((i & 1) << 2); - const int y32_idx = ((i >> 1) << 2); - set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx, - continue); - - for (j = 0; j < 4; ++j) { - const int x16_idx = ((j & 1) << 1); - const int y16_idx = ((j >> 1) << 1); - set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx, - mi_col+x32_idx+x16_idx, continue); - - for (k = 0; k < 4; ++k) { - const int x8_idx = (k & 1); - const int y8_idx = (k >> 1); - set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis, - mi_row + y32_idx + y16_idx + y8_idx, - mi_col + x32_idx + x16_idx + x8_idx); + if (!set_vt_partitioning(cpi, &vt, m, BLOCK_SIZE_SB64X64, mi_row, mi_col, + 4)) { + for (i = 0; i < 4; ++i) { + const int x32_idx = ((i & 1) << 2); + const int y32_idx = ((i >> 1) << 2); + if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_SIZE_SB32X32, + (mi_row + y32_idx), (mi_col + x32_idx), 2)) { + for (j = 0; j < 4; ++j) { + const int x16_idx = ((j & 1) << 1); + const int y16_idx = ((j >> 1) << 1); + if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m, + BLOCK_SIZE_MB16X16, + (mi_row + y32_idx + y16_idx), + (mi_col + x32_idx + x16_idx), 1)) { + for (k = 0; k < 4; ++k) { + const int x8_idx = (k & 1); + const int y8_idx = (k >> 1); + set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis, + (mi_row + y32_idx + y16_idx + y8_idx), + (mi_col + x32_idx + x16_idx + x8_idx)); + } + } + } } } } } static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize, - int *rate, int *dist) { + int *rate, int64_t *dist) { VP9_COMMON * const cm = &cpi->common; MACROBLOCK * const x = &cpi->mb; MACROBLOCKD *xd = &cpi->mb.e_mbd; @@ -1098,18 +1205,18 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, int bsl = b_width_log2(bsize); int bh = (1 << bhl); int bs = (1 << bsl); - int bss = (1 << bsl)/4; + int bss = (1 << bsl) / 4; int i, pl; PARTITION_TYPE partition; BLOCK_SIZE_TYPE subsize; ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; - int r = 0, d = 0; + int r = 0; + int64_t d = 0; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - // parse the partition type if ((bwl == bsl) && (bhl == bsl)) partition = PARTITION_NONE; @@ -1124,18 +1231,15 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, subsize = get_subsize(bsize, partition); - // TODO(JBB): this restriction is here because pick_sb_modes can return - // r's that are INT_MAX meaning we can't select a mode / mv for this block. - // when the code is made to work for less than sb8x8 we need to come up with - // a solution to this problem. - assert(subsize >= BLOCK_SIZE_SB8X8); - - if (bsize >= BLOCK_SIZE_SB8X8) { - xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK); - xd->above_seg_context = cm->above_seg_context + mi_col; + if (bsize < BLOCK_SIZE_SB8X8) { + if (xd->ab_index != 0) { + *rate = 0; + *dist = 0; + return; + } + } else { *(get_sb_partitioning(x, bsize)) = subsize; } - pl = partition_plane_context(xd, bsize); save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); switch (partition) { @@ -1149,7 +1253,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize, get_block_context(x, subsize)); if (mi_row + (bh >> 1) <= cm->mi_rows) { - int rt, dt; + int rt; + int64_t dt; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *(get_sb_index(xd, subsize)) = 1; @@ -1167,7 +1272,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize, get_block_context(x, subsize)); if (mi_col + (bs >> 1) <= cm->mi_cols) { - int rt, dt; + int rt; + int64_t dt; update_state(cpi, get_block_context(x, subsize), subsize, 0); encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); *(get_sb_index(xd, subsize)) = 1; @@ -1186,7 +1292,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, int x_idx = (i & 1) * (bs >> 2); int y_idx = (i >> 1) * (bs >> 2); int jj = i >> 1, ii = i & 0x01; - int rt, dt; + int rt; + int64_t dt; if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; @@ -1206,17 +1313,6 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, assert(0); } - // update partition context -#if CONFIG_AB4X4 - if (bsize >= BLOCK_SIZE_SB8X8 && - (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) { -#else - if (bsize > BLOCK_SIZE_SB8X8 - && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) { -#endif - set_partition_seg_context(cm, xd, mi_row, mi_col); - update_partition_context(xd, subsize, bsize); - } restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); if (r < INT_MAX && d < INT_MAX) @@ -1229,21 +1325,21 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp, // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are // unlikely to be selected depending on previously rate-distortion optimization // results, for encoding speed-up. -static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, - int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize, - int *rate, int *dist) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; +static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, + int mi_col, BLOCK_SIZE_TYPE bsize, int *rate, + int64_t *dist) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; int bsl = b_width_log2(bsize), bs = 1 << bsl; int ms = bs / 2; - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; PARTITION_CONTEXT sl[8], sa[8]; TOKENEXTRA *tp_orig = *tp; int i, pl; BLOCK_SIZE_TYPE subsize; - int srate = INT_MAX, sdist = INT_MAX; + int srate = INT_MAX; + int64_t sdist = INT_MAX; if (bsize < BLOCK_SIZE_SB8X8) if (xd->ab_index != 0) { @@ -1256,121 +1352,132 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); // PARTITION_SPLIT - if (bsize >= BLOCK_SIZE_SB8X8) { - int r4 = 0, d4 = 0; - subsize = get_subsize(bsize, PARTITION_SPLIT); - *(get_sb_partitioning(x, bsize)) = subsize; + if (!cpi->sf.use_partitions_greater_than + || (cpi->sf.use_partitions_greater_than + && bsize > cpi->sf.greater_than_block_size)) { + if (bsize >= BLOCK_SIZE_SB8X8) { + int r4 = 0; + int64_t d4 = 0; + subsize = get_subsize(bsize, PARTITION_SPLIT); + *(get_sb_partitioning(x, bsize)) = subsize; - for (i = 0; i < 4; ++i) { - int x_idx = (i & 1) * (ms >> 1); - int y_idx = (i >> 1) * (ms >> 1); - int r = 0, d = 0; + for (i = 0; i < 4; ++i) { + int x_idx = (i & 1) * (ms >> 1); + int y_idx = (i >> 1) * (ms >> 1); + int r = 0; + int64_t d = 0; - if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) - continue; + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) + continue; - *(get_sb_index(xd, subsize)) = i; - rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, - &r, &d); + *(get_sb_index(xd, subsize)) = i; + rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r, + &d); - r4 += r; - d4 += d; + r4 += r; + d4 += d; + } + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + if (r4 < INT_MAX) + r4 += x->partition_cost[pl][PARTITION_SPLIT]; + assert(r4 >= 0); + assert(d4 >= 0); + srate = r4; + sdist = d4; + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - if (r4 < INT_MAX) - r4 += x->partition_cost[pl][PARTITION_SPLIT]; - assert(r4 >= 0); - assert(d4 >= 0); - srate = r4; - sdist = d4; - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - - // PARTITION_HORZ - if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) { - int r2, d2; - int r = 0, d = 0; - subsize = get_subsize(bsize, PARTITION_HORZ); - *(get_sb_index(xd, subsize)) = 0; - pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, - get_block_context(x, subsize)); - - if (mi_row + (ms >> 1) < cm->mi_rows) { - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - - *(get_sb_index(xd, subsize)) = 1; - pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize, + if (!cpi->sf.use_partitions_less_than + || (cpi->sf.use_partitions_less_than + && bsize <= cpi->sf.less_than_block_size)) { + // PARTITION_HORZ + if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) { + int r2, r = 0; + int64_t d2, d = 0; + subsize = get_subsize(bsize, PARTITION_HORZ); + *(get_sb_index(xd, subsize)) = 0; + pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, get_block_context(x, subsize)); - r2 += r; - d2 += d; - } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - if (r2 < INT_MAX) - r2 += x->partition_cost[pl][PARTITION_HORZ]; - if (RDCOST(x->rdmult, x->rddiv, r2, d2) < - RDCOST(x->rdmult, x->rddiv, srate, sdist)) { - srate = r2; - sdist = d2; - *(get_sb_partitioning(x, bsize)) = subsize; + + if (mi_row + (ms >> 1) < cm->mi_rows) { + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + + *(get_sb_index(xd, subsize)) = 1; + pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize, + get_block_context(x, subsize)); + r2 += r; + d2 += d; + } + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + if (r2 < INT_MAX) + r2 += x->partition_cost[pl][PARTITION_HORZ]; + if (RDCOST(x->rdmult, x->rddiv, r2, d2) + < RDCOST(x->rdmult, x->rddiv, srate, sdist)) { + srate = r2; + sdist = d2; + *(get_sb_partitioning(x, bsize)) = subsize; + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - } - // PARTITION_VERT - if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) { - int r2, d2; - subsize = get_subsize(bsize, PARTITION_VERT); - *(get_sb_index(xd, subsize)) = 0; - pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, - get_block_context(x, subsize)); - if (mi_col + (ms >> 1) < cm->mi_cols) { - int r = 0, d = 0; - update_state(cpi, get_block_context(x, subsize), subsize, 0); - encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - - *(get_sb_index(xd, subsize)) = 1; - pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize, + // PARTITION_VERT + if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) { + int r2; + int64_t d2; + subsize = get_subsize(bsize, PARTITION_VERT); + *(get_sb_index(xd, subsize)) = 0; + pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, get_block_context(x, subsize)); - r2 += r; - d2 += d; - } - set_partition_seg_context(cm, xd, mi_row, mi_col); - pl = partition_plane_context(xd, bsize); - if (r2 < INT_MAX) - r2 += x->partition_cost[pl][PARTITION_VERT]; - if (RDCOST(x->rdmult, x->rddiv, r2, d2) < - RDCOST(x->rdmult, x->rddiv, srate, sdist)) { - srate = r2; - sdist = d2; - *(get_sb_partitioning(x, bsize)) = subsize; - } - restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); - } + if (mi_col + (ms >> 1) < cm->mi_cols) { + int r = 0; + int64_t d = 0; + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); - // PARTITION_NONE - if ((mi_row + (ms >> 1) < cm->mi_rows) && - (mi_col + (ms >> 1) < cm->mi_cols)) { - int r, d; - pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize, - get_block_context(x, bsize)); - if (bsize >= BLOCK_SIZE_SB8X8) { + *(get_sb_index(xd, subsize)) = 1; + pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize, + get_block_context(x, subsize)); + r2 += r; + d2 += d; + } set_partition_seg_context(cm, xd, mi_row, mi_col); pl = partition_plane_context(xd, bsize); - r += x->partition_cost[pl][PARTITION_NONE]; + if (r2 < INT_MAX) + r2 += x->partition_cost[pl][PARTITION_VERT]; + if (RDCOST(x->rdmult, x->rddiv, r2, d2) + < RDCOST(x->rdmult, x->rddiv, srate, sdist)) { + srate = r2; + sdist = d2; + *(get_sb_partitioning(x, bsize)) = subsize; + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); } - if (RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, srate, sdist)) { - srate = r; - sdist = d; - if (bsize >= BLOCK_SIZE_SB8X8) - *(get_sb_partitioning(x, bsize)) = bsize; + // PARTITION_NONE + if ((mi_row + (ms >> 1) < cm->mi_rows) && + (mi_col + (ms >> 1) < cm->mi_cols)) { + int r; + int64_t d; + pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize, + get_block_context(x, bsize)); + if (bsize >= BLOCK_SIZE_SB8X8) { + set_partition_seg_context(cm, xd, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + r += x->partition_cost[pl][PARTITION_NONE]; + } + + if (RDCOST(x->rdmult, x->rddiv, r, d) + < RDCOST(x->rdmult, x->rddiv, srate, sdist)) { + srate = r; + sdist = d; + if (bsize >= BLOCK_SIZE_SB8X8) + *(get_sb_partitioning(x, bsize)) = bsize; + } } } - *rate = srate; *dist = sdist; @@ -1388,9 +1495,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, } } -static void encode_sb_row(VP9_COMP *cpi, int mi_row, - TOKENEXTRA **tp, int *totalrate) { - VP9_COMMON *const cm = &cpi->common; +static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp, + int *totalrate) { + VP9_COMMON * const cm = &cpi->common; int mi_col; // Initialize the left context for the new SB row @@ -1398,27 +1505,49 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row, vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context)); // Code each SB in the row - for (mi_col = cm->cur_tile_mi_col_start; - mi_col < cm->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) { - int dummy_rate, dummy_dist; - if (cpi->speed < 5) { - rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, - &dummy_rate, &dummy_dist); - } else { + for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end; + mi_col += 64 / MI_SIZE) { + int dummy_rate; + int64_t dummy_dist; + if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning || + cpi->sf.use_one_partition_size_always ) { const int idx_str = cm->mode_info_stride * mi_row + mi_col; MODE_INFO *m = cm->mi + idx_str; - // set_partitioning(cpi, m, BLOCK_SIZE_SB64X64); - choose_partitioning(cpi, cm->mi, mi_row, mi_col); - rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, - &dummy_rate, &dummy_dist); + MODE_INFO *p = cm->prev_mi + idx_str; + + if (cpi->sf.use_one_partition_size_always) { + set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64); + set_partitioning(cpi, m, cpi->sf.always_this_block_size); + rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist); + } else if (cpi->sf.partition_by_variance) { + choose_partitioning(cpi, cm->mi, mi_row, mi_col); + rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist); + } else { + if ((cpi->common.current_video_frame & 1) == 0 || cm->prev_mi == 0 + || cpi->common.show_frame == 0 + || cpi->common.frame_type == KEY_FRAME + || cpi->is_src_frame_alt_ref) { + rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist); + } else { + copy_partitioning(cpi, m, p); + rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist); + } + } + } else { + rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist); } } } static void init_encode_frame_mb_context(VP9_COMP *cpi) { - MACROBLOCK *const x = &cpi->mb; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCK * const x = &cpi->mb; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCKD * const xd = &x->e_mbd; x->act_zbin_adj = 0; cpi->seg0_idx = 0; @@ -1438,7 +1567,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { // TODO(jkoleszar): are these initializations required? setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL, - 0, 0, NULL, NULL); + 0, 0, NULL, NULL ); setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0); vp9_build_block_offsets(x); @@ -1463,36 +1592,36 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { // Note: this memset assumes above_context[0], [1] and [2] // are allocated as part of the same buffer. - vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 * - MAX_MB_PLANE * mi_cols_aligned_to_sb(cm)); - vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) * - mi_cols_aligned_to_sb(cm)); + vpx_memset( + cm->above_context[0], 0, + sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * mi_cols_aligned_to_sb(cm)); + vpx_memset(cm->above_seg_context, 0, + sizeof(PARTITION_CONTEXT) * mi_cols_aligned_to_sb(cm)); } static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { if (lossless) { - cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; - cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; - cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; - cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add; - cpi->mb.optimize = 0; - cpi->common.filter_level = 0; - cpi->zbin_mode_boost_enabled = 0; - cpi->common.txfm_mode = ONLY_4X4; + cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; + cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; + cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add; + cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add; + cpi->mb.optimize = 0; + cpi->common.filter_level = 0; + cpi->zbin_mode_boost_enabled = 0; + cpi->common.txfm_mode = ONLY_4X4; } else { - cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; - cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; - cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add; - cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add; + cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; + cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; + cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add; + cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add; } } - static void encode_frame_internal(VP9_COMP *cpi) { int mi_row; - MACROBLOCK *const x = &cpi->mb; - VP9_COMMON *const cm = &cpi->common; - MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCK * const x = &cpi->mb; + VP9_COMMON * const cm = &cpi->common; + MACROBLOCKD * const xd = &x->e_mbd; int totalrate; // fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n", @@ -1524,10 +1653,8 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(cpi->coef_counts); vp9_zero(cm->fc.eob_branch_counts); - cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && - cm->y_dc_delta_q == 0 && - cm->uv_dc_delta_q == 0 && - cm->uv_ac_delta_q == 0; + cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0 + && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0; switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless); vp9_frame_init_quantizer(cpi); @@ -1553,7 +1680,7 @@ static void encode_frame_internal(VP9_COMP *cpi) { set_prev_mi(cm); { - struct vpx_usec_timer emr_timer; + struct vpx_usec_timer emr_timer; vpx_usec_timer_start(&emr_timer); { @@ -1570,9 +1697,9 @@ static void encode_frame_internal(VP9_COMP *cpi) { // For each row of SBs in the frame vp9_get_tile_col_offsets(cm, tile_col); for (mi_row = cm->cur_tile_mi_row_start; - mi_row < cm->cur_tile_mi_row_end; - mi_row += 8) + mi_row < cm->cur_tile_mi_row_end; mi_row += 8) encode_sb_row(cpi, mi_row, &tp, &totalrate); + cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old); assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); @@ -1602,9 +1729,8 @@ static int check_dual_ref_flags(VP9_COMP *cpi) { if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) { return 0; } else { - return (!!(ref_flags & VP9_GOLD_FLAG) + - !!(ref_flags & VP9_LAST_FLAG) + - !!(ref_flags & VP9_ALT_FLAG)) >= 2; + return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG) + + !!(ref_flags & VP9_ALT_FLAG)) >= 2; } } @@ -1631,35 +1757,33 @@ static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs, } } -static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, - int mis, TX_SIZE txfm_max, - int bw, int bh, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MB_MODE_INFO *const mbmi = &mi->mbmi; +static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis, + TX_SIZE txfm_max, int bw, int bh, int mi_row, + int mi_col, BLOCK_SIZE_TYPE bsize) { + VP9_COMMON * const cm = &cpi->common; + MB_MODE_INFO * const mbmi = &mi->mbmi; if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; if (mbmi->txfm_size > txfm_max) { - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; const int segment_id = mbmi->segment_id; const int ymbs = MIN(bh, cm->mi_rows - mi_row); const int xmbs = MIN(bw, cm->mi_cols - mi_col); xd->mode_info_context = mi; - assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) || - get_skip_flag(mi, mis, ymbs, xmbs)); + assert( + vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) || get_skip_flag(mi, mis, ymbs, xmbs)); set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max); } } static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi, - TX_SIZE txfm_max, - int mi_row, int mi_col, + TX_SIZE txfm_max, int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; + VP9_COMMON * const cm = &cpi->common; const int mis = cm->mode_info_stride; int bwl, bhl; const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1); @@ -1671,18 +1795,18 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi, bhl = mi_height_log2(mi->mbmi.sb_type); if (bwl == bsl && bhl == bsl) { - reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, - mi_row, mi_col, bsize); + reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row, + mi_col, bsize); } else if (bwl == bsl && bhl < bsl) { - reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, - mi_row, mi_col, bsize); + reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col, + bsize); reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs, mi_row + bs, mi_col, bsize); } else if (bwl < bsl && bhl == bsl) { - reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, - mi_row, mi_col, bsize); - reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, - mi_row, mi_col + bs, bsize); + reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col, + bsize); + reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row, + mi_col + bs, bsize); } else { BLOCK_SIZE_TYPE subsize; int n; @@ -1700,32 +1824,30 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi, for (n = 0; n < 4; n++) { const int y_idx = n >> 1, x_idx = n & 0x01; - reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, - txfm_max, mi_row + y_idx * bs, - mi_col + x_idx * bs, subsize); + reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max, + mi_row + y_idx * bs, mi_col + x_idx * bs, + subsize); } } } static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) { - VP9_COMMON *const cm = &cpi->common; + VP9_COMMON * const cm = &cpi->common; int mi_row, mi_col; const int mis = cm->mode_info_stride; MODE_INFO *mi, *mi_ptr = cm->mi; - for (mi_row = 0; mi_row < cm->mi_rows; - mi_row += 8, mi_ptr += 8 * mis) { + for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) { mi = mi_ptr; - for (mi_col = 0; mi_col < cm->mi_cols; - mi_col += 8, mi += 8) { - reset_skip_txfm_size_sb(cpi, mi, txfm_max, - mi_row, mi_col, BLOCK_SIZE_SB64X64); + for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi += 8) { + reset_skip_txfm_size_sb(cpi, mi, txfm_max, mi_row, mi_col, + BLOCK_SIZE_SB64X64); } } } void vp9_encode_frame(VP9_COMP *cpi) { - VP9_COMMON *const cm = &cpi->common; + VP9_COMMON * const cm = &cpi->common; // In the longer term the encoder should be generalized to match the // decoder such that we allow compound where one of the 3 buffers has a @@ -1733,10 +1855,10 @@ void vp9_encode_frame(VP9_COMP *cpi) { // requires further work in the rd loop. For now the only supported encoder // side behaviour is where the ALT ref buffer has oppositie sign bias to // the other two. - if ((cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[GOLDEN_FRAME]) || - (cm->ref_frame_sign_bias[ALTREF_FRAME] == - cm->ref_frame_sign_bias[LAST_FRAME])) { + if ((cm->ref_frame_sign_bias[ALTREF_FRAME] + == cm->ref_frame_sign_bias[GOLDEN_FRAME]) + || (cm->ref_frame_sign_bias[ALTREF_FRAME] + == cm->ref_frame_sign_bias[LAST_FRAME])) { cm->allow_comp_inter_inter = 0; } else { cm->allow_comp_inter_inter = 1; @@ -1770,14 +1892,14 @@ void vp9_encode_frame(VP9_COMP *cpi) { /* prediction (compound, single or hybrid) mode selection */ if (frame_type == 3 || !cm->allow_comp_inter_inter) pred_type = SINGLE_PREDICTION_ONLY; - else if (cpi->rd_prediction_type_threshes[frame_type][1] > - cpi->rd_prediction_type_threshes[frame_type][0] && - cpi->rd_prediction_type_threshes[frame_type][1] > - cpi->rd_prediction_type_threshes[frame_type][2] && - check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) + else if (cpi->rd_prediction_type_threshes[frame_type][1] + > cpi->rd_prediction_type_threshes[frame_type][0] + && cpi->rd_prediction_type_threshes[frame_type][1] + > cpi->rd_prediction_type_threshes[frame_type][2] + && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100) pred_type = COMP_PREDICTION_ONLY; - else if (cpi->rd_prediction_type_threshes[frame_type][0] > - cpi->rd_prediction_type_threshes[frame_type][2]) + else if (cpi->rd_prediction_type_threshes[frame_type][0] + > cpi->rd_prediction_type_threshes[frame_type][2]) pred_type = SINGLE_PREDICTION_ONLY; else pred_type = HYBRID_PREDICTION; @@ -1790,43 +1912,44 @@ void vp9_encode_frame(VP9_COMP *cpi) { cpi->mb.e_mbd.lossless = 1; } else #if 0 - /* FIXME (rbultje): this code is disabled until we support cost updates - * while a frame is being encoded; the problem is that each time we - * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities - * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging - * further behind and not being chosen for subsequent frames either. This - * is essentially a local minimum problem that we can probably fix by - * estimating real costs more closely within a frame, perhaps by re- - * calculating costs on-the-fly as frame encoding progresses. */ - if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > - cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] && - cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] && - cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) { - txfm_type = TX_MODE_SELECT; - } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8] - && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] > - cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] - ) { - txfm_type = ONLY_4X4; - } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >= - cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) { - txfm_type = ALLOW_16X16; - } else + /* FIXME (rbultje): this code is disabled until we support cost updates + * while a frame is being encoded; the problem is that each time we + * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities + * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging + * further behind and not being chosen for subsequent frames either. This + * is essentially a local minimum problem that we can probably fix by + * estimating real costs more closely within a frame, perhaps by re- + * calculating costs on-the-fly as frame encoding progresses. */ + if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > + cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] && + cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > + cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] && + cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] > + cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) { + txfm_type = TX_MODE_SELECT; + } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] > + cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8] + && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] > + cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] + ) { + txfm_type = ONLY_4X4; + } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >= + cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) { + txfm_type = ALLOW_16X16; + } else txfm_type = ALLOW_8X8; #else - txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] > - cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? - ALLOW_32X32 : TX_MODE_SELECT; + txfm_type = + cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] + > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ? + ALLOW_32X32 : TX_MODE_SELECT; #endif cpi->common.txfm_mode = txfm_type; cpi->common.comp_pred_mode = pred_type; encode_frame_internal(cpi); for (i = 0; i < NB_PREDICTION_TYPES; ++i) { - const int diff = (int)(cpi->rd_comp_pred_diff[i] / cpi->common.MBs); + const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs); cpi->rd_prediction_type_threshes[frame_type][i] += diff; cpi->rd_prediction_type_threshes[frame_type][i] >>= 1; } @@ -1836,8 +1959,8 @@ void vp9_encode_frame(VP9_COMP *cpi) { int diff; if (i == TX_MODE_SELECT) pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv, - 2048 * (TX_SIZE_MAX_SB - 1), 0); - diff = (int)(pd / cpi->common.MBs); + 2048 * (TX_SIZE_MAX_SB - 1), 0); + diff = (int) (pd / cpi->common.MBs); cpi->rd_tx_select_threshes[frame_type][i] += diff; cpi->rd_tx_select_threshes[frame_type][i] /= 2; } @@ -1890,12 +2013,12 @@ void vp9_encode_frame(VP9_COMP *cpi) { for (i = 0; i < TX_SIZE_CONTEXTS; i++) count32x32 += cm->fc.tx_count_32x32p[i][TX_32X32]; - if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 && - count32x32 == 0) { + if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 + && count32x32 == 0) { cpi->common.txfm_mode = ALLOW_8X8; reset_skip_txfm_size(cpi, TX_8X8); - } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 && - count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { + } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 + && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) { cpi->common.txfm_mode = ONLY_4X4; reset_skip_txfm_size(cpi, TX_4X4); } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) { @@ -1957,18 +2080,17 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) { b = 4 * act + cpi->activity_avg; if (act > cpi->activity_avg) - x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1; + x->act_zbin_adj = (int) (((int64_t) b + (a >> 1)) / a) - 1; else - x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b); + x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b); #endif } -static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, - int output_enabled, int mi_row, int mi_col, - BLOCK_SIZE_TYPE bsize) { - VP9_COMMON *const cm = &cpi->common; - MACROBLOCK *const x = &cpi->mb; - MACROBLOCKD *const xd = &x->e_mbd; +static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled, + int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) { + VP9_COMMON * const cm = &cpi->common; + MACROBLOCK * const x = &cpi->mb; + MACROBLOCKD * const xd = &x->e_mbd; int n; MODE_INFO *mi = xd->mode_info_context; MB_MODE_INFO *mbmi = &mi->mbmi; @@ -2015,10 +2137,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, } if (mbmi->ref_frame[0] == INTRA_FRAME) { - vp9_encode_intra_block_y(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? - BLOCK_SIZE_SB8X8 : bsize); - vp9_encode_intra_block_uv(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? - BLOCK_SIZE_SB8X8 : bsize); + vp9_encode_intra_block_y( + cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); + vp9_encode_intra_block_uv( + cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); if (output_enabled) sum_intra_stats(cpi, x); } else { @@ -2032,12 +2154,12 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, assert(cm->frame_type != KEY_FRAME); - setup_pre_planes(xd, ref_fb, second_ref_fb, - mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv); + setup_pre_planes(xd, ref_fb, second_ref_fb, mi_row, mi_col, + xd->scale_factor, xd->scale_factor_uv); - vp9_build_inter_predictors_sb(xd, mi_row, mi_col, - bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 - : bsize); + vp9_build_inter_predictors_sb( + xd, mi_row, mi_col, + bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 : bsize); } if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) { @@ -2049,14 +2171,14 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); } else { // FIXME(rbultje): not tile-aware (mi - 1) - int mb_skip_context = - (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff; + int mb_skip_context = (mi - 1)->mbmi.mb_skip_coeff + + (mi - mis)->mbmi.mb_skip_coeff; mbmi->mb_skip_coeff = 1; if (output_enabled) cm->fc.mbskip_count[mb_skip_context][1]++; - vp9_reset_sb_tokens_context(xd, - (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); + vp9_reset_sb_tokens_context( + xd, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize); } // copy skip flag on all mb_mode_info contexts in this SB @@ -2068,10 +2190,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, } if (output_enabled) { - if (cm->txfm_mode == TX_MODE_SELECT && - mbmi->sb_type >= BLOCK_SIZE_SB8X8 && - !(mbmi->ref_frame[0] != INTRA_FRAME && (mbmi->mb_skip_coeff || - vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { + if (cm->txfm_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_SIZE_SB8X8 + && !(mbmi->ref_frame[0] != INTRA_FRAME + && (mbmi->mb_skip_coeff + || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) { const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE); if (bsize >= BLOCK_SIZE_SB32X32) { cm->fc.tx_count_32x32p[context][mbmi->txfm_size]++; @@ -2083,7 +2205,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, } else { int x, y; TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode; - // The new intra coding scheme requires no change of transform size + // The new intra coding scheme requires no change of transform size if (mi->mbmi.ref_frame[0] != INTRA_FRAME) { if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32) sz = TX_16X16; diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 4f45496df..2f133ccbc 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -22,10 +22,10 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]); -void vp9_subtract_block(int rows, int cols, - int16_t *diff_ptr, int diff_stride, - const uint8_t *src_ptr, int src_stride, - const uint8_t *pred_ptr, int pred_stride) { +void vp9_subtract_block_c(int rows, int cols, + int16_t *diff_ptr, ptrdiff_t diff_stride, + const uint8_t *src_ptr, ptrdiff_t src_stride, + const uint8_t *pred_ptr, ptrdiff_t pred_stride) { int r, c; for (r = 0; r < rows; r++) { diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h index 579690346..3042c9f7f 100644 --- a/vp9/encoder/vp9_encodemb.h +++ b/vp9/encoder/vp9_encodemb.h @@ -42,10 +42,6 @@ void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); -void vp9_subtract_block(int rows, int cols, - int16_t *diff_ptr, int diff_stride, - const uint8_t *src_ptr, int src_stride, - const uint8_t *pred_ptr, int pred_stride); void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize); void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize); diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c index a582d183d..ea6aa296a 100644 --- a/vp9/encoder/vp9_encodemv.c +++ b/vp9/encoder/vp9_encodemv.c @@ -541,7 +541,7 @@ void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref, const MV diff = {mv->row - ref->row, mv->col - ref->col}; const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff); - usehp = usehp && vp9_use_nmv_hp(ref); + usehp = usehp && vp9_use_mv_hp(ref); write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]); if (mv_joint_vertical(j)) diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 5e26cd82a..522f89982 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -986,9 +986,11 @@ static int estimate_max_q(VP9_COMP *cpi, // Corrections for higher compression speed settings // (reduced compression expected) + // FIXME(jimbankoski): Once we settle on vp9 speed features we need to + // change this code. if (cpi->compressor_speed == 1) speed_correction = cpi->oxcf.cpu_used <= 5 ? - 1.04 + (cpi->oxcf.cpu_used * 0.04) : + 1.04 + (/*cpi->oxcf.cpu_used*/0 * 0.04) : 1.25; // Try and pick a max Q that will be high enough to encode the @@ -1051,7 +1053,7 @@ static int estimate_cq(VP9_COMP *cpi, // (reduced compression expected) if (cpi->compressor_speed == 1) { if (cpi->oxcf.cpu_used <= 5) - speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04); + speed_correction = 1.04 + (/*cpi->oxcf.cpu_used*/ 0 * 0.04); else speed_correction = 1.25; } diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 2e99736ce..0f1062313 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -366,7 +366,7 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, } if (xd->allow_high_precision_mv) { - usehp = vp9_use_nmv_hp(&ref_mv->as_mv); + usehp = vp9_use_mv_hp(&ref_mv->as_mv); } else { usehp = 0; } @@ -556,7 +556,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, } if (xd->allow_high_precision_mv) { - usehp = vp9_use_nmv_hp(&ref_mv->as_mv); + usehp = vp9_use_mv_hp(&ref_mv->as_mv); } else { usehp = 0; } @@ -930,7 +930,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x, } if (x->e_mbd.allow_high_precision_mv) { - usehp = vp9_use_nmv_hp(&ref_mv->as_mv); + usehp = vp9_use_mv_hp(&ref_mv->as_mv); } else { usehp = 0; } diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 6a14df471..e02e73232 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -591,22 +591,25 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) { sf->thresh_mult[THR_COMP_SPLITLA ] += speed_multiplier * 4500; sf->thresh_mult[THR_COMP_SPLITGA ] += speed_multiplier * 4500; - if (speed > 4) { + if (cpi->sf.skip_lots_of_modes) { for (i = 0; i < MAX_MODES; ++i) sf->thresh_mult[i] = INT_MAX; - sf->thresh_mult[THR_DC ] = 0; - sf->thresh_mult[THR_TM ] = 0; - sf->thresh_mult[THR_NEWMV ] = 4000; - sf->thresh_mult[THR_NEWG ] = 4000; - sf->thresh_mult[THR_NEWA ] = 4000; + sf->thresh_mult[THR_DC] = 0; + sf->thresh_mult[THR_TM] = 0; + sf->thresh_mult[THR_NEWMV] = 4000; + sf->thresh_mult[THR_NEWG] = 4000; + sf->thresh_mult[THR_NEWA] = 4000; sf->thresh_mult[THR_NEARESTMV] = 0; - sf->thresh_mult[THR_NEARESTG ] = 0; - sf->thresh_mult[THR_NEARESTA ] = 0; - sf->thresh_mult[THR_NEARMV ] = 2000; - sf->thresh_mult[THR_NEARG ] = 2000; - sf->thresh_mult[THR_NEARA ] = 2000; + sf->thresh_mult[THR_NEARESTG] = 0; + sf->thresh_mult[THR_NEARESTA] = 0; + sf->thresh_mult[THR_NEARMV] = 2000; + sf->thresh_mult[THR_NEARG] = 2000; + sf->thresh_mult[THR_NEARA] = 2000; sf->thresh_mult[THR_COMP_NEARESTLA] = 2000; + sf->thresh_mult[THR_SPLITMV] = 2500; + sf->thresh_mult[THR_SPLITG] = 2500; + sf->thresh_mult[THR_SPLITA] = 2500; sf->recode_loop = 0; } @@ -681,6 +684,18 @@ void vp9_set_speed_features(VP9_COMP *cpi) { sf->max_step_search_steps = MAX_MVSEARCH_STEPS; sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4; sf->adpative_rd_thresh = 0; + sf->use_lastframe_partitioning = 0; + sf->use_largest_txform = 0; + sf->use_8tap_always = 0; + sf->use_avoid_tested_higherror = 0; + sf->skip_lots_of_modes = 0; + sf->adjust_thresholds_by_speed = 0; + sf->partition_by_variance = 0; + sf->use_one_partition_size_always = 0; + sf->use_partitions_less_than = 0; + sf->less_than_block_size = BLOCK_SIZE_MB16X16; + sf->use_partitions_greater_than = 0; + sf->greater_than_block_size = BLOCK_SIZE_SB8X8; #if CONFIG_MULTIPLE_ARF // Switch segmentation off. @@ -703,17 +718,51 @@ void vp9_set_speed_features(VP9_COMP *cpi) { #endif sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8; sf->adpative_rd_thresh = 1; - if (speed > 0) { + if (speed == 1) { sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES; sf->optimize_coefficients = 0; sf->first_step = 1; + sf->use_avoid_tested_higherror = 1; + sf->adjust_thresholds_by_speed = 1; } - break; + if (speed == 2) { + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8; + sf->use_lastframe_partitioning = 1; + sf->first_step = 0; + } + if (speed == 3) { + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8; + sf->partition_by_variance = 1; + sf->first_step = 0; + } + if (speed == 4) { + sf->first_step = 0; + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8; + sf->use_one_partition_size_always = 1; + sf->always_this_block_size = BLOCK_SIZE_MB16X16; + } + if (speed == 2) { + sf->first_step = 0; + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8; + sf->use_partitions_less_than = 1; + sf->less_than_block_size = BLOCK_SIZE_MB16X16; + } + if (speed == 3) { + sf->first_step = 0; + sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8; + sf->use_partitions_greater_than = 1; + sf->greater_than_block_size = BLOCK_SIZE_SB8X8; + } + + break; }; /* switch */ // Set rd thresholds based on mode and speed setting - set_rd_speed_thresholds(cpi, mode, speed); + if(cpi->sf.adjust_thresholds_by_speed) + set_rd_speed_thresholds(cpi, mode, speed); + else + set_rd_speed_thresholds(cpi, mode, 0); // Slow quant, dct and trellis not worthwhile for first pass // so make sure they are always turned off. @@ -2993,7 +3042,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi, !cpi->common.frame_parallel_decoding_mode) { vp9_adapt_mode_probs(&cpi->common); vp9_adapt_mode_context(&cpi->common); - vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv); + vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv); } } @@ -3327,7 +3376,7 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size, vp9_second_pass(cpi); encode_frame_to_data_rate(cpi, size, dest, frame_flags); - + //vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt"); #ifdef DISABLE_RC_LONG_TERM_MEM cpi->twopass.bits_left -= cpi->this_frame_target; #else diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index f5f1c0772..0811976d0 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -216,6 +216,19 @@ typedef struct { int static_segmentation; int comp_inter_joint_search_thresh; int adpative_rd_thresh; + int use_lastframe_partitioning; + int use_largest_txform; + int use_8tap_always; + int use_avoid_tested_higherror; + int skip_lots_of_modes; + int adjust_thresholds_by_speed; + int partition_by_variance; + int use_one_partition_size_always; + BLOCK_SIZE_TYPE always_this_block_size; + int use_partitions_greater_than; + BLOCK_SIZE_TYPE greater_than_block_size; + int use_partitions_less_than; + BLOCK_SIZE_TYPE less_than_block_size; } SPEED_FEATURES; enum BlockSize { diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 53d8be775..ccbb624b0 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -35,6 +35,153 @@ static void quantize(int16_t *zbin_boost_orig_ptr, uint16_t *eob_ptr, const int *scan, int mul) { int i, rc, eob; + int zbins[2], nzbins[2], zbin; + int x, y, z, sz; + int zero_run = 0; + int16_t *zbin_boost_ptr = zbin_boost_orig_ptr; + int zero_flag = n_coeffs; + + vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); + + eob = -1; + + // Base ZBIN + zbins[0] = zbin_ptr[0] + zbin_oq_value; + zbins[1] = zbin_ptr[1] + zbin_oq_value; + nzbins[0] = zbins[0] * -1; + nzbins[1] = zbins[1] * -1; + + if (!skip_block) { + // Pre-scan pass + for (i = n_coeffs - 1; i >= 0; i--) { + rc = scan[i]; + z = coeff_ptr[rc] * mul; + + if (z < zbins[rc != 0] && z > nzbins[rc != 0]) { + zero_flag--; + } else { + break; + } + } + + // Quantization pass: All coefficients with index >= zero_flag are + // skippable. Note: zero_flag can be zero. + for (i = 0; i < zero_flag; i++) { + rc = scan[i]; + z = coeff_ptr[rc] * mul; + + zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]); + zero_run += (zero_run < 15); + + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; + + if (x >= zbin) { + x += (round_ptr[rc != 0]); + y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) + >> quant_shift_ptr[rc != 0]; // quantize (x) + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value + + if (y) { + eob = i; // last nonzero coeffs + zero_run = 0; // set zero_run + } + } + } + } + *eob_ptr = eob + 1; +} + +// This function works well for large transform size. +static void quantize_sparse(int16_t *zbin_boost_orig_ptr, + int16_t *coeff_ptr, int n_coeffs, int skip_block, + int16_t *zbin_ptr, int16_t *round_ptr, + int16_t *quant_ptr, uint8_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + int16_t *dequant_ptr, int zbin_oq_value, + uint16_t *eob_ptr, const int *scan, int mul, + int *idx_arr) { + int i, rc, eob; + int zbins[2], pzbins[2], nzbins[2], zbin; + int x, y, z, sz; + int zero_run = 0; + int16_t *zbin_boost_ptr = zbin_boost_orig_ptr; + int idx = 0; + int pre_idx = 0; + + vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); + vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t)); + + eob = -1; + + // Base ZBIN + zbins[0] = zbin_ptr[0] + zbin_oq_value; + zbins[1] = zbin_ptr[1] + zbin_oq_value; + // Positive and negative ZBIN + pzbins[0] = zbins[0]/mul; + pzbins[1] = zbins[1]/mul; + nzbins[0] = pzbins[0] * -1; + nzbins[1] = pzbins[1] * -1; + + if (!skip_block) { + // Pre-scan pass + for (i = 0; i < n_coeffs; i++) { + rc = scan[i]; + z = coeff_ptr[rc]; + + // If the coefficient is out of the base ZBIN range, keep it for + // quantization. + if (z >= pzbins[rc != 0] || z <= nzbins[rc != 0]) + idx_arr[idx++] = i; + } + + // Quantization pass: only process the coefficients selected in + // pre-scan pass. Note: idx can be zero. + for (i = 0; i < idx; i++) { + rc = scan[idx_arr[i]]; + + // Calculate ZBIN + zero_run += idx_arr[i] - pre_idx; + if(zero_run > 15) zero_run = 15; + zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]); + + pre_idx = idx_arr[i]; + z = coeff_ptr[rc] * mul; + sz = (z >> 31); // sign of z + x = (z ^ sz) - sz; // x = abs(z) + + if (x >= zbin) { + x += (round_ptr[rc != 0]); + y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x)) + >> quant_shift_ptr[rc != 0]; // quantize (x) + + x = (y ^ sz) - sz; // get the sign back + qcoeff_ptr[rc] = x; // write to destination + dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value + + if (y) { + eob = idx_arr[i]; // last nonzero coeffs + zero_run = -1; // set zero_run + } + } + } + } + *eob_ptr = eob + 1; +} +#if 0 +// Original quantize function +static void quantize(int16_t *zbin_boost_orig_ptr, + int16_t *coeff_ptr, int n_coeffs, int skip_block, + int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr, + uint8_t *quant_shift_ptr, + int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr, + int16_t *dequant_ptr, int zbin_oq_value, + uint16_t *eob_ptr, + const int *scan, int mul) { + int i, rc, eob; int zbin; int x, y, z, sz; int zero_run = 0; @@ -74,6 +221,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr, *eob_ptr = eob + 1; } +#endif void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs, TX_TYPE tx_type) { @@ -97,19 +245,40 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs, break; } - quantize(mb->plane[plane].zrun_zbin_boost, - BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), - n_coeffs, mb->skip_block, - mb->plane[plane].zbin, - mb->plane[plane].round, - mb->plane[plane].quant, - mb->plane[plane].quant_shift, - BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16), - BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), - xd->plane[plane].dequant, - mb->plane[plane].zbin_extra, - &xd->plane[plane].eobs[block], - scan, mul); + // Call different quantization for different transform size. + if (n_coeffs >= 1024) { + // Save index of picked coefficient in pre-scan pass. + int idx_arr[1024]; + + quantize_sparse(mb->plane[plane].zrun_zbin_boost, + BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), + n_coeffs, mb->skip_block, + mb->plane[plane].zbin, + mb->plane[plane].round, + mb->plane[plane].quant, + mb->plane[plane].quant_shift, + BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16), + BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), + xd->plane[plane].dequant, + mb->plane[plane].zbin_extra, + &xd->plane[plane].eobs[block], + scan, mul, idx_arr); + } + else { + quantize(mb->plane[plane].zrun_zbin_boost, + BLOCK_OFFSET(mb->plane[plane].coeff, block, 16), + n_coeffs, mb->skip_block, + mb->plane[plane].zbin, + mb->plane[plane].round, + mb->plane[plane].quant, + mb->plane[plane].quant_shift, + BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16), + BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), + xd->plane[plane].dequant, + mb->plane[plane].zbin_extra, + &xd->plane[plane].eobs[block], + scan, mul); + } } void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type, diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 9cb7ab0e1..a48e7dbb3 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -274,12 +274,14 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) { } } -int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) { - int i, error = 0; +int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, + intptr_t block_size) { + int i; + int64_t error = 0; for (i = 0; i < block_size; i++) { int this_diff = coeff[i] - dqcoeff[i]; - error += this_diff * this_diff; + error += (unsigned)this_diff * this_diff; } return error; @@ -417,7 +419,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, int (*r)[2], int *rate, - int *d, int *distortion, + int64_t *d, int64_t *distortion, int *s, int *skip, int64_t txfm_cache[NB_TXFM_MODES], TX_SIZE max_txfm_size) { @@ -496,27 +498,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x, rd[TX_4X4][1] : rd[TX_8X8][1]; } -static int block_error(int16_t *coeff, int16_t *dqcoeff, - int block_size, int shift) { - int i; - int64_t error = 0; - - for (i = 0; i < block_size; i++) { - int this_diff = coeff[i] - dqcoeff[i]; - error += (unsigned)this_diff * this_diff; - } - error >>= shift; - - return error > INT_MAX ? INT_MAX : (int)error; -} - -static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) { +static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, + int shift) { const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); - return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, - 16 << (bwl + bhl), shift); + return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff, + 16 << (bwl + bhl)) >> shift; } -static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) { +static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, + int shift) { const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize); int64_t sum = 0; int plane; @@ -524,11 +514,10 @@ static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) { for (plane = 1; plane < MAX_MB_PLANE; plane++) { const int subsampling = x->e_mbd.plane[plane].subsampling_x + x->e_mbd.plane[plane].subsampling_y; - sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, - 16 << (bwl + bhl - subsampling), 0); + sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff, + 16 << (bwl + bhl - subsampling)); } - sum >>= shift; - return sum > INT_MAX ? INT_MAX : (int)sum; + return sum >> shift; } struct rdcost_block_args { @@ -586,7 +575,8 @@ static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x, } static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, int *skippable, + int *rate, int64_t *distortion, + int *skippable, BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) { MACROBLOCKD *const xd = &x->e_mbd; xd->mode_info_context->mbmi.txfm_size = tx_size; @@ -602,11 +592,12 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, } static void super_block_yrd(VP9_COMP *cpi, - MACROBLOCK *x, int *rate, int *distortion, + MACROBLOCK *x, int *rate, int64_t *distortion, int *skip, BLOCK_SIZE_TYPE bs, int64_t txfm_cache[NB_TXFM_MODES]) { VP9_COMMON *const cm = &cpi->common; - int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB]; + int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB]; + int64_t d[TX_SIZE_MAX_SB]; MACROBLOCKD *xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; @@ -614,7 +605,7 @@ static void super_block_yrd(VP9_COMP *cpi, if (mbmi->ref_frame[0] > INTRA_FRAME) vp9_subtract_sby(x, bs); - if (cpi->speed > 4) { + if (cpi->sf.use_largest_txform) { if (bs >= BLOCK_SIZE_SB32X32) { mbmi->txfm_size = TX_32X32; } else if (bs >= BLOCK_SIZE_MB16X16) { @@ -651,13 +642,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, int *bmode_costs, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l, int *bestrate, int *bestratey, - int *bestdistortion, + int64_t *bestdistortion, BLOCK_SIZE_TYPE bsize) { MB_PREDICTION_MODE mode; MACROBLOCKD *xd = &x->e_mbd; int64_t best_rd = INT64_MAX; int rate = 0; - int distortion; + int64_t distortion; VP9_COMMON *const cm = &cpi->common; const int src_stride = x->plane[0].src.stride; uint8_t *src, *dst; @@ -777,7 +768,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib, static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int *Rate, int *rate_y, - int *Distortion, int64_t best_rd) { + int64_t *Distortion, int64_t best_rd) { int i, j; MACROBLOCKD *const xd = &mb->e_mbd; BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type; @@ -785,7 +776,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, int bh = 1 << b_height_log2(bsize); int idx, idy; int cost = 0; - int distortion = 0; + int64_t distortion = 0; int tot_rate_y = 0; int64_t total_rd = 0; ENTROPY_CONTEXT t_above[4], t_left[4]; @@ -802,7 +793,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, const int mis = xd->mode_info_stride; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode); int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry); - int UNINITIALIZED_IS_SAFE(d); + int64_t UNINITIALIZED_IS_SAFE(d); i = idy * 2 + idx; if (xd->frame_type == KEY_FRAME) { @@ -844,14 +835,14 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, - int *distortion, int *skippable, + int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize, int64_t txfm_cache[NB_TXFM_MODES]) { MB_PREDICTION_MODE mode; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); MACROBLOCKD *const xd = &x->e_mbd; - int this_rate, this_rate_tokenonly; - int this_distortion, s; + int this_rate, this_rate_tokenonly, s; + int64_t this_distortion; int64_t best_rd = INT64_MAX, this_rd; TX_SIZE UNINITIALIZED_IS_SAFE(best_tx); int i; @@ -912,7 +903,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x, } static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, + int *rate, int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize, TX_SIZE uv_tx_size) { MACROBLOCKD *const xd = &x->e_mbd; @@ -927,7 +918,7 @@ static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x, } static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, int *skippable, + int *rate, int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD *const xd = &x->e_mbd; MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; @@ -952,13 +943,13 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x, static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, - int *distortion, int *skippable, + int64_t *distortion, int *skippable, BLOCK_SIZE_TYPE bsize) { MB_PREDICTION_MODE mode; MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected); int64_t best_rd = INT64_MAX, this_rd; - int this_rate_tokenonly, this_rate; - int this_distortion, s; + int this_rate_tokenonly, this_rate, s; + int64_t this_distortion; for (mode = DC_PRED; mode <= TM_PRED; mode++) { x->e_mbd.mode_info_context->mbmi.uv_mode = mode; @@ -1101,7 +1092,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, MACROBLOCK *x, int i, int *labelyrate, - int *distortion, + int64_t *distortion, ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl) { int k; @@ -1126,7 +1117,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm, raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i, xd->plane[0].dst.buf, xd->plane[0].dst.stride); - int thisdistortion = 0; + int64_t thisdistortion = 0; int thisrate = 0; *labelyrate = 0; @@ -1189,7 +1180,7 @@ typedef struct { int64_t segment_rd; int r; - int d; + int64_t d; int segment_yrate; MB_PREDICTION_MODE modes[4]; int_mv mvs[4], second_mvs[4]; @@ -1281,21 +1272,18 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi, int_mv seg_mvs[4][MAX_REF_FRAMES], int mi_row, int mi_col) { - int i, j; - int br = 0, bd = 0; + int i, j, br = 0, rate = 0, sbr = 0, idx, idy; + int64_t bd = 0, sbd = 0; MB_PREDICTION_MODE this_mode; MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi; const int label_count = 4; int64_t this_segment_rd = 0, other_segment_rd; int label_mv_thresh; - int rate = 0; - int sbr = 0, sbd = 0; int segmentyrate = 0; int best_eobs[4] = { 0 }; BLOCK_SIZE_TYPE bsize = mbmi->sb_type; int bwl = b_width_log2(bsize), bw = 1 << bwl; int bhl = b_height_log2(bsize), bh = 1 << bhl; - int idx, idy; vp9_variance_fn_ptr_t *v_fn_ptr; ENTROPY_CONTEXT t_above[4], t_left[4]; ENTROPY_CONTEXT t_above_b[4], t_left_b[4]; @@ -1340,7 +1328,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x, // search for the best motion vector on this segment for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) { int64_t this_rd; - int distortion; + int64_t distortion; int labelyrate; ENTROPY_CONTEXT t_above_s[4], t_left_s[4]; const struct buf_2d orig_src = x->plane[0].src; @@ -1527,7 +1515,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, int64_t best_rd, int *returntotrate, int *returnyrate, - int *returndistortion, + int64_t *returndistortion, int *skippable, int mvthresh, int_mv seg_mvs[4][MAX_REF_FRAMES], int mi_row, int mi_col) { @@ -1800,18 +1788,133 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) { return scaled_ref_frame; } -static void model_rd_from_var_lapndz(int var, int n, int qstep, - int *rate, int *dist) { - // This function models the rate and distortion for a Laplacian +static double linear_interpolate(double x, int ntab, double step, + const double *tab) { + double y = x / step; + int d = (int) y; + double a = y - d; + if (d >= ntab - 1) + return tab[ntab - 1]; + else + return tab[d] * (1 - a) + tab[d + 1] * a; +} + +static double model_rate_norm(double x) { + // Normalized rate + // This function models the rate for a Laplacian source // source with given variance when quantized with a uniform quantizer // with given stepsize. The closed form expressions are in: // Hang and Chen, "Source Model for transform video coder and its // application - Part I: Fundamental Theory", IEEE Trans. Circ. // Sys. for Video Tech., April 1997. - // The function is implemented as piecewise approximation to the - // exact computation. - // TODO(debargha): Implement the functions by interpolating from a - // look-up table + static const double rate_tab_step = 0.125; + static const double rate_tab[] = { + 256.0000, 4.944453, 3.949276, 3.371593, + 2.965771, 2.654550, 2.403348, 2.193612, + 2.014208, 1.857921, 1.719813, 1.596364, + 1.484979, 1.383702, 1.291025, 1.205767, + 1.126990, 1.053937, 0.985991, 0.922644, + 0.863472, 0.808114, 0.756265, 0.707661, + 0.662070, 0.619287, 0.579129, 0.541431, + 0.506043, 0.472828, 0.441656, 0.412411, + 0.384980, 0.359260, 0.335152, 0.312563, + 0.291407, 0.271600, 0.253064, 0.235723, + 0.219508, 0.204351, 0.190189, 0.176961, + 0.164611, 0.153083, 0.142329, 0.132298, + 0.122945, 0.114228, 0.106106, 0.098541, + 0.091496, 0.084937, 0.078833, 0.073154, + 0.067872, 0.062959, 0.058392, 0.054147, + 0.050202, 0.046537, 0.043133, 0.039971, + 0.037036, 0.034312, 0.031783, 0.029436, + 0.027259, 0.025240, 0.023367, 0.021631, + 0.020021, 0.018528, 0.017145, 0.015863, + 0.014676, 0.013575, 0.012556, 0.011612, + 0.010738, 0.009929, 0.009180, 0.008487, + 0.007845, 0.007251, 0.006701, 0.006193, + 0.005722, 0.005287, 0.004884, 0.004512, + 0.004168, 0.003850, 0.003556, 0.003284, + 0.003032, 0.002800, 0.002585, 0.002386, + 0.002203, 0.002034, 0.001877, 0.001732, + 0.001599, 0.001476, 0.001362, 0.001256, + 0.001159, 0.001069, 0.000987, 0.000910, + 0.000840, 0.000774, 0.000714, 0.000659, + 0.000608, 0.000560, 0.000517, 0.000476, + 0.000439, 0.000405, 0.000373, 0.000344, + 0.000317, 0.000292, 0.000270, 0.000248, + 0.000229, 0.000211, 0.000195, 0.000179, + 0.000165, 0.000152, 0.000140, 0.000129, + 0.000119, 0.000110, 0.000101, 0.000093, + 0.000086, 0.000079, 0.000073, 0.000067, + 0.000062, 0.000057, 0.000052, 0.000048, + 0.000044, 0.000041, 0.000038, 0.000035, + 0.000032, 0.000029, 0.000027, 0.000025, + 0.000023, 0.000021, 0.000019, 0.000018, + 0.000016, 0.000015, 0.000014, 0.000013, + 0.000012, 0.000011, 0.000010, 0.000009, + 0.000008, 0.000008, 0.000007, 0.000007, + 0.000006, 0.000006, 0.000005, 0.000005, + 0.000004, 0.000004, 0.000004, 0.000003, + 0.000003, 0.000003, 0.000003, 0.000002, + 0.000002, 0.000002, 0.000002, 0.000002, + 0.000002, 0.000001, 0.000001, 0.000001, + 0.000001, 0.000001, 0.000001, 0.000001, + 0.000001, 0.000001, 0.000001, 0.000001, + 0.000001, 0.000001, 0.000000, 0.000000, + }; + const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]); + assert(x >= 0.0); + return linear_interpolate(x, rate_tab_num, rate_tab_step, rate_tab); +} + +static double model_dist_norm(double x) { + // Normalized distortion + // This function models the normalized distortion for a Laplacian source + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2)) + // where x = qpstep / sqrt(variance) + // Note the actual distortion is Dn * variance. + static const double dist_tab_step = 0.25; + static const double dist_tab[] = { + 0.000000, 0.005189, 0.020533, 0.045381, + 0.078716, 0.119246, 0.165508, 0.215979, + 0.269166, 0.323686, 0.378318, 0.432034, + 0.484006, 0.533607, 0.580389, 0.624063, + 0.664475, 0.701581, 0.735418, 0.766092, + 0.793751, 0.818575, 0.840761, 0.860515, + 0.878045, 0.893554, 0.907238, 0.919281, + 0.929857, 0.939124, 0.947229, 0.954306, + 0.960475, 0.965845, 0.970512, 0.974563, + 0.978076, 0.981118, 0.983750, 0.986024, + 0.987989, 0.989683, 0.991144, 0.992402, + 0.993485, 0.994417, 0.995218, 0.995905, + 0.996496, 0.997002, 0.997437, 0.997809, + 0.998128, 0.998401, 0.998635, 0.998835, + 0.999006, 0.999152, 0.999277, 0.999384, + 0.999475, 0.999553, 0.999619, 0.999676, + 0.999724, 0.999765, 0.999800, 0.999830, + 0.999855, 0.999877, 0.999895, 0.999911, + 0.999924, 0.999936, 0.999945, 0.999954, + 0.999961, 0.999967, 0.999972, 0.999976, + 0.999980, 0.999983, 0.999985, 0.999988, + 0.999989, 0.999991, 0.999992, 0.999994, + 0.999995, 0.999995, 0.999996, 0.999997, + 0.999997, 0.999998, 0.999998, 0.999998, + 0.999999, 0.999999, 0.999999, 0.999999, + 0.999999, 0.999999, 0.999999, 1.000000, + }; + const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]); + assert(x >= 0.0); + return linear_interpolate(x, dist_tab_num, dist_tab_step, dist_tab); +} + +static void model_rd_from_var_lapndz(int var, int n, int qstep, + int *rate, int64_t *dist) { + // This function models the rate and distortion for a Laplacian + // source with given variance when quantized with a uniform quantizer + // with given stepsize. The closed form expression is: + // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)], + // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance) vp9_clear_system_state(); if (var == 0 || n == 0) { *rate = 0; @@ -1819,29 +1922,18 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep, } else { double D, R; double s2 = (double) var / n; - double s = sqrt(s2); - double x = qstep / s; - if (x > 1.0) { - double y = exp(-x / 2); - double y2 = y * y; - D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275; - R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017; - } else { - double x2 = x * x; - D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807; - if (x > 0.125) - R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x + - 0.1626989668625); - else - R = -1.442252874826093 * log(x) + 1.944647760719664; - } + double x = qstep / sqrt(s2); + // TODO(debargha): Make the modeling functions take (qstep^2 / s2) + // as argument rather than qstep / sqrt(s2) to obviate the need for + // the sqrt() operation. + D = model_dist_norm(x); + R = model_rate_norm(x); if (R < 0) { - *rate = 0; - *dist = var; - } else { - *rate = (n * R * 256 + 0.5); - *dist = (n * D * s2 + 0.5); + R = 0; + D = var; } + *rate = (n * R * 256 + 0.5); + *dist = (n * D * s2 + 0.5); } vp9_clear_system_state(); } @@ -1854,12 +1946,13 @@ static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize, static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, MACROBLOCK *x, MACROBLOCKD *xd, - int *out_rate_sum, int *out_dist_sum) { + int *out_rate_sum, int64_t *out_dist_sum) { // Note our transform coeffs are 8 times an orthogonal transform. // Hence quantizer step is also 8 times. To get effective quantizer // we need to divide by 8 before sending to modeling function. - unsigned int sse, var; - int i, rate_sum = 0, dist_sum = 0; + unsigned int sse; + int i, rate_sum = 0; + int64_t dist_sum = 0; for (i = 0; i < MAX_MB_PLANE; ++i) { struct macroblock_plane *const p = &x->plane[i]; @@ -1869,17 +1962,18 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize, const int bw = plane_block_width(bsize, pd); const int bh = plane_block_height(bsize, pd); const enum BlockSize bs = get_block_size(bw, bh); - int rate, dist; - var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, - pd->dst.buf, pd->dst.stride, &sse); - model_rd_from_var_lapndz(var, bw * bh, pd->dequant[1] >> 3, &rate, &dist); + int rate; + int64_t dist; + cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride, + pd->dst.buf, pd->dst.stride, &sse); + model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist); rate_sum += rate; dist_sum += dist; } *out_rate_sum = rate_sum; - *out_dist_sum = dist_sum; + *out_dist_sum = dist_sum << 4; } static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) { @@ -2134,9 +2228,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x, static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int64_t txfm_cache[], - int *rate2, int *distortion, int *skippable, - int *rate_y, int *distortion_y, - int *rate_uv, int *distortion_uv, + int *rate2, int64_t *distortion, + int *skippable, + int *rate_y, int64_t *distortion_y, + int *rate_uv, int64_t *distortion_uv, int *mode_excluded, int *disable_skip, INTERPOLATIONFILTERTYPE *best_filter, int_mv *frame_mv, @@ -2236,11 +2331,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, (mbmi->mv[1].as_mv.col & 15) == 0; // Search for best switchable filter by checking the variance of // pred error irrespective of whether the filter will be used - if (cpi->speed > 4) { + if (cpi->sf.use_8tap_always) { *best_filter = EIGHTTAP; } else { int i, newbest; - int tmp_rate_sum = 0, tmp_dist_sum = 0; + int tmp_rate_sum = 0; + int64_t tmp_dist_sum = 0; for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) { int rs = 0; const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i]; @@ -2255,7 +2351,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, if (interpolating_intpel_seen && is_intpel_interp) { rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum); } else { - int rate_sum = 0, dist_sum = 0; + int rate_sum = 0; + int64_t dist_sum = 0; vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize); model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum); rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum); @@ -2399,19 +2496,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - int *returnrate, int *returndist, + int *returnrate, int64_t *returndist, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *cm = &cpi->common; MACROBLOCKD *xd = &x->e_mbd; - int rate_y = 0, rate_uv; - int rate_y_tokenonly = 0, rate_uv_tokenonly; - int dist_y = 0, dist_uv; - int y_skip = 0, uv_skip; + int rate_y = 0, rate_uv = 0; + int rate_y_tokenonly = 0, rate_uv_tokenonly = 0; + int64_t dist_y = 0, dist_uv = 0; + int y_skip = 0, uv_skip = 0; int64_t txfm_cache[NB_TXFM_MODES], err; MB_PREDICTION_MODE mode; TX_SIZE txfm_size; - int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y; + int rate4x4_y, rate4x4_y_tokenonly; + int64_t dist4x4_y; int64_t err4x4 = INT64_MAX; int i; @@ -2462,7 +2560,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int *returnrate, - int *returndistortion, + int64_t *returndistortion, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) { VP9_COMMON *cm = &cpi->common; @@ -2497,7 +2595,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE; INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE; int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB]; - int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB]; + int64_t dist_uv[TX_SIZE_MAX_SB]; + int skip_uv[TX_SIZE_MAX_SB]; MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB]; struct scale_factors scale_factor[4]; unsigned int ref_frame_mask = 0; @@ -2536,7 +2635,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, best_txfm_rd[i] = INT64_MAX; // Create a mask set to 1 for each frame used by a smaller resolution. - if (cpi->speed > 0) { + if (cpi->sf.use_avoid_tested_higherror) { switch (block_size) { case BLOCK_64X64: for (i = 0; i < 4; i++) { @@ -2576,8 +2675,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; } - if (cpi->speed == 0 - || (cpi->speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) { + if (!cpi->sf.use_avoid_tested_higherror + || (cpi->sf.use_avoid_tested_higherror + && (ref_frame_mask & (1 << INTRA_FRAME)))) { mbmi->mode = DC_PRED; mbmi->ref_frame[0] = INTRA_FRAME; for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 : @@ -2599,7 +2699,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int disable_skip = 0; int compmode_cost = 0; int rate2 = 0, rate_y = 0, rate_uv = 0; - int distortion2 = 0, distortion_y = 0, distortion_uv = 0; + int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0; int skippable; int64_t txfm_cache[NB_TXFM_MODES]; int i; @@ -2623,7 +2723,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, this_mode = vp9_mode_order[mode_index].mode; ref_frame = vp9_mode_order[mode_index].ref_frame; - if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) { + if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) { if (!(ref_frame_mask & (1 << ref_frame))) { continue; } @@ -2786,11 +2886,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, distortion2 = distortion_y + distortion_uv; } else if (this_mode == SPLITMV) { const int is_comp_pred = mbmi->ref_frame[1] > 0; - int rate, distortion; + int rate; + int64_t distortion; int64_t this_rd_thresh; int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX; int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX; - int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0; + int64_t tmp_best_distortion = INT_MAX; + int tmp_best_skippable = 0; int switchable_filter_index; int_mv *second_ref = is_comp_pred ? &mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL; diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index dcf5d00e9..67ef73db7 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -20,12 +20,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex); void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex); void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, - int *r, int *d, BLOCK_SIZE_TYPE bsize, + int *r, int64_t *d, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx); int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, - int *r, int *d, BLOCK_SIZE_TYPE bsize, + int *r, int64_t *d, BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx); void vp9_init_me_luts(); diff --git a/vp9/encoder/x86/vp9_encodeopt.asm b/vp9/encoder/x86/vp9_encodeopt.asm deleted file mode 100644 index 734cb61ca..000000000 --- a/vp9/encoder/x86/vp9_encodeopt.asm +++ /dev/null @@ -1,125 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) -global sym(vp9_block_error_xmm) PRIVATE -sym(vp9_block_error_xmm): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prologue - - mov rsi, arg(0) ;coeff_ptr - mov rdi, arg(1) ;dcoef_ptr - - movdqa xmm0, [rsi] - movdqa xmm1, [rdi] - - movdqa xmm2, [rsi+16] - movdqa xmm3, [rdi+16] - - psubw xmm0, xmm1 - psubw xmm2, xmm3 - - pmaddwd xmm0, xmm0 - pmaddwd xmm2, xmm2 - - paddd xmm0, xmm2 - - pxor xmm5, xmm5 - movdqa xmm1, xmm0 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - psrldq xmm0, 8 - paddd xmm0, xmm1 - - movq rax, xmm0 - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - -;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) -global sym(vp9_block_error_mmx) PRIVATE -sym(vp9_block_error_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 2 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(0) ;coeff_ptr - pxor mm7, mm7 - - mov rdi, arg(1) ;dcoef_ptr - movq mm3, [rsi] - - movq mm4, [rdi] - movq mm5, [rsi+8] - - movq mm6, [rdi+8] - pxor mm1, mm1 ; from movd mm1, dc ; dc =0 - - movq mm2, mm7 - psubw mm5, mm6 - - por mm1, mm2 - pmaddwd mm5, mm5 - - pcmpeqw mm1, mm7 - psubw mm3, mm4 - - pand mm1, mm3 - pmaddwd mm1, mm1 - - paddd mm1, mm5 - movq mm3, [rsi+16] - - movq mm4, [rdi+16] - movq mm5, [rsi+24] - - movq mm6, [rdi+24] - psubw mm5, mm6 - - pmaddwd mm5, mm5 - psubw mm3, mm4 - - pmaddwd mm3, mm3 - paddd mm3, mm5 - - paddd mm1, mm3 - movq mm0, mm1 - - psrlq mm1, 32 - paddd mm0, mm1 - - movq rax, mm0 - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm new file mode 100644 index 000000000..bb1ea71b9 --- /dev/null +++ b/vp9/encoder/x86/vp9_error_sse2.asm @@ -0,0 +1,57 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size) + +INIT_XMM sse2 +cglobal block_error, 3, 3, 6, uqc, dqc, size + pxor m4, m4 ; accumulator + pxor m5, m5 ; dedicated zero register + lea uqcq, [uqcq+sizeq*2] + lea dqcq, [dqcq+sizeq*2] + neg sizeq +.loop: + mova m0, [uqcq+sizeq*2] + mova m2, [dqcq+sizeq*2] + mova m1, [uqcq+sizeq*2+mmsize] + mova m3, [dqcq+sizeq*2+mmsize] + psubw m0, m2 + psubw m1, m3 + ; individual errors are max. 15bit+sign, so squares are 30bit, and + ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit) + pmaddwd m0, m0 + pmaddwd m1, m1 + ; accumulate in 64bit + punpckldq m2, m0, m5 + punpckhdq m0, m5 + punpckldq m3, m1, m5 + punpckhdq m1, m5 + paddq m4, m2 + paddq m4, m0 + paddq m4, m3 + paddq m4, m1 + add sizeq, mmsize + jl .loop + + ; accumulate horizontally and store in return value + movhlps m5, m4 + paddq m4, m5 +%if ARCH_X86_64 + movq rax, m4 +%else + pshufd m5, m4, 0x1 + movd eax, m4 + movd edx, m5 +%endif + RET diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm new file mode 100644 index 000000000..19e2feb57 --- /dev/null +++ b/vp9/encoder/x86/vp9_subpel_variance.asm @@ -0,0 +1,1288 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "third_party/x86inc/x86inc.asm" + +SECTION_RODATA +pw_8: times 8 dw 8 +bilin_filter_m_sse2: times 8 dw 16 + times 8 dw 0 + times 8 dw 15 + times 8 dw 1 + times 8 dw 14 + times 8 dw 2 + times 8 dw 13 + times 8 dw 3 + times 8 dw 12 + times 8 dw 4 + times 8 dw 11 + times 8 dw 5 + times 8 dw 10 + times 8 dw 6 + times 8 dw 9 + times 8 dw 7 + times 16 dw 8 + times 8 dw 7 + times 8 dw 9 + times 8 dw 6 + times 8 dw 10 + times 8 dw 5 + times 8 dw 11 + times 8 dw 4 + times 8 dw 12 + times 8 dw 3 + times 8 dw 13 + times 8 dw 2 + times 8 dw 14 + times 8 dw 1 + times 8 dw 15 + +bilin_filter_m_ssse3: times 8 db 16, 0 + times 8 db 15, 1 + times 8 db 14, 2 + times 8 db 13, 3 + times 8 db 12, 4 + times 8 db 11, 5 + times 8 db 10, 6 + times 8 db 9, 7 + times 16 db 8 + times 8 db 7, 9 + times 8 db 6, 10 + times 8 db 5, 11 + times 8 db 4, 12 + times 8 db 3, 13 + times 8 db 2, 14 + times 8 db 1, 15 + +SECTION .text + +; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride, +; int x_offset, int y_offset, +; const uint8_t *dst, ptrdiff_t dst_stride, +; int height, unsigned int *sse); +; +; This function returns the SE and stores SSE in the given pointer. + +%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse + psubw %3, %4 + psubw %1, %2 + paddw %5, %3 + pmaddwd %3, %3 + paddw %5, %1 + pmaddwd %1, %1 + paddd %6, %3 + paddd %6, %1 +%endmacro + +%macro STORE_AND_RET 0 +%if mmsize == 16 + ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit + ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg. + ; We have to sign-extend it before adding the words within the register + ; and outputing to a dword. + pcmpgtw m5, m6 ; mask for 0 > x + movhlps m3, m7 + punpcklwd m4, m6, m5 + punpckhwd m6, m5 ; sign-extend m6 word->dword + paddd m7, m3 + paddd m6, m4 + pshufd m3, m7, 0x1 + movhlps m4, m6 + paddd m7, m3 + paddd m6, m4 + mov r1, ssem ; r1 = unsigned int *sse + pshufd m4, m6, 0x1 + movd [r1], m7 ; store sse + paddd m6, m4 + movd rax, m6 ; store sum as return value +%else ; mmsize == 8 + pshufw m4, m6, 0xe + pshufw m3, m7, 0xe + paddw m6, m4 + paddd m7, m3 + pcmpgtw m5, m6 ; mask for 0 > x + mov r1, ssem ; r1 = unsigned int *sse + punpcklwd m6, m5 ; sign-extend m6 word->dword + movd [r1], m7 ; store sse + pshufw m4, m6, 0xe + paddd m6, m4 + movd rax, m6 ; store sum as return value +%endif + RET +%endmacro + +%macro SUBPEL_VARIANCE 1-2 0 ; W +%if cpuflag(ssse3) +%define bilin_filter_m bilin_filter_m_ssse3 +%define filter_idx_shift 4 +%else +%define bilin_filter_m bilin_filter_m_sse2 +%define filter_idx_shift 5 +%endif +; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses +; 11, not 13, if the registers are ordered correctly. May make a minor speed +; difference on Win64 +%ifdef PIC +%if %2 == 1 ; avg +cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, height, sse +%define sec_str sec_strideq +%else +cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \ + dst, dst_stride, height, sse +%endif +%define h heightd +%define bilin_filter sseq +%else +%if %2 == 1 ; avg +cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \ + 7 + 2 * ARCH_X86_64, 13, src, src_stride, \ + x_offset, y_offset, \ + dst, dst_stride, \ + sec, sec_stride, \ + height, sse +%if ARCH_X86_64 +%define h heightd +%define sec_str sec_strideq +%else +%define h dword heightm +%define sec_str sec_stridemp +%endif +%else +cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \ + dst, dst_stride, height, sse +%define h heightd +%endif +%define bilin_filter bilin_filter_m +%endif + ASSERT %1 <= 16 ; m6 overflows if w > 16 + pxor m6, m6 ; sum + pxor m7, m7 ; sse + ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we + ; could perhaps use it for something more productive then + pxor m5, m5 ; dedicated zero register +%if %1 < 16 + sar h, 1 +%if %2 == 1 ; avg + shl sec_str, 1 +%endif +%endif + + ; FIXME(rbultje) replace by jumptable? + test x_offsetd, x_offsetd + jnz .x_nonzero + ; x_offset == 0 + test y_offsetd, y_offsetd + jnz .x_zero_y_nonzero + + ; x_offset == 0 && y_offset == 0 +.x_zero_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + mova m1, [dstq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%if %2 == 0 ; !avg + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m0, [srcq+src_strideq] +%else ; mmsize == 8 + punpckldq m0, [srcq+src_strideq] +%endif +%else ; !avg + movh m2, [srcq+src_strideq] +%endif + movh m1, [dstq] + movh m3, [dstq+dst_strideq] +%if %2 == 1 ; avg + pavgb m0, [secq] + punpcklbw m3, m5 + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_zero_y_zero_loop + STORE_AND_RET + +.x_zero_y_nonzero: + cmp y_offsetd, 8 + jne .x_zero_y_nonhalf + + ; x_offset == 0 && y_offset == 0.5 +.x_zero_y_half_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m2, [srcq+src_strideq] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m2, [srcq+src_strideq*2] +%else ; mmsize == 8 + punpckldq m2, [srcq+src_strideq*2] +%endif + movh m1, [dstq] +%if mmsize == 16 + movlhps m0, m2 +%else ; mmsize == 8 + punpckldq m0, m2 +%endif + movh m3, [dstq+dst_strideq] + pavgb m0, m2 + punpcklbw m1, m5 + pavgb m0, [secq] + punpcklbw m3, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg + movh m4, [srcq+src_strideq*2] + movh m1, [dstq] + pavgb m0, m2 + movh m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_zero_y_half_loop + STORE_AND_RET + +.x_zero_y_nonhalf: + ; x_offset == 0 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else ; x86-32 or mmx + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +.x_zero_y_other_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+src_strideq] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can + ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of + ; instructions is the same (5), but it is 1 mul instead of 2, so might be + ; slightly faster because of pmullw latency. It would also cut our rodata + ; tables in half for this function, and save 1-2 registers on x86-64. + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m4, filter_y_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m4, [srcq+src_strideq*2] + movh m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movh m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m4, filter_y_b + paddw m0, m1 + paddw m2, filter_rnd + movh m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_zero_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonzero: + cmp x_offsetd, 8 + jne .x_nonhalf + ; x_offset == 0.5 + test y_offsetd, y_offsetd + jnz .x_half_y_nonzero + + ; x_offset == 0.5 && y_offset == 0 +.x_half_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] + pavgb m0, m4 + punpckhbw m3, m1, m5 +%if %2 == 1 ; avg + pavgb m0, [secq] +%endif + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m4, [srcq+1] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m0, [srcq+src_strideq] + movhps m4, [srcq+src_strideq+1] +%else ; mmsize == 8 + punpckldq m0, [srcq+src_strideq] + punpckldq m4, [srcq+src_strideq+1] +%endif + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + pavgb m0, m4 + punpcklbw m3, m5 + pavgb m0, [secq] + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg + movh m2, [srcq+src_strideq] + movh m1, [dstq] + pavgb m0, m4 + movh m4, [srcq+src_strideq+1] + movh m3, [dstq+dst_strideq] + pavgb m2, m4 + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_half_y_zero_loop + STORE_AND_RET + +.x_half_y_nonzero: + cmp y_offsetd, 8 + jne .x_half_y_nonhalf + + ; x_offset == 0.5 && y_offset == 0.5 +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + pavgb m4, m3 + punpckhbw m3, m1, m5 + pavgb m0, m4 +%if %2 == 1 ; avg + punpcklbw m1, m5 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_half_loop: + movh m2, [srcq] + movh m3, [srcq+1] +%if %2 == 1 ; avg +%if mmsize == 16 + movhps m2, [srcq+src_strideq] + movhps m3, [srcq+src_strideq+1] +%else + punpckldq m2, [srcq+src_strideq] + punpckldq m3, [srcq+src_strideq+1] +%endif + pavgb m2, m3 +%if mmsize == 16 + movlhps m0, m2 + movhlps m4, m2 +%else ; mmsize == 8 + punpckldq m0, m2 + pshufw m4, m2, 0xe +%endif + movh m1, [dstq] + pavgb m0, m2 + movh m3, [dstq+dst_strideq] + pavgb m0, [secq] + punpcklbw m3, m5 + punpcklbw m1, m5 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%else ; !avg + movh m4, [srcq+src_strideq] + movh m1, [srcq+src_strideq+1] + pavgb m2, m3 + pavgb m4, m1 + pavgb m0, m2 + pavgb m2, m4 + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + punpcklbw m0, m5 + punpcklbw m2, m5 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_half_y_half_loop + STORE_AND_RET + +.x_half_y_nonhalf: + ; x_offset == 0.5 && y_offset == bilin interpolation +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+y_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_y_a m8 +%define filter_y_b m9 +%define filter_rnd m10 +%else + add y_offsetq, bilin_filter +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif +%if %1 == 16 + movu m0, [srcq] + movu m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +.x_half_y_other_loop: + movu m4, [srcq] + movu m2, [srcq+1] + mova m1, [dstq] + pavgb m4, m2 +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + pmullw m2, filter_y_a + pmullw m3, filter_y_b + paddw m2, filter_rnd + punpcklbw m0, m5 + paddw m2, m3 + punpcklbw m3, m4, m5 + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 +%endif + punpckhbw m3, m1, m5 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m3, [srcq+1] + add srcq, src_strideq + pavgb m0, m3 +%if notcpuflag(ssse3) + punpcklbw m0, m5 +%endif +.x_half_y_other_loop: + movh m2, [srcq] + movh m1, [srcq+1] + movh m4, [srcq+src_strideq] + movh m3, [srcq+src_strideq+1] + pavgb m2, m1 + pavgb m4, m3 + movh m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + movh m1, [dstq] + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_y_a + pmullw m1, m2, filter_y_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_y_a + paddw m0, m1 + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m2, m1 + movh m1, [dstq] +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_half_y_other_loop +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf: + test y_offsetd, y_offsetd + jnz .x_nonhalf_y_nonzero + + ; x_offset == bilin interpolation && y_offset == 0 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +.x_other_y_zero_loop: +%if %1 == 16 + movu m0, [srcq] + movu m4, [srcq+1] + mova m1, [dstq] +%if cpuflag(ssse3) + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m4, m5 + punpcklbw m0, m5 + punpcklbw m4, m5 + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + pmullw m0, filter_x_a + pmullw m4, filter_x_b + paddw m0, filter_rnd + paddw m2, m3 + paddw m0, m4 +%endif + psraw m2, 4 + psraw m0, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpckhbw m3, m1, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m1, [srcq+1] + movh m2, [srcq+src_strideq] + movh m4, [srcq+src_strideq+1] + movh m3, [dstq+dst_strideq] +%if cpuflag(ssse3) + punpcklbw m0, m1 + movh m1, [dstq] + punpcklbw m2, m4 + pmaddubsw m0, filter_x_a + pmaddubsw m2, filter_x_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + punpcklbw m2, m5 + punpcklbw m4, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + punpcklbw m3, m5 + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m4, filter_x_b + paddw m0, m1 + paddw m2, filter_rnd + movh m1, [dstq] + paddw m2, m4 +%endif + psraw m0, 4 + psraw m2, 4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_other_y_zero_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonzero: + cmp y_offsetd, 8 + jne .x_nonhalf_y_nonhalf + + ; x_offset == bilin interpolation && y_offset == 0.5 +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_rnd m10 +%else + add x_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_rnd [pw_8] +%endif +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_half_loop: + movu m4, [srcq] + movu m3, [srcq+1] +%if cpuflag(ssse3) + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + pavgb m0, m4 + punpckhbw m3, m1, m5 + punpcklbw m1, m5 +%else + punpckhbw m2, m4, m5 + punpckhbw m1, m3, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + paddw m4, m3 + paddw m2, m1 + mova m1, [dstq] + psraw m4, 4 + psraw m2, 4 + punpckhbw m3, m1, m5 + ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we + ; have a 1-register shortage to be able to store the backup of the bilin + ; filtered second line as words as cache for the next line. Packing into + ; a byte costs 1 pack and 2 unpacks, but saves a register. + packuswb m4, m2 + punpcklbw m1, m5 + pavgb m0, m4 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + pavgb m0, [secq] +%endif + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + add srcq, src_strideq + psraw m0, 4 +.x_other_y_half_loop: + movh m2, [srcq] + movh m1, [srcq+1] + movh m4, [srcq+src_strideq] + movh m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movh m1, [dstq] + movh m3, [dstq+dst_strideq] + paddw m2, filter_rnd + paddw m4, filter_rnd +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + movh m1, [dstq] + paddw m4, m3 + movh m3, [dstq+dst_strideq] +%endif + psraw m2, 4 + psraw m4, 4 + pavgw m0, m2 + pavgw m2, m4 +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline - also consider going to bytes here + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + punpcklbw m3, m5 + punpcklbw m1, m5 + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_other_y_half_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_rnd + STORE_AND_RET + +.x_nonhalf_y_nonhalf: +%ifdef PIC + lea bilin_filter, [bilin_filter_m] +%endif + shl x_offsetd, filter_idx_shift + shl y_offsetd, filter_idx_shift +%if ARCH_X86_64 && mmsize == 16 + mova m8, [bilin_filter+x_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m9, [bilin_filter+x_offsetq+16] +%endif + mova m10, [bilin_filter+y_offsetq] +%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64 + mova m11, [bilin_filter+y_offsetq+16] +%endif + mova m12, [pw_8] +%define filter_x_a m8 +%define filter_x_b m9 +%define filter_y_a m10 +%define filter_y_b m11 +%define filter_rnd m12 +%else + add x_offsetq, bilin_filter + add y_offsetq, bilin_filter +%define filter_x_a [x_offsetq] +%define filter_x_b [x_offsetq+16] +%define filter_y_a [y_offsetq] +%define filter_y_b [y_offsetq+16] +%define filter_rnd [pw_8] +%endif + ; x_offset == bilin interpolation && y_offset == bilin interpolation +%if %1 == 16 + movu m0, [srcq] + movu m1, [srcq+1] +%if cpuflag(ssse3) + punpckhbw m2, m0, m1 + punpcklbw m0, m1 + pmaddubsw m2, filter_x_a + pmaddubsw m0, filter_x_a + paddw m2, filter_rnd + paddw m0, filter_rnd +%else + punpckhbw m2, m0, m5 + punpckhbw m3, m1, m5 + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + pmullw m2, filter_x_a + pmullw m3, filter_x_b + paddw m2, filter_rnd + paddw m0, m1 + paddw m2, m3 +%endif + psraw m0, 4 + psraw m2, 4 + add srcq, src_strideq + packuswb m0, m2 +.x_other_y_other_loop: +%if cpuflag(ssse3) + movu m4, [srcq] + movu m3, [srcq+1] + mova m1, [dstq] + punpckhbw m2, m4, m3 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + punpckhbw m3, m1, m5 + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m4, m2 + punpckhbw m2, m0, m4 + punpcklbw m0, m4 + pmaddubsw m2, filter_y_a + pmaddubsw m0, filter_y_a + punpcklbw m1, m5 + paddw m2, filter_rnd + paddw m0, filter_rnd + psraw m2, 4 + psraw m0, 4 +%else + movu m3, [srcq] + movu m4, [srcq+1] + punpckhbw m1, m3, m5 + punpckhbw m2, m4, m5 + punpcklbw m3, m5 + punpcklbw m4, m5 + pmullw m3, filter_x_a + pmullw m4, filter_x_b + paddw m3, filter_rnd + pmullw m1, filter_x_a + pmullw m2, filter_x_b + paddw m1, filter_rnd + paddw m3, m4 + paddw m1, m2 + psraw m3, 4 + psraw m1, 4 + packuswb m4, m3, m1 + punpckhbw m2, m0, m5 + punpcklbw m0, m5 + pmullw m2, filter_y_a + pmullw m1, filter_y_b + paddw m2, filter_rnd + pmullw m0, filter_y_a + pmullw m3, filter_y_b + paddw m2, m1 + mova m1, [dstq] + paddw m0, filter_rnd + psraw m2, 4 + paddw m0, m3 + punpckhbw m3, m1, m5 + psraw m0, 4 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + add srcq, src_strideq + add dstq, dst_strideq +%else ; %1 < 16 + movh m0, [srcq] + movh m1, [srcq+1] +%if cpuflag(ssse3) + punpcklbw m0, m1 + pmaddubsw m0, filter_x_a + paddw m0, filter_rnd +%else + punpcklbw m0, m5 + punpcklbw m1, m5 + pmullw m0, filter_x_a + pmullw m1, filter_x_b + paddw m0, filter_rnd + paddw m0, m1 +%endif + psraw m0, 4 +%if cpuflag(ssse3) + packuswb m0, m0 +%endif + add srcq, src_strideq +.x_other_y_other_loop: + movh m2, [srcq] + movh m1, [srcq+1] + movh m4, [srcq+src_strideq] + movh m3, [srcq+src_strideq+1] +%if cpuflag(ssse3) + punpcklbw m2, m1 + punpcklbw m4, m3 + pmaddubsw m2, filter_x_a + pmaddubsw m4, filter_x_a + movh m3, [dstq+dst_strideq] + movh m1, [dstq] + paddw m2, filter_rnd + paddw m4, filter_rnd + psraw m2, 4 + psraw m4, 4 + packuswb m2, m2 + packuswb m4, m4 + punpcklbw m0, m2 + punpcklbw m2, m4 + pmaddubsw m0, filter_y_a + pmaddubsw m2, filter_y_a + punpcklbw m3, m5 + paddw m0, filter_rnd + paddw m2, filter_rnd + psraw m0, 4 + psraw m2, 4 + punpcklbw m1, m5 +%else + punpcklbw m2, m5 + punpcklbw m1, m5 + punpcklbw m4, m5 + punpcklbw m3, m5 + pmullw m2, filter_x_a + pmullw m1, filter_x_b + paddw m2, filter_rnd + pmullw m4, filter_x_a + pmullw m3, filter_x_b + paddw m4, filter_rnd + paddw m2, m1 + paddw m4, m3 + psraw m2, 4 + psraw m4, 4 + pmullw m0, filter_y_a + pmullw m3, m2, filter_y_b + paddw m0, filter_rnd + pmullw m2, filter_y_a + pmullw m1, m4, filter_y_b + paddw m2, filter_rnd + paddw m0, m3 + movh m3, [dstq+dst_strideq] + paddw m2, m1 + movh m1, [dstq] + psraw m0, 4 + psraw m2, 4 + punpcklbw m3, m5 + punpcklbw m1, m5 +%endif +%if %2 == 1 ; avg + ; FIXME(rbultje) pipeline + packuswb m0, m2 + pavgb m0, [secq] + punpckhbw m2, m0, m5 + punpcklbw m0, m5 +%endif + SUM_SSE m0, m1, m2, m3, m6, m7 + mova m0, m4 + + lea srcq, [srcq+src_strideq*2] + lea dstq, [dstq+dst_strideq*2] +%endif +%if %2 == 1 ; avg + add secq, sec_str +%endif + dec h + jg .x_other_y_other_loop +%undef filter_x_a +%undef filter_x_b +%undef filter_y_a +%undef filter_y_b +%undef filter_rnd + STORE_AND_RET +%endmacro + +; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical +; between the ssse3 and non-ssse3 version. It may make sense to merge their +; code in the sense that the ssse3 version would jump to the appropriate +; location in the sse/2 version, rather than duplicating that code in the +; binary. + +INIT_MMX sse +SUBPEL_VARIANCE 4 +INIT_XMM sse2 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_MMX ssse3 +SUBPEL_VARIANCE 4 +INIT_XMM ssse3 +SUBPEL_VARIANCE 8 +SUBPEL_VARIANCE 16 + +INIT_MMX sse +SUBPEL_VARIANCE 4, 1 +INIT_XMM sse2 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 + +INIT_MMX ssse3 +SUBPEL_VARIANCE 4, 1 +INIT_XMM ssse3 +SUBPEL_VARIANCE 8, 1 +SUBPEL_VARIANCE 16, 1 diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm index 8a2a471f5..2ecc23e55 100644 --- a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm +++ b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm @@ -8,292 +8,8 @@ ; be found in the AUTHORS file in the root of the source tree. ; - %include "vpx_ports/x86_abi_support.asm" -%define xmm_filter_shift 7 - -;void vp9_filter_block2d_bil_var_sse2 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int xoffset, -; int yoffset, -; int *sum, -; unsigned int *sumsquared;; -; -;) -global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE -sym(vp9_filter_block2d_bil_var_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - push rbx - ; end prolog - - pxor xmm6, xmm6 ; - pxor xmm7, xmm7 ; - - lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding - movdqa xmm4, XMMWORD PTR [rsi] - - lea rcx, [GLOBAL(bilinear_filters_sse2)] - movsxd rax, dword ptr arg(5) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je filter_block2d_bil_var_sse2_sp_only - - shl rax, 5 ; point to filter coeff with xoffset - lea rax, [rax + rcx] ; HFilter - - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je filter_block2d_bil_var_sse2_fp_only - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - movdqa xmm5, xmm1 - - movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line - lea rsi, [rsi + rbx] -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - -filter_block2d_bil_var_sse2_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movdqa xmm3, xmm5 ; - movdqa xmm5, xmm1 ; - - pmullw xmm3, [rdx] ; - pmullw xmm1, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rbx] ;ref_pixels_per_line -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 ; - jnz filter_block2d_bil_var_sse2_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_sp_only: - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 - je filter_block2d_bil_var_sse2_full_pixel - - shl rdx, 5 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - lea rsi, [rsi + rax] - -filter_block2d_bil_sp_only_loop: - movq xmm3, QWORD PTR [rsi] ; - punpcklbw xmm3, xmm0 ; - movdqa xmm5, xmm3 - - pmullw xmm1, [rdx] ; - pmullw xmm3, [rdx+16] ; - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - movdqa xmm1, xmm5 ; - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_sp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_full_pixel: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - pxor xmm0, xmm0 ; - -filter_block2d_bil_full_pixel_loop: - movq xmm1, QWORD PTR [rsi] ; - punpcklbw xmm1, xmm0 ; - - movq xmm2, QWORD PTR [rdi] ; - punpcklbw xmm2, xmm0 ; - - psubw xmm1, xmm2 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_full_pixel_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_var_sse2_fp_only: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 ; - movsxd rbx, dword ptr arg(3) ;src_pixels_per_line - -filter_block2d_bil_fp_only_loop: - movq xmm1, QWORD PTR [rsi] ; - movq xmm3, QWORD PTR [rsi+1] ; - - punpcklbw xmm1, xmm0 ; - pmullw xmm1, [rax] ; - punpcklbw xmm3, xmm0 ; - pmullw xmm3, [rax+16] ; - - paddw xmm1, xmm3 ; - paddw xmm1, xmm4 ; - psraw xmm1, xmm_filter_shift ; - - movq xmm3, QWORD PTR [rdi] ; - punpcklbw xmm3, xmm0 ; - - psubw xmm1, xmm3 ; - paddw xmm6, xmm1 ; - - pmaddwd xmm1, xmm1 ; - paddd xmm7, xmm1 ; - lea rsi, [rsi + rdx] - lea rdi, [rdi + rbx] ;src_pixels_per_line - - sub rcx, 1 ; - jnz filter_block2d_bil_fp_only_loop ; - - jmp filter_block2d_bil_variance - -filter_block2d_bil_variance: - movdq2q mm6, xmm6 ; - movdq2q mm7, xmm7 ; - - psrldq xmm6, 8 - psrldq xmm7, 8 - - movdq2q mm2, xmm6 - movdq2q mm3, xmm7 - - paddw mm6, mm2 - paddd mm7, mm3 - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rsi, arg(7) ; sum - mov rdi, arg(8) ; sumsquared - - movd [rsi], mm2 ; xsum - movd [rdi], mm4 ; xxsum - - ; begin epilog - pop rbx - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - - ;void vp9_half_horiz_vert_variance16x_h_sse2 ;( ; unsigned char *ref_ptr, @@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2): UNSHADOW_ARGS pop rbp ret - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 - dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 diff --git a/vp9/encoder/x86/vp9_subtract_mmx.asm b/vp9/encoder/x86/vp9_subtract_mmx.asm deleted file mode 100644 index e9eda4fed..000000000 --- a/vp9/encoder/x86/vp9_subtract_mmx.asm +++ /dev/null @@ -1,432 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride, -; short *diff, unsigned char *Predictor, -; int pitch); -global sym(vp9_subtract_b_mmx_impl) PRIVATE -sym(vp9_subtract_b_mmx_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - - mov rdi, arg(2) ;diff - mov rax, arg(3) ;Predictor - mov rsi, arg(0) ;z - movsxd rdx, dword ptr arg(1);src_stride; - movsxd rcx, dword ptr arg(4);pitch - pxor mm7, mm7 - - movd mm0, [rsi] - movd mm1, [rax] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi], mm0 - - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*2],mm0 - - - movd mm0, [rsi+rdx*2] - movd mm1, [rax+rcx*2] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*4], mm0 - - lea rsi, [rsi+rdx*2] - lea rcx, [rcx+rcx*2] - - - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq [rdi+rcx*2], mm0 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride) -global sym(vp9_subtract_mby_mmx) PRIVATE -sym(vp9_subtract_mby_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - push rsi - push rdi - ; end prolog - - - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride - - mov rcx, 16 - pxor mm0, mm0 - -.submby_loop: - - movq mm1, [rsi] - movq mm3, [rax] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 - - movq [rdi], mm1 - movq [rdi+8], mm2 - - - movq mm1, [rsi+8] - movq mm3, [rax+8] - - movq mm2, mm1 - movq mm4, mm3 - - punpcklbw mm1, mm0 - punpcklbw mm3, mm0 - - punpckhbw mm2, mm0 - punpckhbw mm4, mm0 - - psubw mm1, mm3 - psubw mm2, mm4 - - movq [rdi+16], mm1 - movq [rdi+24], mm2 - - - add rdi, 32 - add rax, 16 - - lea rsi, [rsi+rdx] - - sub rcx, 1 - jnz .submby_loop - - pop rdi - pop rsi - ; begin epilog - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) -global sym(vp9_subtract_mbuv_mmx) PRIVATE -sym(vp9_subtract_mbuv_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - push rsi - push rdi - ; end prolog - - ;short *udiff = diff + 256; - ;short *vdiff = diff + 320; - ;unsigned char *upred = pred + 256; - ;unsigned char *vpred = pred + 320; - - ;unsigned char *z = usrc; - ;unsigned short *diff = udiff; - ;unsigned char *Predictor= upred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - ;unsigned char *z = vsrc; - ;unsigned short *diff = vdiff; - ;unsigned char *Predictor= vpred; - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(2) ;z = usrc - add rdi, 320*2 ;diff = diff + 320 (shorts) - add rax, 320 ;Predictor = pred + 320 - movsxd rdx, dword ptr arg(4) ;stride; - pxor mm7, mm7 - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi] - movq mm1, [rax] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi], mm0 - movq [rdi+8], mm3 - - - movq mm0, [rsi+rdx] - movq mm1, [rax+8] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+16], mm0 - movq [rdi+24], mm3 - - movq mm0, [rsi+rdx*2] - movq mm1, [rax+16] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - movq [rdi+32], mm0 - movq [rdi+40], mm3 - lea rsi, [rsi+rdx*2] - - - movq mm0, [rsi+rdx] - movq mm1, [rax+24] - movq mm3, mm0 - movq mm4, mm1 - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - punpckhbw mm3, mm7 - punpckhbw mm4, mm7 - psubw mm0, mm1 - psubw mm3, mm4 - - movq [rdi+48], mm0 - movq [rdi+56], mm3 - - ; begin epilog - pop rdi - pop rsi - UNSHADOW_ARGS - pop rbp - ret diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vp9/encoder/x86/vp9_subtract_sse2.asm index 739d9487e..982408083 100644 --- a/vp9/encoder/x86/vp9_subtract_sse2.asm +++ b/vp9/encoder/x86/vp9_subtract_sse2.asm @@ -8,349 +8,120 @@ ; be found in the AUTHORS file in the root of the source tree. ; - -%include "vpx_ports/x86_abi_support.asm" - -;void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride, -; short *diff, unsigned char *Predictor, -; int pitch); -global sym(vp9_subtract_b_sse2_impl) PRIVATE -sym(vp9_subtract_b_sse2_impl): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(2) ;diff - mov rax, arg(3) ;Predictor - mov rsi, arg(0) ;z - movsxd rdx, dword ptr arg(1);src_stride; - movsxd rcx, dword ptr arg(4);pitch - pxor mm7, mm7 - - movd mm0, [rsi] - movd mm1, [rax] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi], mm0 - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - movd mm0, [rsi+rdx*2] - movd mm1, [rax+rcx*2] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*4], mm0 - - lea rsi, [rsi+rdx*2] - lea rcx, [rcx+rcx*2] - - movd mm0, [rsi+rdx] - movd mm1, [rax+rcx] - punpcklbw mm0, mm7 - punpcklbw mm1, mm7 - psubw mm0, mm1 - movq MMWORD PTR [rdi+rcx*2], mm0 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride) -global sym(vp9_subtract_mby_sse2) PRIVATE -sym(vp9_subtract_mby_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 4 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(1) ;src - mov rdi, arg(0) ;diff - - mov rax, arg(2) ;pred - movsxd rdx, dword ptr arg(3) ;stride - - mov rcx, 8 ; do two lines at one time - -.submby_loop: - movdqa xmm0, XMMWORD PTR [rsi] ; src - movdqa xmm1, XMMWORD PTR [rax] ; pred - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - movdqa xmm4, XMMWORD PTR [rsi + rdx] - movdqa xmm5, XMMWORD PTR [rax + 16] - - movdqa xmm6, xmm4 - psubb xmm4, xmm5 - - pxor xmm5, [GLOBAL(t80)] ;convert to signed values - pxor xmm6, [GLOBAL(t80)] - pcmpgtb xmm5, xmm6 ; obtain sign information - - movdqa xmm6, xmm4 - movdqa xmm7, xmm5 - punpcklbw xmm4, xmm5 ; put sign back to subtraction - punpckhbw xmm6, xmm7 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi +32], xmm4 - movdqa XMMWORD PTR [rdi +48], xmm6 - - add rdi, 64 - add rax, 32 - lea rsi, [rsi+rdx*2] - - sub rcx, 1 - jnz .submby_loop - - pop rdi - pop rsi - ; begin epilog - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) -global sym(vp9_subtract_mbuv_sse2) PRIVATE -sym(vp9_subtract_mbuv_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rdi, arg(0) ;diff - mov rax, arg(3) ;pred - mov rsi, arg(1) ;z = usrc - add rdi, 256*2 ;diff = diff + 256 (shorts) - add rax, 256 ;Predictor = pred + 256 - movsxd rdx, dword ptr arg(4) ;stride; - lea rcx, [rdx + rdx*2] - - ;u - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ;v - mov rsi, arg(2) ;z = vsrc - add rdi, 64*2 ;diff = diff + 320 (shorts) - add rax, 64 ;Predictor = pred + 320 - - ;line 0 1 - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi], xmm0 - movdqa XMMWORD PTR [rdi +16], xmm2 - - ;line 2 3 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+16] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 32], xmm0 - movdqa XMMWORD PTR [rdi + 48], xmm2 - - ;line 4 5 - lea rsi, [rsi + rdx*4] - - movq xmm0, MMWORD PTR [rsi] ; src - movq xmm2, MMWORD PTR [rsi+rdx] - movdqa xmm1, XMMWORD PTR [rax + 32] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 64], xmm0 - movdqa XMMWORD PTR [rdi + 80], xmm2 - - ;line 6 7 - movq xmm0, MMWORD PTR [rsi+rdx*2] ; src - movq xmm2, MMWORD PTR [rsi+rcx] - movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred - punpcklqdq xmm0, xmm2 - - movdqa xmm2, xmm0 - psubb xmm0, xmm1 ; subtraction with sign missed - - pxor xmm1, [GLOBAL(t80)] ;convert to signed values - pxor xmm2, [GLOBAL(t80)] - pcmpgtb xmm1, xmm2 ; obtain sign information - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - punpcklbw xmm0, xmm1 ; put sign back to subtraction - punpckhbw xmm2, xmm3 ; put sign back to subtraction - - movdqa XMMWORD PTR [rdi + 96], xmm0 - movdqa XMMWORD PTR [rdi + 112], xmm2 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -SECTION_RODATA -align 16 -t80: - times 16 db 0x80 +%include "third_party/x86inc/x86inc.asm" + +SECTION .text + +; void vp9_subtract_block(int rows, int cols, +; int16_t *diff, ptrdiff_t diff_stride, +; const uint8_t *src, ptrdiff_t src_stride, +; const uint8_t *pred, ptrdiff_t pred_stride) + +INIT_XMM sse2 +cglobal subtract_block, 7, 7, 8, \ + rows, cols, diff, diff_stride, src, src_stride, \ + pred, pred_stride +%define pred_str colsq + pxor m7, m7 ; dedicated zero register + cmp colsd, 4 + je .case_4 + cmp colsd, 8 + je .case_8 + cmp colsd, 16 + je .case_16 + cmp colsd, 32 + je .case_32 + +%macro loop16 6 + mova m0, [srcq+%1] + mova m4, [srcq+%2] + mova m1, [predq+%3] + mova m5, [predq+%4] + punpckhbw m2, m0, m7 + punpckhbw m3, m1, m7 + punpcklbw m0, m7 + punpcklbw m1, m7 + psubw m2, m3 + psubw m0, m1 + punpckhbw m1, m4, m7 + punpckhbw m3, m5, m7 + punpcklbw m4, m7 + punpcklbw m5, m7 + psubw m1, m3 + psubw m4, m5 + mova [diffq+mmsize*0+%5], m0 + mova [diffq+mmsize*1+%5], m2 + mova [diffq+mmsize*0+%6], m4 + mova [diffq+mmsize*1+%6], m1 +%endmacro + + mov pred_str, pred_stridemp +.loop_64: + loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize + loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_64 + RET + +.case_32: + mov pred_str, pred_stridemp +.loop_32: + loop16 0, mmsize, 0, mmsize, 0, 2*mmsize + lea diffq, [diffq+diff_strideq*2] + add predq, pred_str + add srcq, src_strideq + dec rowsd + jg .loop_32 + RET + +.case_16: + mov pred_str, pred_stridemp +.loop_16: + loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2 + lea diffq, [diffq+diff_strideq*4] + lea predq, [predq+pred_str*2] + lea srcq, [srcq+src_strideq*2] + sub rowsd, 2 + jg .loop_16 + RET + +%macro loop_h 0 + movh m0, [srcq] + movh m2, [srcq+src_strideq] + movh m1, [predq] + movh m3, [predq+pred_str] + punpcklbw m0, m7 + punpcklbw m1, m7 + punpcklbw m2, m7 + punpcklbw m3, m7 + psubw m0, m1 + psubw m2, m3 + mova [diffq], m0 + mova [diffq+diff_strideq*2], m2 +%endmacro + +.case_8: + mov pred_str, pred_stridemp +.loop_8: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_8 + RET + +INIT_MMX +.case_4: + mov pred_str, pred_stridemp +.loop_4: + loop_h + lea diffq, [diffq+diff_strideq*4] + lea srcq, [srcq+src_strideq*2] + lea predq, [predq+pred_str*2] + sub rowsd, 2 + jg .loop_4 + RET diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm index 9f140c96b..d3dbefed8 100644 --- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm +++ b/vp9/encoder/x86/vp9_variance_impl_mmx.asm @@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx): UNSHADOW_ARGS pop rbp ret - -%define mmx_filter_shift 7 - -;void vp9_filter_block2d_bil4x4_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE -sym(vp9_filter_block2d_bil4x4_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 8 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - - mov rax, arg(4) ;HFilter ; - mov rdx, arg(5) ;VFilter ; - - mov rsi, arg(0) ;ref_ptr ; - mov rdi, arg(2) ;src_ptr ; - - mov rcx, 4 ; - pxor mm0, mm0 ; - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm5, mm1 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - add rsi, r8 -%endif - -.filter_block2d_bil4x4_var_mmx_loop: - - movd mm1, [rsi] ; - movd mm3, [rsi+1] ; - - punpcklbw mm1, mm0 ; - pmullw mm1, [rax] ; - - punpcklbw mm3, mm0 ; - pmullw mm3, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - movq mm3, mm5 ; - - movq mm5, mm1 ; - pmullw mm3, [rdx] ; - - pmullw mm1, [rdx+8] ; - paddw mm1, mm3 ; - - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - movd mm3, [rdi] ; - punpcklbw mm3, mm0 ; - - psubw mm1, mm3 ; - paddw mm6, mm1 ; - - pmaddwd mm1, mm1 ; - paddd mm7, mm1 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil4x4_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(6) ;sum - mov rsi, arg(7) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - - -;void vp9_filter_block2d_bil_var_mmx -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; unsigned short *HFilter, -; unsigned short *VFilter, -; int *sum, -; unsigned int *sumsquared -;) -global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE -sym(vp9_filter_block2d_bil_var_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - GET_GOT rbx - push rsi - push rdi - sub rsp, 16 - ; end prolog - - pxor mm6, mm6 ; - pxor mm7, mm7 ; - mov rax, arg(5) ;HFilter ; - - mov rdx, arg(6) ;VFilter ; - mov rsi, arg(0) ;ref_ptr ; - - mov rdi, arg(2) ;src_ptr ; - movsxd rcx, dword ptr arg(4) ;Height ; - - pxor mm0, mm0 ; - movq mm1, [rsi] ; - - movq mm3, [rsi+1] ; - movq mm2, mm1 ; - - movq mm4, mm3 ; - punpcklbw mm1, mm0 ; - - punpckhbw mm2, mm0 ; - pmullw mm1, [rax] ; - - pmullw mm2, [rax] ; - punpcklbw mm3, mm0 ; - - punpckhbw mm4, mm0 ; - pmullw mm3, [rax+8] ; - - pmullw mm4, [rax+8] ; - paddw mm1, mm3 ; - - paddw mm2, mm4 ; - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm2, mmx_filter_shift ; - movq mm5, mm1 - - packuswb mm5, mm2 ; -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - add rsi, r8 -%endif - -.filter_block2d_bil_var_mmx_loop: - - movq mm1, [rsi] ; - movq mm3, [rsi+1] ; - - movq mm2, mm1 ; - movq mm4, mm3 ; - - punpcklbw mm1, mm0 ; - punpckhbw mm2, mm0 ; - - pmullw mm1, [rax] ; - pmullw mm2, [rax] ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - pmullw mm3, [rax+8] ; - pmullw mm4, [rax+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - psraw mm1, mmx_filter_shift ; - - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - psraw mm2, mmx_filter_shift ; - - movq mm3, mm5 ; - movq mm4, mm5 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - movq mm5, mm1 ; - packuswb mm5, mm2 ; - - pmullw mm3, [rdx] ; - pmullw mm4, [rdx] ; - - pmullw mm1, [rdx+8] ; - pmullw mm2, [rdx+8] ; - - paddw mm1, mm3 ; - paddw mm2, mm4 ; - - paddw mm1, [GLOBAL(mmx_bi_rd)] ; - paddw mm2, [GLOBAL(mmx_bi_rd)] ; - - psraw mm1, mmx_filter_shift ; - psraw mm2, mmx_filter_shift ; - - movq mm3, [rdi] ; - movq mm4, mm3 ; - - punpcklbw mm3, mm0 ; - punpckhbw mm4, mm0 ; - - psubw mm1, mm3 ; - psubw mm2, mm4 ; - - paddw mm6, mm1 ; - pmaddwd mm1, mm1 ; - - paddw mm6, mm2 ; - pmaddwd mm2, mm2 ; - - paddd mm7, mm1 ; - paddd mm7, mm2 ; - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line ; - add rdi, dword ptr arg(3) ;src_pixels_per_line ; -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; - movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; - add rsi, r8 - add rdi, r9 -%endif - sub rcx, 1 ; - jnz .filter_block2d_bil_var_mmx_loop ; - - - pxor mm3, mm3 ; - pxor mm2, mm2 ; - - punpcklwd mm2, mm6 ; - punpckhwd mm3, mm6 ; - - paddd mm2, mm3 ; - movq mm6, mm2 ; - - psrlq mm6, 32 ; - paddd mm2, mm6 ; - - psrad mm2, 16 ; - movq mm4, mm7 ; - - psrlq mm4, 32 ; - paddd mm4, mm7 ; - - mov rdi, arg(7) ;sum - mov rsi, arg(8) ;sumsquared - - movd dword ptr [rdi], mm2 ; - movd dword ptr [rsi], mm4 ; - - ; begin epilog - add rsp, 16 - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -;short mmx_bi_rd[4] = { 64, 64, 64, 64}; -align 16 -mmx_bi_rd: - times 4 dw 64 diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm index 896dd185d..2c5088134 100644 --- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm +++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm @@ -11,8 +11,6 @@ %include "vpx_ports/x86_abi_support.asm" -%define xmm_filter_shift 7 - ;unsigned int vp9_get_mb_ss_sse2 ;( ; short *src_ptr @@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2): UNSHADOW_ARGS pop rbp ret - - -SECTION_RODATA -; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_sse2: - dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 - dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8 - dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 - dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24 - dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 - dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40 - dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 - dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56 - dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 - dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72 - dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 - dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88 - dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 - dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104 - dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112 - dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120 diff --git a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm b/vp9/encoder/x86/vp9_variance_impl_ssse3.asm deleted file mode 100644 index 98a4a16f6..000000000 --- a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm +++ /dev/null @@ -1,372 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" - -%define xmm_filter_shift 7 - - -;void vp9_filter_block2d_bil_var_ssse3 -;( -; unsigned char *ref_ptr, -; int ref_pixels_per_line, -; unsigned char *src_ptr, -; int src_pixels_per_line, -; unsigned int Height, -; int xoffset, -; int yoffset, -; int *sum, -; unsigned int *sumsquared;; -; -;) -;Note: The filter coefficient at offset=0 is 128. Since the second register -;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. -global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE -sym(vp9_filter_block2d_bil_var_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 9 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - pxor xmm6, xmm6 - pxor xmm7, xmm7 - - lea rcx, [GLOBAL(bilinear_filters_ssse3)] - movsxd rax, dword ptr arg(5) ; xoffset - - cmp rax, 0 ; skip first_pass filter if xoffset=0 - je .filter_block2d_bil_var_ssse3_sp_only - - shl rax, 4 ; point to filter coeff with xoffset - lea rax, [rax + rcx] ; HFilter - - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; skip second_pass filter if yoffset=0 - je .filter_block2d_bil_var_ssse3_fp_only - - shl rdx, 4 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - - movdqu xmm0, XMMWORD PTR [rsi] - movdqu xmm1, XMMWORD PTR [rsi+1] - movdqa xmm2, xmm0 - - punpcklbw xmm0, xmm1 - punpckhbw xmm2, xmm1 - pmaddubsw xmm0, [rax] - pmaddubsw xmm2, [rax] - - paddw xmm0, [GLOBAL(xmm_bi_rd)] - paddw xmm2, [GLOBAL(xmm_bi_rd)] - psraw xmm0, xmm_filter_shift - psraw xmm2, xmm_filter_shift - - packuswb xmm0, xmm2 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line -%else - movsxd r8, dword ptr arg(1) ;ref_pixels_per_line - movsxd r9, dword ptr arg(3) ;src_pixels_per_line - lea rsi, [rsi + r8] -%endif - -.filter_block2d_bil_var_ssse3_loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rsi+1] - movdqa xmm3, xmm1 - - punpcklbw xmm1, xmm2 - punpckhbw xmm3, xmm2 - pmaddubsw xmm1, [rax] - pmaddubsw xmm3, [rax] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm3, xmm_filter_shift - packuswb xmm1, xmm3 - - movdqa xmm2, xmm0 - movdqa xmm0, xmm1 - movdqa xmm3, xmm2 - - punpcklbw xmm2, xmm1 - punpckhbw xmm3, xmm1 - pmaddubsw xmm2, [rdx] - pmaddubsw xmm3, [rdx] - - paddw xmm2, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm2, xmm_filter_shift - psraw xmm3, xmm_filter_shift - - movq xmm1, QWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm1, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm2, xmm1 - psubw xmm3, xmm5 - paddw xmm6, xmm2 - paddw xmm6, xmm3 - pmaddwd xmm2, xmm2 - pmaddwd xmm3, xmm3 - paddd xmm7, xmm2 - paddd xmm7, xmm3 - -%if ABI_IS_32BIT - add rsi, dword ptr arg(1) ;ref_pixels_per_line - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rsi, [rsi + r8] - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_var_ssse3_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_sp_only: - movsxd rdx, dword ptr arg(6) ; yoffset - - cmp rdx, 0 ; Both xoffset =0 and yoffset=0 - je .filter_block2d_bil_var_ssse3_full_pixel - - shl rdx, 4 - lea rdx, [rdx + rcx] ; VFilter - - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - - movdqu xmm1, XMMWORD PTR [rsi] - movdqa xmm0, xmm1 - -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - - lea rsi, [rsi + rax] - -.filter_block2d_bil_sp_only_loop: - movdqu xmm3, XMMWORD PTR [rsi] - movdqa xmm2, xmm1 - movdqa xmm0, xmm3 - - punpcklbw xmm1, xmm3 - punpckhbw xmm2, xmm3 - pmaddubsw xmm1, [rdx] - pmaddubsw xmm2, [rdx] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm2, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm2, xmm_filter_shift - - movq xmm3, QWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm3, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm1, xmm3 - psubw xmm2, xmm5 - paddw xmm6, xmm1 - paddw xmm6, xmm2 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm7, xmm1 - paddd xmm7, xmm2 - - movdqa xmm1, xmm0 - lea rsi, [rsi + rax] ;ref_pixels_per_line - -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_sp_only_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_full_pixel: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rax, dword ptr arg(1) ;ref_pixels_per_line - movsxd rdx, dword ptr arg(3) ;src_pixels_per_line - pxor xmm0, xmm0 - -.filter_block2d_bil_full_pixel_loop: - movq xmm1, QWORD PTR [rsi] - punpcklbw xmm1, xmm0 - movq xmm2, QWORD PTR [rsi+8] - punpcklbw xmm2, xmm0 - - movq xmm3, QWORD PTR [rdi] - punpcklbw xmm3, xmm0 - movq xmm4, QWORD PTR [rdi+8] - punpcklbw xmm4, xmm0 - - psubw xmm1, xmm3 - psubw xmm2, xmm4 - paddw xmm6, xmm1 - paddw xmm6, xmm2 - pmaddwd xmm1, xmm1 - pmaddwd xmm2, xmm2 - paddd xmm7, xmm1 - paddd xmm7, xmm2 - - lea rsi, [rsi + rax] ;ref_pixels_per_line - lea rdi, [rdi + rdx] ;src_pixels_per_line - sub rcx, 1 - jnz .filter_block2d_bil_full_pixel_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_var_ssse3_fp_only: - mov rsi, arg(0) ;ref_ptr - mov rdi, arg(2) ;src_ptr - movsxd rcx, dword ptr arg(4) ;Height - movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line - - pxor xmm0, xmm0 - -%if ABI_IS_32BIT=0 - movsxd r9, dword ptr arg(3) ;src_pixels_per_line -%endif - -.filter_block2d_bil_fp_only_loop: - movdqu xmm1, XMMWORD PTR [rsi] - movdqu xmm2, XMMWORD PTR [rsi+1] - movdqa xmm3, xmm1 - - punpcklbw xmm1, xmm2 - punpckhbw xmm3, xmm2 - pmaddubsw xmm1, [rax] - pmaddubsw xmm3, [rax] - - paddw xmm1, [GLOBAL(xmm_bi_rd)] - paddw xmm3, [GLOBAL(xmm_bi_rd)] - psraw xmm1, xmm_filter_shift - psraw xmm3, xmm_filter_shift - - movq xmm2, XMMWORD PTR [rdi] - pxor xmm4, xmm4 - punpcklbw xmm2, xmm4 - movq xmm5, QWORD PTR [rdi+8] - punpcklbw xmm5, xmm4 - - psubw xmm1, xmm2 - psubw xmm3, xmm5 - paddw xmm6, xmm1 - paddw xmm6, xmm3 - pmaddwd xmm1, xmm1 - pmaddwd xmm3, xmm3 - paddd xmm7, xmm1 - paddd xmm7, xmm3 - - lea rsi, [rsi + rdx] -%if ABI_IS_32BIT - add rdi, dword ptr arg(3) ;src_pixels_per_line -%else - lea rdi, [rdi + r9] -%endif - - sub rcx, 1 - jnz .filter_block2d_bil_fp_only_loop - - jmp .filter_block2d_bil_variance - -.filter_block2d_bil_variance: - pxor xmm0, xmm0 - pxor xmm1, xmm1 - pxor xmm5, xmm5 - - punpcklwd xmm0, xmm6 - punpckhwd xmm1, xmm6 - psrad xmm0, 16 - psrad xmm1, 16 - paddd xmm0, xmm1 - movdqa xmm1, xmm0 - - movdqa xmm6, xmm7 - punpckldq xmm6, xmm5 - punpckhdq xmm7, xmm5 - paddd xmm6, xmm7 - - punpckldq xmm0, xmm5 - punpckhdq xmm1, xmm5 - paddd xmm0, xmm1 - - movdqa xmm7, xmm6 - movdqa xmm1, xmm0 - - psrldq xmm7, 8 - psrldq xmm1, 8 - - paddd xmm6, xmm7 - paddd xmm0, xmm1 - - mov rsi, arg(7) ;[Sum] - mov rdi, arg(8) ;[SSE] - - movd [rsi], xmm0 - movd [rdi], xmm6 - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -SECTION_RODATA -align 16 -xmm_bi_rd: - times 8 dw 64 -align 16 -bilinear_filters_ssse3: - times 8 db 128, 0 - times 8 db 120, 8 - times 8 db 112, 16 - times 8 db 104, 24 - times 8 db 96, 32 - times 8 db 88, 40 - times 8 db 80, 48 - times 8 db 72, 56 - times 8 db 64, 64 - times 8 db 56, 72 - times 8 db 48, 80 - times 8 db 40, 88 - times 8 db 32, 96 - times 8 db 24, 104 - times 8 db 16, 112 - times 8 db 8, 120 diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c index bad1cfa74..d1415606e 100644 --- a/vp9/encoder/x86/vp9_variance_mmx.c +++ b/vp9/encoder/x86/vp9_variance_mmx.c @@ -13,27 +13,6 @@ #include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -extern void filter_block1d_h6_mmx -( - const unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *vp7_filter -); -extern void filter_block1d_v6_mmx -( - const short *src_ptr, - unsigned char *output_ptr, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *vp7_filter -); - extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr); extern unsigned int vp9_get8x8var_mmx ( @@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx unsigned int *SSE, int *Sum ); -extern void vp9_filter_block2d_bil4x4_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); -extern void vp9_filter_block2d_bil_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); - unsigned int vp9_variance4x4_mmx( const unsigned char *src_ptr, @@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx( return (var - (((unsigned int)avg * avg) >> 7)); } - -DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]); - -unsigned int vp9_sub_pixel_variance4x4_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) - -{ - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil4x4_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); -} - - -unsigned int vp9_sub_pixel_variance8x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 6)); -} - -unsigned int vp9_sub_pixel_variance16x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - vp9_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); - - -} - -unsigned int vp9_sub_pixel_mse16x16_mmx( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_variance16x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - - vp9_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} - -unsigned int vp9_sub_pixel_variance8x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 7)); -} - - -unsigned int vp9_variance_halfpixvar16x16_h_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp9_variance_halfpixvar16x16_v_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp9_variance_halfpixvar16x16_hv_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c index 67ca9257c..b4ff8509c 100644 --- a/vp9/encoder/x86/vp9_variance_sse2.c +++ b/vp9/encoder/x86/vp9_variance_sse2.c @@ -9,29 +9,11 @@ */ #include "vpx_config.h" + #include "vp9/encoder/vp9_variance.h" #include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -#define HALFNDX 8 - -extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); -extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter); - -extern void vp9_filter_block2d_bil4x4_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); - extern unsigned int vp9_get4x4var_mmx ( const unsigned char *src_ptr, @@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2 unsigned int *SSE, int *Sum ); -void vp9_filter_block2d_bil_var_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int xoffset, - int yoffset, - int *sum, - unsigned int *sumsquared -); void vp9_half_horiz_vert_variance8x_h_sse2 ( const unsigned char *ref_ptr, @@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2 unsigned int *sumsquared ); -DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]); - typedef unsigned int (*get_var_sse2) ( const unsigned char *src_ptr, int source_stride, @@ -375,347 +343,162 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr, return (var - (((int64_t)avg * avg) >> 11)); } -unsigned int vp9_sub_pixel_variance4x4_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil4x4_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); +#define DECL(w, opt) \ +int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + int height, unsigned int *sse) +#define DECLS(opt1, opt2) \ +DECL(4, opt2); \ +DECL(8, opt1); \ +DECL(16, opt1) + +DECLS(sse2, sse); +DECLS(ssse3, ssse3); +#undef DECLS +#undef DECL + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst, \ + int dst_stride, \ + unsigned int *sse_ptr) { \ + unsigned int sse; \ + int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + h, &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sse_ptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ } - -unsigned int vp9_sub_pixel_variance8x8_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum, &xxsum); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum, &xxsum); - } - - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 6)); -} - -static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse, int *avg) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - // note we could avoid these if statements if the calling function - // just called the appropriate functions inside. - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum0, &xxsum0 - ); - - vp9_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum1, &xxsum1 - ); - xsum0 += xsum1; - xxsum0 += xxsum1; - } - - *sse = xxsum0; - *avg = xsum0; -} - -unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse_ptr) { - int avg; - unsigned int sse; - - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse, &avg); - *sse_ptr = sse; - - return (sse - (((unsigned int) avg * avg) >> 8)); -} - -unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse_ptr) { - int avg0, avg1, avg2, avg3; - unsigned int sse0, sse1, sse2, sse3; - - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse0, &avg0); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse1, &avg1); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse3, &avg3); - sse0 += sse1 + sse2 + sse3; - avg0 += avg1 + avg2 + avg3; - *sse_ptr = sse0; - - return (sse0 - (((unsigned int) avg0 * avg0) >> 10)); -} - -unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const uint8_t *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse_ptr) { - int avg0, avg1, avg2, avg3, avg4; - unsigned int sse0, sse1, sse2, sse3, sse4; - - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse0, &avg0); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse3, &avg3); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - avg0 += avg1 + avg2 + avg3; - sse0 += sse1 + sse2 + sse3; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse3, &avg3); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse4, &avg4); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - avg0 += avg1 + avg2 + avg3 + avg4; - sse0 += sse1 + sse2 + sse3 + sse4; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse3, &avg3); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse4, &avg4); - src_ptr += 16 * src_pixels_per_line; - dst_ptr += 16 * dst_pixels_per_line; - avg0 += avg1 + avg2 + avg3 + avg4; - sse0 += sse1 + sse2 + sse3 + sse4; - sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, - &sse1, &avg1); - sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 16, dst_pixels_per_line, - &sse2, &avg2); - sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 32, dst_pixels_per_line, - &sse3, &avg3); - sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset, - yoffset, dst_ptr + 48, dst_pixels_per_line, - &sse4, &avg4); - avg0 += avg1 + avg2 + avg3 + avg4; - sse0 += sse1 + sse2 + sse3 + sse4; - *sse_ptr = sse0; - - return (sse0 - (((unsigned int) avg0 * avg0) >> 12)); -} - -unsigned int vp9_sub_pixel_mse16x16_sse2( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset, - yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_variance16x8_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse - -) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum0, &xxsum0); - - vp9_filter_block2d_bil_var_sse2( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum1, &xxsum1); - xsum0 += xsum1; - xxsum0 += xxsum1; - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} - -unsigned int vp9_sub_pixel_variance8x16_wmt -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance8x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum, &xxsum); - } else { - vp9_filter_block2d_bil_var_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum, &xxsum); - } - - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 7)); +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ +FN(16, 8, 16, 4, 3, opt1,); \ +FN(8, 16, 8, 3, 4, opt1,); \ +FN(8, 8, 8, 3, 3, opt1,); \ +FN(8, 4, 8, 3, 2, opt1,); \ +FN(4, 8, 4, 2, 3, opt2,); \ +FN(4, 4, 4, 2, 2, opt2,) + +FNS(sse2, sse); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN + +#define DECL(w, opt) \ +int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \ + ptrdiff_t src_stride, \ + int x_offset, int y_offset, \ + const uint8_t *dst, \ + ptrdiff_t dst_stride, \ + const uint8_t *sec, \ + ptrdiff_t sec_stride, \ + int height, unsigned int *sse) +#define DECLS(opt1, opt2) \ +DECL(4, opt2); \ +DECL(8, opt1); \ +DECL(16, opt1) + +DECLS(sse2, sse); +DECLS(ssse3, ssse3); +#undef DECL +#undef DECLS + +#define FN(w, h, wf, wlog2, hlog2, opt, cast) \ +unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \ + int src_stride, \ + int x_offset, \ + int y_offset, \ + const uint8_t *dst, \ + int dst_stride, \ + unsigned int *sseptr, \ + const uint8_t *sec) { \ + unsigned int sse; \ + int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \ + y_offset, dst, dst_stride, \ + sec, w, h, &sse); \ + if (w > wf) { \ + unsigned int sse2; \ + int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \ + x_offset, y_offset, \ + dst + 16, dst_stride, \ + sec + 16, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + if (w > wf * 2) { \ + se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \ + x_offset, y_offset, \ + dst + 32, dst_stride, \ + sec + 32, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \ + x_offset, y_offset, \ + dst + 48, dst_stride, \ + sec + 48, w, h, &sse2); \ + se += se2; \ + sse += sse2; \ + } \ + } \ + *sseptr = sse; \ + return sse - ((cast se * se) >> (wlog2 + hlog2)); \ } +#define FNS(opt1, opt2) \ +FN(64, 64, 16, 6, 6, opt1, (int64_t)); \ +FN(64, 32, 16, 6, 5, opt1, (int64_t)); \ +FN(32, 64, 16, 5, 6, opt1, (int64_t)); \ +FN(32, 32, 16, 5, 5, opt1, (int64_t)); \ +FN(32, 16, 16, 5, 4, opt1, (int64_t)); \ +FN(16, 32, 16, 4, 5, opt1, (int64_t)); \ +FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \ +FN(16, 8, 16, 4, 3, opt1,); \ +FN(8, 16, 8, 3, 4, opt1,); \ +FN(8, 8, 8, 3, 3, opt1,); \ +FN(8, 4, 8, 3, 2, opt1,); \ +FN(4, 8, 4, 2, 3, opt2,); \ +FN(4, 4, 4, 2, 2, opt2,) + +FNS(sse2, sse); +FNS(ssse3, ssse3); + +#undef FNS +#undef FN unsigned int vp9_variance_halfpixvar16x16_h_wmt( const unsigned char *src_ptr, diff --git a/vp9/encoder/x86/vp9_variance_ssse3.c b/vp9/encoder/x86/vp9_variance_ssse3.c deleted file mode 100644 index 882acad78..000000000 --- a/vp9/encoder/x86/vp9_variance_ssse3.c +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/common/vp9_pragmas.h" -#include "vpx_ports/mem.h" - -#define HALFNDX 8 - -extern void vp9_half_horiz_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp9_half_horiz_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp9_half_vert_variance16x_h_sse2 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int *sum, - unsigned int *sumsquared -); -extern void vp9_filter_block2d_bil_var_ssse3 -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - int xoffset, - int yoffset, - int *sum, - unsigned int *sumsquared -); - -unsigned int vp9_sub_pixel_variance16x16_ssse3 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum0; - unsigned int xxsum0; - - // note we could avoid these if statements if the calling function - // just called the appropriate functions inside. - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_ssse3( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - xoffset, yoffset, - &xsum0, &xxsum0); - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); -} - -unsigned int vp9_sub_pixel_variance16x8_ssse3 -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse - -) { - int xsum0; - unsigned int xxsum0; - - if (xoffset == HALFNDX && yoffset == 0) { - vp9_half_horiz_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == 0 && yoffset == HALFNDX) { - vp9_half_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else if (xoffset == HALFNDX && yoffset == HALFNDX) { - vp9_half_horiz_vert_variance16x_h_sse2( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - &xsum0, &xxsum0); - } else { - vp9_filter_block2d_bil_var_ssse3( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - xoffset, yoffset, - &xsum0, &xxsum0); - } - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c deleted file mode 100644 index 6016e14eb..000000000 --- a/vp9/encoder/x86/vp9_x86_csystemdependent.c +++ /dev/null @@ -1,55 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#include "./vpx_config.h" -#include "vpx_ports/x86.h" -#include "vp9/encoder/vp9_variance.h" -#include "vp9/encoder/vp9_onyx_int.h" -#include "vp9/encoder/x86/vp9_dct_mmx.h" - -// TODO(jimbankoski) Consider rewriting the c to take the same values rather -// than going through these pointer conversions -#if 0 && HAVE_MMX -void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) { - vp9_short_fdct4x4_mmx(input, output, pitch); - vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch); -} - -void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride, - short *diff, unsigned char *predictor, - int pitch); -void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) { - unsigned char *z = *(be->base_src) + be->src; - unsigned int src_stride = be->src_stride; - short *diff = &be->src_diff[0]; - unsigned char *predictor = *(bd->base_dst) + bd->dst; - // TODO(jingning): The prototype function in c has been changed. Need to - // modify the mmx and sse versions. - vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch); -} - -#endif - -#if 0 && HAVE_SSE2 -void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride, - short *diff, unsigned char *predictor, - int pitch); -void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) { - unsigned char *z = *(be->base_src) + be->src; - unsigned int src_stride = be->src_stride; - short *diff = &be->src_diff[0]; - unsigned char *predictor = *(bd->base_dst) + bd->dst; - // TODO(jingning): The prototype function in c has been changed. Need to - // modify the mmx and sse versions. - vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch); -} - -#endif diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk index 4bed6c0d7..a1e93753d 100644 --- a/vp9/vp9cx.mk +++ b/vp9/vp9cx.mk @@ -73,27 +73,24 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h -VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h -VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm +VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c -VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm -VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c |