diff options
author | John Koleszar <jkoleszar@google.com> | 2011-04-19 00:05:09 -0400 |
---|---|---|
committer | John Koleszar <jkoleszar@google.com> | 2011-04-19 00:05:09 -0400 |
commit | a5d3febc1377275518a3c8cf6595ff4b7ffa7021 (patch) | |
tree | 91b48040fc317526fc2876aa673551e01c698f91 | |
parent | 0ba3fffc3a3361c4ff37fee13b6c8d88ff80ea06 (diff) | |
parent | 48438d60162cdf0ff16706b243724bc698febf83 (diff) | |
download | libvpx-a5d3febc1377275518a3c8cf6595ff4b7ffa7021.tar libvpx-a5d3febc1377275518a3c8cf6595ff4b7ffa7021.tar.gz libvpx-a5d3febc1377275518a3c8cf6595ff4b7ffa7021.tar.bz2 libvpx-a5d3febc1377275518a3c8cf6595ff4b7ffa7021.zip |
Merge remote branch 'origin/master' into experimental
Change-Id: I920c3ed6af244ef9032b744675d9f664e5878d0e
-rwxr-xr-x | build/make/configure.sh | 2 | ||||
-rw-r--r-- | vp8/common/threading.h | 6 | ||||
-rw-r--r-- | vp8/decoder/onyxd_if.c | 15 | ||||
-rw-r--r-- | vp8/decoder/onyxd_int.h | 6 | ||||
-rw-r--r-- | vp8/encoder/block.h | 2 | ||||
-rw-r--r-- | vp8/encoder/encodeframe.c | 2 | ||||
-rw-r--r-- | vp8/encoder/mcomp.c | 32 | ||||
-rw-r--r-- | vp8/encoder/mcomp.h | 2 | ||||
-rw-r--r-- | vp8/encoder/onyx_int.h | 6 | ||||
-rw-r--r-- | vp8/encoder/pickinter.c | 9 | ||||
-rw-r--r-- | vp8/encoder/quantize.c | 6 | ||||
-rw-r--r-- | vp8/encoder/rdopt.c | 20 | ||||
-rw-r--r-- | vp8/encoder/ssim.c | 66 | ||||
-rw-r--r-- | vp8/encoder/temporal_filter.c | 3 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_sse2.asm | 2 | ||||
-rw-r--r-- | vp8/encoder/x86/ssim_opt.asm | 37 | ||||
-rw-r--r-- | vpx_ports/x86.h | 4 |
17 files changed, 124 insertions, 96 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh index 085170d65..3324be36e 100755 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -884,6 +884,8 @@ process_common_toolchain() { link_with_cc=gcc tune_cflags="-march=" setup_gnu_toolchain + #for 32 bit x86 builds, -O3 did not turn on this flag + enabled optimizations && check_add_cflags -fomit-frame-pointer ;; esac diff --git a/vp8/common/threading.h b/vp8/common/threading.h index 44eaf0800..b7542b306 100644 --- a/vp8/common/threading.h +++ b/vp8/common/threading.h @@ -12,8 +12,6 @@ #ifndef _PTHREAD_EMULATION #define _PTHREAD_EMULATION -#define VPXINFINITE 10000 /* 10second. */ - #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD /* Thread management macros */ @@ -28,7 +26,7 @@ #define pthread_t HANDLE #define pthread_attr_t DWORD #define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL) -#define pthread_join(thread, result) ((WaitForSingleObject((thread),VPXINFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread)) +#define pthread_join(thread, result) ((WaitForSingleObject((thread),INFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread)) #define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread) #define thread_sleep(nms) Sleep(nms) #define pthread_cancel(thread) terminate_thread(thread,0) @@ -62,7 +60,7 @@ #define sem_t HANDLE #define pause(voidpara) __asm PAUSE #define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateEvent(NULL,FALSE,FALSE,NULL))==NULL) -#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,VPXINFINITE)) +#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE)) #define sem_post(sem) SetEvent(*sem) #define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE) #define thread_sleep(nms) Sleep(nms) diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index ef2e00d61..1e83ab542 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -76,7 +76,6 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) pbi->common.current_video_frame = 0; pbi->ready_for_new_data = 1; - pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/ #if CONFIG_MULTITHREAD pbi->max_threads = oxcf->max_threads; vp8_decoder_create_threads(pbi); @@ -252,7 +251,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign VP8D_COMP *pbi = (VP8D_COMP *) ptr; VP8_COMMON *cm = &pbi->common; int retcode = 0; - struct vpx_usec_timer timer; /*if(pbi->ready_for_new_data == 0) return -1;*/ @@ -317,8 +315,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign pbi->common.error.setjmp = 1; - vpx_usec_timer_start(&timer); - /*cm->current_video_frame++;*/ pbi->Source = source; pbi->source_sz = size; @@ -379,15 +375,9 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign if(pbi->common.filter_level) { - struct vpx_usec_timer lpftimer; - vpx_usec_timer_start(&lpftimer); /* Apply the loop filter if appropriate. */ - vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level); - vpx_usec_timer_mark(&lpftimer); - pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer); - cm->last_frame_type = cm->frame_type; cm->last_filter_type = cm->filter_type; cm->last_sharpness_level = cm->sharpness_level; @@ -398,11 +388,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign vp8_clear_system_state(); - vpx_usec_timer_mark(&timer); - pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer); - - pbi->time_decoding += pbi->decode_microseconds; - /*vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);*/ if (cm->show_frame) diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index 512f1fc0f..240061eb2 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -81,12 +81,6 @@ typedef struct VP8Decompressor const unsigned char *Source; unsigned int source_sz; - - unsigned int CPUFreq; - unsigned int decode_microseconds; - unsigned int time_decoding; - unsigned int time_loop_filtering; - #if CONFIG_MULTITHREAD /* variable for threading */ diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index 5a2568dde..6de4c8517 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -34,7 +34,7 @@ typedef struct // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries short *quant; short *quant_fast; - short *quant_shift; + unsigned char *quant_shift; short *zbin; short *zrun_zbin_boost; short *round; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index 5f6322bc7..e5d7c674d 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -147,7 +147,7 @@ static const int qzbin_factors_y2[129] = #define EXACT_QUANT #ifdef EXACT_QUANT static void vp8cx_invert_quant(int improved_quant, short *quant, - short *shift, short d) + unsigned char *shift, short d) { if(improved_quant) { diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index eb840d70b..651890d81 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -194,13 +194,13 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) #define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function. #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost -#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse;}}, v=INT_MAX;)// checks if (r,c) has better score than previous best +#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best #define MIN(x,y) (((x)<(y))?(x):(y)) #define MAX(x,y) (((x)>(y))?(x):(y)) //#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; } -int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion) +int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1) { unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col; unsigned char *z = (*(b->base_src) + b->src); @@ -226,7 +226,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, bestmv->col <<= 3; // calculate central point error - besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse); + besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); @@ -316,7 +316,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #undef CHECK_BETTER #undef MIN #undef MAX -int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion) +int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1) { int bestmse = INT_MAX; MV startmv; @@ -345,7 +345,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, startmv = *bestmv; // calculate central point error - bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse); + bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); *distortion = bestmse; bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); @@ -360,6 +360,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = left; *distortion = thismse; + *sse1 = sse; } this_mv.col += 8; @@ -371,6 +372,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = right; *distortion = thismse; + *sse1 = sse; } // go up then down and check error @@ -384,6 +386,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = up; *distortion = thismse; + *sse1 = sse; } this_mv.row += 8; @@ -395,6 +398,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = down; *distortion = thismse; + *sse1 = sse; } @@ -436,6 +440,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = diag; *distortion = thismse; + *sse1 = sse; } // } @@ -473,6 +478,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = left; *distortion = thismse; + *sse1 = sse; } this_mv.col += 4; @@ -484,6 +490,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = right; *distortion = thismse; + *sse1 = sse; } // go up then down and check error @@ -507,6 +514,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = up; *distortion = thismse; + *sse1 = sse; } this_mv.row += 4; @@ -518,6 +526,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = down; *distortion = thismse; + *sse1 = sse; } @@ -608,12 +617,13 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, *bestmv = this_mv; bestmse = diag; *distortion = thismse; + *sse1 = sse; } return bestmse; } -int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion) +int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1) { int bestmse = INT_MAX; MV startmv; @@ -640,7 +650,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm startmv = *bestmv; // calculate central point error - bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse); + bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); *distortion = bestmse; bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); @@ -655,6 +665,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm *bestmv = this_mv; bestmse = left; *distortion = thismse; + *sse1 = sse; } this_mv.col += 8; @@ -666,6 +677,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm *bestmv = this_mv; bestmse = right; *distortion = thismse; + *sse1 = sse; } // go up then down and check error @@ -679,6 +691,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm *bestmv = this_mv; bestmse = up; *distortion = thismse; + *sse1 = sse; } this_mv.row += 8; @@ -690,6 +703,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm *bestmv = this_mv; bestmse = down; *distortion = thismse; + *sse1 = sse; } // somewhat strangely not doing all the diagonals for half pel is slower than doing them. @@ -741,6 +755,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm *bestmv = this_mv; bestmse = diag; *distortion = thismse; + *sse1 = sse; } this_mv.col += 8; @@ -752,6 +767,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm *bestmv = this_mv; bestmse = diag; *distortion = thismse; + *sse1 = sse; } this_mv.col = (this_mv.col - 8) | 4; @@ -764,6 +780,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm *bestmv = this_mv; bestmse = diag; *distortion = thismse; + *sse1 = sse; } this_mv.col += 8; @@ -775,6 +792,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm *bestmv = this_mv; bestmse = diag; *distortion = thismse; + *sse1 = sse; } #endif diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h index 72faf8ea6..b14cbcbc8 100644 --- a/vp8/encoder/mcomp.h +++ b/vp8/encoder/mcomp.h @@ -49,7 +49,7 @@ extern int vp8_hex_search typedef int (fractional_mv_step_fp) (MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, - int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion); + int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse); extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; extern fractional_mv_step_fp vp8_find_best_sub_pixel_step; extern fractional_mv_step_fp vp8_find_best_half_pixel_step; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 0ab528e27..4c1854244 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -244,17 +244,17 @@ typedef struct { DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]); diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 111cd74ba..c56593e0b 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -50,7 +50,7 @@ extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv); -int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion) +int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse) { (void) b; (void) d; @@ -59,6 +59,7 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, (void) vfp; (void) mvcost; (void) distortion; + (void) sse; bestmv->row <<= 3; bestmv->col <<= 3; return 0; @@ -443,7 +444,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re int bestsme; //int all_rds[MAX_MODES]; // Experimental debug code. int best_mode_index = 0; - int sse = INT_MAX; + unsigned int sse = INT_MAX; MV mvp; int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7}; @@ -796,7 +797,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re } if (bestsme < INT_MAX) - cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2); + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2, &sse); mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; @@ -827,7 +828,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mode_info_context->mbmi.mv.as_int; if((this_mode != NEWMV) || !(have_subp_search)) - distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], (unsigned int *)(&sse)); + distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], &sse); this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index 803e3a51d..86ed267fb 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -27,7 +27,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) short *zbin_ptr = b->zbin; short *round_ptr = b->round; short *quant_ptr = b->quant_fast; - short *quant_shift_ptr = b->quant_shift; + unsigned char *quant_shift_ptr = b->quant_shift; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = d->dequant; @@ -112,7 +112,7 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) short *zbin_ptr = b->zbin; short *round_ptr = b->round; short *quant_ptr = b->quant; - short *quant_shift_ptr = b->quant_shift; + unsigned char *quant_shift_ptr = b->quant_shift; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = d->dequant; @@ -166,7 +166,7 @@ void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d) int sz; short *coeff_ptr; short *quant_ptr; - short *quant_shift_ptr; + unsigned char *quant_shift_ptr; short *qcoeff_ptr; short *dqcoeff_ptr; short *dequant_ptr; diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index f775fadd9..9ff00c9a6 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -1271,13 +1271,14 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { int distortion; + unsigned int sse; if (!cpi->common.full_pixel) cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], - bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost, &distortion); + bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost, &distortion, &sse); else vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], - bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost, &distortion); + bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost, &distortion, &sse); } } /* NEW4X4 */ @@ -2255,9 +2256,10 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int x->mv_row_max = tmp_row_max; if (bestsme < INT_MAX) - { - int dis; /* TODO: use dis in distortion calculation later. */ - cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis); + { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis, &sse); } mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; @@ -2304,7 +2306,8 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int } else if (x->encode_breakout) { - int sum, sse; + int sum; + unsigned int sse; int threshold = (xd->block[0].dequant[1] * xd->block[0].dequant[1] >>4); @@ -2313,7 +2316,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var) (x->src.y_buffer, x->src.y_stride, - x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum); + x->e_mbd.predictor, 16, &sse, &sum); if (sse < threshold) { @@ -2337,8 +2340,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int distortion_uv = sse2; disable_skip = 1; - this_rd = RDCOST(x->rdmult, x->rddiv, rate2, - distortion2); + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); break; } diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c index 64d67c6dd..c78be37a3 100644 --- a/vp8/encoder/ssim.c +++ b/vp8/encoder/ssim.c @@ -290,8 +290,8 @@ void ssim_parms_8x8_c } } -const static long long c1 = 426148; // (256^2*(.01*255)^2 -const static long long c2 = 3835331; //(256^2*(.03*255)^2 +const static long long cc1 = 26634; // (64^2*(.01*255)^2 +const static long long cc2 = 239708; // (64^2*(.03*255)^2 static double similarity ( @@ -303,10 +303,19 @@ static double similarity int count ) { - long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2); + long long ssim_n, ssim_d; + long long c1, c2; - long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* - (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ; + //scale the constants by number of pixels + c1 = (cc1*count*count)>>12; + c2 = (cc2*count*count)>>12; + + ssim_n = (2*sum_s*sum_r+ c1)*((long long) 2*count*sum_sxr- + (long long) 2*sum_s*sum_r+c2); + + ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + ((long long)count*sum_sq_s-(long long)sum_s*sum_s + + (long long)count*sum_sq_r-(long long) sum_r*sum_r +c2) ; return ssim_n * 1.0 / ssim_d; } @@ -332,18 +341,33 @@ long dssim(unsigned char *s,int sp, unsigned char *r,int rp, const vp8_variance_rtcd_vtable_t *rtcd) { unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; - double ssim3; - long long ssim_n; - long long ssim_d; + long long ssim3; + long long ssim_n,ssim_n1,ssim_n2; + long long ssim_d,ssim_d1,ssim_d2; + long long ssim_t1,ssim_t2; + long long c1, c2; + + // normalize by 256/64 + c1 = cc1*16; + c2 = cc2*16; rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); - ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2); + ssim_n1 = (2*sum_s*sum_r+ c1); - ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* - (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ; + ssim_n2 =((long long) 2*256*sum_sxr-(long long) 2*sum_s*sum_r+c2); - ssim3 = 256 * (ssim_d-ssim_n) / ssim_d; - return (long)( 256*ssim3 * ssim3 ); + ssim_d1 =((long long)sum_s*sum_s +(long long)sum_r*sum_r+c1); + + ssim_d2 = (256 * (long long) sum_sq_s-(long long) sum_s*sum_s + + (long long) 256*sum_sq_r-(long long) sum_r*sum_r +c2) ; + + ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1; + ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2; + + ssim3 = 256 *ssim_t1 * ssim_t2; + if(ssim3 <0 ) + ssim3=0; + return (long)( ssim3 ); } // TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels // such that the window regions overlap block boundaries to penalize blocking @@ -361,18 +385,20 @@ double vp8_ssim2 ) { int i,j; - + int samples =0; double ssim_total=0; - // we can sample points as frequently as we like start with 1 per 8x8 - for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8) + // we can sample points as frequently as we like start with 1 per 4x4 + for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4) { - for(j=0; j < width; j+=8 ) + for(j=0; j < width-8; j+=4 ) { - ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd); + double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2, rtcd); + ssim_total += v; + samples++; } } - ssim_total /= (width/8 * height /8); + ssim_total /= samples; return ssim_total; } @@ -405,4 +431,4 @@ double vp8_calc_ssim *weight = 1; return ssimv; -} +}
\ No newline at end of file diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c index 5ede33f4a..b77195511 100644 --- a/vp8/encoder/temporal_filter.c +++ b/vp8/encoder/temporal_filter.c @@ -209,10 +209,11 @@ static int vp8_temporal_filter_find_matching_mb_c //if (bestsme > error_thresh && bestsme < INT_MAX) { int distortion; + unsigned int sse; bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], - mvcost, &distortion); + mvcost, &distortion, &sse); } #endif diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 5e40dc7de..7b7ae706a 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -142,7 +142,7 @@ sym(vp8_regular_quantize_b_sse2): movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] ; downshift by quant_shift[rc] - movsx ecx, WORD PTR[rax + %1 * 2] ; quant_shift_ptr[rc] + movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc] sar edi, cl ; also sets Z bit je rq_zigzag_loop_%1 ; !y mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm index c267cdb54..d6cebf33d 100644 --- a/vp8/encoder/x86/ssim_opt.asm +++ b/vp8/encoder/x86/ssim_opt.asm @@ -16,12 +16,12 @@ paddusw xmm14, xmm4 ; sum_r movdqa xmm1, xmm3 pmaddwd xmm1, xmm1 - paddq xmm13, xmm1 ; sum_sq_s + paddd xmm13, xmm1 ; sum_sq_s movdqa xmm2, xmm4 pmaddwd xmm2, xmm2 - paddq xmm12, xmm2 ; sum_sq_r + paddd xmm12, xmm2 ; sum_sq_r pmaddwd xmm3, xmm4 - paddq xmm11, xmm3 ; sum_sxr + paddd xmm11, xmm3 ; sum_sxr %endmacro ; Sum across the register %1 starting with q words @@ -66,6 +66,7 @@ sym(vp8_ssim_parms_16x16_sse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 + SAVE_XMM push rsi push rdi ; end prolog @@ -115,19 +116,20 @@ NextRow: SUM_ACROSS_Q xmm11 mov rdi,arg(4) - movq [rdi], xmm15; + movd [rdi], xmm15; mov rdi,arg(5) - movq [rdi], xmm14; + movd [rdi], xmm14; mov rdi,arg(6) - movq [rdi], xmm13; + movd [rdi], xmm13; mov rdi,arg(7) - movq [rdi], xmm12; + movd [rdi], xmm12; mov rdi,arg(8) - movq [rdi], xmm11; + movd [rdi], xmm11; ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -154,6 +156,7 @@ sym(vp8_ssim_parms_8x8_sse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 + SAVE_XMM push rsi push rdi ; end prolog @@ -174,11 +177,8 @@ sym(vp8_ssim_parms_8x8_sse3): NextRow2: ;grab source and reference pixels - movq xmm5, [rsi] - movq xmm6, [rdi] - - movdqa xmm3, xmm5 - movdqa xmm4, xmm6 + movq xmm3, [rsi] + movq xmm4, [rdi] punpcklbw xmm3, xmm0 ; low_s punpcklbw xmm4, xmm0 ; low_r @@ -197,19 +197,20 @@ NextRow2: SUM_ACROSS_Q xmm11 mov rdi,arg(4) - movq [rdi], xmm15; + movd [rdi], xmm15; mov rdi,arg(5) - movq [rdi], xmm14; + movd [rdi], xmm14; mov rdi,arg(6) - movq [rdi], xmm13; + movd [rdi], xmm13; mov rdi,arg(7) - movq [rdi], xmm12; + movd [rdi], xmm12; mov rdi,arg(8) - movq [rdi], xmm11; + movd [rdi], xmm11; ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vpx_ports/x86.h b/vpx_ports/x86.h index 190c8643a..b0130fbfe 100644 --- a/vpx_ports/x86.h +++ b/vpx_ports/x86.h @@ -151,8 +151,8 @@ x86_readtsc(void) __asm__ __volatile__ ("pause \n\t") #else #if ARCH_X86_64 -/* No pause intrinsic for windows x64 */ -#define x86_pause_hint() +#define x86_pause_hint()\ + _mm_pause(); #else #define x86_pause_hint()\ __asm pause |