diff options
Diffstat (limited to 'vp8')
84 files changed, 2303 insertions, 2728 deletions
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c index edef36094..b0c7363a7 100644 --- a/vp8/common/alloccommon.c +++ b/vp8/common/alloccommon.c @@ -129,32 +129,32 @@ void vp8_setup_version(VP8_COMMON *cm) { case 0: cm->no_lpf = 0; - cm->simpler_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; cm->use_bilinear_mc_filter = 0; cm->full_pixel = 0; break; case 1: cm->no_lpf = 0; - cm->simpler_lpf = 1; + cm->filter_type = SIMPLE_LOOPFILTER; cm->use_bilinear_mc_filter = 1; cm->full_pixel = 0; break; case 2: cm->no_lpf = 1; - cm->simpler_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; cm->use_bilinear_mc_filter = 1; cm->full_pixel = 0; break; case 3: cm->no_lpf = 1; - cm->simpler_lpf = 1; + cm->filter_type = SIMPLE_LOOPFILTER; cm->use_bilinear_mc_filter = 1; cm->full_pixel = 1; break; default: /*4,5,6,7 are reserved for future use*/ cm->no_lpf = 0; - cm->simpler_lpf = 0; + cm->filter_type = NORMAL_LOOPFILTER; cm->use_bilinear_mc_filter = 0; cm->full_pixel = 0; break; @@ -169,7 +169,7 @@ void vp8_create_common(VP8_COMMON *oci) oci->mb_no_coeff_skip = 1; oci->no_lpf = 0; - oci->simpler_lpf = 0; + oci->filter_type = NORMAL_LOOPFILTER; oci->use_bilinear_mc_filter = 0; oci->full_pixel = 0; oci->multi_token_partition = ONE_PARTITION; diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c index 3532a0356..6d1caa485 100644 --- a/vp8/common/arm/loopfilter_arm.c +++ b/vp8/common/arm/loopfilter_arm.c @@ -38,9 +38,8 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_neon; /*ARMV6 loopfilter functions*/ /* Horizontal MB filtering */ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -51,20 +50,18 @@ void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig } void vp8_loop_filter_mbhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -75,20 +72,18 @@ void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig } void vp8_loop_filter_mbvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_armv6(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -101,12 +96,11 @@ void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign } void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -114,9 +108,8 @@ void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig /* Vertical B Filtering */ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -129,12 +122,11 @@ void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsign } void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -145,9 +137,8 @@ void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, unsigned char *u_ptr, unsig /* NEON loopfilter functions */ /* Horizontal MB filtering */ void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_horizontal_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -155,20 +146,18 @@ void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign } void vp8_loop_filter_mbhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_vertical_edge_y_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -176,20 +165,18 @@ void vp8_loop_filter_mbv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign } void vp8_loop_filter_mbvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_y_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -199,12 +186,11 @@ void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne } void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_neon(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -212,9 +198,8 @@ void vp8_loop_filter_bhs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsign /* Vertical B Filtering */ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_vertical_edge_y_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_y_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_y_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -224,12 +209,11 @@ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigne } void vp8_loop_filter_bvs_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_neon(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm index d3a79f640..e73dd6401 100644 --- a/vp8/common/arm/neon/loopfilter_neon.asm +++ b/vp8/common/arm/neon/loopfilter_neon.asm @@ -308,7 +308,6 @@ ; q9 q2 ; q10 q3 |vp8_loop_filter_neon| PROC - ldr r12, _lf_coeff_ ; vp8_filter_mask vabd.u8 q11, q3, q4 ; abs(p3 - p2) @@ -339,7 +338,7 @@ vqadd.u8 q9, q9, q2 ; a = b + a vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - vld1.u8 {q0}, [r12]! + vmov.u8 q0, #0x80 ; 0x80 ; vp8_filter() function ; convert to signed @@ -348,7 +347,7 @@ veor q5, q5, q0 ; ps1 veor q8, q8, q0 ; qs1 - vld1.u8 {q10}, [r12]! + vmov.u8 q10, #3 ; #3 vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) vsubl.s8 q11, d15, d13 @@ -367,7 +366,7 @@ vaddw.s8 q2, q2, d2 vaddw.s8 q11, q11, d3 - vld1.u8 {q9}, [r12]! + vmov.u8 q9, #4 ; #4 ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d2, q2 @@ -399,12 +398,4 @@ ;----------------- -_lf_coeff_ - DCD lf_coeff -lf_coeff - DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 - DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 - DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 - DCD 0x01010101, 0x01010101, 0x01010101, 0x01010101 - END diff --git a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm index 5fe7e7e6d..7c5ea3644 100644 --- a/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimplehorizontaledge_neon.asm @@ -22,20 +22,19 @@ ; r1 int p, //pitch ; r2 const signed char *flimit, ; r3 const signed char *limit, -; stack(r4) const signed char *thresh, +; stack(r4) const signed char *thresh (unused) ; //stack(r5) int count --unused |vp8_loop_filter_simple_horizontal_edge_neon| PROC sub r0, r0, r1, lsl #1 ; move src pointer down by 2 lines - ldr r12, _lfhy_coeff_ vld1.u8 {q5}, [r0], r1 ; p1 vld1.s8 {d2[], d3[]}, [r2] ; flimit vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 vld1.u8 {q6}, [r0], r1 ; p0 - vld1.u8 {q0}, [r12]! ; 0x80 + vmov.u8 q0, #0x80 ; 0x80 vld1.u8 {q7}, [r0], r1 ; q0 - vld1.u8 {q10}, [r12]! ; 0x03 + vmov.u8 q10, #0x03 ; 0x03 vld1.u8 {q8}, [r0] ; q1 ;vp8_filter_mask() function @@ -66,7 +65,7 @@ vadd.s16 q11, q2, q2 ; 3 * ( qs0 - ps0) vadd.s16 q12, q3, q3 - vld1.u8 {q9}, [r12]! ; 0x04 + vmov.u8 q9, #0x04 ; 0x04 vadd.s16 q2, q2, q11 vadd.s16 q3, q3, q12 @@ -105,11 +104,4 @@ ;----------------- -_lfhy_coeff_ - DCD lfhy_coeff -lfhy_coeff - DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 - DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 - DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 - END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm index c30378b9c..a7f7b690e 100644 --- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -22,7 +22,7 @@ ; r1 int p, //pitch ; r2 const signed char *flimit, ; r3 const signed char *limit, -; stack(r4) const signed char *thresh, +; stack(r4) const signed char *thresh (unused) ; //stack(r5) int count --unused |vp8_loop_filter_simple_vertical_edge_neon| PROC @@ -32,7 +32,6 @@ vld1.s8 {d2[], d3[]}, [r2] ; flimit vld1.s8 {d26[], d27[]}, [r3] ; limit -> q13 vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r0], r1 - ldr r12, _vlfy_coeff_ vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r1 vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r0], r1 vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r1 @@ -41,11 +40,11 @@ vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r0], r1 vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vld1.u8 {q0}, [r12]! ; 0x80 + vmov.u8 q0, #0x80 ; 0x80 vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vld1.u8 {q11}, [r12]! ; 0x03 + vmov.u8 q11, #0x03 ; 0x03 vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vld1.u8 {q12}, [r12]! ; 0x04 + vmov.u8 q12, #0x04 ; 0x04 vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 @@ -146,11 +145,4 @@ ;----------------- -_vlfy_coeff_ - DCD vlfy_coeff -vlfy_coeff - DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 - DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 - DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 - END diff --git a/vp8/common/arm/neon/mbloopfilter_neon.asm b/vp8/common/arm/neon/mbloopfilter_neon.asm index 981adffd1..72f0f9271 100644 --- a/vp8/common/arm/neon/mbloopfilter_neon.asm +++ b/vp8/common/arm/neon/mbloopfilter_neon.asm @@ -372,7 +372,6 @@ ; q10 q3 |vp8_mbloop_filter_neon| PROC - ldr r12, _mblf_coeff_ ; vp8_filter_mask vabd.u8 q11, q3, q4 ; abs(p3 - p2) @@ -396,7 +395,7 @@ vld1.s8 {d4[], d5[]}, [r2] ; flimit - vld1.u8 {q0}, [r12]! + vmov.u8 q0, #0x80 ; 0x80 vadd.u8 q2, q2, q2 ; flimit * 2 vadd.u8 q2, q2, q1 ; flimit * 2 + limit @@ -431,12 +430,12 @@ vadd.s16 q2, q2, q10 vadd.s16 q13, q13, q11 - vld1.u8 {q12}, [r12]! ; #3 + vmov.u8 q12, #3 ; #3 vaddw.s8 q2, q2, d2 ; vp8_filter + 3 * ( qs0 - ps0) vaddw.s8 q13, q13, d3 - vld1.u8 {q11}, [r12]! ; #4 + vmov.u8 q11, #4 ; #4 ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) vqmovn.s16 d2, q2 @@ -444,16 +443,16 @@ vand q1, q1, q15 ; vp8_filter &= mask - vld1.u8 {q15}, [r12]! ; #63 - ; + vmov.u16 q15, #63 ; #63 + vand q13, q1, q14 ; Filter2 &= hev - vld1.u8 {d7}, [r12]! ; #9 + vmov.u8 d7, #9 ; #9 vqadd.s8 q2, q13, q11 ; Filter1 = clamp(Filter2+4) vqadd.s8 q13, q13, q12 ; Filter2 = clamp(Filter2+3) - vld1.u8 {d6}, [r12]! ; #18 + vmov.u8 d6, #18 ; #18 vshr.s8 q2, q2, #3 ; Filter1 >>= 3 vshr.s8 q13, q13, #3 ; Filter2 >>= 3 @@ -463,7 +462,7 @@ vqsub.s8 q7, q7, q2 ; qs0 = clamp(qs0 - Filter1) - vld1.u8 {d5}, [r12]! ; #27 + vmov.u8 d5, #27 ; #27 vqadd.s8 q6, q6, q13 ; ps0 = clamp(ps0 + Filter2) @@ -507,14 +506,4 @@ ;----------------- -_mblf_coeff_ - DCD mblf_coeff -mblf_coeff - DCD 0x80808080, 0x80808080, 0x80808080, 0x80808080 - DCD 0x03030303, 0x03030303, 0x03030303, 0x03030303 - DCD 0x04040404, 0x04040404, 0x04040404, 0x04040404 - DCD 0x003f003f, 0x003f003f, 0x003f003f, 0x003f003f - DCD 0x09090909, 0x09090909, 0x12121212, 0x12121212 - DCD 0x1b1b1b1b, 0x1b1b1b1b - END diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index fc8e0722c..aef692744 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -169,12 +169,8 @@ typedef struct unsigned char partitioning; unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ - unsigned char dc_diff; unsigned char need_to_clamp_mvs; - unsigned char segment_id; /* Which set of segmentation parameters should be used for this MB */ - - unsigned char force_no_skip; /* encoder only */ } MB_MODE_INFO; diff --git a/vp8/common/extend.c b/vp8/common/extend.c index 47207fa79..036bafc5d 100644 --- a/vp8/common/extend.c +++ b/vp8/common/extend.c @@ -13,10 +13,12 @@ #include "vpx_mem/vpx_mem.h" -static void extend_plane_borders +static void copy_and_extend_plane ( unsigned char *s, /* source */ - int sp, /* pitch */ + int sp, /* source pitch */ + unsigned char *d, /* destination */ + int dp, /* destination pitch */ int h, /* height */ int w, /* width */ int et, /* extend top border */ @@ -25,7 +27,6 @@ static void extend_plane_borders int er /* extend right border */ ) { - int i; unsigned char *src_ptr1, *src_ptr2; unsigned char *dest_ptr1, *dest_ptr2; @@ -34,68 +35,73 @@ static void extend_plane_borders /* copy the left and right most columns out */ src_ptr1 = s; src_ptr2 = s + w - 1; - dest_ptr1 = s - el; - dest_ptr2 = s + w; + dest_ptr1 = d - el; + dest_ptr2 = d + w; - for (i = 0; i < h - 0 + 1; i++) + for (i = 0; i < h; i++) { - /* Some linkers will complain if we call vpx_memset with el set to a - * constant 0. - */ - if (el) - vpx_memset(dest_ptr1, src_ptr1[0], el); + vpx_memset(dest_ptr1, src_ptr1[0], el); + vpx_memcpy(dest_ptr1 + el, src_ptr1, w); vpx_memset(dest_ptr2, src_ptr2[0], er); src_ptr1 += sp; src_ptr2 += sp; - dest_ptr1 += sp; - dest_ptr2 += sp; + dest_ptr1 += dp; + dest_ptr2 += dp; } - /* Now copy the top and bottom source lines into each line of the respective borders */ - src_ptr1 = s - el; - src_ptr2 = s + sp * (h - 1) - el; - dest_ptr1 = s + sp * (-et) - el; - dest_ptr2 = s + sp * (h) - el; - linesize = el + er + w + 1; + /* Now copy the top and bottom lines into each line of the respective + * borders + */ + src_ptr1 = d - el; + src_ptr2 = d + dp * (h - 1) - el; + dest_ptr1 = d + dp * (-et) - el; + dest_ptr2 = d + dp * (h) - el; + linesize = el + er + w; - for (i = 0; i < (int)et; i++) + for (i = 0; i < et; i++) { vpx_memcpy(dest_ptr1, src_ptr1, linesize); - dest_ptr1 += sp; + dest_ptr1 += dp; } - for (i = 0; i < (int)eb; i++) + for (i = 0; i < eb; i++) { vpx_memcpy(dest_ptr2, src_ptr2, linesize); - dest_ptr2 += sp; + dest_ptr2 += dp; } } -void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height) +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst) { - int er = 0xf & (16 - (width & 0xf)); - int eb = 0xf & (16 - (height & 0xf)); - - /* check for non multiples of 16 */ - if (er != 0 || eb != 0) - { - extend_plane_borders(ybf->y_buffer, ybf->y_stride, height, width, 0, 0, eb, er); - - /* adjust for uv */ - height = (height + 1) >> 1; - width = (width + 1) >> 1; - er = 0x7 & (8 - (width & 0x7)); - eb = 0x7 & (8 - (height & 0x7)); - - if (er || eb) - { - extend_plane_borders(ybf->u_buffer, ybf->uv_stride, height, width, 0, 0, eb, er); - extend_plane_borders(ybf->v_buffer, ybf->uv_stride, height, width, 0, 0, eb, er); - } - } + int et = dst->border; + int el = dst->border; + int eb = dst->border + dst->y_height - src->y_height; + int er = dst->border + dst->y_width - src->y_width; + + copy_and_extend_plane(src->y_buffer, src->y_stride, + dst->y_buffer, dst->y_stride, + src->y_height, src->y_width, + et, el, eb, er); + + et = (et + 1) >> 1; + el = (el + 1) >> 1; + eb = (eb + 1) >> 1; + er = (er + 1) >> 1; + + copy_and_extend_plane(src->u_buffer, src->uv_stride, + dst->u_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); + + copy_and_extend_plane(src->v_buffer, src->uv_stride, + dst->v_buffer, dst->uv_stride, + src->uv_height, src->uv_width, + et, el, eb, er); } + /* note the extension is only for the last row, for intra prediction purpose */ void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr) { diff --git a/vp8/common/extend.h b/vp8/common/extend.h index fd0a608e5..9e0be4e06 100644 --- a/vp8/common/extend.h +++ b/vp8/common/extend.h @@ -14,8 +14,8 @@ #include "vpx_scale/yv12config.h" -void Extend(YV12_BUFFER_CONFIG *ybf); void vp8_extend_mb_row(YV12_BUFFER_CONFIG *ybf, unsigned char *YPtr, unsigned char *UPtr, unsigned char *VPtr); -void vp8_extend_to_multiple_of16(YV12_BUFFER_CONFIG *ybf, int width, int height); +void vp8_copy_and_extend_frame(YV12_BUFFER_CONFIG *src, + YV12_BUFFER_CONFIG *dst); #endif diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index 5c6464772..d981f3496 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -17,9 +17,54 @@ #include "vp8/common/idct.h" #include "vp8/common/onyxc_int.h" +#if CONFIG_MULTITHREAD +#if HAVE_UNISTD_H +#include <unistd.h> +#elif defined(_WIN32) +#include <windows.h> +typedef void (WINAPI *PGNSI)(LPSYSTEM_INFO); +#endif +#endif + extern void vp8_arch_x86_common_init(VP8_COMMON *ctx); extern void vp8_arch_arm_common_init(VP8_COMMON *ctx); +#if CONFIG_MULTITHREAD +static int get_cpu_count() +{ + int core_count = 16; + +#if HAVE_UNISTD_H +#if defined(_SC_NPROCESSORS_ONLN) + core_count = sysconf(_SC_NPROCESSORS_ONLN); +#elif defined(_SC_NPROC_ONLN) + core_count = sysconf(_SC_NPROC_ONLN); +#endif +#elif defined(_WIN32) + { + PGNSI pGNSI; + SYSTEM_INFO sysinfo; + + /* Call GetNativeSystemInfo if supported or + * GetSystemInfo otherwise. */ + + pGNSI = (PGNSI) GetProcAddress( + GetModuleHandle(TEXT("kernel32.dll")), "GetNativeSystemInfo"); + if (pGNSI != NULL) + pGNSI(&sysinfo); + else + GetSystemInfo(&sysinfo); + + core_count = sysinfo.dwNumberOfProcessors; + } +#else + /* other platforms */ +#endif + + return core_count > 0 ? core_count : 1; +} +#endif + void vp8_machine_specific_config(VP8_COMMON *ctx) { #if CONFIG_RUNTIME_CPU_DETECT @@ -43,6 +88,12 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) vp8_build_intra_predictors_mby; rtcd->recon.build_intra_predictors_mby_s = vp8_build_intra_predictors_mby_s; + rtcd->recon.build_intra_predictors_mbuv = + vp8_build_intra_predictors_mbuv; + rtcd->recon.build_intra_predictors_mbuv_s = + vp8_build_intra_predictors_mbuv_s; + rtcd->recon.intra4x4_predict = + vp8_intra4x4_predict; rtcd->subpix.sixtap16x16 = vp8_sixtap_predict16x16_c; rtcd->subpix.sixtap8x8 = vp8_sixtap_predict8x8_c; @@ -82,4 +133,7 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) vp8_arch_arm_common_init(ctx); #endif +#if CONFIG_MULTITHREAD + ctx->processor_core_count = get_cpu_count(); +#endif /* CONFIG_MULTITHREAD */ } diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c index 37c5b7740..a3242716f 100644 --- a/vp8/common/loopfilter.c +++ b/vp8/common/loopfilter.c @@ -25,9 +25,8 @@ prototype_loopfilter(vp8_loop_filter_simple_vertical_edge_c); /* Horizontal MB filtering */ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -38,20 +37,18 @@ void vp8_loop_filter_mbh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned } void vp8_loop_filter_mbhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -62,20 +59,18 @@ void vp8_loop_filter_mbv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned } void vp8_loop_filter_mbvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_c(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -88,12 +83,11 @@ void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c } void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_c(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -101,9 +95,8 @@ void vp8_loop_filter_bhs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned /* Vertical B Filtering */ void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -116,12 +109,11 @@ void vp8_loop_filter_bv_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c } void vp8_loop_filter_bvs_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_c(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_c(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_c(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -353,6 +345,9 @@ void vp8_loop_filter_frame for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; + int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && + mbd->mode_info_context->mbmi.mode != SPLITMV && + mbd->mode_info_context->mbmi.mb_skip_coeff); filter_level = baseline_filter_level[Segment]; @@ -365,17 +360,17 @@ void vp8_loop_filter_frame if (filter_level) { if (mb_col > 0) - cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + cm->lf_mbv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); - if (mbd->mode_info_context->mbmi.dc_diff > 0) - cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + if (!skip_lf) + cm->lf_bv(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); /* don't apply across umv border */ if (mb_row > 0) - cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + cm->lf_mbh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); - if (mbd->mode_info_context->mbmi.dc_diff > 0) - cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level], cm->simpler_lpf); + if (!skip_lf) + cm->lf_bh(y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi[filter_level]); } y_ptr += 16; @@ -457,6 +452,10 @@ void vp8_loop_filter_frame_yonly for (mb_col = 0; mb_col < cm->mb_cols; mb_col++) { int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; + int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && + mbd->mode_info_context->mbmi.mode != SPLITMV && + mbd->mode_info_context->mbmi.mb_skip_coeff); + filter_level = baseline_filter_level[Segment]; /* Apply any context driven MB level adjustment */ @@ -465,17 +464,17 @@ void vp8_loop_filter_frame_yonly if (filter_level) { if (mb_col > 0) - cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - if (mbd->mode_info_context->mbmi.dc_diff > 0) - cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + if (!skip_lf) + cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); /* don't apply across umv border */ if (mb_row > 0) - cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - if (mbd->mode_info_context->mbmi.dc_diff > 0) - cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + if (!skip_lf) + cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); } y_ptr += 16; @@ -565,20 +564,24 @@ void vp8_loop_filter_partial_frame for (mb_col = 0; mb_col < mb_cols; mb_col++) { int Segment = (alt_flt_enabled) ? mbd->mode_info_context->mbmi.segment_id : 0; + int skip_lf = (mbd->mode_info_context->mbmi.mode != B_PRED && + mbd->mode_info_context->mbmi.mode != SPLITMV && + mbd->mode_info_context->mbmi.mb_skip_coeff); + filter_level = baseline_filter_level[Segment]; if (filter_level) { if (mb_col > 0) - cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + cm->lf_mbv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - if (mbd->mode_info_context->mbmi.dc_diff > 0) - cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + if (!skip_lf) + cm->lf_bv(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + cm->lf_mbh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); - if (mbd->mode_info_context->mbmi.dc_diff > 0) - cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level], 0); + if (!skip_lf) + cm->lf_bh(y_ptr, 0, 0, post->y_stride, 0, &lfi[filter_level]); } y_ptr += 16; diff --git a/vp8/common/loopfilter.h b/vp8/common/loopfilter.h index 2e5997c73..ca136b3a4 100644 --- a/vp8/common/loopfilter.h +++ b/vp8/common/loopfilter.h @@ -41,7 +41,7 @@ typedef struct #define prototype_loopfilter_block(sym) \ void sym(unsigned char *y, unsigned char *u, unsigned char *v,\ - int ystride, int uv_stride, loop_filter_info *lfi, int simpler) + int ystride, int uv_stride, loop_filter_info *lfi) #if ARCH_X86 || ARCH_X86_64 #include "x86/loopfilter_x86.h" diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h index 426b8fc2b..a05951933 100644 --- a/vp8/common/onyx.h +++ b/vp8/common/onyx.h @@ -109,6 +109,7 @@ extern "C" int noise_sensitivity; // parameter used for applying pre processing blur: recommendation 0 int Sharpness; // parameter used for sharpening output: recommendation 0: int cpu_used; + unsigned int rc_max_intra_bitrate_pct; // mode -> //(0)=Realtime/Live Encoding. This mode is optimized for realtim encoding (for example, capturing @@ -139,8 +140,9 @@ extern "C" int end_usage; // vbr or cbr - // shoot to keep buffer full at all times by undershooting a bit 95 recommended + // buffer targeting aggressiveness int under_shoot_pct; + int over_shoot_pct; // buffering parameters int starting_buffer_level; // in seconds diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index c8c227787..cf29d03df 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -119,7 +119,6 @@ typedef struct VP8Common /* profile settings */ int mb_no_coeff_skip; int no_lpf; - int simpler_lpf; int use_bilinear_mc_filter; int full_pixel; @@ -196,6 +195,9 @@ typedef struct VP8Common #if CONFIG_RUNTIME_CPU_DETECT VP8_COMMON_RTCD rtcd; #endif +#if CONFIG_MULTITHREAD + int processor_core_count; +#endif struct postproc_state postproc_state; } VP8_COMMON; diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c index 5bfc7d6fb..7dbe96649 100644 --- a/vp8/common/postproc.c +++ b/vp8/common/postproc.c @@ -804,11 +804,14 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t for (j = 0; j < mb_cols; j++) { char zz[4]; + int dc_diff = !(mi[mb_index].mbmi.mode != B_PRED && + mi[mb_index].mbmi.mode != SPLITMV && + mi[mb_index].mbmi.mb_skip_coeff); if (oci->frame_type == KEY_FRAME) sprintf(zz, "a"); else - sprintf(zz, "%c", mi[mb_index].mbmi.dc_diff + '0'); + sprintf(zz, "%c", dc_diff + '0'); vp8_blit_text(zz, y_ptr, post->y_stride); mb_index ++; diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c index e602feedc..71bf6e2d7 100644 --- a/vp8/common/ppc/loopfilter_altivec.c +++ b/vp8/common/ppc/loopfilter_altivec.c @@ -53,9 +53,8 @@ loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc; // Horizontal MB filtering void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)simpler_lpf; mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); if (u_ptr) @@ -63,9 +62,8 @@ void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch } void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)simpler_lpf; (void)u_ptr; (void)v_ptr; (void)uv_stride; @@ -74,9 +72,8 @@ void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c // Vertical MB Filtering void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)simpler_lpf; mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr); if (u_ptr) @@ -84,9 +81,8 @@ void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch } void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)simpler_lpf; (void)u_ptr; (void)v_ptr; (void)uv_stride; @@ -95,9 +91,8 @@ void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned c // Horizontal B Filtering void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)simpler_lpf; // These should all be done at once with one call, instead of 3 loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr); @@ -108,9 +103,8 @@ void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha } void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)simpler_lpf; (void)u_ptr; (void)v_ptr; (void)uv_stride; @@ -121,9 +115,8 @@ void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned ch // Vertical B Filtering void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)simpler_lpf; loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr); if (u_ptr) @@ -131,9 +124,8 @@ void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned cha } void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void)simpler_lpf; (void)u_ptr; (void)v_ptr; (void)uv_stride; diff --git a/vp8/common/recon.h b/vp8/common/recon.h index e608f218c..7cfc779cd 100644 --- a/vp8/common/recon.h +++ b/vp8/common/recon.h @@ -26,6 +26,9 @@ #define prototype_build_intra_predictors(sym) \ void sym(MACROBLOCKD *x) +#define prototype_intra4x4_predict(sym) \ + void sym(BLOCKD *x, int b_mode, unsigned char *predictor) + struct vp8_recon_rtcd_vtable; #if ARCH_X86 || ARCH_X86_64 @@ -88,11 +91,30 @@ extern prototype_build_intra_predictors\ extern prototype_build_intra_predictors\ (vp8_recon_build_intra_predictors_mby_s); +#ifndef vp8_recon_build_intra_predictors_mbuv +#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv +#endif +extern prototype_build_intra_predictors\ + (vp8_recon_build_intra_predictors_mbuv); + +#ifndef vp8_recon_build_intra_predictors_mbuv_s +#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s +#endif +extern prototype_build_intra_predictors\ + (vp8_recon_build_intra_predictors_mbuv_s); + +#ifndef vp8_recon_intra4x4_predict +#define vp8_recon_intra4x4_predict vp8_intra4x4_predict +#endif +extern prototype_intra4x4_predict\ + (vp8_recon_intra4x4_predict); + typedef prototype_copy_block((*vp8_copy_block_fn_t)); typedef prototype_recon_block((*vp8_recon_fn_t)); typedef prototype_recon_macroblock((*vp8_recon_mb_fn_t)); typedef prototype_build_intra_predictors((*vp8_build_intra_pred_fn_t)); +typedef prototype_intra4x4_predict((*vp8_intra4x4_pred_fn_t)); typedef struct vp8_recon_rtcd_vtable { vp8_copy_block_fn_t copy16x16; @@ -105,6 +127,9 @@ typedef struct vp8_recon_rtcd_vtable vp8_recon_mb_fn_t recon_mby; vp8_build_intra_pred_fn_t build_intra_predictors_mby_s; vp8_build_intra_pred_fn_t build_intra_predictors_mby; + vp8_build_intra_pred_fn_t build_intra_predictors_mbuv_s; + vp8_build_intra_pred_fn_t build_intra_predictors_mbuv; + vp8_intra4x4_pred_fn_t intra4x4_predict; } vp8_recon_rtcd_vtable_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/common/reconinter.c b/vp8/common/reconinter.c index 7cfab4140..3b0405ca1 100644 --- a/vp8/common/reconinter.c +++ b/vp8/common/reconinter.c @@ -207,12 +207,12 @@ static void build_inter_predictors2b(MACROBLOCKD *x, BLOCKD *d, int pitch) } +/*encoder only*/ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x) { int i; - if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME && - x->mode_info_context->mbmi.mode != SPLITMV) + if (x->mode_info_context->mbmi.mode != SPLITMV) { unsigned char *uptr, *vptr; unsigned char *upred_ptr = &x->predictor[256]; @@ -257,159 +257,133 @@ void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x) } /*encoder only*/ -void vp8_build_inter_predictors_mby(MACROBLOCKD *x) +void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x) { + unsigned char *ptr_base; + unsigned char *ptr; + unsigned char *pred_ptr = x->predictor; + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; + int pre_stride = x->block[0].pre_stride; - if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME && - x->mode_info_context->mbmi.mode != SPLITMV) - { - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = x->predictor; - int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; - int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; - int pre_stride = x->block[0].pre_stride; - - ptr_base = x->pre.y_buffer; - ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + ptr_base = x->pre.y_buffer; + ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); - if ((mv_row | mv_col) & 7) - { - x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16); - } - else - { - RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16); - } + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16); } else { - int i; - - if (x->mode_info_context->mbmi.partitioning < 3) - { - for (i = 0; i < 4; i++) - { - BLOCKD *d = &x->block[bbb[i]]; - build_inter_predictors4b(x, d, 16); - } - - } - else - { - for (i = 0; i < 16; i += 2) - { - BLOCKD *d0 = &x->block[i]; - BLOCKD *d1 = &x->block[i+1]; - - if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - build_inter_predictors2b(x, d0, 16); - else - { - vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict); - vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict); - } - - } - } + RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16); } } -void vp8_build_inter_predictors_mb(MACROBLOCKD *x) +void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride) { + int offset; + unsigned char *ptr; + unsigned char *uptr, *vptr; - if (x->mode_info_context->mbmi.ref_frame != INTRA_FRAME && - x->mode_info_context->mbmi.mode != SPLITMV) - { - int offset; - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *uptr, *vptr; - unsigned char *pred_ptr = x->predictor; - unsigned char *upred_ptr = &x->predictor[256]; - unsigned char *vpred_ptr = &x->predictor[320]; + int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; + int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; - int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; - int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; - int pre_stride = x->block[0].pre_stride; + unsigned char *ptr_base = x->pre.y_buffer; + int pre_stride = x->block[0].pre_stride; - ptr_base = x->pre.y_buffer; - ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); + ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); - if ((mv_row | mv_col) & 7) - { - x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, pred_ptr, 16); - } - else - { - RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, pred_ptr, 16); - } + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_y, dst_ystride); + } + else + { + RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_y, dst_ystride); + } - mv_row = x->block[16].bmi.mv.as_mv.row; - mv_col = x->block[16].bmi.mv.as_mv.col; - pre_stride >>= 1; - offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); - uptr = x->pre.u_buffer + offset; - vptr = x->pre.v_buffer + offset; + mv_row = x->block[16].bmi.mv.as_mv.row; + mv_col = x->block[16].bmi.mv.as_mv.col; + pre_stride >>= 1; + offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); + uptr = x->pre.u_buffer + offset; + vptr = x->pre.v_buffer + offset; - if ((mv_row | mv_col) & 7) - { - x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, upred_ptr, 8); - x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vpred_ptr, 8); - } - else - { - RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, upred_ptr, 8); - RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vpred_ptr, 8); - } + if ((mv_row | mv_col) & 7) + { + x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, dst_u, dst_uvstride); + x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, dst_v, dst_uvstride); } else { - int i; - - if (x->mode_info_context->mbmi.partitioning < 3) - { - for (i = 0; i < 4; i++) - { - BLOCKD *d = &x->block[bbb[i]]; - build_inter_predictors4b(x, d, 16); - } - } - else - { - for (i = 0; i < 16; i += 2) - { - BLOCKD *d0 = &x->block[i]; - BLOCKD *d1 = &x->block[i+1]; + RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, dst_u, dst_uvstride); + RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, dst_v, dst_uvstride); + } - if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - build_inter_predictors2b(x, d0, 16); - else - { - vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict); - vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict); - } +} - } +void vp8_build_inter4x4_predictors_mb(MACROBLOCKD *x) +{ + int i; + if (x->mode_info_context->mbmi.partitioning < 3) + { + for (i = 0; i < 4; i++) + { + BLOCKD *d = &x->block[bbb[i]]; + build_inter_predictors4b(x, d, 16); } - - for (i = 16; i < 24; i += 2) + } + else + { + for (i = 0; i < 16; i += 2) { BLOCKD *d0 = &x->block[i]; BLOCKD *d1 = &x->block[i+1]; if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - build_inter_predictors2b(x, d0, 8); + build_inter_predictors2b(x, d0, 16); else { - vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict); - vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict); + vp8_build_inter_predictors_b(d0, 16, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 16, x->subpixel_predict); } } } + + for (i = 16; i < 24; i += 2) + { + BLOCKD *d0 = &x->block[i]; + BLOCKD *d1 = &x->block[i+1]; + + if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) + build_inter_predictors2b(x, d0, 8); + else + { + vp8_build_inter_predictors_b(d0, 8, x->subpixel_predict); + vp8_build_inter_predictors_b(d1, 8, x->subpixel_predict); + } + } +} + +void vp8_build_inter_predictors_mb(MACROBLOCKD *x) +{ + if (x->mode_info_context->mbmi.mode != SPLITMV) + { + vp8_build_inter16x16_predictors_mb(x, x->predictor, &x->predictor[256], + &x->predictor[320], 16, 8); + } + else + { + vp8_build_inter4x4_predictors_mb(x); + } } void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel) @@ -492,202 +466,5 @@ void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel) } -/* The following functions are wriiten for skip_recon_mb() to call. Since there is no recon in this - * situation, we can write the result directly to dst buffer instead of writing it to predictor - * buffer and then copying it to dst buffer. - */ -static void vp8_build_inter_predictors_b_s(BLOCKD *d, unsigned char *dst_ptr, vp8_subpix_fn_t sppf) -{ - int r; - unsigned char *ptr_base; - unsigned char *ptr; - /*unsigned char *pred_ptr = d->predictor;*/ - int dst_stride = d->dst_stride; - int pre_stride = d->pre_stride; - - ptr_base = *(d->base_pre); - - if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) - { - ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); - sppf(ptr, pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, dst_stride); - } - else - { - ptr_base += d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); - ptr = ptr_base; - - for (r = 0; r < 4; r++) - { -#ifdef MUST_BE_ALIGNED - dst_ptr[0] = ptr[0]; - dst_ptr[1] = ptr[1]; - dst_ptr[2] = ptr[2]; - dst_ptr[3] = ptr[3]; -#else - *(int *)dst_ptr = *(int *)ptr ; -#endif - dst_ptr += dst_stride; - ptr += pre_stride; - } - } -} - - - -void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x) -{ - /*unsigned char *pred_ptr = x->block[0].predictor; - unsigned char *dst_ptr = *(x->block[0].base_dst) + x->block[0].dst;*/ - unsigned char *pred_ptr = x->predictor; - unsigned char *dst_ptr = x->dst.y_buffer; - - if (x->mode_info_context->mbmi.mode != SPLITMV) - { - int offset; - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *uptr, *vptr; - /*unsigned char *pred_ptr = x->predictor; - unsigned char *upred_ptr = &x->predictor[256]; - unsigned char *vpred_ptr = &x->predictor[320];*/ - unsigned char *udst_ptr = x->dst.u_buffer; - unsigned char *vdst_ptr = x->dst.v_buffer; - - int mv_row = x->mode_info_context->mbmi.mv.as_mv.row; - int mv_col = x->mode_info_context->mbmi.mv.as_mv.col; - int pre_stride = x->dst.y_stride; /*x->block[0].pre_stride;*/ - - ptr_base = x->pre.y_buffer; - ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3); - - if ((mv_row | mv_col) & 7) - { - x->subpixel_predict16x16(ptr, pre_stride, mv_col & 7, mv_row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/ - } - else - { - RECON_INVOKE(&x->rtcd->recon, copy16x16)(ptr, pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/ - } - mv_row = x->block[16].bmi.mv.as_mv.row; - mv_col = x->block[16].bmi.mv.as_mv.col; - pre_stride >>= 1; - offset = (mv_row >> 3) * pre_stride + (mv_col >> 3); - uptr = x->pre.u_buffer + offset; - vptr = x->pre.v_buffer + offset; - if ((mv_row | mv_col) & 7) - { - x->subpixel_predict8x8(uptr, pre_stride, mv_col & 7, mv_row & 7, udst_ptr, x->dst.uv_stride); - x->subpixel_predict8x8(vptr, pre_stride, mv_col & 7, mv_row & 7, vdst_ptr, x->dst.uv_stride); - } - else - { - RECON_INVOKE(&x->rtcd->recon, copy8x8)(uptr, pre_stride, udst_ptr, x->dst.uv_stride); - RECON_INVOKE(&x->rtcd->recon, copy8x8)(vptr, pre_stride, vdst_ptr, x->dst.uv_stride); - } - } - else - { - /* note: this whole ELSE part is not executed at all. So, no way to test the correctness of my modification. Later, - * if sth is wrong, go back to what it is in build_inter_predictors_mb. - */ - int i; - - if (x->mode_info_context->mbmi.partitioning < 3) - { - for (i = 0; i < 4; i++) - { - BLOCKD *d = &x->block[bbb[i]]; - /*build_inter_predictors4b(x, d, 16);*/ - - { - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = d->predictor; - - ptr_base = *(d->base_pre); - ptr = ptr_base + d->pre + (d->bmi.mv.as_mv.row >> 3) * d->pre_stride + (d->bmi.mv.as_mv.col >> 3); - - if (d->bmi.mv.as_mv.row & 7 || d->bmi.mv.as_mv.col & 7) - { - x->subpixel_predict8x8(ptr, d->pre_stride, d->bmi.mv.as_mv.col & 7, d->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/ - } - else - { - RECON_INVOKE(&x->rtcd->recon, copy8x8)(ptr, d->pre_stride, dst_ptr, x->dst.y_stride); /*x->block[0].dst_stride);*/ - } - } - } - } - else - { - for (i = 0; i < 16; i += 2) - { - BLOCKD *d0 = &x->block[i]; - BLOCKD *d1 = &x->block[i+1]; - - if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - { - /*build_inter_predictors2b(x, d0, 16);*/ - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = d0->predictor; - - ptr_base = *(d0->base_pre); - ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3); - - if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7) - { - x->subpixel_predict8x4(ptr, d0->pre_stride, d0->bmi.mv.as_mv.col & 7, d0->bmi.mv.as_mv.row & 7, dst_ptr, x->dst.y_stride); - } - else - { - RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, d0->pre_stride, dst_ptr, x->dst.y_stride); - } - } - else - { - vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict); - vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict); - } - } - } - - for (i = 16; i < 24; i += 2) - { - BLOCKD *d0 = &x->block[i]; - BLOCKD *d1 = &x->block[i+1]; - - if (d0->bmi.mv.as_int == d1->bmi.mv.as_int) - { - /*build_inter_predictors2b(x, d0, 8);*/ - unsigned char *ptr_base; - unsigned char *ptr; - unsigned char *pred_ptr = d0->predictor; - - ptr_base = *(d0->base_pre); - ptr = ptr_base + d0->pre + (d0->bmi.mv.as_mv.row >> 3) * d0->pre_stride + (d0->bmi.mv.as_mv.col >> 3); - - if (d0->bmi.mv.as_mv.row & 7 || d0->bmi.mv.as_mv.col & 7) - { - x->subpixel_predict8x4(ptr, d0->pre_stride, - d0->bmi.mv.as_mv.col & 7, - d0->bmi.mv.as_mv.row & 7, - dst_ptr, x->dst.uv_stride); - } - else - { - RECON_INVOKE(&x->rtcd->recon, copy8x4)(ptr, - d0->pre_stride, dst_ptr, x->dst.uv_stride); - } - } - else - { - vp8_build_inter_predictors_b_s(d0, dst_ptr, x->subpixel_predict); - vp8_build_inter_predictors_b_s(d1, dst_ptr, x->subpixel_predict); - } - } - } -} diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h index 7c1dee431..a68e4aaba 100644 --- a/vp8/common/reconinter.h +++ b/vp8/common/reconinter.h @@ -13,9 +13,15 @@ #define __INC_RECONINTER_H extern void vp8_build_inter_predictors_mb(MACROBLOCKD *x); -extern void vp8_build_inter_predictors_mb_s(MACROBLOCKD *x); +extern void vp8_build_inter16x16_predictors_mb(MACROBLOCKD *x, + unsigned char *dst_y, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_ystride, + int dst_uvstride); -extern void vp8_build_inter_predictors_mby(MACROBLOCKD *x); + +extern void vp8_build_inter16x16_predictors_mby(MACROBLOCKD *x); extern void vp8_build_uvmvs(MACROBLOCKD *x, int fullpixel); extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch, vp8_subpix_fn_t sppf); extern void vp8_build_inter_predictors_mbuv(MACROBLOCKD *x); diff --git a/vp8/common/reconintra.h b/vp8/common/reconintra.h index 4025a5307..47e479285 100644 --- a/vp8/common/reconintra.h +++ b/vp8/common/reconintra.h @@ -14,9 +14,4 @@ extern void init_intra_left_above_pixels(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mbuv(MACROBLOCKD *x); -extern void vp8_build_intra_predictors_mbuv_s(MACROBLOCKD *x); - -extern void vp8_predict_intra4x4(BLOCKD *x, int b_mode, unsigned char *Predictor); - #endif diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c index cd70dca73..18c514541 100644 --- a/vp8/common/reconintra4x4.c +++ b/vp8/common/reconintra4x4.c @@ -14,7 +14,7 @@ #include "vpx_mem/vpx_mem.h" #include "reconintra.h" -void vp8_predict_intra4x4(BLOCKD *x, +void vp8_intra4x4_predict(BLOCKD *x, int b_mode, unsigned char *predictor) { diff --git a/vp8/common/threading.h b/vp8/common/threading.h index 44eaf0800..5927cb165 100644 --- a/vp8/common/threading.h +++ b/vp8/common/threading.h @@ -12,8 +12,6 @@ #ifndef _PTHREAD_EMULATION #define _PTHREAD_EMULATION -#define VPXINFINITE 10000 /* 10second. */ - #if CONFIG_OS_SUPPORT && CONFIG_MULTITHREAD /* Thread management macros */ @@ -28,7 +26,7 @@ #define pthread_t HANDLE #define pthread_attr_t DWORD #define pthread_create(thhandle,attr,thfunc,tharg) (int)((*thhandle=(HANDLE)_beginthreadex(NULL,0,(unsigned int (__stdcall *)(void *))thfunc,tharg,0,NULL))==NULL) -#define pthread_join(thread, result) ((WaitForSingleObject((thread),VPXINFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread)) +#define pthread_join(thread, result) ((WaitForSingleObject((thread),INFINITE)!=WAIT_OBJECT_0) || !CloseHandle(thread)) #define pthread_detach(thread) if(thread!=NULL)CloseHandle(thread) #define thread_sleep(nms) Sleep(nms) #define pthread_cancel(thread) terminate_thread(thread,0) @@ -61,9 +59,9 @@ #ifdef _WIN32 #define sem_t HANDLE #define pause(voidpara) __asm PAUSE -#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateEvent(NULL,FALSE,FALSE,NULL))==NULL) -#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,VPXINFINITE)) -#define sem_post(sem) SetEvent(*sem) +#define sem_init(sem, sem_attr1, sem_init_value) (int)((*sem = CreateSemaphore(NULL,0,32768,NULL))==NULL) +#define sem_wait(sem) (int)(WAIT_OBJECT_0 != WaitForSingleObject(*sem,INFINITE)) +#define sem_post(sem) ReleaseSemaphore(*sem,1,NULL) #define sem_destroy(sem) if(*sem)((int)(CloseHandle(*sem))==TRUE) #define thread_sleep(nms) Sleep(nms) diff --git a/vp8/common/x86/boolcoder.cxx b/vp8/common/x86/boolcoder.cxx deleted file mode 100644 index faddf1f42..000000000 --- a/vp8/common/x86/boolcoder.cxx +++ /dev/null @@ -1,494 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - - -/* Arithmetic bool coder with largish probability range. - Timothy S Murphy 6 August 2004 */ - -#include <assert.h> -#include <math.h> - -#include "bool_coder.h" - -#if tim_vp8 - extern "C" { -# include "VP8cx/treewriter.h" - } -#endif - -int_types::~int_types() {} - -void bool_coder_spec::check_prec() const { - assert( w && (r==Up || w > 1) && w < 24 && (ebias || w < 17)); -} - -bool bool_coder_spec::float_init( uint Ebits, uint Mbits) { - uint b = (ebits = Ebits) + (mbits = Mbits); - if( b) { - assert( ebits < 6 && w + mbits < 31); - assert( ebits + mbits < sizeof(Index) * 8); - ebias = (1 << ebits) + 1 + mbits; - mmask = (1 << mbits) - 1; - max_index = ( ( half_index = 1 << b ) << 1) - 1; - } else { - ebias = 0; - max_index = 255; - half_index = 128; - } - check_prec(); - return b? 1:0; -} - -void bool_coder_spec::cost_init() -{ - static cdouble c = -(1 << 20)/log( 2.); - - FILE *f = fopen( "costs.txt", "w"); - assert( f); - - assert( sizeof(int) >= 4); /* for C interface */ - assert( max_index <= 255); /* size of Ctbl */ - uint i = 0; do { - cdouble p = ( *this)( (Index) i); - Ctbl[i] = (uint32) ( log( p) * c); - fprintf( - f, "cost( %d -> %10.7f) = %10d = %12.5f bits\n", - i, p, Ctbl[i], (double) Ctbl[i] / (1<<20) - ); - } while( ++i <= max_index); - fclose( f); -} - -bool_coder_spec_explicit_table::bool_coder_spec_explicit_table( - cuint16 tbl[256], Rounding rr, uint prec -) - : bool_coder_spec( prec, rr) -{ - check_prec(); - uint i = 0; - if( tbl) - do { Ptbl[i] = tbl[i];} while( ++i < 256); - else - do { Ptbl[i] = i << 8;} while( ++i < 256); - cost_init(); -} - - -bool_coder_spec_exponential_table::bool_coder_spec_exponential_table( - uint x, Rounding rr, uint prec -) - : bool_coder_spec( prec, rr) -{ - assert( x > 1 && x <= 16); - check_prec(); - Ptbl[128] = 32768u; - Ptbl[0] = (uint16) pow( 2., 16. - x); - --x; - int i=1; do { - cdouble d = pow( .5, 1. + (1. - i/128.)*x) * 65536.; - uint16 v = (uint16) d; - if( v < i) - v = i; - Ptbl[256-i] = (uint16) ( 65536U - (Ptbl[i] = v)); - } while( ++i < 128); - cost_init(); -} - -bool_coder_spec::bool_coder_spec( FILE *fp) { - fscanf( fp, "%d", &w); - int v; - fscanf( fp, "%d", &v); - assert( 0 <= v && v <= 2); - r = (Rounding) v; - fscanf( fp, "%d", &ebits); - fscanf( fp, "%d", &mbits); - if( float_init( ebits, mbits)) - return; - int i=0; do { - uint v; - fscanf( fp, "%d", &v); - assert( 0 <=v && v <= 65535U); - Ptbl[i] = v; - } while( ++i < 256); - cost_init(); -} - -void bool_coder_spec::dump( FILE *fp) const { - fprintf( fp, "%d %d %d %d\n", w, (int) r, ebits, mbits); - if( ebits || mbits) - return; - int i=0; do { fprintf( fp, "%d\n", Ptbl[i]);} while( ++i < 256); -} - -vp8bc_index_t bool_coder_spec::operator()( double p) const -{ - if( p <= 0.) - return 0; - if( p >= 1.) - return max_index; - if( ebias) { - if( p > .5) - return max_index - ( *this)( 1. - p); - int e; - uint m = (uint) ldexp( frexp( p, &e), mbits + 2); - uint x = 1 << (mbits + 1); - assert( x <= m && m < x<<1); - if( (m = (m >> 1) + (m & 1)) >= x) { - m = x >> 1; - ++e; - } - int y = 1 << ebits; - if( (e += y) >= y) - return half_index - 1; - if( e < 0) - return 0; - return (Index) ( (e << mbits) + (m & mmask)); - } - - cuint16 v = (uint16) (p * 65536.); - int i = 128; - int j = 128; - uint16 w; - while( w = Ptbl[i], j >>= 1) { - if( w < v) - i += j; - else if( w == v) - return (uchar) i; - else - i -= j; - } - if( w > v) { - cuint16 x = Ptbl[i-1]; - if( v <= x || w - v > v - x) - --i; - } else if( w < v && i < 255) { - cuint16 x = Ptbl[i+1]; - if( x <= v || x - v < v - w) - ++i; - } - return (Index) i; -} - -double bool_coder_spec::operator()( Index i) const { - if( !ebias) - return Ptbl[i]/65536.; - if( i >= half_index) - return 1. - ( *this)( (Index) (max_index - i)); - return ldexp( (double)mantissa( i), - (int) exponent( i)); -} - - - -void bool_writer::carry() { - uchar *p = B; - assert( p > Bstart); - while( *--p == 255) { assert( p > Bstart); *p = 0;} - ++*p; -} - - -bool_writer::bool_writer( c_spec& s, uchar *Dest, size_t Len) - : bool_coder( s), - Bstart( Dest), - Bend( Len? Dest+Len : 0), - B( Dest) -{ - assert( Dest); - reset(); -} - -bool_writer::~bool_writer() { flush();} - -#if 1 - extern "C" { int bc_v = 0;} -#else -# define bc_v 0 -#endif - - -void bool_writer::raw( bool value, uint32 s) { - uint32 L = Low; - - assert( Range >= min_range && Range <= spec.max_range()); - assert( !is_toast && s && s < Range); - - if( bc_v) printf( - "Writing a %d, B %x Low %x Range %x s %x blag %d ...\n", - value? 1:0, B-Bstart, Low, Range, s, bit_lag - ); - if( value) { - L += s; - s = Range - s; - } else - s -= rinc; - if( s < min_range) { - int ct = bit_lag; do { - if( !--ct) { - ct = 8; - if( L & (1 << 31)) - carry(); - assert( !Bend || B < Bend); - *B++ = (uchar) (L >> 23); - L &= (1<<23) - 1; - } - } while( L += L, (s += s + rinc) < min_range); - bit_lag = ct; - } - Low = L; - Range = s; - if( bc_v) - printf( - "...done, B %x Low %x Range %x blag %d \n", - B-Bstart, Low, Range, bit_lag - ); -} - -bool_writer& bool_writer::flush() { - if( is_toast) - return *this; - int b = bit_lag; - uint32 L = Low; - assert( b); - if( L & (1 << (32 - b))) - carry(); - L <<= b & 7; - b >>= 3; - while( --b >= 0) - L <<= 8; - b = 4; - assert( !Bend || B + 4 <= Bend); - do { - *B++ = (uchar) (L >> 24); - L <<= 8; - } while( --b); - is_toast = 1; - return *this; -} - - -bool_reader::bool_reader( c_spec& s, cuchar *src, size_t Len) - : bool_coder( s), - Bstart( src), - B( src), - Bend( Len? src+Len : 0), - shf( 32 - s.w), - bct( 8) -{ - int i = 4; do { Low <<= 8; Low |= *B++;} while( --i); -} - - -bool bool_reader::raw( uint32 s) { - - bool val = 0; - uint32 L = Low; - cuint32 S = s << shf; - - assert( Range >= min_range && Range <= spec.max_range()); - assert( s && s < Range && (L >> shf) < Range); - - if( bc_v) - printf( - "Reading, B %x Low %x Range %x s %x bct %d ...\n", - B-Bstart, Low, Range, s, bct - ); - - if( L >= S) { - L -= S; - s = Range - s; - assert( L < (s << shf)); - val = 1; - } else - s -= rinc; - if( s < min_range) { - int ct = bct; - do { - assert( ~L & (1 << 31)); - L += L; - if( !--ct) { - ct = 8; - if( !Bend || B < Bend) - L |= *B++; - } - } while( (s += s + rinc) < min_range); - bct = ct; - } - Low = L; - Range = s; - if( bc_v) - printf( - "...done, val %d B %x Low %x Range %x bct %d\n", - val? 1:0, B-Bstart, Low, Range, bct - ); - return val; -} - - -/* C interfaces */ - -// spec interface - -struct NS : bool_coder_namespace { - static Rounding r( vp8bc_c_prec *p, Rounding rr =down_full) { - return p? (Rounding) p->r : rr; - } -}; - -bool_coder_spec *vp8bc_vp6spec() { - return new bool_coder_spec_explicit_table( 0, bool_coder_namespace::Down, 8); -} -bool_coder_spec *vp8bc_float_spec( - unsigned int Ebits, unsigned int Mbits, vp8bc_c_prec *p -) { - return new bool_coder_spec_float( Ebits, Mbits, NS::r( p), p? p->prec : 12); -} -bool_coder_spec *vp8bc_literal_spec( - const unsigned short m[256], vp8bc_c_prec *p -) { - return new bool_coder_spec_explicit_table( m, NS::r( p), p? p->prec : 16); -} -bool_coder_spec *vp8bc_exponential_spec( unsigned int x, vp8bc_c_prec *p) -{ - return new bool_coder_spec_exponential_table( x, NS::r( p), p? p->prec : 16); -} -bool_coder_spec *vp8bc_spec_from_file( FILE *fp) { - return new bool_coder_spec( fp); -} -void vp8bc_destroy_spec( c_bool_coder_spec *p) { delete p;} - -void vp8bc_spec_to_file( c_bool_coder_spec *p, FILE *fp) { p->dump( fp);} - -vp8bc_index_t vp8bc_index( c_bool_coder_spec *p, double x) { - return ( *p)( x); -} - -vp8bc_index_t vp8bc_index_from_counts( - c_bool_coder_spec *p, unsigned int L, unsigned int R -) { - return ( *p)( (R += L)? (double) L/R : .5); -} - -double vp8bc_probability( c_bool_coder_spec *p, vp8bc_index_t i) { - return ( *p)( i); -} - -vp8bc_index_t vp8bc_complement( c_bool_coder_spec *p, vp8bc_index_t i) { - return p->complement( i); -} -unsigned int vp8bc_cost_zero( c_bool_coder_spec *p, vp8bc_index_t i) { - return p->cost_zero( i); -} -unsigned int vp8bc_cost_one( c_bool_coder_spec *p, vp8bc_index_t i) { - return p->cost_one( i); -} -unsigned int vp8bc_cost_bit( c_bool_coder_spec *p, vp8bc_index_t i, int v) { - return p->cost_bit( i, v); -} - -#if tim_vp8 - extern "C" int tok_verbose; - -# define dbg_l 1000000 - - static vp8bc_index_t dbg_i [dbg_l]; - static char dbg_v [dbg_l]; - static size_t dbg_w = 0, dbg_r = 0; -#endif - -// writer interface - -bool_writer *vp8bc_create_writer( - c_bool_coder_spec *p, unsigned char *D, size_t L -) { - return new bool_writer( *p, D, L); -} - -size_t vp8bc_destroy_writer( bool_writer *p) { - const size_t s = p->flush().bytes_written(); - delete p; - return s; -} - -void vp8bc_write_bool( bool_writer *p, int v, vp8bc_index_t i) -{ -# if tim_vp8 - // bc_v = dbg_w < 10; - if( bc_v = tok_verbose) - printf( " writing %d at prob %d\n", v? 1:0, i); - accum_entropy_bc( &p->Spec(), i, v); - - ( *p)( i, (bool) v); - - if( dbg_w < dbg_l) { - dbg_i [dbg_w] = i; - dbg_v [dbg_w++] = v? 1:0; - } -# else - ( *p)( i, (bool) v); -# endif -} - -void vp8bc_write_bits( bool_writer *p, unsigned int v, int n) -{ -# if tim_vp8 - { - c_bool_coder_spec * const s = & p->Spec(); - const vp8bc_index_t i = s->half_index(); - int m = n; - while( --m >= 0) - accum_entropy_bc( s, i, (v>>m) & 1); - } -# endif - - p->write_bits( n, v); -} - -c_bool_coder_spec *vp8bc_writer_spec( c_bool_writer *w) { return & w->Spec();} - -// reader interface - -bool_reader *vp8bc_create_reader( - c_bool_coder_spec *p, const unsigned char *S, size_t L -) { - return new bool_reader( *p, S, L); -} - -void vp8bc_destroy_reader( bool_reader * p) { delete p;} - -int vp8bc_read_bool( bool_reader *p, vp8bc_index_t i) -{ -# if tim_vp8 - // bc_v = dbg_r < 10; - bc_v = tok_verbose; - const int v = ( *p)( i)? 1:0; - if( tok_verbose) - printf( " reading %d at prob %d\n", v, i); - if( dbg_r < dbg_l) { - assert( dbg_r <= dbg_w); - if( i != dbg_i[dbg_r] || v != dbg_v[dbg_r]) { - printf( - "Position %d: INCORRECTLY READING %d prob %d, wrote %d prob %d\n", - dbg_r, v, i, dbg_v[dbg_r], dbg_i[dbg_r] - ); - } - ++dbg_r; - } - return v; -# else - return ( *p)( i)? 1:0; -# endif -} - -unsigned int vp8bc_read_bits( bool_reader *p, int n) { return p->read_bits( n);} - -c_bool_coder_spec *vp8bc_reader_spec( c_bool_reader *r) { return & r->Spec();} - -#undef bc_v diff --git a/vp8/common/x86/idctllm_mmx.asm b/vp8/common/x86/idctllm_mmx.asm index 43735bc4b..465626b8f 100644 --- a/vp8/common/x86/idctllm_mmx.asm +++ b/vp8/common/x86/idctllm_mmx.asm @@ -14,18 +14,18 @@ ; /**************************************************************************** ; * Notes: ; * -; * This implementation makes use of 16 bit fixed point verio of two multiply +; * This implementation makes use of 16 bit fixed point version of two multiply ; * constants: ; * 1. sqrt(2) * cos (pi/8) -; * 2. sqrt(2) * sin (pi/8) -; * Becuase the first constant is bigger than 1, to maintain the same 16 bit -; * fixed point prrcision as the second one, we use a trick of +; * 2. sqrt(2) * sin (pi/8) +; * Because the first constant is bigger than 1, to maintain the same 16 bit +; * fixed point precision as the second one, we use a trick of ; * x * a = x + x*(a-1) ; * so ; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). ; * -; * For the second constant, becuase of the 16bit version is 35468, which -; * is bigger than 32768, in signed 16 bit multiply, it become a negative +; * For the second constant, because of the 16bit version is 35468, which +; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative ; * number. ; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x ; * diff --git a/vp8/common/x86/idctllm_sse2.asm b/vp8/common/x86/idctllm_sse2.asm index edee1578e..34a7e18ae 100644 --- a/vp8/common/x86/idctllm_sse2.asm +++ b/vp8/common/x86/idctllm_sse2.asm @@ -32,9 +32,6 @@ sym(idct_dequant_0_2x_sse2): mov rdx, arg(1) ; dequant mov rax, arg(0) ; qcoeff - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 - movd xmm4, [rax] movd xmm5, [rdx] @@ -43,9 +40,12 @@ sym(idct_dequant_0_2x_sse2): pmullw xmm4, xmm5 + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 + ; clear coeffs - movd [rax], xmm7 - movd [rax+32], xmm7 + movd [rax], xmm5 + movd [rax+32], xmm5 ;pshufb pshuflw xmm4, xmm4, 00000000b pshufhw xmm4, xmm4, 00000000b @@ -62,10 +62,10 @@ sym(idct_dequant_0_2x_sse2): lea rcx, [3*rcx] movq xmm3, [rax+rcx] - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 mov rax, arg(3) ; dst movsxd rdx, dword ptr arg(4) ; dst_stride @@ -77,10 +77,10 @@ sym(idct_dequant_0_2x_sse2): paddw xmm3, xmm4 ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 ; store blocks back out movq [rax], xmm0 @@ -102,6 +102,7 @@ sym(idct_dequant_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -347,6 +348,7 @@ sym(idct_dequant_full_2x_sse2): pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -377,8 +379,8 @@ sym(idct_dequant_dc_0_2x_sse2): mov rdi, arg(3) ; dst mov rdx, arg(5) ; dc - ; Zero out xmm7, for use unpacking - pxor xmm7, xmm7 + ; Zero out xmm5, for use unpacking + pxor xmm5, xmm5 ; load up 2 dc words here == 2*16 = doubleword movd xmm4, [rdx] @@ -398,10 +400,10 @@ sym(idct_dequant_dc_0_2x_sse2): psraw xmm4, 3 ; Predict buffer needs to be expanded from bytes to words - punpcklbw xmm0, xmm7 - punpcklbw xmm1, xmm7 - punpcklbw xmm2, xmm7 - punpcklbw xmm3, xmm7 + punpcklbw xmm0, xmm5 + punpcklbw xmm1, xmm5 + punpcklbw xmm2, xmm5 + punpcklbw xmm3, xmm5 ; Add to predict buffer paddw xmm0, xmm4 @@ -410,10 +412,10 @@ sym(idct_dequant_dc_0_2x_sse2): paddw xmm3, xmm4 ; pack up before storing - packuswb xmm0, xmm7 - packuswb xmm1, xmm7 - packuswb xmm2, xmm7 - packuswb xmm3, xmm7 + packuswb xmm0, xmm5 + packuswb xmm1, xmm5 + packuswb xmm2, xmm5 + packuswb xmm3, xmm5 ; Load destination stride before writing out, ; doesn't need to persist @@ -441,6 +443,7 @@ sym(idct_dequant_dc_full_2x_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -692,6 +695,7 @@ sym(idct_dequant_dc_full_2x_sse2): pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/common/x86/iwalsh_sse2.asm b/vp8/common/x86/iwalsh_sse2.asm index 83c97df7d..1da4fd8da 100644 --- a/vp8/common/x86/iwalsh_sse2.asm +++ b/vp8/common/x86/iwalsh_sse2.asm @@ -17,7 +17,7 @@ sym(vp8_short_inv_walsh4x4_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 2 - SAVE_XMM + SAVE_XMM 6 push rsi push rdi ; end prolog @@ -41,7 +41,7 @@ sym(vp8_short_inv_walsh4x4_sse2): movdqa xmm4, xmm0 punpcklqdq xmm0, xmm3 ;d1 a1 punpckhqdq xmm4, xmm3 ;c1 b1 - movd xmm7, eax + movd xmm6, eax movdqa xmm1, xmm4 ;c1 b1 paddw xmm4, xmm0 ;dl+cl a1+b1 aka op[4] op[0] @@ -66,7 +66,7 @@ sym(vp8_short_inv_walsh4x4_sse2): pshufd xmm2, xmm1, 4eh ;ip[8] ip[12] movdqa xmm3, xmm4 ;ip[4] ip[0] - pshufd xmm7, xmm7, 0 ;03 03 03 03 03 03 03 03 + pshufd xmm6, xmm6, 0 ;03 03 03 03 03 03 03 03 paddw xmm4, xmm2 ;ip[4]+ip[8] ip[0]+ip[12] aka b1 a1 psubw xmm3, xmm2 ;ip[4]-ip[8] ip[0]-ip[12] aka c1 d1 @@ -90,8 +90,8 @@ sym(vp8_short_inv_walsh4x4_sse2): punpcklwd xmm5, xmm0 ; 31 21 11 01 30 20 10 00 punpckhwd xmm1, xmm0 ; 33 23 13 03 32 22 12 02 ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - paddw xmm5, xmm7 - paddw xmm1, xmm7 + paddw xmm5, xmm6 + paddw xmm1, xmm6 psraw xmm5, 3 psraw xmm1, 3 diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index 849133dc4..c2ce1a106 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -288,7 +288,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -338,7 +338,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -584,7 +584,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -634,7 +634,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1024,7 +1024,7 @@ sym(vp8_loop_filter_vertical_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1091,7 +1091,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1249,7 +1249,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1318,7 +1318,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1386,7 +1386,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1503,7 +1503,7 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2): push rbp ; save old base pointer value. mov rbp, rsp ; set new base pointer value. SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx ; save callee-saved reg push rsi push rdi diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c index 5837bc0dc..a52420c98 100644 --- a/vp8/common/x86/loopfilter_x86.c +++ b/vp8/common/x86/loopfilter_x86.c @@ -42,9 +42,8 @@ extern loop_filter_uvfunction vp8_mbloop_filter_vertical_edge_uv_sse2; #if HAVE_MMX /* Horizontal MB filtering */ void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -56,21 +55,19 @@ void vp8_loop_filter_mbh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_mbhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -82,21 +79,19 @@ void vp8_loop_filter_mbv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_mbvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -110,12 +105,11 @@ void vp8_loop_filter_bh_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -124,9 +118,8 @@ void vp8_loop_filter_bhs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne /* Vertical B Filtering */ void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -140,12 +133,11 @@ void vp8_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -156,9 +148,8 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigne /* Horizontal MB filtering */ #if HAVE_SSE2 void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -167,21 +158,19 @@ void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_mbhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Vertical MB Filtering */ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); if (u_ptr) @@ -190,21 +179,19 @@ void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign void vp8_loop_filter_mbvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_sse2(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr, 2); } /* Horizontal B Filtering */ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -215,12 +202,11 @@ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -229,9 +215,8 @@ void vp8_loop_filter_bhs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign /* Vertical B Filtering */ void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { - (void) simpler_lpf; vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); @@ -242,12 +227,11 @@ void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne void vp8_loop_filter_bvs_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, - int y_stride, int uv_stride, loop_filter_info *lfi, int simpler_lpf) + int y_stride, int uv_stride, loop_filter_info *lfi) { (void) u_ptr; (void) v_ptr; (void) uv_stride; - (void) simpler_lpf; vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); vp8_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->flim, lfi->lim, lfi->thr, 2); diff --git a/vp8/common/x86/postproc_sse2.asm b/vp8/common/x86/postproc_sse2.asm index 30b4bf53a..06d51ec6f 100644 --- a/vp8/common/x86/postproc_sse2.asm +++ b/vp8/common/x86/postproc_sse2.asm @@ -26,7 +26,7 @@ sym(vp8_post_proc_down_and_across_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -256,7 +256,7 @@ sym(vp8_mbpost_proc_down_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -456,7 +456,7 @@ sym(vp8_mbpost_proc_across_ip_xmm): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/common/x86/recon_sse2.asm b/vp8/common/x86/recon_sse2.asm index 4ad3973ec..97dc4f686 100644 --- a/vp8/common/x86/recon_sse2.asm +++ b/vp8/common/x86/recon_sse2.asm @@ -67,7 +67,7 @@ sym(vp8_recon4b_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 - SAVE_XMM + SAVE_XMM 7 push rsi push rdi ; end prolog @@ -229,3 +229,397 @@ sym(vp8_copy_mem16x16_sse2): UNSHADOW_ARGS pop rbp ret + + +;void vp8_intra_pred_uv_dc_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp8_intra_pred_uv_dc_mmx2) +sym(vp8_intra_pred_uv_dc_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + ; from top + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + sub rsi, rax + pxor mm0, mm0 + movq mm1, [rsi] + psadbw mm1, mm0 + + ; from left + dec rsi + lea rdi, [rax*3] + movzx ecx, byte [rsi+rax] + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + movzx edx, byte [rsi+rax*4] + add ecx, edx + + ; add up + pextrw edx, mm1, 0x0 + lea edx, [edx+ecx+8] + sar edx, 4 + movd mm1, edx + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_dctop_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp8_intra_pred_uv_dctop_mmx2) +sym(vp8_intra_pred_uv_dctop_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + ; from top + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + sub rsi, rax + pxor mm0, mm0 + movq mm1, [rsi] + psadbw mm1, mm0 + + ; add up + paddw mm1, [GLOBAL(dc_4)] + psraw mm1, 3 + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_dcleft_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp8_intra_pred_uv_dcleft_mmx2) +sym(vp8_intra_pred_uv_dcleft_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + ; from left + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + dec rsi + lea rdi, [rax*3] + movzx ecx, byte [rsi] + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + add ecx, edx + lea rsi, [rsi+rax*4] + movzx edx, byte [rsi] + add ecx, edx + movzx edx, byte [rsi+rax] + add ecx, edx + movzx edx, byte [rsi+rax*2] + add ecx, edx + movzx edx, byte [rsi+rdi] + lea edx, [ecx+edx+4] + + ; add up + shr edx, 3 + movd mm1, edx + pshufw mm1, mm1, 0x0 + packuswb mm1, mm1 + + ; write out + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + lea rax, [rcx*3] + + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + lea rdi, [rdi+rcx*4] + movq [rdi ], mm1 + movq [rdi+rcx ], mm1 + movq [rdi+rcx*2], mm1 + movq [rdi+rax ], mm1 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_dc128_mmx( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp8_intra_pred_uv_dc128_mmx) +sym(vp8_intra_pred_uv_dc128_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + ; end prolog + + ; write out + movq mm1, [GLOBAL(dc_128)] + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + lea rax, [rax+rdx*4] + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_tm_sse2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +%macro vp8_intra_pred_uv_tm 1 +global sym(vp8_intra_pred_uv_tm_%1) +sym(vp8_intra_pred_uv_tm_%1): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + ; read top row + mov edx, 4 + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + sub rsi, rax + pxor xmm0, xmm0 +%ifidn %1, ssse3 + movdqa xmm2, [GLOBAL(dc_1024)] +%endif + movq xmm1, [rsi] + punpcklbw xmm1, xmm0 + + ; set up left ptrs ans subtract topleft + movd xmm3, [rsi-1] + lea rsi, [rsi+rax-1] +%ifidn %1, sse2 + punpcklbw xmm3, xmm0 + pshuflw xmm3, xmm3, 0x0 + punpcklqdq xmm3, xmm3 +%else + pshufb xmm3, xmm2 +%endif + psubw xmm1, xmm3 + + ; set up dest ptrs + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + +vp8_intra_pred_uv_tm_%1_loop: + movd xmm3, [rsi] + movd xmm5, [rsi+rax] +%ifidn %1, sse2 + punpcklbw xmm3, xmm0 + punpcklbw xmm5, xmm0 + pshuflw xmm3, xmm3, 0x0 + pshuflw xmm5, xmm5, 0x0 + punpcklqdq xmm3, xmm3 + punpcklqdq xmm5, xmm5 +%else + pshufb xmm3, xmm2 + pshufb xmm5, xmm2 +%endif + paddw xmm3, xmm1 + paddw xmm5, xmm1 + packuswb xmm3, xmm5 + movq [rdi ], xmm3 + movhps[rdi+rcx], xmm3 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz vp8_intra_pred_uv_tm_%1_loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret +%endmacro + +vp8_intra_pred_uv_tm sse2 +vp8_intra_pred_uv_tm ssse3 + +;void vp8_intra_pred_uv_ve_mmx( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp8_intra_pred_uv_ve_mmx) +sym(vp8_intra_pred_uv_ve_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + ; end prolog + + ; read from top + mov rax, arg(2) ;src; + movsxd rdx, dword ptr arg(3) ;src_stride; + sub rax, rdx + movq mm1, [rax] + + ; write out + mov rax, arg(0) ;dst; + movsxd rdx, dword ptr arg(1) ;dst_stride + lea rcx, [rdx*3] + + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + lea rax, [rax+rdx*4] + movq [rax ], mm1 + movq [rax+rdx ], mm1 + movq [rax+rdx*2], mm1 + movq [rax+rcx ], mm1 + + ; begin epilog + UNSHADOW_ARGS + pop rbp + ret + +;void vp8_intra_pred_uv_ho_mmx2( +; unsigned char *dst, +; int dst_stride +; unsigned char *src, +; int src_stride, +; ) +global sym(vp8_intra_pred_uv_ho_mmx2) +sym(vp8_intra_pred_uv_ho_mmx2): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 4 + push rsi + push rdi + ; end prolog + + ; read from left and write out + mov edx, 4 + mov rsi, arg(2) ;src; + movsxd rax, dword ptr arg(3) ;src_stride; + mov rdi, arg(0) ;dst; + movsxd rcx, dword ptr arg(1) ;dst_stride + dec rsi +vp8_intra_pred_uv_ho_mmx2_loop: + movd mm0, [rsi] + movd mm1, [rsi+rax] + punpcklbw mm0, mm0 + punpcklbw mm1, mm1 + pshufw mm0, mm0, 0x0 + pshufw mm1, mm1, 0x0 + movq [rdi ], mm0 + movq [rdi+rcx], mm1 + lea rsi, [rsi+rax*2] + lea rdi, [rdi+rcx*2] + dec edx + jnz vp8_intra_pred_uv_ho_mmx2_loop + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +dc_128: + times 8 db 128 +dc_4: + times 4 dw 4 +align 16 +dc_1024: + times 8 dw 0x400 diff --git a/vp8/common/x86/recon_wrapper_sse2.c b/vp8/common/x86/recon_wrapper_sse2.c new file mode 100644 index 000000000..86b4da2c2 --- /dev/null +++ b/vp8/common/x86/recon_wrapper_sse2.c @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2010 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include "vpx_ports/config.h" +#include "vp8/common/recon.h" +#include "recon_x86.h" +#include "vpx_mem/vpx_mem.h" + +#define build_intra_predictors_mbuv_prototype(sym) \ + void sym(unsigned char *dst, int dst_stride, \ + const unsigned char *src, int src_stride) +typedef build_intra_predictors_mbuv_prototype((*build_intra_predictors_mbuv_fn_t)); + +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dctop_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dcleft_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_dc128_mmx); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ho_mmx2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_ve_mmx); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_sse2); +extern build_intra_predictors_mbuv_prototype(vp8_intra_pred_uv_tm_ssse3); + +static void vp8_build_intra_predictors_mbuv_x86(MACROBLOCKD *x, + unsigned char *dst_u, + unsigned char *dst_v, + int dst_stride, + build_intra_predictors_mbuv_fn_t tm_func) +{ + int mode = x->mode_info_context->mbmi.uv_mode; + build_intra_predictors_mbuv_fn_t fn; + int src_stride = x->dst.uv_stride; + + switch (mode) { + case V_PRED: fn = vp8_intra_pred_uv_ve_mmx; break; + case H_PRED: fn = vp8_intra_pred_uv_ho_mmx2; break; + case TM_PRED: fn = tm_func; break; + case DC_PRED: + if (x->up_available) { + if (x->left_available) { + fn = vp8_intra_pred_uv_dc_mmx2; break; + } else { + fn = vp8_intra_pred_uv_dctop_mmx2; break; + } + } else if (x->left_available) { + fn = vp8_intra_pred_uv_dcleft_mmx2; break; + } else { + fn = vp8_intra_pred_uv_dc128_mmx; break; + } + break; + default: return; + } + + fn(dst_u, dst_stride, x->dst.u_buffer, src_stride); + fn(dst_v, dst_stride, x->dst.v_buffer, src_stride); +} + +void vp8_build_intra_predictors_mbuv_sse2(MACROBLOCKD *x) +{ + vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256], + &x->predictor[320], 8, + vp8_intra_pred_uv_tm_sse2); +} + +void vp8_build_intra_predictors_mbuv_ssse3(MACROBLOCKD *x) +{ + vp8_build_intra_predictors_mbuv_x86(x, &x->predictor[256], + &x->predictor[320], 8, + vp8_intra_pred_uv_tm_ssse3); +} + +void vp8_build_intra_predictors_mbuv_s_sse2(MACROBLOCKD *x) +{ + vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer, + x->dst.v_buffer, x->dst.uv_stride, + vp8_intra_pred_uv_tm_sse2); +} + +void vp8_build_intra_predictors_mbuv_s_ssse3(MACROBLOCKD *x) +{ + vp8_build_intra_predictors_mbuv_x86(x, x->dst.u_buffer, + x->dst.v_buffer, x->dst.uv_stride, + vp8_intra_pred_uv_tm_ssse3); +} diff --git a/vp8/common/x86/recon_x86.h b/vp8/common/x86/recon_x86.h index 40ee65a12..fe0f8f0bc 100644 --- a/vp8/common/x86/recon_x86.h +++ b/vp8/common/x86/recon_x86.h @@ -46,6 +46,8 @@ extern prototype_copy_block(vp8_copy_mem16x16_mmx); extern prototype_recon_block(vp8_recon2b_sse2); extern prototype_recon_block(vp8_recon4b_sse2); extern prototype_copy_block(vp8_copy_mem16x16_sse2); +extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_sse2); +extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_sse2); #if !CONFIG_RUNTIME_CPU_DETECT #undef vp8_recon_recon2 @@ -57,6 +59,26 @@ extern prototype_copy_block(vp8_copy_mem16x16_sse2); #undef vp8_recon_copy16x16 #define vp8_recon_copy16x16 vp8_copy_mem16x16_sse2 +#undef vp8_recon_build_intra_predictors_mbuv +#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_sse2 + +#undef vp8_recon_build_intra_predictors_mbuv_s +#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_sse2 + +#endif +#endif + +#if HAVE_SSSE3 +extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_ssse3); +extern prototype_build_intra_predictors(vp8_build_intra_predictors_mbuv_s_ssse3); + +#if !CONFIG_RUNTIME_CPU_DETECT +#undef vp8_recon_build_intra_predictors_mbuv +#define vp8_recon_build_intra_predictors_mbuv vp8_build_intra_predictors_mbuv_ssse3 + +#undef vp8_recon_build_intra_predictors_mbuv_s +#define vp8_recon_build_intra_predictors_mbuv_s vp8_build_intra_predictors_mbuv_s_ssse3 + #endif #endif #endif diff --git a/vp8/common/x86/subpixel_sse2.asm b/vp8/common/x86/subpixel_sse2.asm index b87cad259..83e3b1479 100644 --- a/vp8/common/x86/subpixel_sse2.asm +++ b/vp8/common/x86/subpixel_sse2.asm @@ -37,7 +37,7 @@ sym(vp8_filter_block1d8_h6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -157,7 +157,7 @@ sym(vp8_filter_block1d16_h6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -333,7 +333,7 @@ sym(vp8_filter_block1d8_v6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 8 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -428,7 +428,7 @@ sym(vp8_filter_block1d16_v6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 8 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -538,7 +538,7 @@ sym(vp8_filter_block1d8_h6_only_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -651,7 +651,7 @@ sym(vp8_filter_block1d16_h6_only_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -816,7 +816,7 @@ sym(vp8_filter_block1d8_v6_only_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -908,7 +908,6 @@ sym(vp8_unpack_block1d16_h6_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 - ;SAVE_XMM ;xmm6, xmm7 are not used here. GET_GOT rbx push rsi push rdi @@ -948,7 +947,6 @@ unpack_block1d16_h6_sse2_rowloop: pop rdi pop rsi RESTORE_GOT - ;RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -969,7 +967,7 @@ sym(vp8_bilinear_predict16x16_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1238,7 +1236,7 @@ sym(vp8_bilinear_predict8x8_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/common/x86/subpixel_ssse3.asm b/vp8/common/x86/subpixel_ssse3.asm index 7f6fd93e4..1ddbc54bd 100644 --- a/vp8/common/x86/subpixel_ssse3.asm +++ b/vp8/common/x86/subpixel_ssse3.asm @@ -39,6 +39,7 @@ sym(vp8_filter_block1d8_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -107,6 +108,7 @@ filter_block1d8_h6_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -162,6 +164,7 @@ filter_block1d8_h4_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -179,7 +182,7 @@ sym(vp8_filter_block1d16_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -194,10 +197,6 @@ sym(vp8_filter_block1d16_h6_ssse3): mov rdi, arg(2) ;output_ptr -;; -;; cmp esi, DWORD PTR [rax] -;; je vp8_filter_block1d16_h4_ssse3 - mov rsi, arg(0) ;src_ptr movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 @@ -271,61 +270,7 @@ filter_block1d16_h6_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - -vp8_filter_block1d16_h4_ssse3: - movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 - movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixels_per_line - movsxd rcx, dword ptr arg(4) ;output_height - movsxd rdx, dword ptr arg(3) ;output_pitch - -filter_block1d16_h4_rowloop_ssse3: - movdqu xmm1, XMMWORD PTR [rsi - 2] - - movdqa xmm2, xmm1 - pshufb xmm1, [GLOBAL(shuf2b)] - pshufb xmm2, [GLOBAL(shuf3b)] - pmaddubsw xmm1, xmm5 - - movdqu xmm3, XMMWORD PTR [rsi + 6] - - pmaddubsw xmm2, xmm6 - movdqa xmm0, xmm3 - pshufb xmm3, [GLOBAL(shuf3b)] - pshufb xmm0, [GLOBAL(shuf2b)] - - paddsw xmm1, [GLOBAL(rd)] - paddsw xmm1, xmm2 - - pmaddubsw xmm0, xmm5 - pmaddubsw xmm3, xmm6 - - psraw xmm1, 7 - packuswb xmm1, xmm1 - lea rsi, [rsi + rax] - paddsw xmm3, xmm0 - paddsw xmm3, [GLOBAL(rd)] - psraw xmm3, 7 - packuswb xmm3, xmm3 - - punpcklqdq xmm1, xmm3 - - movdqa XMMWORD Ptr [rdi], xmm1 - - add rdi, rdx - dec rcx - jnz filter_block1d16_h4_rowloop_ssse3 - - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -344,6 +289,7 @@ sym(vp8_filter_block1d4_h6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -451,6 +397,7 @@ filter_block1d4_h4_rowloop_ssse3: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -471,6 +418,7 @@ sym(vp8_filter_block1d16_v6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -566,6 +514,7 @@ vp8_filter_block1d16_v6_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -638,6 +587,7 @@ vp8_filter_block1d16_v4_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -656,6 +606,7 @@ sym(vp8_filter_block1d8_v6_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -728,6 +679,7 @@ vp8_filter_block1d8_v6_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -776,6 +728,7 @@ vp8_filter_block1d8_v4_ssse3_loop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -932,7 +885,7 @@ sym(vp8_bilinear_predict16x16_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1195,7 +1148,7 @@ sym(vp8_bilinear_predict8x8_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/common/x86/x86_systemdependent.c b/vp8/common/x86/x86_systemdependent.c index e89c07a4f..17667330a 100644 --- a/vp8/common/x86/x86_systemdependent.c +++ b/vp8/common/x86/x86_systemdependent.c @@ -88,6 +88,10 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->recon.recon2 = vp8_recon2b_sse2; rtcd->recon.recon4 = vp8_recon4b_sse2; rtcd->recon.copy16x16 = vp8_copy_mem16x16_sse2; + rtcd->recon.build_intra_predictors_mbuv = + vp8_build_intra_predictors_mbuv_sse2; + rtcd->recon.build_intra_predictors_mbuv_s = + vp8_build_intra_predictors_mbuv_s_sse2; rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_sse2; @@ -126,6 +130,11 @@ void vp8_arch_x86_common_init(VP8_COMMON *ctx) rtcd->subpix.sixtap4x4 = vp8_sixtap_predict4x4_ssse3; rtcd->subpix.bilinear16x16 = vp8_bilinear_predict16x16_ssse3; rtcd->subpix.bilinear8x8 = vp8_bilinear_predict8x8_ssse3; + + rtcd->recon.build_intra_predictors_mbuv = + vp8_build_intra_predictors_mbuv_ssse3; + rtcd->recon.build_intra_predictors_mbuv_s = + vp8_build_intra_predictors_mbuv_s_ssse3; } #endif diff --git a/vp8/decoder/dboolhuff.h b/vp8/decoder/dboolhuff.h index a83e3f012..5f6b211ea 100644 --- a/vp8/decoder/dboolhuff.h +++ b/vp8/decoder/dboolhuff.h @@ -51,19 +51,26 @@ void vp8dx_bool_decoder_fill(BOOL_DECODER *br); #define VP8DX_BOOL_DECODER_FILL(_count,_value,_bufptr,_bufend) \ do \ { \ - int shift; \ - for(shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); shift >= 0; ) \ + int shift = VP8_BD_VALUE_SIZE - 8 - ((_count) + 8); \ + int loop_end, x; \ + size_t bits_left = ((_bufend)-(_bufptr))*CHAR_BIT; \ + \ + x = shift + CHAR_BIT - bits_left; \ + loop_end = 0; \ + if(x >= 0) \ { \ - if((_bufptr) >= (_bufend)) { \ - (_count) = VP8_LOTS_OF_BITS; \ - break; \ - } \ - (_count) += 8; \ + (_count) += VP8_LOTS_OF_BITS; \ + loop_end = x; \ + if(!bits_left) break; \ + } \ + while(shift >= loop_end) \ + { \ + (_count) += CHAR_BIT; \ (_value) |= (VP8_BD_VALUE)*(_bufptr)++ << shift; \ - shift -= 8; \ + shift -= CHAR_BIT; \ } \ } \ - while(0) + while(0) \ static int vp8dx_decode_bool(BOOL_DECODER *br, int probability) { @@ -119,18 +126,19 @@ static int vp8_decode_value(BOOL_DECODER *br, int bits) static int vp8dx_bool_error(BOOL_DECODER *br) { - /* Check if we have reached the end of the buffer. - * - * Variable 'count' stores the number of bits in the 'value' buffer, - * minus 8. So if count == 8, there are 16 bits available to be read. - * Normally, count is filled with 8 and one byte is filled into the - * value buffer. When we reach the end of the buffer, count is instead - * filled with VP8_LOTS_OF_BITS, 8 of which represent the last 8 real - * bits from the bitstream. So the last bit in the bitstream will be - * represented by count == VP8_LOTS_OF_BITS - 16. - */ - if ((br->count > VP8_BD_VALUE_SIZE) - && (br->count <= VP8_LOTS_OF_BITS - 16)) + /* Check if we have reached the end of the buffer. + * + * Variable 'count' stores the number of bits in the 'value' buffer, minus + * 8. The top byte is part of the algorithm, and the remainder is buffered + * to be shifted into it. So if count == 8, the top 16 bits of 'value' are + * occupied, 8 for the algorithm and 8 in the buffer. + * + * When reading a byte from the user's buffer, count is filled with 8 and + * one byte is filled into the value buffer. When we reach the end of the + * data, count is additionally filled with VP8_LOTS_OF_BITS. So when + * count == VP8_LOTS_OF_BITS - 1, the user's data has been exhausted. + */ + if ((br->count > VP8_BD_VALUE_SIZE) && (br->count < VP8_LOTS_OF_BITS)) { /* We have tried to decode bits after the end of * stream was encountered. diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c index 82841e8b8..a585f774c 100644 --- a/vp8/decoder/decodframe.c +++ b/vp8/decoder/decodframe.c @@ -111,16 +111,17 @@ void mb_init_dequantizer(VP8D_COMP *pbi, MACROBLOCKD *xd) */ static void skip_recon_mb(VP8D_COMP *pbi, MACROBLOCKD *xd) { - if (xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { - - vp8_build_intra_predictors_mbuv_s(xd); + RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv_s)(xd); RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mby_s)(xd); } else { - vp8_build_inter_predictors_mb_s(xd); + vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.y_stride, xd->dst.uv_stride); } } @@ -195,11 +196,15 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) clamp_mvs(xd); } - xd->mode_info_context->mbmi.dc_diff = 1; - - if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0) + eobtotal |= (xd->mode_info_context->mbmi.mode == B_PRED || + xd->mode_info_context->mbmi.mode == SPLITMV); + if (!eobtotal) { - xd->mode_info_context->mbmi.dc_diff = 0; + /* Special case: Force the loopfilter to skip when eobtotal and + * mb_skip_coeff are zero. + * */ + xd->mode_info_context->mbmi.mb_skip_coeff = 1; + skip_recon_mb(pbi, xd); return; } @@ -208,9 +213,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) mb_init_dequantizer(pbi, xd); /* do prediction */ - if (xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { - vp8_build_intra_predictors_mbuv(xd); + RECON_INVOKE(&pbi->common.rtcd.recon, build_intra_predictors_mbuv)(xd); if (xd->mode_info_context->mbmi.mode != B_PRED) { @@ -218,6 +223,9 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) build_intra_predictors_mby)(xd); } else { vp8_intra_prediction_down_copy(xd); + + + } } else @@ -229,6 +237,7 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV) { BLOCKD *b = &xd->block[24]; + DEQUANT_INVOKE(&pbi->dequant, block)(b); /* do 2nd order transform on the dc block */ @@ -255,13 +264,14 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd) xd->predictor, xd->dst.y_buffer, xd->dst.y_stride, xd->eobs, xd->block[24].diff); } - else if ((xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) && xd->mode_info_context->mbmi.mode == B_PRED) + else if (xd->mode_info_context->mbmi.mode == B_PRED) { for (i = 0; i < 16; i++) { BLOCKD *b = &xd->block[i]; - vp8_predict_intra4x4(b, b->bmi.mode, b->predictor); + RECON_INVOKE(RTCD_VTABLE(recon), intra4x4_predict) + (b, b->bmi.mode, b->predictor); if (xd->eobs[i] > 1) { diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c index c22e0f28c..c22e0f28c 100755..100644 --- a/vp8/decoder/detokenize.c +++ b/vp8/decoder/detokenize.c diff --git a/vp8/decoder/onyxd_if.c b/vp8/decoder/onyxd_if.c index ef2e00d61..1e83ab542 100644 --- a/vp8/decoder/onyxd_if.c +++ b/vp8/decoder/onyxd_if.c @@ -76,7 +76,6 @@ VP8D_PTR vp8dx_create_decompressor(VP8D_CONFIG *oxcf) pbi->common.current_video_frame = 0; pbi->ready_for_new_data = 1; - pbi->CPUFreq = 0; /*vp8_get_processor_freq();*/ #if CONFIG_MULTITHREAD pbi->max_threads = oxcf->max_threads; vp8_decoder_create_threads(pbi); @@ -252,7 +251,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign VP8D_COMP *pbi = (VP8D_COMP *) ptr; VP8_COMMON *cm = &pbi->common; int retcode = 0; - struct vpx_usec_timer timer; /*if(pbi->ready_for_new_data == 0) return -1;*/ @@ -317,8 +315,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign pbi->common.error.setjmp = 1; - vpx_usec_timer_start(&timer); - /*cm->current_video_frame++;*/ pbi->Source = source; pbi->source_sz = size; @@ -379,15 +375,9 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign if(pbi->common.filter_level) { - struct vpx_usec_timer lpftimer; - vpx_usec_timer_start(&lpftimer); /* Apply the loop filter if appropriate. */ - vp8_loop_filter_frame(cm, &pbi->mb, cm->filter_level); - vpx_usec_timer_mark(&lpftimer); - pbi->time_loop_filtering += vpx_usec_timer_elapsed(&lpftimer); - cm->last_frame_type = cm->frame_type; cm->last_filter_type = cm->filter_type; cm->last_sharpness_level = cm->sharpness_level; @@ -398,11 +388,6 @@ int vp8dx_receive_compressed_data(VP8D_PTR ptr, unsigned long size, const unsign vp8_clear_system_state(); - vpx_usec_timer_mark(&timer); - pbi->decode_microseconds = vpx_usec_timer_elapsed(&timer); - - pbi->time_decoding += pbi->decode_microseconds; - /*vp8_print_modes_and_motion_vectors( cm->mi, cm->mb_rows,cm->mb_cols, cm->current_video_frame);*/ if (cm->show_frame) diff --git a/vp8/decoder/onyxd_int.h b/vp8/decoder/onyxd_int.h index ac1e332e8..9b9175628 100644 --- a/vp8/decoder/onyxd_int.h +++ b/vp8/decoder/onyxd_int.h @@ -81,12 +81,6 @@ typedef struct VP8Decompressor const unsigned char *Source; unsigned int source_sz; - - unsigned int CPUFreq; - unsigned int decode_microseconds; - unsigned int time_decoding; - unsigned int time_loop_filtering; - #if CONFIG_MULTITHREAD /* variable for threading */ diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c index 3d9d428ef..9ef85e9cd 100644 --- a/vp8/decoder/threading.c +++ b/vp8/decoder/threading.c @@ -108,21 +108,26 @@ static void decode_macroblock(VP8D_COMP *pbi, MACROBLOCKD *xd, int mb_row, int m clamp_mvs(xd); } - xd->mode_info_context->mbmi.dc_diff = 1; - - if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV && eobtotal == 0) + eobtotal |= (xd->mode_info_context->mbmi.mode == B_PRED || + xd->mode_info_context->mbmi.mode == SPLITMV); + if (!eobtotal) { - xd->mode_info_context->mbmi.dc_diff = 0; + /* Special case: Force the loopfilter to skip when eobtotal and + * mb_skip_coeff are zero. + * */ + xd->mode_info_context->mbmi.mb_skip_coeff = 1; /*mt_skip_recon_mb(pbi, xd, mb_row, mb_col);*/ - if (xd->frame_type == KEY_FRAME || xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) + if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) { vp8mt_build_intra_predictors_mbuv_s(pbi, xd, mb_row, mb_col); vp8mt_build_intra_predictors_mby_s(pbi, xd, mb_row, mb_col); } else { - vp8_build_inter_predictors_mb_s(xd); + vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.y_stride, xd->dst.uv_stride); } return; } @@ -322,6 +327,7 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) if (pbi->common.filter_level) { + int skip_lf; if( mb_row != pc->mb_rows-1 ) { /* Save decoded MB last row data for next-row decoding */ @@ -349,6 +355,10 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) /* update loopfilter info */ Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0; + skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV && + xd->mode_info_context->mbmi.mb_skip_coeff); + filter_level = pbi->mt_baseline_filter_level[Segment]; /* Distance of Mb to the various image edges. * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units @@ -360,17 +370,17 @@ static THREAD_FUNCTION thread_decoding_proc(void *p_data) if (filter_level) { if (mb_col > 0) - pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf); + pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - if (xd->mode_info_context->mbmi.dc_diff > 0) - pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf); + if (!skip_lf) + pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); /* don't apply across umv border */ if (mb_row > 0) - pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf); + pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - if (xd->mode_info_context->mbmi.dc_diff > 0) - pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf); + if (!skip_lf) + pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); } } @@ -429,12 +439,18 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi) pbi->b_multithreaded_rd = 0; pbi->allocated_decoding_thread_count = 0; - core_count = (pbi->max_threads > 16) ? 16 : pbi->max_threads; + + /* limit decoding threads to the max number of token partitions */ + core_count = (pbi->max_threads > 8) ? 8 : pbi->max_threads; + + /* limit decoding threads to the available cores */ + if (core_count > pbi->common.processor_core_count) + core_count = pbi->common.processor_core_count; if (core_count > 1) { pbi->b_multithreaded_rd = 1; - pbi->decoding_thread_count = core_count -1; + pbi->decoding_thread_count = core_count - 1; CHECK_MEM_ERROR(pbi->h_decoding_thread, vpx_malloc(sizeof(pthread_t) * pbi->decoding_thread_count)); CHECK_MEM_ERROR(pbi->h_event_start_decoding, vpx_malloc(sizeof(sem_t) * pbi->decoding_thread_count)); @@ -810,6 +826,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) if (pbi->common.filter_level) { + int skip_lf; /* Save decoded MB last row data for next-row decoding */ if(mb_row != pc->mb_rows-1) { @@ -837,6 +854,9 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) /* update loopfilter info */ Segment = (alt_flt_enabled) ? xd->mode_info_context->mbmi.segment_id : 0; + skip_lf = (xd->mode_info_context->mbmi.mode != B_PRED && + xd->mode_info_context->mbmi.mode != SPLITMV && + xd->mode_info_context->mbmi.mb_skip_coeff); filter_level = pbi->mt_baseline_filter_level[Segment]; /* Distance of Mb to the various image edges. * These are specified to 8th pel as they are always compared to values that are in 1/8th pel units @@ -848,17 +868,17 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd) if (filter_level) { if (mb_col > 0) - pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf); + pc->lf_mbv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - if (xd->mode_info_context->mbmi.dc_diff > 0) - pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf); + if (!skip_lf) + pc->lf_bv(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); /* don't apply across umv border */ if (mb_row > 0) - pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf); + pc->lf_mbh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); - if (xd->mode_info_context->mbmi.dc_diff > 0) - pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level], pc->simpler_lpf); + if (!skip_lf) + pc->lf_bh(xd->dst.y_buffer, xd->dst.u_buffer, xd->dst.v_buffer, recon_y_stride, recon_uv_stride, &lfi[filter_level]); } } diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h index 5a2568dde..6de4c8517 100644 --- a/vp8/encoder/block.h +++ b/vp8/encoder/block.h @@ -34,7 +34,7 @@ typedef struct // 16 Y blocks, 4 U blocks, 4 V blocks each with 16 entries short *quant; short *quant_fast; - short *quant_shift; + unsigned char *quant_shift; short *zbin; short *zrun_zbin_boost; short *round; diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c index ab4071d35..4a936ec4a 100644 --- a/vp8/encoder/encodeframe.c +++ b/vp8/encoder/encodeframe.c @@ -147,7 +147,7 @@ static const int qzbin_factors_y2[129] = #define EXACT_QUANT #ifdef EXACT_QUANT static void vp8cx_invert_quant(int improved_quant, short *quant, - short *shift, short d) + unsigned char *shift, short d) { if(improved_quant) { @@ -1157,7 +1157,6 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME; -#if !(CONFIG_REALTIME_ONLY) if (cpi->sf.RD && cpi->compressor_speed != 2) { vp8_rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv); @@ -1170,7 +1169,6 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16; } else -#endif { int rate2, best_distortion; MB_PREDICTION_MODE mode, best_mode = DC_PRED; @@ -1188,7 +1186,7 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t) (&x->e_mbd); distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); rate2 = x->mbmode_cost[x->e_mbd.frame_type][mode]; - this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); if (Error16x16 > this_rd) { @@ -1242,8 +1240,6 @@ int vp8cx_encode_inter_macroblock else x->encode_breakout = cpi->oxcf.encode_breakout; -#if !(CONFIG_REALTIME_ONLY) - if (cpi->sf.RD) { int zbin_mode_boost_enabled = cpi->zbin_mode_boost_enabled; @@ -1270,7 +1266,6 @@ int vp8cx_encode_inter_macroblock } else -#endif vp8_pick_inter_mode(cpi, x, recon_yoffset, recon_uvoffset, &rate, &distortion, &intra_error); cpi->prediction_error += distortion; @@ -1386,7 +1381,7 @@ int vp8cx_encode_inter_macroblock cpi->MVcount[1][mv_max+((xd->block[0].bmi.mv.as_mv.col - best_ref_mv.col) >> 1)]++; } - if (!x->skip && !x->e_mbd.mode_info_context->mbmi.force_no_skip) + if (!x->skip) { vp8_encode_inter16x16(IF_RTCD(&cpi->rtcd), x); @@ -1396,7 +1391,10 @@ int vp8cx_encode_inter_macroblock } else - vp8_stuff_inter16x16(x); + vp8_build_inter16x16_predictors_mb(xd, xd->dst.y_buffer, + xd->dst.u_buffer, xd->dst.v_buffer, + xd->dst.y_stride, xd->dst.uv_stride); + } if (!x->skip) @@ -1405,11 +1403,6 @@ int vp8cx_encode_inter_macroblock { if (cpi->common.mb_no_coeff_skip) { - if (xd->mode_info_context->mbmi.mode != B_PRED && xd->mode_info_context->mbmi.mode != SPLITMV) - xd->mode_info_context->mbmi.dc_diff = 0; - else - xd->mode_info_context->mbmi.dc_diff = 1; - xd->mode_info_context->mbmi.mb_skip_coeff = 1; cpi->skip_true_count ++; vp8_fix_contexts(xd); diff --git a/vp8/encoder/encodeintra.c b/vp8/encoder/encodeintra.c index 44000063c..9517a1d89 100644 --- a/vp8/encoder/encodeintra.c +++ b/vp8/encoder/encodeintra.c @@ -32,7 +32,8 @@ #endif void vp8_encode_intra4x4block(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x, BLOCK *be, BLOCKD *b, int best_mode) { - vp8_predict_intra4x4(b, best_mode, b->predictor); + RECON_INVOKE(&rtcd->common->recon, intra4x4_predict) + (b, best_mode, b->predictor); ENCODEMB_INVOKE(&rtcd->encodemb, subb)(be, b, 16); @@ -75,14 +76,9 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) vp8_quantize_mby(x); -#if !(CONFIG_REALTIME_ONLY) -#if 1 if (x->optimize) vp8_optimize_mby(x, rtcd); -#endif -#endif - vp8_inverse_transform_mby(IF_RTCD(&rtcd->common->idct), &x->e_mbd); RECON_INVOKE(&rtcd->common->recon, recon_mby) @@ -118,7 +114,7 @@ void vp8_encode_intra16x16mby(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { - vp8_build_intra_predictors_mbuv(&x->e_mbd); + RECON_INVOKE(&rtcd->common->recon, build_intra_predictors_mbuv)(&x->e_mbd); ENCODEMB_INVOKE(&rtcd->encodemb, submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); @@ -126,15 +122,9 @@ void vp8_encode_intra16x16mbuv(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) vp8_quantize_mbuv(x); -#if !(CONFIG_REALTIME_ONLY) -#if 1 - if (x->optimize==2 ||(x->optimize && x->rddiv > 1)) vp8_optimize_mbuv(x, rtcd); -#endif -#endif - vp8_inverse_transform_mbuv(IF_RTCD(&rtcd->common->idct), &x->e_mbd); vp8_recon_intra_mbuv(IF_RTCD(&rtcd->common->recon), &x->e_mbd); diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c index 463dbcaa9..2509e0698 100644 --- a/vp8/encoder/encodemb.c +++ b/vp8/encoder/encodemb.c @@ -19,6 +19,7 @@ #include "vp8/common/reconintra.h" #include "dct.h" #include "vpx_mem/vpx_mem.h" +#include "rdopt.h" #if CONFIG_RUNTIME_CPU_DETECT #define IF_RTCD(x) (x) @@ -195,42 +196,7 @@ static void transform_mby(MACROBLOCK *x) } -void vp8_stuff_inter16x16(MACROBLOCK *x) -{ - vp8_build_inter_predictors_mb_s(&x->e_mbd); - /* - // recon = copy from predictors to destination - { - BLOCKD *b = &x->e_mbd.block[0]; - unsigned char *pred_ptr = b->predictor; - unsigned char *dst_ptr = *(b->base_dst) + b->dst; - int stride = b->dst_stride; - - int i; - for(i=0;i<16;i++) - vpx_memcpy(dst_ptr+i*stride,pred_ptr+16*i,16); - - b = &x->e_mbd.block[16]; - pred_ptr = b->predictor; - dst_ptr = *(b->base_dst) + b->dst; - stride = b->dst_stride; - - for(i=0;i<8;i++) - vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8); - - b = &x->e_mbd.block[20]; - pred_ptr = b->predictor; - dst_ptr = *(b->base_dst) + b->dst; - stride = b->dst_stride; - - for(i=0;i<8;i++) - vpx_memcpy(dst_ptr+i*stride,pred_ptr+8*i,8); - } - */ -} -#if !(CONFIG_REALTIME_ONLY) -#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) #define RDTRUNC(RM,DM,R,D) ( (128+(R)*(RM)) & 0xFF ) typedef struct vp8_token_state vp8_token_state; @@ -608,7 +574,6 @@ void vp8_optimize_mbuv(MACROBLOCK *x, const VP8_ENCODER_RTCD *rtcd) ta + vp8_block2above[b], tl + vp8_block2left[b], rtcd); } } -#endif void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { @@ -620,10 +585,8 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) vp8_quantize_mb(x); -#if !(CONFIG_REALTIME_ONLY) if (x->optimize) optimize_mb(x, rtcd); -#endif vp8_inverse_transform_mb(IF_RTCD(&rtcd->common->idct), &x->e_mbd); @@ -635,7 +598,7 @@ void vp8_encode_inter16x16(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) /* this funciton is used by first pass only */ void vp8_encode_inter16x16y(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x) { - vp8_build_inter_predictors_mby(&x->e_mbd); + vp8_build_inter16x16_predictors_mby(&x->e_mbd); ENCODEMB_INVOKE(&rtcd->encodemb, submby)(x->src_diff, x->src.y_buffer, x->e_mbd.predictor, x->src.y_stride); diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h index 8c93aa180..47fc72dad 100644 --- a/vp8/encoder/encodemb.h +++ b/vp8/encoder/encodemb.h @@ -95,8 +95,6 @@ typedef struct struct VP8_ENCODER_RTCD; void vp8_encode_inter16x16(const struct VP8_ENCODER_RTCD *rtcd, MACROBLOCK *x); -extern void vp8_stuff_inter16x16(MACROBLOCK *x); - void vp8_build_dcblock(MACROBLOCK *b); void vp8_transform_mb(MACROBLOCK *mb); void vp8_transform_mbuv(MACROBLOCK *x); diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c index f5006ddab..c00494dcf 100644 --- a/vp8/encoder/ethreading.c +++ b/vp8/encoder/ethreading.c @@ -459,15 +459,15 @@ void vp8cx_create_encoder_threads(VP8_COMP *cpi) cpi->b_multi_threaded = 0; cpi->encoding_thread_count = 0; - cpi->processor_core_count = 32; //vp8_get_proc_core_count(); - if (cpi->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) + if (cm->processor_core_count > 1 && cpi->oxcf.multi_threaded > 1) { int ithread; int th_count = cpi->oxcf.multi_threaded - 1; - if (cpi->oxcf.multi_threaded > cpi->processor_core_count) - th_count = cpi->processor_core_count - 1; + /* don't allocate more threads than cores available */ + if (cpi->oxcf.multi_threaded > cm->processor_core_count) + th_count = cm->processor_core_count - 1; /* we have th_count + 1 (main) threads processing one row each */ /* no point to have more threads than the sync range allows */ diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c index 8f24a11a0..6f330991b 100644 --- a/vp8/encoder/firstpass.c +++ b/vp8/encoder/firstpass.c @@ -786,7 +786,8 @@ void vp8_first_pass(VP8_COMP *cpi) // TODO: handle the case when duration is set to 0, or something less // than the full time between subsequent cpi->source_time_stamp s . - fps.duration = cpi->source_end_time_stamp - cpi->source_time_stamp; + fps.duration = cpi->source->ts_end + - cpi->source->ts_start; // don't want to do output stats with a stack variable! memcpy(cpi->this_frame_stats, diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index 1d672bef9..d48c95bf7 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -89,9 +89,7 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) cpi->rtcd.quantize.quantb = vp8_regular_quantize_b; cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_c; -#if !(CONFIG_REALTIME_ONLY) cpi->rtcd.search.full_search = vp8_full_search_sad; -#endif cpi->rtcd.search.diamond_search = vp8_diamond_search_sad; #if !(CONFIG_REALTIME_ONLY) cpi->rtcd.temporal.apply = vp8_temporal_filter_apply_c; diff --git a/vp8/encoder/lookahead.c b/vp8/encoder/lookahead.c new file mode 100644 index 000000000..3b86d4094 --- /dev/null +++ b/vp8/encoder/lookahead.c @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#include <assert.h> +#include <stdlib.h> +#include "vpx_config.h" +#include "lookahead.h" +#include "vp8/common/extend.h" + +#define MAX_LAG_BUFFERS (CONFIG_REALTIME_ONLY? 1 : 25) + +struct lookahead_ctx +{ + unsigned int max_sz; /* Absolute size of the queue */ + unsigned int sz; /* Number of buffers currently in the queue */ + unsigned int read_idx; /* Read index */ + unsigned int write_idx; /* Write index */ + struct lookahead_entry *buf; /* Buffer list */ +}; + + +/* Return the buffer at the given absolute index and increment the index */ +static struct lookahead_entry * +pop(struct lookahead_ctx *ctx, + unsigned int *idx) +{ + unsigned int index = *idx; + struct lookahead_entry *buf = ctx->buf + index; + + assert(index < ctx->max_sz); + if(++index >= ctx->max_sz) + index -= ctx->max_sz; + *idx = index; + return buf; +} + + +void +vp8_lookahead_destroy(struct lookahead_ctx *ctx) +{ + if(ctx) + { + if(ctx->buf) + { + int i; + + for(i = 0; i < ctx->max_sz; i++) + vp8_yv12_de_alloc_frame_buffer(&ctx->buf[i].img); + free(ctx->buf); + } + free(ctx); + } +} + + +struct lookahead_ctx* +vp8_lookahead_init(unsigned int width, + unsigned int height, + unsigned int depth) +{ + struct lookahead_ctx *ctx = NULL; + int i; + + /* Clamp the lookahead queue depth */ + if(depth < 1) + depth = 1; + else if(depth > MAX_LAG_BUFFERS) + depth = MAX_LAG_BUFFERS; + + /* Align the buffer dimensions */ + width = (width + 15) & ~15; + height = (height + 15) & ~15; + + /* Allocate the lookahead structures */ + ctx = calloc(1, sizeof(*ctx)); + if(ctx) + { + ctx->max_sz = depth; + ctx->buf = calloc(depth, sizeof(*ctx->buf)); + if(!ctx->buf) + goto bail; + for(i=0; i<depth; i++) + if (vp8_yv12_alloc_frame_buffer(&ctx->buf[i].img, width, height, 16)) + goto bail; + } + return ctx; +bail: + vp8_lookahead_destroy(ctx); + return NULL; +} + + +int +vp8_lookahead_push(struct lookahead_ctx *ctx, + YV12_BUFFER_CONFIG *src, + int64_t ts_start, + int64_t ts_end, + unsigned int flags) +{ + struct lookahead_entry* buf; + + if(ctx->sz + 1 > ctx->max_sz) + return 1; + ctx->sz++; + buf = pop(ctx, &ctx->write_idx); + vp8_copy_and_extend_frame(src, &buf->img); + buf->ts_start = ts_start; + buf->ts_end = ts_end; + buf->flags = flags; + return 0; +} + + +struct lookahead_entry* +vp8_lookahead_pop(struct lookahead_ctx *ctx, + int drain) +{ + struct lookahead_entry* buf = NULL; + + if(ctx->sz && (drain || ctx->sz == ctx->max_sz)) + { + buf = pop(ctx, &ctx->read_idx); + ctx->sz--; + } + return buf; +} + + +struct lookahead_entry* +vp8_lookahead_peek(struct lookahead_ctx *ctx, + int index) +{ + struct lookahead_entry* buf = NULL; + + assert(index < ctx->max_sz); + if(index < ctx->sz) + { + index += ctx->read_idx; + if(index >= ctx->max_sz) + index -= ctx->max_sz; + buf = ctx->buf + index; + } + return buf; +} + + +unsigned int +vp8_lookahead_depth(struct lookahead_ctx *ctx) +{ + return ctx->sz; +} diff --git a/vp8/encoder/lookahead.h b/vp8/encoder/lookahead.h new file mode 100644 index 000000000..a483d7e0b --- /dev/null +++ b/vp8/encoder/lookahead.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2011 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ +#ifndef LOOKAHEAD_H +#define LOOKAHEAD_H +#include "vpx_scale/yv12config.h" +#include "vpx/vpx_integer.h" + +struct lookahead_entry +{ + YV12_BUFFER_CONFIG img; + int64_t ts_start; + int64_t ts_end; + unsigned int flags; +}; + + +struct lookahead_ctx; + +/**\brief Initializes the lookahead stage + * + * The lookahead stage is a queue of frame buffers on which some analysis + * may be done when buffers are enqueued. + * + * + */ +struct lookahead_ctx* vp8_lookahead_init(unsigned int width, + unsigned int height, + unsigned int depth + ); + + +/**\brief Destroys the lookahead stage + * + */ +void vp8_lookahead_destroy(struct lookahead_ctx *ctx); + + +/**\brief Enqueue a source buffer + * + * This function will copy the source image into a new framebuffer with + * the expected stride/border. + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] src Pointer to the image to enqueue + * \param[in] ts_start Timestamp for the start of this frame + * \param[in] ts_end Timestamp for the end of this frame + * \param[in] flags Flags set on this frame + */ +int +vp8_lookahead_push(struct lookahead_ctx *ctx, + YV12_BUFFER_CONFIG *src, + int64_t ts_start, + int64_t ts_end, + unsigned int flags); + + +/**\brief Get the next source buffer to encode + * + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] drain Flag indicating the buffer should be drained + * (return a buffer regardless of the current queue depth) + * + * \retval NULL, if drain set and queue is empty + * \retval NULL, if drain not set and queue not of the configured depth + * + */ +struct lookahead_entry* +vp8_lookahead_pop(struct lookahead_ctx *ctx, + int drain); + + +/**\brief Get a future source buffer to encode + * + * \param[in] ctx Pointer to the lookahead context + * \param[in] index Index of the frame to be returned, 0 == next frame + * + * \retval NULL, if no buffer exists at the specified index + * + */ +struct lookahead_entry* +vp8_lookahead_peek(struct lookahead_ctx *ctx, + int index); + + +/**\brief Get the number of frames currently in the lookahead queue + * + * \param[in] ctx Pointer to the lookahead context + */ +unsigned int +vp8_lookahead_depth(struct lookahead_ctx *ctx); + + +#endif diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c index 716f514af..9d447b210 100644 --- a/vp8/encoder/mcomp.c +++ b/vp8/encoder/mcomp.c @@ -194,13 +194,13 @@ void vp8_init3smotion_compensation(MACROBLOCK *x, int stride) #define DIST(r,c) vfp->svf( PRE(r,c), d->pre_stride, SP(c),SP(r), z,b->src_stride,&sse) // returns subpixel variance error function. #define IFMVCV(r,c,s,e) if ( c >= minc && c <= maxc && r >= minr && r <= maxr) s else e; #define ERR(r,c) (MVC(r,c)+DIST(r,c)) // returns distortion + motion vector cost -#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best +#define CHECK_BETTER(v,r,c) IFMVCV(r,c,{thismse = DIST(r,c); if((v = (MVC(r,c)+thismse)) < besterr) { besterr = v; br=r; bc=c; *distortion = thismse; *sse1 = sse; }}, v=INT_MAX;)// checks if (r,c) has better score than previous best #define MIN(x,y) (((x)<(y))?(x):(y)) #define MAX(x,y) (((x)>(y))?(x):(y)) //#define CHECK_BETTER(v,r,c) if((v = ERR(r,c)) < besterr) { besterr = v; br=r; bc=c; } -int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2]) +int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1) { unsigned char *y = *(d->base_pre) + d->pre + (bestmv->row) * d->pre_stride + bestmv->col; unsigned char *z = (*(b->base_src) + b->src); @@ -214,6 +214,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, unsigned int whichdir; unsigned int halfiters = 4; unsigned int quarteriters = 4; + int thismse; int minc = MAX(x->mv_col_min << 2, (ref_mv->col >> 1) - ((1 << mvlong_width) - 1)); int maxc = MIN(x->mv_col_max << 2, (ref_mv->col >> 1) + ((1 << mvlong_width) - 1)); @@ -225,7 +226,8 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, bestmv->col <<= 3; // calculate central point error - besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse); + besterr = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); + *distortion = besterr; besterr += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); // TODO: Each subsequent iteration checks at least one point in common with the last iteration could be 2 ( if diag selected) @@ -314,7 +316,7 @@ int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d, #undef CHECK_BETTER #undef MIN #undef MAX -int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2]) +int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1) { int bestmse = INT_MAX; MV startmv; @@ -325,6 +327,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, int left, right, up, down, diag; unsigned int sse; int whichdir ; + int thismse; // Trap uncodable vectors @@ -332,6 +335,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, { bestmv->row <<= 3; bestmv->col <<= 3; + *distortion = INT_MAX; return INT_MAX; } @@ -341,51 +345,60 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, startmv = *bestmv; // calculate central point error - bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse); + bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); + *distortion = bestmse; bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); // go left then right and check error this_mv.row = startmv.row; this_mv.col = ((startmv.col - 8) | 4); - left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse); - left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse); + left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (left < bestmse) { *bestmv = this_mv; bestmse = left; + *distortion = thismse; + *sse1 = sse; } this_mv.col += 8; - right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse); - right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse); + right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (right < bestmse) { *bestmv = this_mv; bestmse = right; + *distortion = thismse; + *sse1 = sse; } // go up then down and check error this_mv.col = startmv.col; this_mv.row = ((startmv.row - 8) | 4); - up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); - up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (up < bestmse) { *bestmv = this_mv; bestmse = up; + *distortion = thismse; + *sse1 = sse; } this_mv.row += 8; - down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse); - down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse); + down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (down < bestmse) { *bestmv = this_mv; bestmse = down; + *distortion = thismse; + *sse1 = sse; } @@ -400,32 +413,34 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, case 0: this_mv.col = (this_mv.col - 8) | 4; this_mv.row = (this_mv.row - 8) | 4; - diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); break; case 1: this_mv.col += 4; this_mv.row = (this_mv.row - 8) | 4; - diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); break; case 2: this_mv.col = (this_mv.col - 8) | 4; this_mv.row += 4; - diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse); break; case 3: default: this_mv.col += 4; this_mv.row += 4; - diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse); + thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse); break; } - diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) { *bestmv = this_mv; bestmse = diag; + *distortion = thismse; + *sse1 = sse; } // } @@ -448,30 +463,34 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, if (startmv.col & 7) { this_mv.col = startmv.col - 2; - left = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.col = (startmv.col - 8) | 6; - left = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse); } - left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (left < bestmse) { *bestmv = this_mv; bestmse = left; + *distortion = thismse; + *sse1 = sse; } this_mv.col += 4; - right = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); - right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (right < bestmse) { *bestmv = this_mv; bestmse = right; + *distortion = thismse; + *sse1 = sse; } // go up then down and check error @@ -480,30 +499,34 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, if (startmv.row & 7) { this_mv.row = startmv.row - 2; - up = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.row = (startmv.row - 8) | 6; - up = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); + thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); } - up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (up < bestmse) { *bestmv = this_mv; bestmse = up; + *distortion = thismse; + *sse1 = sse; } this_mv.row += 4; - down = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); - down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (down < bestmse) { *bestmv = this_mv; bestmse = down; + *distortion = thismse; + *sse1 = sse; } @@ -525,12 +548,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, if (startmv.col & 7) { this_mv.col -= 2; - diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.col = (startmv.col - 8) | 6; - diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);; + thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);; } } else @@ -540,12 +563,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, if (startmv.col & 7) { this_mv.col -= 2; - diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); + thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); } else { this_mv.col = (startmv.col - 8) | 6; - diag = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse); + thismse = vfp->svf(y - d->pre_stride - 1, d->pre_stride, 6, 6, z, b->src_stride, &sse); } } @@ -556,12 +579,12 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, if (startmv.row & 7) { this_mv.row -= 2; - diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.row = (startmv.row - 8) | 6; - diag = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); + thismse = vfp->svf(y - d->pre_stride, d->pre_stride, this_mv.col & 7, 6, z, b->src_stride, &sse); } break; @@ -571,36 +594,36 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, if (startmv.col & 7) { this_mv.col -= 2; - diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); } else { this_mv.col = (startmv.col - 8) | 6; - diag = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);; + thismse = vfp->svf(y - 1, d->pre_stride, 6, this_mv.row & 7, z, b->src_stride, &sse);; } break; case 3: this_mv.col += 2; this_mv.row += 2; - diag = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); + thismse = vfp->svf(y, d->pre_stride, this_mv.col & 7, this_mv.row & 7, z, b->src_stride, &sse); break; } - diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) { *bestmv = this_mv; bestmse = diag; + *distortion = thismse; + *sse1 = sse; } -// } - return bestmse; } -int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2]) +int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse1) { int bestmse = INT_MAX; MV startmv; @@ -610,12 +633,14 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm unsigned char *z = (*(b->base_src) + b->src); int left, right, up, down, diag; unsigned int sse; + int thismse; // Trap uncodable vectors if ((abs((bestmv->col << 3) - ref_mv->col) > MAX_FULL_PEL_VAL) || (abs((bestmv->row << 3) - ref_mv->row) > MAX_FULL_PEL_VAL)) { bestmv->row <<= 3; bestmv->col <<= 3; + *distortion = INT_MAX; return INT_MAX; } @@ -625,51 +650,60 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm startmv = *bestmv; // calculate central point error - bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, &sse); + bestmse = vfp->vf(y, d->pre_stride, z, b->src_stride, sse1); + *distortion = bestmse; bestmse += mv_err_cost(bestmv, ref_mv, mvcost, error_per_bit); // go left then right and check error this_mv.row = startmv.row; this_mv.col = ((startmv.col - 8) | 4); - left = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse); - left += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_h(y - 1, d->pre_stride, z, b->src_stride, &sse); + left = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (left < bestmse) { *bestmv = this_mv; bestmse = left; + *distortion = thismse; + *sse1 = sse; } this_mv.col += 8; - right = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse); - right += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_h(y, d->pre_stride, z, b->src_stride, &sse); + right = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (right < bestmse) { *bestmv = this_mv; bestmse = right; + *distortion = thismse; + *sse1 = sse; } // go up then down and check error this_mv.col = startmv.col; this_mv.row = ((startmv.row - 8) | 4); - up = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); - up += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_v(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + up = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (up < bestmse) { *bestmv = this_mv; bestmse = up; + *distortion = thismse; + *sse1 = sse; } this_mv.row += 8; - down = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse); - down += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_v(y, d->pre_stride, z, b->src_stride, &sse); + down = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (down < bestmse) { *bestmv = this_mv; bestmse = down; + *distortion = thismse; + *sse1 = sse; } // somewhat strangely not doing all the diagonals for half pel is slower than doing them. @@ -713,44 +747,52 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm #else this_mv.col = (this_mv.col - 8) | 4; this_mv.row = (this_mv.row - 8) | 4; - diag = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); - diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_hv(y - 1 - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) { *bestmv = this_mv; bestmse = diag; + *distortion = thismse; + *sse1 = sse; } this_mv.col += 8; - diag = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); - diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_hv(y - d->pre_stride, d->pre_stride, z, b->src_stride, &sse); + diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) { *bestmv = this_mv; bestmse = diag; + *distortion = thismse; + *sse1 = sse; } this_mv.col = (this_mv.col - 8) | 4; this_mv.row = startmv.row + 4; - diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse); - diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse); + diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) { *bestmv = this_mv; bestmse = diag; + *distortion = thismse; + *sse1 = sse; } this_mv.col += 8; - diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse); - diag += mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); + thismse = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse); + diag = thismse + mv_err_cost(&this_mv, ref_mv, mvcost, error_per_bit); if (diag < bestmse) { *bestmv = this_mv; bestmse = diag; + *distortion = thismse; + *sse1 = sse; } #endif @@ -789,7 +831,9 @@ int vp8_hex_search ) { MV hex[6] = { { -1, -2}, {1, -2}, {2, 0}, {1, 2}, { -1, 2}, { -2, 0} } ; - MV neighbors[8] = { { -1, -1}, {0, -1}, {1, -1}, { -1, 0}, {1, 0}, { -1, 1}, {0, 1}, {1, 1} } ; + //MV neighbors[8] = { { -1, -1}, {0, -1}, {1, -1}, { -1, 0}, {1, 0}, { -1, 1}, {0, 1}, {1, 1} } ; + MV neighbors[4] = {{0, -1}, { -1, 0}, {1, 0}, {0, 1}} ; + int i, j; unsigned char *src = (*(b->base_src) + b->src); int src_stride = b->src_stride; @@ -876,24 +920,31 @@ int vp8_hex_search break; } - // check 8 1 away neighbors + // check 4 1-away neighbors cal_neighbors: - tr = br; - tc = bc; - for (i = 0; i < 8; i++) + for (j = 0; j < 32; j++) { - int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col; + tr = br; + tc = bc; - if (nc < x->mv_col_min) continue; + for (i = 0; i < 4; i++) + { + int nr = tr + neighbors[i].row, nc = tc + neighbors[i].col; - if (nc > x->mv_col_max) continue; + if (nc < x->mv_col_min) continue; - if (nr < x->mv_row_min) continue; + if (nc > x->mv_col_max) continue; - if (nr > x->mv_row_max) continue; + if (nr < x->mv_row_min) continue; - CHECK_BETTER(thiserr, nr, nc); + if (nr > x->mv_row_max) continue; + + CHECK_BETTER(thiserr, nr, nc); + } + + if (tr == br && tc == bc) + break; } best_mv->row = br; @@ -1190,8 +1241,6 @@ int vp8_diamond_search_sadx4 + mv_err_cost(&this_mv, center_mv, mvcost, error_per_bit); } - -#if !(CONFIG_REALTIME_ONLY) int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], MV *center_mv) { unsigned char *what = (*(b->base_src) + b->src); @@ -1571,7 +1620,6 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er else return INT_MAX; } -#endif /* !(CONFIG_REALTIME_ONLY) */ #ifdef ENTROPY_STATS void print_mode_context(void) diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h index 5efcec296..b14cbcbc8 100644 --- a/vp8/encoder/mcomp.h +++ b/vp8/encoder/mcomp.h @@ -49,7 +49,7 @@ extern int vp8_hex_search typedef int (fractional_mv_step_fp) (MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, - int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2]); + int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse); extern fractional_mv_step_fp vp8_find_best_sub_pixel_step_iteratively; extern fractional_mv_step_fp vp8_find_best_sub_pixel_step; extern fractional_mv_step_fp vp8_find_best_half_pixel_step; diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index a18447d51..1738e5699 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -56,7 +56,6 @@ extern void vp8_loop_filter_frame(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt extern void vp8_loop_filter_frame_yonly(VP8_COMMON *cm, MACROBLOCKD *mbd, int filt_val, int sharpness_lvl); extern void vp8_dmachine_specific_config(VP8_COMP *cpi); extern void vp8_cmachine_specific_config(VP8_COMP *cpi); -extern void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi); extern void vp8_deblock_frame(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *post, int filt_lvl, int low_var_thresh, int flag); extern void print_parms(VP8_CONFIG *ocf, char *filenam); extern unsigned int vp8_get_processor_freq(); @@ -71,7 +70,7 @@ extern void vp8_yv12_copy_src_frame_func_neon(YV12_BUFFER_CONFIG *src_ybc, YV12_ int vp8_estimate_entropy_savings(VP8_COMP *cpi); int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd); -extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi); +extern void vp8_temporal_filter_prepare_c(VP8_COMP *cpi, int distance); static void set_default_lf_deltas(VP8_COMP *cpi); @@ -96,7 +95,8 @@ extern double vp8_calc_ssimg YV12_BUFFER_CONFIG *dest, double *ssim_y, double *ssim_u, - double *ssim_v + double *ssim_v, + const vp8_variance_rtcd_vtable_t *rtcd ); @@ -287,16 +287,9 @@ static void dealloc_compressor_data(VP8_COMP *cpi) vp8_yv12_de_alloc_frame_buffer(&cpi->last_frame_uf); vp8_yv12_de_alloc_frame_buffer(&cpi->scaled_source); #if VP8_TEMPORAL_ALT_REF - vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer); + vp8_yv12_de_alloc_frame_buffer(&cpi->alt_ref_buffer); #endif - { - int i; - - for (i = 0; i < MAX_LAG_BUFFERS; i++) - vp8_yv12_de_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer); - - cpi->source_buffer_count = 0; - } + vp8_lookahead_destroy(cpi->lookahead); vpx_free(cpi->tok); cpi->tok = 0; @@ -1252,35 +1245,23 @@ void vp8_set_speed_features(VP8_COMP *cpi) } static void alloc_raw_frame_buffers(VP8_COMP *cpi) { - int i, buffers; - /* allocate source_buffer to be multiples of 16 */ int width = (cpi->oxcf.Width + 15) & ~15; + int height = (cpi->oxcf.Height + 15) & ~15; - buffers = cpi->oxcf.lag_in_frames; - - if (buffers > MAX_LAG_BUFFERS) - buffers = MAX_LAG_BUFFERS; - - if (buffers < 1) - buffers = 1; - - for (i = 0; i < buffers; i++) - if (vp8_yv12_alloc_frame_buffer(&cpi->src_buffer[i].source_buffer, - width, cpi->oxcf.Height, - 16)) - vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, - "Failed to allocate lag buffer"); + cpi->lookahead = vp8_lookahead_init(cpi->oxcf.Width, cpi->oxcf.Height, + cpi->oxcf.lag_in_frames); + if(!cpi->lookahead) + vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, + "Failed to allocate lag buffers"); #if VP8_TEMPORAL_ALT_REF - if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer.source_buffer, - width, cpi->oxcf.Height, 16)) + if (vp8_yv12_alloc_frame_buffer(&cpi->alt_ref_buffer, + width, height, 16)) vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR, "Failed to allocate altref buffer"); #endif - - cpi->source_buffer_count = 0; } static int vp8_alloc_partition_data(VP8_COMP *cpi) @@ -1461,10 +1442,7 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->auto_gold = 1; cpi->auto_adjust_gold_quantizer = 1; - cpi->goldquantizer = 1; cpi->goldfreq = 7; - cpi->auto_adjust_key_quantizer = 1; - cpi->keyquantizer = 1; cm->version = oxcf->Version; vp8_setup_version(cm); @@ -1478,10 +1456,6 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->avg_frame_qindex = cpi->oxcf.worst_allowed_q; // Initialise the starting buffer levels - cpi->oxcf.starting_buffer_level = - rescale(cpi->oxcf.starting_buffer_level, - cpi->oxcf.target_bandwidth, 1000); - cpi->buffer_level = cpi->oxcf.starting_buffer_level; cpi->bits_off_target = cpi->oxcf.starting_buffer_level; @@ -1542,7 +1516,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) break; -#if !(CONFIG_REALTIME_ONLY) case MODE_GOODQUALITY: cpi->pass = 0; cpi->compressor_speed = 1; @@ -1583,7 +1556,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->pass = 2; cpi->compressor_speed = 0; break; -#endif } if (cpi->pass == 0) @@ -1656,6 +1628,10 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) // Convert target bandwidth from Kbit/s to Bit/s cpi->oxcf.target_bandwidth *= 1000; + cpi->oxcf.starting_buffer_level = + rescale(cpi->oxcf.starting_buffer_level, + cpi->oxcf.target_bandwidth, 1000); + // Set or reset optimal and maximum buffer levels. if (cpi->oxcf.optimal_buffer_level == 0) cpi->oxcf.optimal_buffer_level = cpi->oxcf.target_bandwidth / 8; @@ -1705,8 +1681,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) // Only allow dropped frames in buffered mode cpi->drop_frames_allowed = cpi->oxcf.allow_df && cpi->buffered_mode; - cm->filter_type = (LOOPFILTERTYPE) cpi->filter_type; - if (!cm->use_bilinear_mc_filter) cm->mcomp_filter_type = SIXTAP; else @@ -1720,9 +1694,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cm->horiz_scale = cpi->horiz_scale; cm->vert_scale = cpi->vert_scale ; - // As per VP8 - cpi->intra_frame_target = (4 * (cm->Width + cm->Height) / 15) * 1000; - // VP8 sharpness level mapping 0-7 (vs 0-10 in general VPx dialogs) if (cpi->oxcf.Sharpness > 7) cpi->oxcf.Sharpness = 7; @@ -1752,10 +1723,6 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) vp8_alloc_compressor_data(cpi); } - // Clamp KF frame size to quarter of data rate - if (cpi->intra_frame_target > cpi->target_bandwidth >> 2) - cpi->intra_frame_target = cpi->target_bandwidth >> 2; - if (cpi->oxcf.fixed_q >= 0) { cpi->last_q[0] = cpi->oxcf.fixed_q; @@ -1774,7 +1741,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf) cpi->oxcf.lag_in_frames = MAX_LAG_BUFFERS; // YX Temp - cpi->last_alt_ref_sei = -1; + cpi->alt_ref_source = NULL; cpi->is_src_frame_alt_ref = 0; cpi->is_next_src_alt_ref = 0; @@ -1981,7 +1948,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->frames_till_gf_update_due = 0; cpi->key_frame_count = 1; - cpi->tot_key_frame_bits = 0; cpi->ni_av_qi = cpi->oxcf.worst_allowed_q; cpi->ni_tot_qi = 0; @@ -2007,7 +1973,6 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) for (i = 0; i < KEY_FRAME_CONTEXT; i++) { - cpi->prior_key_frame_size[i] = cpi->intra_frame_target; cpi->prior_key_frame_distance[i] = (int)cpi->output_frame_rate; } @@ -2117,15 +2082,11 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf) cpi->fn_ptr[BLOCK_4X4].sdx8f = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x8); cpi->fn_ptr[BLOCK_4X4].sdx4df = VARIANCE_INVOKE(&cpi->rtcd.variance, sad4x4x4d); -#if !(CONFIG_REALTIME_ONLY) cpi->full_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, full_search); -#endif cpi->diamond_search_sad = SEARCH_INVOKE(&cpi->rtcd.search, diamond_search); cpi->ready_for_new_frame = 1; - cpi->source_encode_index = 0; - // make sure frame 1 is okay cpi->error_bins[0] = cpi->common.MBs; @@ -2173,7 +2134,8 @@ void vp8_remove_compressor(VP8_PTR *ptr) if (cpi->pass != 1) { FILE *f = fopen("opsnr.stt", "a"); - double time_encoded = (cpi->source_end_time_stamp - cpi->first_time_stamp_ever) / 10000000.000; + double time_encoded = (cpi->last_end_time_stamp_seen + - cpi->first_time_stamp_ever) / 10000000.000; double total_encode_time = (cpi->time_receive_data + cpi->time_compress_data) / 1000.000; double dr = (double)cpi->bytes * (double) 8 / (double)1000 / time_encoded; @@ -2186,7 +2148,7 @@ void vp8_remove_compressor(VP8_PTR *ptr) double total_ssim = 100 * pow(cpi->summed_quality / cpi->summed_weights, 8.0); fprintf(f, "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\tVPXSSIM\t Time(us)\n"); - fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f %8.0f\n", + fprintf(f, "%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%7.3f\t%8.0f\n", dr, cpi->total / cpi->count, total_psnr, cpi->totalp / cpi->count, total_psnr2, total_ssim, total_encode_time); } @@ -2628,37 +2590,13 @@ static void scale_and_extend_source(YV12_BUFFER_CONFIG *sd, VP8_COMP *cpi) vp8_scale_frame(sd, &cpi->scaled_source, cm->temp_scale_frame.y_buffer, tmp_height, hs, hr, vs, vr, 0); + vp8_yv12_extend_frame_borders(&cpi->scaled_source); cpi->Source = &cpi->scaled_source; #endif } - // we may need to copy to a buffer so we can extend the image... - else if (cm->Width != cm->yv12_fb[cm->lst_fb_idx].y_width || - cm->Height != cm->yv12_fb[cm->lst_fb_idx].y_height) - { - //vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source); -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_src_frame_func_neon(sd, &cpi->scaled_source); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_frame_ptr(sd, &cpi->scaled_source); - } -#endif - - cpi->Source = &cpi->scaled_source; - } +} - vp8_extend_to_multiple_of16(cpi->Source, cm->Width, cm->Height); -} static void resize_key_frame(VP8_COMP *cpi) { #if CONFIG_SPATIAL_RESAMPLING @@ -2705,64 +2643,7 @@ static void resize_key_frame(VP8_COMP *cpi) #endif } -// return of 0 means drop frame -static int pick_frame_size(VP8_COMP *cpi) -{ - VP8_COMMON *cm = &cpi->common; - - // First Frame is a special case - if (cm->current_video_frame == 0) - { -#if !(CONFIG_REALTIME_ONLY) - if (cpi->pass == 2) - vp8_calc_auto_iframe_target_size(cpi); - - // 1 Pass there is no information on which to base size so use bandwidth per second * fixed fraction - else -#endif - cpi->this_frame_target = cpi->oxcf.target_bandwidth / 2; - - // in error resilient mode the first frame is bigger since it likely contains - // all the static background - if (cpi->oxcf.error_resilient_mode == 1 || (cpi->compressor_speed == 2)) - { - cpi->this_frame_target *= 3; // 5; - } - - // Key frame from VFW/auto-keyframe/first frame - cm->frame_type = KEY_FRAME; - - } - // Special case for forced key frames - // The frame sizing here is still far from ideal for 2 pass. - else if (cm->frame_flags & FRAMEFLAGS_KEY) - { - cm->frame_type = KEY_FRAME; - resize_key_frame(cpi); - vp8_calc_iframe_target_size(cpi); - } - else if (cm->frame_type == KEY_FRAME) - { - vp8_calc_auto_iframe_target_size(cpi); - } - else - { - // INTER frame: compute target frame size - cm->frame_type = INTER_FRAME; - vp8_calc_pframe_target_size(cpi); - - // Check if we're dropping the frame: - if (cpi->drop_frame) - { - cpi->drop_frame = FALSE; - cpi->drop_count++; - return 0; - } - } - - return 1; -} static void set_quantizer(VP8_COMP *cpi, int Q) { @@ -3551,7 +3432,7 @@ static void encode_frame_to_data_rate } // Decide how big to make the frame - if (!pick_frame_size(cpi)) + if (!vp8_pick_frame_size(cpi)) { cm->current_video_frame++; cpi->frames_since_key++; @@ -3571,7 +3452,6 @@ static void encode_frame_to_data_rate if (Adjustment) { int buff_lvl_step; - int tmp_lvl = cpi->buffer_level; if (cpi->buffer_level < cpi->oxcf.maximum_buffer_size) { @@ -3880,7 +3760,10 @@ static void encode_frame_to_data_rate } if (cm->frame_type == KEY_FRAME) + { + resize_key_frame(cpi); vp8_setup_key_frame(cpi); + } // transform / motion compensation build reconstruction frame vp8_encode_frame(cpi); @@ -3904,11 +3787,11 @@ static void encode_frame_to_data_rate #else if (decide_key_frame(cpi)) { - vp8_calc_auto_iframe_target_size(cpi); - // Reset all our sizing numbers and recode cm->frame_type = KEY_FRAME; + vp8_pick_frame_size(cpi); + // Clear the Alt reference frame active flag when we have a key frame cpi->source_alt_ref_active = FALSE; @@ -3937,7 +3820,6 @@ static void encode_frame_to_data_rate loop_count++; Loop = TRUE; - resize_key_frame(cpi); continue; } #endif @@ -4419,9 +4301,9 @@ static void encode_frame_to_data_rate vp8_clear_system_state(); //__asm emms; if (cpi->total_coded_error_left != 0.0) - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld" - "%6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f" - "%10.3f %8ld\n", + fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d" + "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f" + "%10.3f %8d\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), @@ -4438,9 +4320,9 @@ static void encode_frame_to_data_rate (double)cpi->bits_left / cpi->total_coded_error_left, cpi->tot_recode_hits); else - fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld" - "%6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f" - "%8ld\n", + fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6d %6d" + "%6d %6d %6d %5d %5d %5d %8d %8.2f %10d %10.3f" + "%8d\n", cpi->common.current_video_frame, cpi->this_frame_target, cpi->projected_frame_size, (cpi->projected_frame_size - cpi->this_frame_target), @@ -4672,17 +4554,17 @@ static void Pass2Encode(VP8_COMP *cpi, unsigned long *size, unsigned char *dest, extern void vp8_push_neon(INT64 *store); extern void vp8_pop_neon(INT64 *store); #endif + + int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time) { #if HAVE_ARMV7 INT64 store_reg[8]; #endif - VP8_COMP *cpi = (VP8_COMP *) ptr; - VP8_COMMON *cm = &cpi->common; + VP8_COMP *cpi = (VP8_COMP *) ptr; + VP8_COMMON *cm = &cpi->common; struct vpx_usec_timer timer; - - if (!cpi) - return -1; + int res = 0; #if HAVE_ARMV7 #if CONFIG_RUNTIME_CPU_DETECT @@ -4694,75 +4576,10 @@ int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CON #endif vpx_usec_timer_start(&timer); - - // no more room for frames; - if (cpi->source_buffer_count != 0 && cpi->source_buffer_count >= cpi->oxcf.lag_in_frames) - { -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_pop_neon(store_reg); - } -#endif - return -1; - } - - //printf("in-cpi->source_buffer_count: %d\n", cpi->source_buffer_count); - + if(vp8_lookahead_push(cpi->lookahead, sd, time_stamp, end_time, + frame_flags)) + res = -1; cm->clr_type = sd->clrtype; - - // make a copy of the frame for use later... -#if !(CONFIG_REALTIME_ONLY) - - if (cpi->oxcf.allow_lag) - { - int which_buffer = cpi->source_encode_index - 1; - SOURCE_SAMPLE *s; - - if (which_buffer == -1) - which_buffer = cpi->oxcf.lag_in_frames - 1; - - if (cpi->source_buffer_count < cpi->oxcf.lag_in_frames - 1) - which_buffer = cpi->source_buffer_count; - - s = &cpi->src_buffer[which_buffer]; - - s->source_time_stamp = time_stamp; - s->source_end_time_stamp = end_time; - s->source_frame_flags = frame_flags; - vp8_yv12_copy_frame_ptr(sd, &s->source_buffer); - - cpi->source_buffer_count ++; - } - else -#endif - { - SOURCE_SAMPLE *s; - s = &cpi->src_buffer[0]; - s->source_end_time_stamp = end_time; - s->source_time_stamp = time_stamp; - s->source_frame_flags = frame_flags; -#if HAVE_ARMV7 -#if CONFIG_RUNTIME_CPU_DETECT - if (cm->rtcd.flags & HAS_NEON) -#endif - { - vp8_yv12_copy_src_frame_func_neon(sd, &s->source_buffer); - } -#if CONFIG_RUNTIME_CPU_DETECT - else -#endif -#endif -#if !HAVE_ARMV7 || CONFIG_RUNTIME_CPU_DETECT - { - vp8_yv12_copy_frame_ptr(sd, &s->source_buffer); - } -#endif - cpi->source_buffer_count = 1; - } - vpx_usec_timer_mark(&timer); cpi->time_receive_data += vpx_usec_timer_elapsed(&timer); @@ -4775,8 +4592,10 @@ int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CON } #endif - return 0; + return res; } + + int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush) { #if HAVE_ARMV7 @@ -4787,6 +4606,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon struct vpx_usec_timer tsctimer; struct vpx_usec_timer ticktimer; struct vpx_usec_timer cmptimer; + YV12_BUFFER_CONFIG *force_src_buffer = NULL; if (!cpi) return -1; @@ -4802,95 +4622,24 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon vpx_usec_timer_start(&cmptimer); - - // flush variable tells us that even though we have less than 10 frames - // in our buffer we need to start producing compressed frames. - // Probably because we are at the end of a file.... - if ((cpi->source_buffer_count == cpi->oxcf.lag_in_frames && cpi->oxcf.lag_in_frames > 0) - || (!cpi->oxcf.allow_lag && cpi->source_buffer_count > 0) - || (flush && cpi->source_buffer_count > 0)) - { - - SOURCE_SAMPLE *s; - - s = &cpi->src_buffer[cpi->source_encode_index]; - cpi->source_time_stamp = s->source_time_stamp; - cpi->source_end_time_stamp = s->source_end_time_stamp; + cpi->source = NULL; #if !(CONFIG_REALTIME_ONLY) - - // Should we code an alternate reference frame - if (cpi->oxcf.error_resilient_mode == 0 && - cpi->oxcf.play_alternate && - cpi->source_alt_ref_pending && - (cpi->frames_till_gf_update_due < cpi->source_buffer_count) && - cpi->oxcf.lag_in_frames != 0) + // Should we code an alternate reference frame + if (cpi->oxcf.error_resilient_mode == 0 && + cpi->oxcf.play_alternate && + cpi->source_alt_ref_pending) + { + if ((cpi->source = vp8_lookahead_peek(cpi->lookahead, + cpi->frames_till_gf_update_due))) { - cpi->last_alt_ref_sei = (cpi->source_encode_index + cpi->frames_till_gf_update_due) % cpi->oxcf.lag_in_frames; - -#if VP8_TEMPORAL_ALT_REF - + cpi->alt_ref_source = cpi->source; if (cpi->oxcf.arnr_max_frames > 0) { -#if 0 - // my attempt at a loop that tests the results of strength filter. - int start_frame = cpi->last_alt_ref_sei - 3; - - int i, besti = -1, pastin = cpi->oxcf.arnr_strength; - - int besterr; - - if (start_frame < 0) - start_frame += cpi->oxcf.lag_in_frames; - - besterr = calc_low_ss_err(&cpi->src_buffer[cpi->last_alt_ref_sei].source_buffer, - &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance)); - - for (i = 0; i < 7; i++) - { - int thiserr; - cpi->oxcf.arnr_strength = i; - vp8_temporal_filter_prepare_c(cpi); - - thiserr = calc_low_ss_err(&cpi->alt_ref_buffer.source_buffer, - &cpi->src_buffer[start_frame].source_buffer, IF_RTCD(&cpi->rtcd.variance)); - - if (10 * thiserr < besterr * 8) - { - besterr = thiserr; - besti = i; - } - } - - if (besti != -1) - { - cpi->oxcf.arnr_strength = besti; - vp8_temporal_filter_prepare_c(cpi); - s = &cpi->alt_ref_buffer; - - // FWG not sure if I need to copy this data for the Alt Ref frame - s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp; - s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp; - s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags; - } - else - s = &cpi->src_buffer[cpi->last_alt_ref_sei]; - -#else - vp8_temporal_filter_prepare_c(cpi); - s = &cpi->alt_ref_buffer; - - // FWG not sure if I need to copy this data for the Alt Ref frame - s->source_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_time_stamp; - s->source_end_time_stamp = cpi->src_buffer[cpi->last_alt_ref_sei].source_end_time_stamp; - s->source_frame_flags = cpi->src_buffer[cpi->last_alt_ref_sei].source_frame_flags; - -#endif + vp8_temporal_filter_prepare_c(cpi, + cpi->frames_till_gf_update_due); + force_src_buffer = &cpi->alt_ref_buffer; } - else -#endif - s = &cpi->src_buffer[cpi->last_alt_ref_sei]; - cm->frames_till_alt_ref_frame = cpi->frames_till_gf_update_due; cm->refresh_alt_ref_frame = 1; cm->refresh_golden_frame = 0; @@ -4900,40 +4649,33 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon cpi->is_src_frame_alt_ref = 0; cpi->is_next_src_alt_ref = 0; } - else + } #endif + + if (!cpi->source) + { + if ((cpi->source = vp8_lookahead_pop(cpi->lookahead, flush))) { cm->show_frame = 1; -#if !(CONFIG_REALTIME_ONLY) - if (cpi->oxcf.allow_lag) - { - if (cpi->source_encode_index == cpi->last_alt_ref_sei) - { - cpi->is_src_frame_alt_ref = 1; - cpi->last_alt_ref_sei = -1; - } - else - cpi->is_src_frame_alt_ref = 0; - - cpi->source_encode_index = (cpi->source_encode_index + 1) % cpi->oxcf.lag_in_frames; + cpi->is_src_frame_alt_ref = cpi->alt_ref_source + && (cpi->source == cpi->alt_ref_source); - if(cpi->source_encode_index == cpi->last_alt_ref_sei) - cpi->is_next_src_alt_ref = 1; - else - cpi->is_next_src_alt_ref = 0; - } - -#endif - cpi->source_buffer_count--; + cpi->is_next_src_alt_ref = cpi->alt_ref_source + && (vp8_lookahead_peek(cpi->lookahead, 0) + == cpi->alt_ref_source); + if(cpi->is_src_frame_alt_ref) + cpi->alt_ref_source = NULL; } + } - cpi->un_scaled_source = &s->source_buffer; - cpi->Source = &s->source_buffer; - cpi->source_frame_flags = s->source_frame_flags; - - *time_stamp = cpi->source_time_stamp; - *time_end = cpi->source_end_time_stamp; + if (cpi->source) + { + cpi->un_scaled_source = + cpi->Source = force_src_buffer ? force_src_buffer : &cpi->source->img; + *time_stamp = cpi->source->ts_start; + *time_end = cpi->source->ts_end; + *frame_flags = cpi->source->flags; } else { @@ -4959,26 +4701,24 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon return -1; } - *frame_flags = cpi->source_frame_flags; - - if (cpi->source_time_stamp < cpi->first_time_stamp_ever) + if (cpi->source->ts_start < cpi->first_time_stamp_ever) { - cpi->first_time_stamp_ever = cpi->source_time_stamp; - cpi->last_end_time_stamp_seen = cpi->source_time_stamp; + cpi->first_time_stamp_ever = cpi->source->ts_start; + cpi->last_end_time_stamp_seen = cpi->source->ts_start; } // adjust frame rates based on timestamps given if (!cm->refresh_alt_ref_frame) { - if (cpi->source_time_stamp == cpi->first_time_stamp_ever) + if (cpi->source->ts_start == cpi->first_time_stamp_ever) { - double this_fps = 10000000.000 / (cpi->source_end_time_stamp - cpi->source_time_stamp); + double this_fps = 10000000.000 / (cpi->source->ts_end - cpi->source->ts_start); vp8_new_frame_rate(cpi, this_fps); } else { - long long nanosecs = cpi->source_end_time_stamp + long long nanosecs = cpi->source->ts_end - cpi->last_end_time_stamp_seen; if (nanosecs > 0) @@ -4989,8 +4729,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon } - cpi->last_time_stamp_seen = cpi->source_time_stamp; - cpi->last_end_time_stamp_seen = cpi->source_end_time_stamp; + cpi->last_time_stamp_seen = cpi->source->ts_start; + cpi->last_end_time_stamp_seen = cpi->source->ts_end; } if (cpi->compressor_speed == 2) @@ -5111,7 +4851,6 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon if (cpi->b_calculate_psnr) { - double y, u, v; double ye,ue,ve; double frame_psnr; YV12_BUFFER_CONFIG *orig = cpi->Source; @@ -5144,7 +4883,7 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon cpi->total_sq_error += sq_error; cpi->total += frame_psnr; { - double y2, u2, v2, frame_psnr2, frame_ssim2 = 0; + double frame_psnr2, frame_ssim2 = 0; double weight = 0; vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc)); @@ -5185,7 +4924,8 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon if (cpi->b_calculate_ssimg) { double y, u, v, frame_all; - frame_all = vp8_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v); + frame_all = vp8_calc_ssimg(cpi->Source, cm->frame_to_show, + &y, &u, &v, IF_RTCD(&cpi->rtcd.variance)); cpi->total_ssimg_y += y; cpi->total_ssimg_u += u; cpi->total_ssimg_v += v; @@ -5376,35 +5116,6 @@ int vp8_calc_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const } -static int calc_low_ss_err(YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, const vp8_variance_rtcd_vtable_t *rtcd) -{ - int i, j; - int Total = 0; - - unsigned char *src = source->y_buffer; - unsigned char *dst = dest->y_buffer; - (void)rtcd; - - // Loop through the Y plane raw and reconstruction data summing (square differences) - for (i = 0; i < source->y_height; i += 16) - { - for (j = 0; j < source->y_width; j += 16) - { - unsigned int sse; - VARIANCE_INVOKE(rtcd, mse16x16)(src + j, source->y_stride, dst + j, dest->y_stride, &sse); - - if (sse < 8096) - Total += sse; - } - - src += 16 * source->y_stride; - dst += 16 * dest->y_stride; - } - - return Total; -} - - int vp8_get_quantizer(VP8_PTR c) { VP8_COMP *cpi = (VP8_COMP *) c; diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h index 0e53f6803..e2e6b367c 100644 --- a/vp8/encoder/onyx_int.h +++ b/vp8/encoder/onyx_int.h @@ -29,6 +29,7 @@ #include "mcomp.h" #include "temporal_filter.h" #include "vp8/common/findnearmv.h" +#include "lookahead.h" //#define SPEEDSTATS 1 #define MIN_GF_INTERVAL 4 @@ -217,14 +218,6 @@ typedef struct void *ptr1; } LPFTHREAD_DATA; -typedef struct -{ - INT64 source_time_stamp; - INT64 source_end_time_stamp; - - DECLARE_ALIGNED(16, YV12_BUFFER_CONFIG, source_buffer); - unsigned int source_frame_flags; -} SOURCE_SAMPLE; typedef struct VP8_ENCODER_RTCD { @@ -251,17 +244,17 @@ typedef struct { DECLARE_ALIGNED(16, short, Y1quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y1quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, unsigned char, Y1quant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y1round[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2quant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, Y2quant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, unsigned char, Y2quant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2zbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, Y2round[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVquant[QINDEX_RANGE][16]); - DECLARE_ALIGNED(16, short, UVquant_shift[QINDEX_RANGE][16]); + DECLARE_ALIGNED(16, unsigned char, UVquant_shift[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVzbin[QINDEX_RANGE][16]); DECLARE_ALIGNED(16, short, UVround[QINDEX_RANGE][16]); @@ -280,19 +273,17 @@ typedef struct VP8_CONFIG oxcf; + struct lookahead_ctx *lookahead; + struct lookahead_entry *source; + struct lookahead_entry *alt_ref_source; + YV12_BUFFER_CONFIG *Source; YV12_BUFFER_CONFIG *un_scaled_source; - INT64 source_time_stamp; - INT64 source_end_time_stamp; - unsigned int source_frame_flags; YV12_BUFFER_CONFIG scaled_source; - int source_buffer_count; // number of src_buffers in use for lagged encoding - int source_encode_index; // index of buffer in src_buffer to encode int source_alt_ref_pending; // frame in src_buffers has been identified to be encoded as an alt ref int source_alt_ref_active; // an alt ref frame has been encoded and is usable - int last_alt_ref_sei; // index into src_buffers of frame used as alt reference int is_src_frame_alt_ref; // source of frame to encode is an exact copy of an alt ref frame int is_next_src_alt_ref; // source of next frame to encode is an exact copy of an alt ref frame @@ -301,8 +292,6 @@ typedef struct int gold_is_alt; // don't do both alt and gold search ( just do gold). //int refresh_alt_ref_frame; - SOURCE_SAMPLE src_buffer[MAX_LAG_BUFFERS]; - YV12_BUFFER_CONFIG last_frame_uf; TOKENEXTRA *tok; @@ -396,14 +385,11 @@ typedef struct int active_arnr_frames; // <= cpi->oxcf.arnr_max_frames INT64 key_frame_count; - INT64 tot_key_frame_bits; - int prior_key_frame_size[KEY_FRAME_CONTEXT]; int prior_key_frame_distance[KEY_FRAME_CONTEXT]; int per_frame_bandwidth; // Current section per frame bandwidth target int av_per_frame_bandwidth; // Average frame size target for clip int min_frame_bandwidth; // Minimum allocation that should be used for any frame int last_key_frame_size; - int intra_frame_target; int inter_frame_target; double output_frame_rate; long long last_time_stamp_seen; @@ -515,12 +501,8 @@ typedef struct int interquantizer; int auto_gold; int auto_adjust_gold_quantizer; - int goldquantizer; int goldfreq; - int auto_adjust_key_quantizer; - int keyquantizer; int auto_worst_q; - int filter_type; int cpu_used; int chroma_boost; int horiz_scale; @@ -594,7 +576,6 @@ typedef struct // multithread data int * mt_current_mb_col; int mt_sync_range; - int processor_core_count; int b_multi_threaded; int encoding_thread_count; @@ -638,7 +619,7 @@ typedef struct VP8_ENCODER_RTCD rtcd; #endif #if VP8_TEMPORAL_ALT_REF - SOURCE_SAMPLE alt_ref_buffer; + YV12_BUFFER_CONFIG alt_ref_buffer; YV12_BUFFER_CONFIG *frames[MAX_LAG_BUFFERS]; int fixed_divide[512]; #endif diff --git a/vp8/encoder/parms.cpp b/vp8/encoder/parms.cpp deleted file mode 100644 index 2a39b2ca3..000000000 --- a/vp8/encoder/parms.cpp +++ /dev/null @@ -1,107 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - - -#if 0 - -#include <map> -#include <string> -#include <fstream> -extern "C" -{ - #include "vp8/common/onyx.h" -} - - -using namespace std; - -typedef map<string,int> Parms; - -#define ALLPARMS(O,DOTHIS) \ - DOTHIS(O, interquantizer )\ - DOTHIS(O, auto_gold )\ - DOTHIS(O, auto_adjust_gold_quantizer )\ - DOTHIS(O, goldquantizer )\ - DOTHIS(O, goldfreq )\ - DOTHIS(O, auto_key )\ - DOTHIS(O, auto_adjust_key_quantizer )\ - DOTHIS(O, keyquantizer )\ - DOTHIS(O, keyfreq )\ - DOTHIS(O, pass )\ - DOTHIS(O, fixed_q )\ - DOTHIS(O, target_bandwidth )\ - DOTHIS(O, auto_worst_q )\ - DOTHIS(O, worst_quality )\ - DOTHIS(O, best_allowed_q )\ - DOTHIS(O, end_usage )\ - DOTHIS(O, starting_buffer_level )\ - DOTHIS(O, optimal_buffer_level )\ - DOTHIS(O, maximum_buffer_size )\ - DOTHIS(O, under_shoot_pct )\ - DOTHIS(O, allow_df )\ - DOTHIS(O, drop_frames_water_mark )\ - DOTHIS(O, max_allowed_datarate )\ - DOTHIS(O, two_pass_vbrbias )\ - DOTHIS(O, two_pass_vbrmin_section )\ - DOTHIS(O, two_pass_vbrmax_section )\ - DOTHIS(O, filter_type )\ - DOTHIS(O, compressor_speed )\ - DOTHIS(O, mbpitch_feature )\ - DOTHIS(O, allow_spatial_resampling )\ - DOTHIS(O, resample_down_water_mark )\ - DOTHIS(O, resample_up_water_mark )\ - DOTHIS(O, noise_sensitivity )\ - DOTHIS(O, horiz_scale )\ - DOTHIS(O, vert_scale ) - - -#define GET(O,V) O->V = x[#V]; -#define PUT(O,V) x[#V] = O->V; - - -extern "C" void get_parms(VP8_CONFIG *ocf,char *filename) -{ - - Parms x; - int value; - string variable; - string equal; - - ifstream config_file(filename); - - ALLPARMS(ocf, PUT); - - // store all the parms in a map (really simple parsing) - while(!config_file.eof() && config_file.is_open()) - { - config_file >> variable; - config_file >> equal; - - if(equal != "=") - continue; - - config_file >> value; - - x[variable] = value; - } - - ALLPARMS(ocf, GET); - -} - -#define PRINT(O,V) debug_file<<#V <<" = " << O->V <<"\n"; -extern "C" void print_parms(VP8_CONFIG *ocf,char *filename) -{ - ofstream debug_file(filename,ios_base::app); - ALLPARMS(ocf, PRINT); - debug_file << "=============================================="<<"\n"; -} - -#endif diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c index 0759e2d5b..ea4f01fad 100644 --- a/vp8/encoder/pickinter.c +++ b/vp8/encoder/pickinter.c @@ -50,7 +50,7 @@ extern int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]); extern void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv); -int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2]) +int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, MV *ref_mv, int error_per_bit, const vp8_variance_fn_ptr_t *vfp, int *mvcost[2], int *distortion, unsigned int *sse) { (void) b; (void) d; @@ -58,6 +58,8 @@ int vp8_skip_fractional_mv_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestmv, (void) error_per_bit; (void) vfp; (void) mvcost; + (void) distortion; + (void) sse; bestmv->row <<= 3; bestmv->col <<= 3; return 0; @@ -192,9 +194,10 @@ static int pick_intra4x4block( int this_rd; rate = mode_costs[mode]; - vp8_predict_intra4x4(b, mode, b->predictor); + RECON_INVOKE(&rtcd->common->recon, intra4x4_predict) + (b, mode, b->predictor); distortion = get_prediction_error(be, b, &rtcd->variance); - this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate, distortion); + this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion); if (this_rd < best_rd) { @@ -212,7 +215,13 @@ static int pick_intra4x4block( } -int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int *Rate, int *best_dist) +int vp8_pick_intra4x4mby_modes +( + const VP8_ENCODER_RTCD *rtcd, + MACROBLOCK *mb, + int *Rate, + int *best_dist +) { MACROBLOCKD *const xd = &mb->e_mbd; int i; @@ -239,20 +248,18 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int mic->bmi[i].mode = xd->block[i].bmi.mode = best_mode; - // Break out case where we have already exceeded best so far value that was bassed in + // Break out case where we have already exceeded best so far value + // that was passed in if (distortion > *best_dist) break; } - for (i = 0; i < 16; i++) - xd->block[i].bmi.mv.as_int = 0; - *Rate = cost; if (i == 16) { *best_dist = distortion; - error = RD_ESTIMATE(mb->rdmult, mb->rddiv, cost, distortion); + error = RDCOST(mb->rdmult, mb->rddiv, cost, distortion); } else { @@ -260,6 +267,9 @@ int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *rtcd, MACROBLOCK *mb, int error = INT_MAX; } + for (i = 0; i < 16; i++) + xd->block[i].bmi.mv.as_int = 0; + return error; } @@ -435,7 +445,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re int bestsme; //int all_rds[MAX_MODES]; // Experimental debug code. int best_mode_index = 0; - int sse = INT_MAX; + unsigned int sse = INT_MAX; MV mvp; int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7}; @@ -452,6 +462,8 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re int skip_mode[4] = {0, 0, 0, 0}; + int have_subp_search = cpi->sf.half_pixel_search; /* In real-time mode, when Speed >= 15, no sub-pixel search. */ + vpx_memset(mode_mv, 0, sizeof(mode_mv)); vpx_memset(nearest_mv, 0, sizeof(nearest_mv)); vpx_memset(near_mv, 0, sizeof(near_mv)); @@ -632,10 +644,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re switch (this_mode) { case B_PRED: - distortion2 = *returndistortion; // Best so far passed in as breakout value to vp8_pick_intra4x4mby_modes - vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2); - rate2 += rate; - distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); + // Pass best so far to vp8_pick_intra4x4mby_modes to use as breakout + distortion2 = *returndistortion; + vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, + &rate, &distortion2); if (distortion2 == INT_MAX) { @@ -643,7 +655,12 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re } else { - this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); + rate2 += rate; + distortion2 = VARIANCE_INVOKE + (&cpi->rtcd.variance, get16x16prederror)( + x->src.y_buffer, x->src.y_stride, + x->e_mbd.predictor, 16, 0x7fffffff); + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); if (this_rd < best_intra_rd) { @@ -667,7 +684,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re (&x->e_mbd); distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16, 0x7fffffff); rate2 += x->mbmode_cost[x->e_mbd.frame_type][x->e_mbd.mode_info_context->mbmi.mode]; - this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); if (this_rd < best_intra_rd) { @@ -781,7 +798,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re } if (bestsme < INT_MAX) - cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost); + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], cpi->mb.mvcost, &distortion2, &sse); mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; @@ -811,9 +828,10 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re x->e_mbd.block[0].bmi.mode = this_mode; x->e_mbd.block[0].bmi.mv.as_int = x->e_mbd.mode_info_context->mbmi.mv.as_int; - distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], (unsigned int *)(&sse)); + if((this_mode != NEWMV) || !(have_subp_search)) + distortion2 = get_inter_mbpred_error(x, &cpi->fn_ptr[BLOCK_16X16], &sse); - this_rd = RD_ESTIMATE(x->rdmult, x->rddiv, rate2, distortion2); + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); if (cpi->active_map_enabled && x->active_ptr[0] == 0) { @@ -921,7 +939,6 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re best_mbmode.uv_mode = 0; best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; best_mbmode.partitioning = 0; - best_mbmode.dc_diff = 0; vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO)); vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO)); @@ -932,6 +949,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int re } x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; + return; } diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h index 8fea98397..f96fc5376 100644 --- a/vp8/encoder/pickinter.h +++ b/vp8/encoder/pickinter.h @@ -14,7 +14,6 @@ #include "vpx_ports/config.h" #include "vp8/common/onyxc_int.h" -#define RD_ESTIMATE(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) extern int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *, MACROBLOCK *mb, int *Rate, int *Distortion); extern void vp8_pick_intra_mbuv_mode(MACROBLOCK *mb); extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra); diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c index 803e3a51d..86ed267fb 100644 --- a/vp8/encoder/quantize.c +++ b/vp8/encoder/quantize.c @@ -27,7 +27,7 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d) short *zbin_ptr = b->zbin; short *round_ptr = b->round; short *quant_ptr = b->quant_fast; - short *quant_shift_ptr = b->quant_shift; + unsigned char *quant_shift_ptr = b->quant_shift; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = d->dequant; @@ -112,7 +112,7 @@ void vp8_regular_quantize_b(BLOCK *b, BLOCKD *d) short *zbin_ptr = b->zbin; short *round_ptr = b->round; short *quant_ptr = b->quant; - short *quant_shift_ptr = b->quant_shift; + unsigned char *quant_shift_ptr = b->quant_shift; short *qcoeff_ptr = d->qcoeff; short *dqcoeff_ptr = d->dqcoeff; short *dequant_ptr = d->dequant; @@ -166,7 +166,7 @@ void vp8_strict_quantize_b(BLOCK *b, BLOCKD *d) int sz; short *coeff_ptr; short *quant_ptr; - short *quant_shift_ptr; + unsigned char *quant_shift_ptr; short *qcoeff_ptr; short *dqcoeff_ptr; short *dequant_ptr; diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c index 9821d2990..d87591cb9 100644 --- a/vp8/encoder/ratectrl.c +++ b/vp8/encoder/ratectrl.c @@ -329,62 +329,96 @@ void vp8_setup_key_frame(VP8_COMP *cpi) cpi->common.refresh_alt_ref_frame = TRUE; } -void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi) + +static int estimate_bits_at_q(int frame_kind, int Q, int MBs, + double correction_factor) +{ + int Bpm = (int)(.5 + correction_factor * vp8_bits_per_mb[frame_kind][Q]); + + /* Attempt to retain reasonable accuracy without overflow. The cutoff is + * chosen such that the maximum product of Bpm and MBs fits 31 bits. The + * largest Bpm takes 20 bits. + */ + if (MBs > (1 << 11)) + return (Bpm >> BPER_MB_NORMBITS) * MBs; + else + return (Bpm * MBs) >> BPER_MB_NORMBITS; +} + + +static void calc_iframe_target_size(VP8_COMP *cpi) { // boost defaults to half second int kf_boost; + int target; // Clear down mmx registers to allow floating point in what follows vp8_clear_system_state(); //__asm emms; if (cpi->oxcf.fixed_q >= 0) { - vp8_calc_iframe_target_size(cpi); - return; - } + int Q = cpi->oxcf.key_q; - if (cpi->pass == 2) + target = estimate_bits_at_q(INTRA_FRAME, Q, cpi->common.MBs, + cpi->key_frame_rate_correction_factor); + } + else if (cpi->pass == 2) + { + // New Two pass RC + target = cpi->per_frame_bandwidth; + } + // First Frame is a special case + else if (cpi->common.current_video_frame == 0) { - cpi->this_frame_target = cpi->per_frame_bandwidth; // New Two pass RC + /* 1 Pass there is no information on which to base size so use + * bandwidth per second * fraction of the initial buffer + * level + */ + target = cpi->oxcf.starting_buffer_level / 2; + + if(target > cpi->oxcf.target_bandwidth * 3 / 2) + target = cpi->oxcf.target_bandwidth * 3 / 2; } else { + // if this keyframe was forced, use a more recent Q estimate + int Q = (cpi->common.frame_flags & FRAMEFLAGS_KEY) + ? cpi->avg_frame_qindex : cpi->ni_av_qi; + // Boost depends somewhat on frame rate kf_boost = (int)(2 * cpi->output_frame_rate - 16); // adjustment up based on q - kf_boost = kf_boost * kf_boost_qadjustment[cpi->ni_av_qi] / 100; + kf_boost = kf_boost * kf_boost_qadjustment[Q] / 100; // frame separation adjustment ( down) if (cpi->frames_since_key < cpi->output_frame_rate / 2) - kf_boost = (int)(kf_boost * cpi->frames_since_key / (cpi->output_frame_rate / 2)); + kf_boost = (int)(kf_boost + * cpi->frames_since_key / (cpi->output_frame_rate / 2)); if (kf_boost < 16) kf_boost = 16; - // Reset the active worst quality to the baseline value for key frames. - cpi->active_worst_quality = cpi->worst_quality; - - cpi->this_frame_target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; + target = ((16 + kf_boost) * cpi->per_frame_bandwidth) >> 4; } - // Should the next frame be an altref frame - if (cpi->pass != 2) + if (cpi->oxcf.rc_max_intra_bitrate_pct) { - // For now Alt ref is not allowed except in 2 pass modes. - cpi->source_alt_ref_pending = FALSE; + unsigned int max_rate = cpi->per_frame_bandwidth + * cpi->oxcf.rc_max_intra_bitrate_pct / 100; - /*if ( cpi->oxcf.fixed_q == -1) - { - if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) ) - cpi->source_alt_ref_pending = TRUE; - else - cpi->source_alt_ref_pending = FALSE; - }*/ + if (target > max_rate) + target = max_rate; } - if (0) + cpi->this_frame_target = target; + + // TODO: if we separate rate targeting from Q targetting, move this. + // Reset the active worst quality to the baseline value for key frames. + cpi->active_worst_quality = cpi->worst_quality; + +#if 0 { FILE *f; @@ -397,8 +431,10 @@ void vp8_calc_auto_iframe_target_size(VP8_COMP *cpi) fclose(f); } +#endif } + // Do the best we can to define the parameteres for the next GF based on what information we have available. static void calc_gf_params(VP8_COMP *cpi) { @@ -564,106 +600,13 @@ static void calc_gf_params(VP8_COMP *cpi) }*/ } } -/* This is equvialent to estimate_bits_at_q without the rate_correction_factor. */ -static int baseline_bits_at_q(int frame_kind, int Q, int MBs) -{ - int Bpm = vp8_bits_per_mb[frame_kind][Q]; - /* Attempt to retain reasonable accuracy without overflow. The cutoff is - * chosen such that the maximum product of Bpm and MBs fits 31 bits. The - * largest Bpm takes 20 bits. - */ - if (MBs > (1 << 11)) - return (Bpm >> BPER_MB_NORMBITS) * MBs; - else - return (Bpm * MBs) >> BPER_MB_NORMBITS; -} -void vp8_calc_iframe_target_size(VP8_COMP *cpi) -{ - int Q; - int Boost = 100; - - Q = (cpi->oxcf.fixed_q >= 0) ? cpi->oxcf.fixed_q : cpi->avg_frame_qindex; - - if (cpi->auto_adjust_key_quantizer == 1) - { - // If (auto_adjust_key_quantizer==1) then a lower Q is selected for key-frames. - // The enhanced Q is calculated so as to boost the key frame size by a factor - // specified in kf_boost_qadjustment. Also, can adjust based on distance - // between key frames. - - // Adjust boost based upon ambient Q - Boost = kf_boost_qadjustment[Q]; - - // Make the Key frame boost less if the seperation from the previous key frame is small - if (cpi->frames_since_key < 16) - Boost = Boost * kf_boost_seperation_adjustment[cpi->frames_since_key] / 100; - else - Boost = Boost * kf_boost_seperation_adjustment[15] / 100; - - // Apply limits on boost - if (Boost > kf_gf_boost_qlimits[Q]) - Boost = kf_gf_boost_qlimits[Q]; - else if (Boost < 120) - Boost = 120; - } - - // Keep a record of the boost that was used - cpi->last_boost = Boost; - - // Should the next frame be an altref frame - if (cpi->pass != 2) - { - // For now Alt ref is not allowed except in 2 pass modes. - cpi->source_alt_ref_pending = FALSE; - - /*if ( cpi->oxcf.fixed_q == -1) - { - if ( cpi->oxcf.play_alternate && ( (cpi->last_boost/2) > (100+(AF_THRESH*cpi->frames_till_gf_update_due)) ) ) - cpi->source_alt_ref_pending = TRUE; - else - cpi->source_alt_ref_pending = FALSE; - }*/ - } - - if (cpi->oxcf.fixed_q >= 0) - { - cpi->this_frame_target = (baseline_bits_at_q(0, Q, cpi->common.MBs) * Boost) / 100; - } - else - { - - int bits_per_mb_at_this_q ; - - if (cpi->oxcf.error_resilient_mode == 1) - { - cpi->this_frame_target = 2 * cpi->av_per_frame_bandwidth; - return; - } - - // Rate targetted scenario: - // Be careful of 32-bit OVERFLOW if restructuring the caluclation of cpi->this_frame_target - bits_per_mb_at_this_q = (int)(.5 + - cpi->key_frame_rate_correction_factor * vp8_bits_per_mb[0][Q]); - - cpi->this_frame_target = (((bits_per_mb_at_this_q * cpi->common.MBs) >> BPER_MB_NORMBITS) * Boost) / 100; - - // Reset the active worst quality to the baseline value for key frames. - if (cpi->pass < 2) - cpi->active_worst_quality = cpi->worst_quality; - } -} - - - -void vp8_calc_pframe_target_size(VP8_COMP *cpi) +static void calc_pframe_target_size(VP8_COMP *cpi) { int min_frame_target; int Adjustment; - // Set the min frame bandwidth. - //min_frame_target = estimate_min_frame_size( cpi ); min_frame_target = 0; if (cpi->pass == 2) @@ -817,11 +760,6 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) } } - // Set a reduced data rate target for our initial Q calculation. - // This should help to save bits during earier sections. - if ((cpi->oxcf.under_shoot_pct > 0) && (cpi->oxcf.under_shoot_pct <= 100)) - cpi->this_frame_target = (cpi->this_frame_target * cpi->oxcf.under_shoot_pct) / 100; - // Sanity check that the total sum of adjustments is not above the maximum allowed // That is that having allowed for KF and GF penalties we have not pushed the // current interframe target to low. If the adjustment we apply here is not capable of recovering @@ -858,11 +796,6 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) percent_low = (cpi->oxcf.optimal_buffer_level - cpi->buffer_level) / one_percent_bits; - - if (percent_low > 100) - percent_low = 100; - else if (percent_low < 0) - percent_low = 0; } // Are we overshooting the long term clip data rate... else if (cpi->bits_off_target < 0) @@ -870,16 +803,16 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) // Adjust per frame data target downwards to compensate. percent_low = (int)(100 * -cpi->bits_off_target / (cpi->total_byte_count * 8)); - - if (percent_low > 100) - percent_low = 100; - else if (percent_low < 0) - percent_low = 0; } + if (percent_low > cpi->oxcf.under_shoot_pct) + percent_low = cpi->oxcf.under_shoot_pct; + else if (percent_low < 0) + percent_low = 0; + // lower the target bandwidth for this frame. - cpi->this_frame_target = - (cpi->this_frame_target * (100 - (percent_low / 2))) / 100; + cpi->this_frame_target -= (cpi->this_frame_target * percent_low) + / 200; // Are we using allowing control of active_worst_allowed_q // according to buffer level. @@ -950,20 +883,29 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) } else { - int percent_high; + int percent_high = 0; - if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) + if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) + && (cpi->buffer_level > cpi->oxcf.optimal_buffer_level)) + { + percent_high = (cpi->buffer_level + - cpi->oxcf.optimal_buffer_level) + / one_percent_bits; + } + else if (cpi->bits_off_target > cpi->oxcf.optimal_buffer_level) { - percent_high = (int)(100 * (cpi->bits_off_target - cpi->oxcf.optimal_buffer_level) / (cpi->total_byte_count * 8)); + percent_high = (int)((100 * cpi->bits_off_target) + / (cpi->total_byte_count * 8)); + } - if (percent_high > 100) - percent_high = 100; - else if (percent_high < 0) - percent_high = 0; + if (percent_high > cpi->oxcf.over_shoot_pct) + percent_high = cpi->oxcf.over_shoot_pct; + else if (percent_high < 0) + percent_high = 0; - cpi->this_frame_target = (cpi->this_frame_target * (100 + (percent_high / 2))) / 100; + cpi->this_frame_target += (cpi->this_frame_target * + percent_high) / 200; - } // Are we allowing control of active_worst_allowed_q according to bufferl level. if (cpi->auto_worst_q) @@ -1152,7 +1094,9 @@ void vp8_calc_pframe_target_size(VP8_COMP *cpi) } } else - cpi->this_frame_target = (baseline_bits_at_q(1, Q, cpi->common.MBs) * cpi->last_boost) / 100; + cpi->this_frame_target = + (estimate_bits_at_q(1, Q, cpi->common.MBs, 1.0) + * cpi->last_boost) / 100; } // If there is an active ARF at this location use the minimum @@ -1274,21 +1218,6 @@ void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var) } } -static int estimate_bits_at_q(VP8_COMP *cpi, int Q) -{ - int Bpm = (int)(.5 + cpi->rate_correction_factor * vp8_bits_per_mb[INTER_FRAME][Q]); - - /* Attempt to retain reasonable accuracy without overflow. The cutoff is - * chosen such that the maximum product of Bpm and MBs fits 31 bits. The - * largest Bpm takes 20 bits. - */ - if (cpi->common.MBs > (1 << 11)) - return (Bpm >> BPER_MB_NORMBITS) * cpi->common.MBs; - else - return (Bpm * cpi->common.MBs) >> BPER_MB_NORMBITS; - -} - int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) { @@ -1419,119 +1348,85 @@ int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame) return Q; } -static int estimate_min_frame_size(VP8_COMP *cpi) -{ - double correction_factor; - int bits_per_mb_at_max_q; - - // This funtion returns a default value for the first few frames untill the correction factor has had time to adapt. - if (cpi->common.current_video_frame < 10) - { - if (cpi->pass == 2) - return (cpi->min_frame_bandwidth); - else - return cpi->per_frame_bandwidth / 3; - } - - /* // Select the appropriate correction factor based upon type of frame. - if ( cpi->common.frame_type == KEY_FRAME ) - correction_factor = cpi->key_frame_rate_correction_factor; - else - { - if ( cpi->common.refresh_alt_ref_frame || cpi->common.refresh_golden_frame ) - correction_factor = cpi->gf_rate_correction_factor; - else - correction_factor = cpi->rate_correction_factor; - }*/ - - // We estimate at half the value we get from vp8_bits_per_mb - correction_factor = cpi->rate_correction_factor / 2.0; - bits_per_mb_at_max_q = (int)(.5 + correction_factor * vp8_bits_per_mb[cpi->common.frame_type][MAXQ]); - - return (bits_per_mb_at_max_q * cpi->common.MBs) >> BPER_MB_NORMBITS; -} - -void vp8_adjust_key_frame_context(VP8_COMP *cpi) +static int estimate_keyframe_frequency(VP8_COMP *cpi) { int i; - int av_key_frames_per_second; - - // Average key frame frequency and size - unsigned int total_weight = 0; - unsigned int av_key_frame_frequency = 0; - unsigned int av_key_frame_bits = 0; - unsigned int output_frame_rate = (unsigned int)(100 * cpi->output_frame_rate); - unsigned int target_bandwidth = (unsigned int)(100 * cpi->target_bandwidth); + // Average key frame frequency + int av_key_frame_frequency = 0; - // Clear down mmx registers to allow floating point in what follows - vp8_clear_system_state(); //__asm emms; - - // Update the count of total key frame bits - cpi->tot_key_frame_bits += cpi->projected_frame_size; - - // First key frame at start of sequence is a special case. We have no frequency data. + /* First key frame at start of sequence is a special case. We have no + * frequency data. + */ if (cpi->key_frame_count == 1) { - av_key_frame_frequency = (int)cpi->output_frame_rate * 2; // Assume a default of 1 kf every 2 seconds - av_key_frame_bits = cpi->projected_frame_size; - av_key_frames_per_second = output_frame_rate / av_key_frame_frequency; // Note output_frame_rate not cpi->output_frame_rate + /* Assume a default of 1 kf every 2 seconds, or the max kf interval, + * whichever is smaller. + */ + int key_freq = cpi->oxcf.key_freq>0 ? cpi->oxcf.key_freq : 1; + av_key_frame_frequency = (int)cpi->output_frame_rate * 2; + + if (cpi->oxcf.auto_key && av_key_frame_frequency > key_freq) + av_key_frame_frequency = cpi->oxcf.key_freq; + + cpi->prior_key_frame_distance[KEY_FRAME_CONTEXT - 1] + = av_key_frame_frequency; } else { + unsigned int total_weight = 0; int last_kf_interval = (cpi->frames_since_key > 0) ? cpi->frames_since_key : 1; - // reset keyframe context and calculate weighted average of last KEY_FRAME_CONTEXT keyframes + /* reset keyframe context and calculate weighted average of last + * KEY_FRAME_CONTEXT keyframes + */ for (i = 0; i < KEY_FRAME_CONTEXT; i++) { if (i < KEY_FRAME_CONTEXT - 1) - { - cpi->prior_key_frame_size[i] = cpi->prior_key_frame_size[i+1]; - cpi->prior_key_frame_distance[i] = cpi->prior_key_frame_distance[i+1]; - } + cpi->prior_key_frame_distance[i] + = cpi->prior_key_frame_distance[i+1]; else - { - cpi->prior_key_frame_size[i] = cpi->projected_frame_size; cpi->prior_key_frame_distance[i] = last_kf_interval; - } - av_key_frame_bits += prior_key_frame_weight[i] * cpi->prior_key_frame_size[i]; - av_key_frame_frequency += prior_key_frame_weight[i] * cpi->prior_key_frame_distance[i]; - total_weight += prior_key_frame_weight[i]; + av_key_frame_frequency += prior_key_frame_weight[i] + * cpi->prior_key_frame_distance[i]; + total_weight += prior_key_frame_weight[i]; } - av_key_frame_bits /= total_weight; av_key_frame_frequency /= total_weight; - av_key_frames_per_second = output_frame_rate / av_key_frame_frequency; } + return av_key_frame_frequency; +} + + +void vp8_adjust_key_frame_context(VP8_COMP *cpi) +{ + // Clear down mmx registers to allow floating point in what follows + vp8_clear_system_state(); // Do we have any key frame overspend to recover? - if ((cpi->pass != 2) && (cpi->projected_frame_size > cpi->per_frame_bandwidth)) + // Two-pass overspend handled elsewhere. + if ((cpi->pass != 2) + && (cpi->projected_frame_size > cpi->per_frame_bandwidth)) { - // Update the count of key frame overspend to be recovered in subsequent frames - // A portion of the KF overspend is treated as gf overspend (and hence recovered more quickly) - // as the kf is also a gf. Otherwise the few frames following each kf tend to get more bits - // allocated than those following other gfs. - cpi->kf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 7 / 8; - cpi->gf_overspend_bits += (cpi->projected_frame_size - cpi->per_frame_bandwidth) * 1 / 8; - if(!av_key_frame_frequency) - av_key_frame_frequency = 60; - - // Work out how much to try and recover per frame. - // For one pass we estimate the number of frames to spread it over based upon past history. - // For two pass we know how many frames there will be till the next kf. - if (cpi->pass == 2) - { - if (cpi->frames_to_key > 16) - cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)cpi->frames_to_key; - else - cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / 16; - } - else - cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits / (int)av_key_frame_frequency; + int overspend; + + /* Update the count of key frame overspend to be recovered in + * subsequent frames. A portion of the KF overspend is treated as gf + * overspend (and hence recovered more quickly) as the kf is also a + * gf. Otherwise the few frames following each kf tend to get more + * bits allocated than those following other gfs. + */ + overspend = (cpi->projected_frame_size - cpi->per_frame_bandwidth); + cpi->kf_overspend_bits += overspend * 7 / 8; + cpi->gf_overspend_bits += overspend * 1 / 8; + + /* Work out how much to try and recover per frame. */ + cpi->kf_bitrate_adjustment = cpi->kf_overspend_bits + / estimate_keyframe_frequency(cpi); } cpi->frames_since_key = 0; @@ -1539,6 +1434,7 @@ void vp8_adjust_key_frame_context(VP8_COMP *cpi) cpi->key_frame_count++; } + void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit) { // Set-up bounds on acceptable frame size: @@ -1605,3 +1501,26 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, } } } + + +// return of 0 means drop frame +int vp8_pick_frame_size(VP8_COMP *cpi) +{ + VP8_COMMON *cm = &cpi->common; + + if (cm->frame_type == KEY_FRAME) + calc_iframe_target_size(cpi); + else + { + calc_pframe_target_size(cpi); + + // Check if we're dropping the frame: + if (cpi->drop_frame) + { + cpi->drop_frame = FALSE; + cpi->drop_count++; + return 0; + } + } + return 1; +} diff --git a/vp8/encoder/ratectrl.h b/vp8/encoder/ratectrl.h index 766dfdfce..d4f779677 100644 --- a/vp8/encoder/ratectrl.h +++ b/vp8/encoder/ratectrl.h @@ -17,11 +17,12 @@ extern void vp8_save_coding_context(VP8_COMP *cpi); extern void vp8_restore_coding_context(VP8_COMP *cpi); extern void vp8_setup_key_frame(VP8_COMP *cpi); -extern void vp8_calc_iframe_target_size(VP8_COMP *cpi); -extern void vp8_calc_pframe_target_size(VP8_COMP *cpi); extern void vp8_update_rate_correction_factors(VP8_COMP *cpi, int damp_var); extern int vp8_regulate_q(VP8_COMP *cpi, int target_bits_per_frame); extern void vp8_adjust_key_frame_context(VP8_COMP *cpi); extern void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit, int *frame_over_shoot_limit); +// return of 0 means drop frame +extern int vp8_pick_frame_size(VP8_COMP *cpi); + #endif diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c index 863b6d419..2789cffbb 100644 --- a/vp8/encoder/rdopt.c +++ b/vp8/encoder/rdopt.c @@ -31,7 +31,7 @@ #include "vp8/common/g_common.h" #include "variance.h" #include "mcomp.h" - +#include "rdopt.h" #include "vpx_mem/vpx_mem.h" #include "dct.h" #include "vp8/common/systemdependent.h" @@ -46,13 +46,8 @@ extern void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x); extern void vp8_update_zbin_extra(VP8_COMP *cpi, MACROBLOCK *x); - -#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) - #define MAXF(a,b) (((a) > (b)) ? (a) : (b)) - - static const int auto_speed_thresh[17] = { 1000, @@ -248,9 +243,6 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue) vp8_set_speed_features(cpi); - if (cpi->common.simpler_lpf) - cpi->common.filter_type = SIMPLE_LOOPFILTER; - q = (int)pow(Qvalue, 1.25); if (q < 8) @@ -480,7 +472,6 @@ int VP8_UVSSE(MACROBLOCK *x, const vp8_variance_rtcd_vtable_t *rtcd) } -#if !(CONFIG_REALTIME_ONLY) static int cost_coeffs(MACROBLOCK *mb, BLOCKD *b, int type, ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) { int c = !type; /* start at coef 0, unless Y with Y2 */ @@ -629,7 +620,8 @@ static int rd_pick_intra4x4block( rate = bmode_costs[mode]; - vp8_predict_intra4x4(b, mode, b->predictor); + RECON_INVOKE(&cpi->rtcd.common->recon, intra4x4_predict) + (b, mode, b->predictor); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), subb)(be, b, 16); x->vp8_short_fdct4x4(be->src_diff, be->coeff, 32); x->quantize_b(be, b); @@ -818,7 +810,8 @@ void vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *r int this_rd; x->e_mbd.mode_info_context->mbmi.uv_mode = mode; - vp8_build_intra_predictors_mbuv(&x->e_mbd); + RECON_INVOKE(&cpi->rtcd.common->recon, build_intra_predictors_mbuv) + (&x->e_mbd); ENCODEMB_INVOKE(IF_RTCD(&cpi->rtcd.encodemb), submbuv)(x->src_diff, x->src.u_buffer, x->src.v_buffer, x->e_mbd.predictor, x->src.uv_stride); @@ -847,7 +840,6 @@ void vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *r x->e_mbd.mode_info_context->mbmi.uv_mode = mode_selected; } -#endif int vp8_cost_mv_ref(MB_PREDICTION_MODE m, const int near_mv_ref_ct[4]) { @@ -875,7 +867,6 @@ void vp8_set_mbmode_and_mvs(MACROBLOCK *x, MB_PREDICTION_MODE mb, MV *mv) } } -#if !(CONFIG_REALTIME_ONLY) static int labels2mode( MACROBLOCK *x, int const *labelings, int which_label, @@ -1213,12 +1204,15 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, if (bestsme < INT_MAX) { + int distortion; + unsigned int sse; + if (!cpi->common.full_pixel) cpi->find_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], - bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost); + bsi->ref_mv, x->errorperbit / 2, v_fn_ptr, x->mvcost, &distortion, &sse); else vp8_skip_fractional_mv_step(x, c, e, &mode_mv[NEW4X4], - bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost); + bsi->ref_mv, x->errorperbit, v_fn_ptr, x->mvcost, &distortion, &sse); } } /* NEW4X4 */ @@ -1437,88 +1431,56 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x, return bsi.segment_rd; } -#endif -static void swap(int *x,int *y) +static void insertsortmv(int arr[], int len) { - int tmp; + int i, j, k; - tmp = *x; - *x = *y; - *y = tmp; -} + for ( i = 1 ; i <= len-1 ; i++ ) + { + for ( j = 0 ; j < i ; j++ ) + { + if ( arr[j] > arr[i] ) + { + int temp; -static void quicksortmv(int arr[],int left, int right) -{ - int lidx,ridx,pivot; - - lidx = left; - ridx = right; - - if( left < right) - { - pivot = (left + right)/2; - - while(lidx <=pivot && ridx >=pivot) - { - while(arr[lidx] < arr[pivot] && lidx <= pivot) - lidx++; - while(arr[ridx] > arr[pivot] && ridx >= pivot) - ridx--; - swap(&arr[lidx], &arr[ridx]); - lidx++; - ridx--; - if(lidx-1 == pivot) - { - ridx++; - pivot = ridx; - } - else if(ridx+1 == pivot) - { - lidx--; - pivot = lidx; - } - } - quicksortmv(arr, left, pivot - 1); - quicksortmv(arr, pivot + 1, right); - } + temp = arr[i]; + + for ( k = i; k >j; k--) + arr[k] = arr[k - 1] ; + + arr[j] = temp ; + } + } + } } -static void quicksortsad(int arr[],int idx[], int left, int right) +static void insertsortsad(int arr[],int idx[], int len) { - int lidx,ridx,pivot; - - lidx = left; - ridx = right; - - if( left < right) - { - pivot = (left + right)/2; - - while(lidx <=pivot && ridx >=pivot) - { - while(arr[lidx] < arr[pivot] && lidx <= pivot) - lidx++; - while(arr[ridx] > arr[pivot] && ridx >= pivot) - ridx--; - swap(&arr[lidx], &arr[ridx]); - swap(&idx[lidx], &idx[ridx]); - lidx++; - ridx--; - if(lidx-1 == pivot) - { - ridx++; - pivot = ridx; - } - else if(ridx+1 == pivot) - { - lidx--; - pivot = lidx; - } - } - quicksortsad(arr, idx, left, pivot - 1); - quicksortsad(arr, idx, pivot + 1, right); - } + int i, j, k; + + for ( i = 1 ; i <= len-1 ; i++ ) + { + for ( j = 0 ; j < i ; j++ ) + { + if ( arr[j] > arr[i] ) + { + int temp, tempi; + + temp = arr[i]; + tempi = idx[i]; + + for ( k = i; k >j; k--) + { + arr[k] = arr[k - 1] ; + idx[k] = idx[k - 1]; + } + + arr[j] = temp ; + idx[j] = tempi; + } + } + } } //The improved MV prediction @@ -1654,8 +1616,8 @@ void vp8_mv_pred mvy[i] = near_mvs[i].as_mv.col; } - quicksortmv (mvx, 0, vcnt-1); - quicksortmv (mvy, 0, vcnt-1); + insertsortmv(mvx, vcnt); + insertsortmv(mvy, vcnt); mv.as_mv.row = mvx[vcnt/2]; mv.as_mv.col = mvy[vcnt/2]; @@ -1718,14 +1680,13 @@ void vp8_cal_sad(VP8_COMP *cpi, MACROBLOCKD *xd, MACROBLOCK *x, int recon_yoffse if(cpi->common.last_frame_type != KEY_FRAME) { - quicksortsad(near_sad, near_sadidx, 0, 7); + insertsortsad(near_sad, near_sadidx, 8); }else { - quicksortsad(near_sad, near_sadidx, 0, 2); + insertsortsad(near_sad, near_sadidx, 3); } } -#if !(CONFIG_REALTIME_ONLY) void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra) { BLOCK *b = &x->block[0]; @@ -1760,9 +1721,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int //int intermodecost[MAX_MODES]; MB_PREDICTION_MODE uv_intra_mode; - - int force_no_skip = 0; - MV mvp; int near_sadidx[8] = {0, 1, 2, 3, 4, 5, 6, 7}; int saddone=0; @@ -1865,8 +1823,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int int disable_skip = 0; int other_cost = 0; - force_no_skip = 0; - // Experimental debug code. // Record of rd values recorded for this MB. -1 indicates not measured //all_rds[mode_index] = -1; @@ -2198,8 +2154,11 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int x->mv_row_max = tmp_row_max; if (bestsme < INT_MAX) - // cpi->find_fractional_mv_step(x,b,d,&d->bmi.mv.as_mv,&best_ref_mv,x->errorperbit/2,cpi->fn_ptr.svf,cpi->fn_ptr.vf,x->mvcost); // normal mvc=11 - cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost); + { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv, x->errorperbit / 4, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &dis, &sse); + } mode_mv[NEWMV].row = d->bmi.mv.as_mv.row; mode_mv[NEWMV].col = d->bmi.mv.as_mv.col; @@ -2230,8 +2189,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int case ZEROMV: - mv_selected: - // Trap vectors that reach beyond the UMV borders // Note that ALL New MV, Nearest MV Near MV and Zero MV code drops through to this point // because of the lack of break statements in the previous two cases. @@ -2240,14 +2197,15 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int continue; vp8_set_mbmode_and_mvs(x, this_mode, &mode_mv[this_mode]); - vp8_build_inter_predictors_mby(&x->e_mbd); + vp8_build_inter16x16_predictors_mby(&x->e_mbd); if (cpi->active_map_enabled && x->active_ptr[0] == 0) { x->skip = 1; } else if (x->encode_breakout) { - int sum, sse; + int sum; + unsigned int sse; int threshold = (xd->block[0].dequant[1] * xd->block[0].dequant[1] >>4); @@ -2256,7 +2214,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var) (x->src.y_buffer, x->src.y_stride, - x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum); + x->e_mbd.predictor, 16, &sse, &sum); if (sse < threshold) { @@ -2280,8 +2238,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int distortion_uv = sse2; disable_skip = 1; - this_rd = RDCOST(x->rdmult, x->rddiv, rate2, - distortion2); + this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2); break; } @@ -2376,7 +2333,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int { // Note index of best mode so far best_mode_index = mode_index; - x->e_mbd.mode_info_context->mbmi.force_no_skip = force_no_skip; if (this_mode <= B_PRED) { @@ -2473,7 +2429,6 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int best_mbmode.uv_mode = 0; best_mbmode.mb_skip_coeff = (cpi->common.mb_no_coeff_skip) ? 1 : 0; best_mbmode.partitioning = 0; - best_mbmode.dc_diff = 0; vpx_memcpy(&x->e_mbd.mode_info_context->mbmi, &best_mbmode, sizeof(MB_MODE_INFO)); vpx_memcpy(x->partition_info, &best_partition, sizeof(PARTITION_INFO)); @@ -2484,6 +2439,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int } x->e_mbd.mode_info_context->mbmi.mv.as_int = 0; + return; } @@ -2507,4 +2463,3 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int x->e_mbd.mode_info_context->mbmi.mv.as_mv = x->e_mbd.block[15].bmi.mv.as_mv; } -#endif diff --git a/vp8/encoder/rdopt.h b/vp8/encoder/rdopt.h index 1d1be11a4..1d5f9a3a8 100644 --- a/vp8/encoder/rdopt.h +++ b/vp8/encoder/rdopt.h @@ -11,6 +11,9 @@ #ifndef __INC_RDOPT_H #define __INC_RDOPT_H + +#define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) ) + extern void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue); extern int vp8_rd_pick_intra4x4mby_modes(VP8_COMP *cpi, MACROBLOCK *mb, int *rate, int *rate_to, int *distortion, int best_rd); extern int vp8_rd_pick_intra16x16mby_mode(VP8_COMP *cpi, MACROBLOCK *x, int *returnrate, int *rate_to, int *returndistortion); diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c index 64d67c6dd..8646b5fdb 100644 --- a/vp8/encoder/ssim.c +++ b/vp8/encoder/ssim.c @@ -18,223 +18,6 @@ #else #define IF_RTCD(x) NULL #endif -// Google version of SSIM -// SSIM -#define KERNEL 3 -#define KERNEL_SIZE (2 * KERNEL + 1) - -typedef unsigned char uint8; -typedef unsigned int uint32; - -static const int K[KERNEL_SIZE] = -{ - 1, 4, 11, 16, 11, 4, 1 // 16 * exp(-0.3 * i * i) -}; -static const double ki_w = 1. / 2304.; // 1 / sum(i:0..6, j..6) K[i]*K[j] -double get_ssimg(const uint8 *org, const uint8 *rec, - int xo, int yo, int W, int H, - const int stride1, const int stride2 - ) -{ - // TODO(skal): use summed tables - int y, x; - - const int ymin = (yo - KERNEL < 0) ? 0 : yo - KERNEL; - const int ymax = (yo + KERNEL > H - 1) ? H - 1 : yo + KERNEL; - const int xmin = (xo - KERNEL < 0) ? 0 : xo - KERNEL; - const int xmax = (xo + KERNEL > W - 1) ? W - 1 : xo + KERNEL; - // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1) - // with a diff of 255, squares. That would a max error of 0x8ee0900, - // which fits into 32 bits integers. - uint32 w = 0, xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; - org += ymin * stride1; - rec += ymin * stride2; - - for (y = ymin; y <= ymax; ++y, org += stride1, rec += stride2) - { - const int Wy = K[KERNEL + y - yo]; - - for (x = xmin; x <= xmax; ++x) - { - const int Wxy = Wy * K[KERNEL + x - xo]; - // TODO(skal): inlined assembly - w += Wxy; - xm += Wxy * org[x]; - ym += Wxy * rec[x]; - xxm += Wxy * org[x] * org[x]; - xym += Wxy * org[x] * rec[x]; - yym += Wxy * rec[x] * rec[x]; - } - } - - { - const double iw = 1. / w; - const double iwx = xm * iw; - const double iwy = ym * iw; - double sxx = xxm * iw - iwx * iwx; - double syy = yym * iw - iwy * iwy; - - // small errors are possible, due to rounding. Clamp to zero. - if (sxx < 0.) sxx = 0.; - - if (syy < 0.) syy = 0.; - - { - const double sxsy = sqrt(sxx * syy); - const double sxy = xym * iw - iwx * iwy; - static const double C11 = (0.01 * 0.01) * (255 * 255); - static const double C22 = (0.03 * 0.03) * (255 * 255); - static const double C33 = (0.015 * 0.015) * (255 * 255); - const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); - const double c = (2. * sxsy + C22) / (sxx + syy + C22); - - const double s = (sxy + C33) / (sxsy + C33); - return l * c * s; - - } - } - -} - -double get_ssimfull_kernelg(const uint8 *org, const uint8 *rec, - int xo, int yo, int W, int H, - const int stride1, const int stride2) -{ - // TODO(skal): use summed tables - // worst case of accumulation is a weight of 48 = 16 + 2 * (11 + 4 + 1) - // with a diff of 255, squares. That would a max error of 0x8ee0900, - // which fits into 32 bits integers. - int y_, x_; - uint32 xm = 0, ym = 0, xxm = 0, xym = 0, yym = 0; - org += (yo - KERNEL) * stride1; - org += (xo - KERNEL); - rec += (yo - KERNEL) * stride2; - rec += (xo - KERNEL); - - for (y_ = 0; y_ < KERNEL_SIZE; ++y_, org += stride1, rec += stride2) - { - const int Wy = K[y_]; - - for (x_ = 0; x_ < KERNEL_SIZE; ++x_) - { - const int Wxy = Wy * K[x_]; - // TODO(skal): inlined assembly - const int org_x = org[x_]; - const int rec_x = rec[x_]; - xm += Wxy * org_x; - ym += Wxy * rec_x; - xxm += Wxy * org_x * org_x; - xym += Wxy * org_x * rec_x; - yym += Wxy * rec_x * rec_x; - } - } - - { - const double iw = ki_w; - const double iwx = xm * iw; - const double iwy = ym * iw; - double sxx = xxm * iw - iwx * iwx; - double syy = yym * iw - iwy * iwy; - - // small errors are possible, due to rounding. Clamp to zero. - if (sxx < 0.) sxx = 0.; - - if (syy < 0.) syy = 0.; - - { - const double sxsy = sqrt(sxx * syy); - const double sxy = xym * iw - iwx * iwy; - static const double C11 = (0.01 * 0.01) * (255 * 255); - static const double C22 = (0.03 * 0.03) * (255 * 255); - static const double C33 = (0.015 * 0.015) * (255 * 255); - const double l = (2. * iwx * iwy + C11) / (iwx * iwx + iwy * iwy + C11); - const double c = (2. * sxsy + C22) / (sxx + syy + C22); - const double s = (sxy + C33) / (sxsy + C33); - return l * c * s; - } - } -} - -double calc_ssimg(const uint8 *org, const uint8 *rec, - const int image_width, const int image_height, - const int stride1, const int stride2 - ) -{ - int j, i; - double SSIM = 0.; - - for (j = 0; j < KERNEL; ++j) - { - for (i = 0; i < image_width; ++i) - { - SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2); - } - } - - for (j = KERNEL; j < image_height - KERNEL; ++j) - { - for (i = 0; i < KERNEL; ++i) - { - SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2); - } - - for (i = KERNEL; i < image_width - KERNEL; ++i) - { - SSIM += get_ssimfull_kernelg(org, rec, i, j, - image_width, image_height, stride1, stride2); - } - - for (i = image_width - KERNEL; i < image_width; ++i) - { - SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2); - } - } - - for (j = image_height - KERNEL; j < image_height; ++j) - { - for (i = 0; i < image_width; ++i) - { - SSIM += get_ssimg(org, rec, i, j, image_width, image_height, stride1, stride2); - } - } - - return SSIM; -} - - -double vp8_calc_ssimg -( - YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, - double *ssim_y, - double *ssim_u, - double *ssim_v -) -{ - double ssim_all = 0; - int ysize = source->y_width * source->y_height; - int uvsize = ysize / 4; - - *ssim_y = calc_ssimg(source->y_buffer, dest->y_buffer, - source->y_width, source->y_height, - source->y_stride, dest->y_stride); - - - *ssim_u = calc_ssimg(source->u_buffer, dest->u_buffer, - source->uv_width, source->uv_height, - source->uv_stride, dest->uv_stride); - - - *ssim_v = calc_ssimg(source->v_buffer, dest->v_buffer, - source->uv_width, source->uv_height, - source->uv_stride, dest->uv_stride); - - ssim_all = (*ssim_y + *ssim_u + *ssim_v) / (ysize + uvsize + uvsize); - *ssim_y /= ysize; - *ssim_u /= uvsize; - *ssim_v /= uvsize; - return ssim_all; -} void ssim_parms_c @@ -290,8 +73,8 @@ void ssim_parms_8x8_c } } -const static long long c1 = 426148; // (256^2*(.01*255)^2 -const static long long c2 = 3835331; //(256^2*(.03*255)^2 +const static long long cc1 = 26634; // (64^2*(.01*255)^2 +const static long long cc2 = 239708; // (64^2*(.03*255)^2 static double similarity ( @@ -303,10 +86,19 @@ static double similarity int count ) { - long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2); + long long ssim_n, ssim_d; + long long c1, c2; - long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* - (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ; + //scale the constants by number of pixels + c1 = (cc1*count*count)>>12; + c2 = (cc2*count*count)>>12; + + ssim_n = (2*sum_s*sum_r+ c1)*((long long) 2*count*sum_sxr- + (long long) 2*sum_s*sum_r+c2); + + ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + ((long long)count*sum_sq_s-(long long)sum_s*sum_s + + (long long)count*sum_sq_r-(long long) sum_r*sum_r +c2) ; return ssim_n * 1.0 / ssim_d; } @@ -332,23 +124,38 @@ long dssim(unsigned char *s,int sp, unsigned char *r,int rp, const vp8_variance_rtcd_vtable_t *rtcd) { unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; - double ssim3; - long long ssim_n; - long long ssim_d; + long long ssim3; + long long ssim_n,ssim_n1,ssim_n2; + long long ssim_d,ssim_d1,ssim_d2; + long long ssim_t1,ssim_t2; + long long c1, c2; + + // normalize by 256/64 + c1 = cc1*16; + c2 = cc2*16; rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); - ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2); + ssim_n1 = (2*sum_s*sum_r+ c1); - ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* - (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ; + ssim_n2 =((long long) 2*256*sum_sxr-(long long) 2*sum_s*sum_r+c2); + + ssim_d1 =((long long)sum_s*sum_s +(long long)sum_r*sum_r+c1); + + ssim_d2 = (256 * (long long) sum_sq_s-(long long) sum_s*sum_s + + (long long) 256*sum_sq_r-(long long) sum_r*sum_r +c2) ; - ssim3 = 256 * (ssim_d-ssim_n) / ssim_d; - return (long)( 256*ssim3 * ssim3 ); + ssim_t1 = 256 - 256 * ssim_n1 / ssim_d1; + ssim_t2 = 256 - 256 * ssim_n2 / ssim_d2; + + ssim3 = 256 *ssim_t1 * ssim_t2; + if(ssim3 <0 ) + ssim3=0; + return (long)( ssim3 ); } -// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels -// such that the window regions overlap block boundaries to penalize blocking -// artifacts. +// We are using a 8x8 moving window with starting location of each 8x8 window +// on the 4x4 pixel grid. Such arrangement allows the windows to overlap +// block boundaries to penalize blocking artifacts. double vp8_ssim2 ( unsigned char *img1, @@ -361,20 +168,21 @@ double vp8_ssim2 ) { int i,j; - + int samples =0; double ssim_total=0; - // we can sample points as frequently as we like start with 1 per 8x8 - for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8) + // sample point start with each 4x4 location + for(i=0; i < height-8; i+=4, img1 += stride_img1*4, img2 += stride_img2*4) { - for(j=0; j < width; j+=8 ) + for(j=0; j < width-8; j+=4 ) { - ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd); + double v = ssim_8x8(img1+j, stride_img1, img2+j, stride_img2, rtcd); + ssim_total += v; + samples++; } } - ssim_total /= (width/8 * height /8); + ssim_total /= samples; return ssim_total; - } double vp8_calc_ssim ( @@ -406,3 +214,35 @@ double vp8_calc_ssim return ssimv; } + +double vp8_calc_ssimg +( + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + double *ssim_y, + double *ssim_u, + double *ssim_v, + const vp8_variance_rtcd_vtable_t *rtcd +) +{ + double ssim_all = 0; + double a, b, c; + + a = vp8_ssim2(source->y_buffer, dest->y_buffer, + source->y_stride, dest->y_stride, source->y_width, + source->y_height, rtcd); + + b = vp8_ssim2(source->u_buffer, dest->u_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + + c = vp8_ssim2(source->v_buffer, dest->v_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + *ssim_y = a; + *ssim_u = b; + *ssim_v = c; + ssim_all = (a * 4 + b + c) /6; + + return ssim_all; +}
\ No newline at end of file diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c index cec951897..b77195511 100644 --- a/vp8/encoder/temporal_filter.c +++ b/vp8/encoder/temporal_filter.c @@ -208,10 +208,12 @@ static int vp8_temporal_filter_find_matching_mb_c // Try sub-pixel MC? //if (bestsme > error_thresh && bestsme < INT_MAX) { + int distortion; + unsigned int sse; bestsme = cpi->find_fractional_mv_step(x, b, d, &d->bmi.mv.as_mv, &best_ref_mv1, x->errorperbit, &cpi->fn_ptr[BLOCK_16X16], - mvcost); + mvcost, &distortion, &sse); } #endif @@ -357,8 +359,8 @@ static void vp8_temporal_filter_iterate_c } // Normalize filter output to produce AltRef frame - dst1 = cpi->alt_ref_buffer.source_buffer.y_buffer; - stride = cpi->alt_ref_buffer.source_buffer.y_stride; + dst1 = cpi->alt_ref_buffer.y_buffer; + stride = cpi->alt_ref_buffer.y_stride; byte = mb_y_offset; for (i = 0,k = 0; i < 16; i++) { @@ -377,9 +379,9 @@ static void vp8_temporal_filter_iterate_c byte += stride - 16; } - dst1 = cpi->alt_ref_buffer.source_buffer.u_buffer; - dst2 = cpi->alt_ref_buffer.source_buffer.v_buffer; - stride = cpi->alt_ref_buffer.source_buffer.uv_stride; + dst1 = cpi->alt_ref_buffer.u_buffer; + dst2 = cpi->alt_ref_buffer.v_buffer; + stride = cpi->alt_ref_buffer.uv_stride; byte = mb_uv_offset; for (i = 0,k = 256; i < 8; i++) { @@ -422,7 +424,8 @@ static void vp8_temporal_filter_iterate_c void vp8_temporal_filter_prepare_c ( - VP8_COMP *cpi + VP8_COMP *cpi, + int distance ) { int frame = 0; @@ -441,12 +444,9 @@ void vp8_temporal_filter_prepare_c int max_frames = cpi->active_arnr_frames; - num_frames_backward = cpi->last_alt_ref_sei - cpi->source_encode_index; - - if (num_frames_backward < 0) - num_frames_backward += cpi->oxcf.lag_in_frames; - - num_frames_forward = cpi->oxcf.lag_in_frames - (num_frames_backward + 1); + num_frames_backward = distance; + num_frames_forward = vp8_lookahead_depth(cpi->lookahead) + - (num_frames_backward + 1); switch (blur_type) { @@ -498,8 +498,7 @@ void vp8_temporal_filter_prepare_c break; } - start_frame = (cpi->last_alt_ref_sei - + frames_to_blur_forward) % cpi->oxcf.lag_in_frames; + start_frame = distance + frames_to_blur_forward; #ifdef DEBUGFWG // DEBUG FWG @@ -520,12 +519,9 @@ void vp8_temporal_filter_prepare_c for (frame = 0; frame < frames_to_blur; frame++) { int which_buffer = start_frame - frame; - - if (which_buffer < 0) - which_buffer += cpi->oxcf.lag_in_frames; - - cpi->frames[frames_to_blur-1-frame] - = &cpi->src_buffer[which_buffer].source_buffer; + struct lookahead_entry* buf = vp8_lookahead_peek(cpi->lookahead, + which_buffer); + cpi->frames[frames_to_blur-1-frame] = &buf->img; } vp8_temporal_filter_iterate_c ( diff --git a/vp8/encoder/tokenize.c b/vp8/encoder/tokenize.c index e3f423f8a..1c5923813 100644 --- a/vp8/encoder/tokenize.c +++ b/vp8/encoder/tokenize.c @@ -224,28 +224,9 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) int plane_type; int b; - TOKENEXTRA *start = *t; - TOKENEXTRA *tp = *t; - - x->mode_info_context->mbmi.dc_diff = 1; - -#if 0 - - if (x->mbmi.force_no_skip) - { - x->mbmi.mb_skip_coeff = 1; - //reset for next_mb. - x->mbmi.force_no_skip = 0; - } - -#endif - -#if 1 - x->mode_info_context->mbmi.mb_skip_coeff = mb_is_skippable(x); if (x->mode_info_context->mbmi.mb_skip_coeff) { - cpi->skip_true_count++; if (!cpi->common.mb_no_coeff_skip) @@ -255,17 +236,11 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) vp8_fix_contexts(x); } - if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV) - x->mode_info_context->mbmi.dc_diff = 0; - else - x->mode_info_context->mbmi.dc_diff = 1; - - return; } cpi->skip_false_count++; -#endif + #if 0 vpx_memcpy(cpi->coef_counts_backup, cpi->coef_counts, sizeof(cpi->coef_counts)); #endif @@ -292,42 +267,6 @@ void vp8_tokenize_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) A + vp8_block2above[b], L + vp8_block2left[b], cpi); -#if 0 - - if (cpi->common.mb_no_coeff_skip) - { - int skip = 1; - - while ((tp != *t) && skip) - { - skip = (skip && (tp->Token == DCT_EOB_TOKEN)); - tp ++; - } - - if (skip != x->mbmi.mb_skip_coeff) - skip += 0; - - x->mbmi.mb_skip_coeff = skip; - - if (x->mbmi.mb_skip_coeff == 1) - { - x->mbmi.dc_diff = 0; - //redo the coutnts - vpx_memcpy(cpi->coef_counts, cpi->coef_counts_backup, sizeof(cpi->coef_counts)); - - *t = start; - cpi->skip_true_count++; - //skip_true_count++; - } - else - { - - cpi->skip_false_count++; - //skip_false_count++; - } - } - -#endif } @@ -510,13 +449,6 @@ void vp8_stuff_mb(VP8_COMP *cpi, MACROBLOCKD *x, TOKENEXTRA **t) A + vp8_block2above[24], L + vp8_block2left[24], cpi); plane_type = 0; - - if (x->mode_info_context->mbmi.mode != B_PRED && x->mode_info_context->mbmi.mode != SPLITMV) - x->mode_info_context->mbmi.dc_diff = 0; - else - x->mode_info_context->mbmi.dc_diff = 1; - - for (b = 0; b < 16; b++) stuff1st_order_b(x->block + b, t, plane_type, x->frame_type, A + vp8_block2above[b], diff --git a/vp8/encoder/x86/dct_sse2.asm b/vp8/encoder/x86/dct_sse2.asm index 652dd9804..3d52a5d54 100644 --- a/vp8/encoder/x86/dct_sse2.asm +++ b/vp8/encoder/x86/dct_sse2.asm @@ -33,6 +33,7 @@ %define input rcx %define output rdx %define pitch r8 + SAVE_XMM 7, u %else %define input rdi %define output rsi @@ -53,6 +54,7 @@ pop rbp %else %ifidn __OUTPUT_FORMAT__,x64 + RESTORE_XMM %endif %endif ret diff --git a/vp8/encoder/x86/encodeopt.asm b/vp8/encoder/x86/encodeopt.asm index c0f06bbbb..994629499 100644 --- a/vp8/encoder/x86/encodeopt.asm +++ b/vp8/encoder/x86/encodeopt.asm @@ -22,33 +22,33 @@ sym(vp8_block_error_xmm): ; end prologue mov rsi, arg(0) ;coeff_ptr - mov rdi, arg(1) ;dcoef_ptr - movdqa xmm3, [rsi] - movdqa xmm4, [rdi] - movdqa xmm5, [rsi+16] + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] + + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] - movdqa xmm6, [rdi+16] - psubw xmm3, xmm4 + psubw xmm0, xmm1 + psubw xmm2, xmm3 - psubw xmm5, xmm6 - pmaddwd xmm3, xmm3 - pmaddwd xmm5, xmm5 + pmaddwd xmm0, xmm0 + pmaddwd xmm2, xmm2 - paddd xmm3, xmm5 + paddd xmm0, xmm2 - pxor xmm7, xmm7 - movdqa xmm0, xmm3 + pxor xmm5, xmm5 + movdqa xmm1, xmm0 - punpckldq xmm0, xmm7 - punpckhdq xmm3, xmm7 + punpckldq xmm0, xmm5 + punpckhdq xmm1, xmm5 - paddd xmm0, xmm3 - movdqa xmm3, xmm0 + paddd xmm0, xmm1 + movdqa xmm1, xmm0 psrldq xmm0, 8 - paddd xmm0, xmm3 + paddd xmm0, xmm1 movq rax, xmm0 @@ -208,53 +208,54 @@ sym(vp8_mbblock_error_xmm_impl): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 + SAVE_XMM 6 push rsi push rdi ; end prolog mov rsi, arg(0) ;coeff_ptr - pxor xmm7, xmm7 + pxor xmm6, xmm6 mov rdi, arg(1) ;dcoef_ptr - pxor xmm2, xmm2 + pxor xmm4, xmm4 - movd xmm1, dword ptr arg(2) ;dc - por xmm1, xmm2 + movd xmm5, dword ptr arg(2) ;dc + por xmm5, xmm4 - pcmpeqw xmm1, xmm7 + pcmpeqw xmm5, xmm6 mov rcx, 16 mberror_loop: - movdqa xmm3, [rsi] - movdqa xmm4, [rdi] + movdqa xmm0, [rsi] + movdqa xmm1, [rdi] - movdqa xmm5, [rsi+16] - movdqa xmm6, [rdi+16] + movdqa xmm2, [rsi+16] + movdqa xmm3, [rdi+16] - psubw xmm5, xmm6 - pmaddwd xmm5, xmm5 + psubw xmm2, xmm3 + pmaddwd xmm2, xmm2 - psubw xmm3, xmm4 - pand xmm3, xmm1 + psubw xmm0, xmm1 + pand xmm0, xmm5 - pmaddwd xmm3, xmm3 + pmaddwd xmm0, xmm0 add rsi, 32 add rdi, 32 sub rcx, 1 - paddd xmm2, xmm5 + paddd xmm4, xmm2 - paddd xmm2, xmm3 + paddd xmm4, xmm0 jnz mberror_loop - movdqa xmm0, xmm2 - punpckldq xmm0, xmm7 + movdqa xmm0, xmm4 + punpckldq xmm0, xmm6 - punpckhdq xmm2, xmm7 - paddd xmm0, xmm2 + punpckhdq xmm4, xmm6 + paddd xmm0, xmm4 movdqa xmm1, xmm0 psrldq xmm0, 8 @@ -265,6 +266,7 @@ mberror_loop: pop rdi pop rsi ; begin epilog + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -342,7 +344,7 @@ sym(vp8_mbuverror_xmm_impl): mov rdi, arg(1) ;d_ptr mov rcx, 16 - pxor xmm7, xmm7 + pxor xmm3, xmm3 mbuverror_loop: @@ -352,7 +354,7 @@ mbuverror_loop: psubw xmm1, xmm2 pmaddwd xmm1, xmm1 - paddd xmm7, xmm1 + paddd xmm3, xmm1 add rsi, 16 add rdi, 16 @@ -361,7 +363,7 @@ mbuverror_loop: jnz mbuverror_loop pxor xmm0, xmm0 - movdqa xmm1, xmm7 + movdqa xmm1, xmm3 movdqa xmm2, xmm1 punpckldq xmm1, xmm0 diff --git a/vp8/encoder/x86/fwalsh_sse2.asm b/vp8/encoder/x86/fwalsh_sse2.asm index 39439f0d8..71efd5613 100644 --- a/vp8/encoder/x86/fwalsh_sse2.asm +++ b/vp8/encoder/x86/fwalsh_sse2.asm @@ -17,7 +17,7 @@ sym(vp8_short_walsh4x4_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 3 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm index 5e40dc7de..056b64c39 100644 --- a/vp8/encoder/x86/quantize_sse2.asm +++ b/vp8/encoder/x86/quantize_sse2.asm @@ -20,7 +20,7 @@ global sym(vp8_regular_quantize_b_sse2) sym(vp8_regular_quantize_b_sse2): push rbp mov rbp, rsp - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx %if ABI_IS_32BIT @@ -142,7 +142,7 @@ sym(vp8_regular_quantize_b_sse2): movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2] ; downshift by quant_shift[rc] - movsx ecx, WORD PTR[rax + %1 * 2] ; quant_shift_ptr[rc] + movsx cx, BYTE PTR[rax + %1] ; quant_shift_ptr[rc] sar edi, cl ; also sets Z bit je rq_zigzag_loop_%1 ; !y mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc] diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm new file mode 100644 index 000000000..258899eed --- /dev/null +++ b/vp8/encoder/x86/quantize_sse4.asm @@ -0,0 +1,254 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license and patent +; grant that can be found in the LICENSE file in the root of the source +; tree. All contributing project authors may be found in the AUTHORS +; file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" +%include "asm_enc_offsets.asm" + + +; void vp8_regular_quantize_b_sse4 | arg +; (BLOCK *b, | 0 +; BLOCKD *d) | 1 + +global sym(vp8_regular_quantize_b_sse4) +sym(vp8_regular_quantize_b_sse4): + +%if ABI_IS_32BIT + push rbp + mov rbp, rsp + GET_GOT rbx + push rdi + push rsi + + ALIGN_STACK 16, rax + %define qcoeff 0 ; 32 + %define stack_size 32 + sub rsp, stack_size +%else + %ifidn __OUTPUT_FORMAT__,x64 + SAVE_XMM 8, u + push rdi + push rsi + %endif +%endif + ; end prolog + +%if ABI_IS_32BIT + mov rdi, arg(0) ; BLOCK *b + mov rsi, arg(1) ; BLOCKD *d +%else + %ifidn __OUTPUT_FORMAT__,x64 + mov rdi, rcx ; BLOCK *b + mov rsi, rdx ; BLOCKD *d + %else + ;mov rdi, rdi ; BLOCK *b + ;mov rsi, rsi ; BLOCKD *d + %endif +%endif + + mov rax, [rdi + vp8_block_coeff] + mov rcx, [rdi + vp8_block_zbin] + mov rdx, [rdi + vp8_block_round] + movd xmm7, [rdi + vp8_block_zbin_extra] + + ; z + movdqa xmm0, [rax] + movdqa xmm1, [rax + 16] + + ; duplicate zbin_oq_value + pshuflw xmm7, xmm7, 0 + punpcklwd xmm7, xmm7 + + movdqa xmm2, xmm0 + movdqa xmm3, xmm1 + + ; sz + psraw xmm0, 15 + psraw xmm1, 15 + + ; (z ^ sz) + pxor xmm2, xmm0 + pxor xmm3, xmm1 + + ; x = abs(z) + psubw xmm2, xmm0 + psubw xmm3, xmm1 + + ; zbin + movdqa xmm4, [rcx] + movdqa xmm5, [rcx + 16] + + ; *zbin_ptr + zbin_oq_value + paddw xmm4, xmm7 + paddw xmm5, xmm7 + + movdqa xmm6, xmm2 + movdqa xmm7, xmm3 + + ; x - (*zbin_ptr + zbin_oq_value) + psubw xmm6, xmm4 + psubw xmm7, xmm5 + + ; round + movdqa xmm4, [rdx] + movdqa xmm5, [rdx + 16] + + mov rax, [rdi + vp8_block_quant_shift] + mov rcx, [rdi + vp8_block_quant] + mov rdx, [rdi + vp8_block_zrun_zbin_boost] + + ; x + round + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + ; quant + movdqa xmm4, [rcx] + movdqa xmm5, [rcx + 16] + + ; y = x * quant_ptr >> 16 + pmulhw xmm4, xmm2 + pmulhw xmm5, xmm3 + + ; y += x + paddw xmm2, xmm4 + paddw xmm3, xmm5 + + pxor xmm4, xmm4 +%if ABI_IS_32BIT + movdqa [rsp + qcoeff], xmm4 + movdqa [rsp + qcoeff + 16], xmm4 +%else + pxor xmm8, xmm8 +%endif + + ; quant_shift + movdqa xmm5, [rax] + + ; zrun_zbin_boost + mov rax, rdx + +%macro ZIGZAG_LOOP 5 + ; x + pextrw ecx, %4, %2 + + ; if (x >= zbin) + sub cx, WORD PTR[rdx] ; x - zbin + lea rdx, [rdx + 2] ; zbin_boost_ptr++ + jl rq_zigzag_loop_%1 ; x < zbin + + pextrw edi, %3, %2 ; y + + ; downshift by quant_shift[rc] + pextrb ecx, xmm5, %1 ; quant_shift[rc] + sar edi, cl ; also sets Z bit + je rq_zigzag_loop_%1 ; !y +%if ABI_IS_32BIT + mov WORD PTR[rsp + qcoeff + %1 *2], di +%else + pinsrw %5, edi, %2 ; qcoeff[rc] +%endif + mov rdx, rax ; reset to b->zrun_zbin_boost +rq_zigzag_loop_%1: +%endmacro +; in vp8_default_zig_zag1d order: see vp8/common/entropy.c +ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4 +ZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4 +ZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4 +ZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8 +ZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4 +ZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4 +ZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4 +ZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4 +ZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8 +ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8 +ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8 +ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8 +ZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4 +ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8 +ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8 +ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8 + + mov rcx, [rsi + vp8_blockd_dequant] + mov rdi, [rsi + vp8_blockd_dqcoeff] + +%if ABI_IS_32BIT + movdqa xmm4, [rsp + qcoeff] + movdqa xmm5, [rsp + qcoeff + 16] +%else + %define xmm5 xmm8 +%endif + + ; y ^ sz + pxor xmm4, xmm0 + pxor xmm5, xmm1 + ; x = (y ^ sz) - sz + psubw xmm4, xmm0 + psubw xmm5, xmm1 + + ; dequant + movdqa xmm0, [rcx] + movdqa xmm1, [rcx + 16] + + mov rcx, [rsi + vp8_blockd_qcoeff] + + pmullw xmm0, xmm4 + pmullw xmm1, xmm5 + + ; store qcoeff + movdqa [rcx], xmm4 + movdqa [rcx + 16], xmm5 + + ; store dqcoeff + movdqa [rdi], xmm0 + movdqa [rdi + 16], xmm1 + + ; select the last value (in zig_zag order) for EOB + pxor xmm6, xmm6 + pcmpeqw xmm4, xmm6 + pcmpeqw xmm5, xmm6 + + packsswb xmm4, xmm5 + pshufb xmm4, [GLOBAL(zig_zag1d)] + pmovmskb edx, xmm4 + xor rdi, rdi + mov eax, -1 + xor dx, ax + bsr eax, edx + sub edi, edx + sar edi, 31 + add eax, 1 + and eax, edi + + mov [rsi + vp8_blockd_eob], eax + + ; begin epilog +%if ABI_IS_32BIT + add rsp, stack_size + pop rsp + + pop rsi + pop rdi + RESTORE_GOT + pop rbp +%else + %undef xmm5 + %ifidn __OUTPUT_FORMAT__,x64 + pop rsi + pop rdi + RESTORE_XMM + %endif +%endif + + ret + +SECTION_RODATA +align 16 +; vp8/common/entropy.c: vp8_default_zig_zag1d +zig_zag1d: + db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 diff --git a/vp8/encoder/x86/quantize_x86.h b/vp8/encoder/x86/quantize_x86.h index f09358061..bbe475f8c 100644 --- a/vp8/encoder/x86/quantize_x86.h +++ b/vp8/encoder/x86/quantize_x86.h @@ -51,4 +51,17 @@ extern prototype_quantize_block(vp8_fast_quantize_b_ssse3); #endif /* HAVE_SSSE3 */ + +#if HAVE_SSE4_1 +extern prototype_quantize_block(vp8_regular_quantize_b_sse4); + +#if !CONFIG_RUNTIME_CPU_DETECT + +#undef vp8_quantize_quantb +#define vp8_quantize_quantb vp8_regular_quantize_b_sse4 + +#endif /* !CONFIG_RUNTIME_CPU_DETECT */ + +#endif /* HAVE_SSE4_1 */ + #endif /* QUANTIZE_X86_H */ diff --git a/vp8/encoder/x86/sad_sse2.asm b/vp8/encoder/x86/sad_sse2.asm index cc6bc3cd9..04ee72f72 100644 --- a/vp8/encoder/x86/sad_sse2.asm +++ b/vp8/encoder/x86/sad_sse2.asm @@ -21,6 +21,7 @@ sym(vp8_sad16x16_wmt): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 + SAVE_XMM 6 push rsi push rdi ; end prolog @@ -34,7 +35,7 @@ sym(vp8_sad16x16_wmt): lea rcx, [rsi+rax*8] lea rcx, [rcx+rax*8] - pxor xmm7, xmm7 + pxor xmm6, xmm6 x16x16sad_wmt_loop: @@ -52,32 +53,33 @@ x16x16sad_wmt_loop: punpcklbw xmm1, xmm3 psadbw xmm0, xmm1 - movq xmm6, QWORD PTR [rsi+rax+8] + movq xmm2, QWORD PTR [rsi+rax+8] movq xmm3, QWORD PTR [rdi+rdx+8] lea rsi, [rsi+rax*2] lea rdi, [rdi+rdx*2] - punpcklbw xmm4, xmm6 + punpcklbw xmm4, xmm2 punpcklbw xmm5, xmm3 psadbw xmm4, xmm5 - paddw xmm7, xmm0 - paddw xmm7, xmm4 + paddw xmm6, xmm0 + paddw xmm6, xmm4 cmp rsi, rcx jne x16x16sad_wmt_loop - movq xmm0, xmm7 - psrldq xmm7, 8 + movq xmm0, xmm6 + psrldq xmm6, 8 - paddw xmm0, xmm7 + paddw xmm0, xmm6 movq rax, xmm0 ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/sad_sse3.asm b/vp8/encoder/x86/sad_sse3.asm index f0336ab17..2dbcc7dc9 100644 --- a/vp8/encoder/x86/sad_sse3.asm +++ b/vp8/encoder/x86/sad_sse3.asm @@ -33,14 +33,15 @@ movsxd rdx, dword ptr arg(3) ; ref_stride %else %ifidn __OUTPUT_FORMAT__,x64 + SAVE_XMM 7, u %define src_ptr rcx %define src_stride rdx %define ref_ptr r8 %define ref_stride r9 %define end_ptr r10 %define ret_var r11 - %define result_ptr [rsp+8+4*8] - %define max_err [rsp+8+4*8] + %define result_ptr [rsp+xmm_stack_space+8+4*8] + %define max_err [rsp+xmm_stack_space+8+4*8] %else %define src_ptr rdi %define src_stride rsi @@ -72,6 +73,7 @@ pop rbp %else %ifidn __OUTPUT_FORMAT__,x64 + RESTORE_XMM %endif %endif ret @@ -106,6 +108,7 @@ xchg rbx, rax %else %ifidn __OUTPUT_FORMAT__,x64 + SAVE_XMM 7, u %define src_ptr rcx %define src_stride rdx %define r0_ptr rsi @@ -113,7 +116,7 @@ %define r2_ptr r11 %define r3_ptr r8 %define ref_stride r9 - %define result_ptr [rsp+16+4*8] + %define result_ptr [rsp+xmm_stack_space+16+4*8] push rsi LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr @@ -151,6 +154,7 @@ %else %ifidn __OUTPUT_FORMAT__,x64 pop rsi + RESTORE_XMM %endif %endif ret diff --git a/vp8/encoder/x86/sad_ssse3.asm b/vp8/encoder/x86/sad_ssse3.asm index 69c5eaedc..6ecf08184 100644 --- a/vp8/encoder/x86/sad_ssse3.asm +++ b/vp8/encoder/x86/sad_ssse3.asm @@ -157,6 +157,7 @@ sym(vp8_sad16x16x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 push rsi push rdi push rcx @@ -253,6 +254,7 @@ vp8_sad16x16x3_ssse3_store_off: pop rcx pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -268,6 +270,7 @@ sym(vp8_sad16x8x3_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 push rsi push rdi push rcx @@ -361,6 +364,7 @@ vp8_sad16x8x3_ssse3_store_off: pop rcx pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm index c267cdb54..d5d267a69 100644 --- a/vp8/encoder/x86/ssim_opt.asm +++ b/vp8/encoder/x86/ssim_opt.asm @@ -16,12 +16,12 @@ paddusw xmm14, xmm4 ; sum_r movdqa xmm1, xmm3 pmaddwd xmm1, xmm1 - paddq xmm13, xmm1 ; sum_sq_s + paddd xmm13, xmm1 ; sum_sq_s movdqa xmm2, xmm4 pmaddwd xmm2, xmm2 - paddq xmm12, xmm2 ; sum_sq_r + paddd xmm12, xmm2 ; sum_sq_r pmaddwd xmm3, xmm4 - paddq xmm11, xmm3 ; sum_sxr + paddd xmm11, xmm3 ; sum_sxr %endmacro ; Sum across the register %1 starting with q words @@ -66,6 +66,7 @@ sym(vp8_ssim_parms_16x16_sse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 push rsi push rdi ; end prolog @@ -115,19 +116,20 @@ NextRow: SUM_ACROSS_Q xmm11 mov rdi,arg(4) - movq [rdi], xmm15; + movd [rdi], xmm15; mov rdi,arg(5) - movq [rdi], xmm14; + movd [rdi], xmm14; mov rdi,arg(6) - movq [rdi], xmm13; + movd [rdi], xmm13; mov rdi,arg(7) - movq [rdi], xmm12; + movd [rdi], xmm12; mov rdi,arg(8) - movq [rdi], xmm11; + movd [rdi], xmm11; ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -154,6 +156,7 @@ sym(vp8_ssim_parms_8x8_sse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 + SAVE_XMM 15 push rsi push rdi ; end prolog @@ -174,11 +177,8 @@ sym(vp8_ssim_parms_8x8_sse3): NextRow2: ;grab source and reference pixels - movq xmm5, [rsi] - movq xmm6, [rdi] - - movdqa xmm3, xmm5 - movdqa xmm4, xmm6 + movq xmm3, [rsi] + movq xmm4, [rdi] punpcklbw xmm3, xmm0 ; low_s punpcklbw xmm4, xmm0 ; low_r @@ -197,19 +197,20 @@ NextRow2: SUM_ACROSS_Q xmm11 mov rdi,arg(4) - movq [rdi], xmm15; + movd [rdi], xmm15; mov rdi,arg(5) - movq [rdi], xmm14; + movd [rdi], xmm14; mov rdi,arg(6) - movq [rdi], xmm13; + movd [rdi], xmm13; mov rdi,arg(7) - movq [rdi], xmm12; + movd [rdi], xmm12; mov rdi,arg(8) - movq [rdi], xmm11; + movd [rdi], xmm11; ; begin epilog pop rdi pop rsi + RESTORE_XMM UNSHADOW_ARGS pop rbp ret diff --git a/vp8/encoder/x86/subtract_sse2.asm b/vp8/encoder/x86/subtract_sse2.asm index 3fb23d097..95888f6be 100644 --- a/vp8/encoder/x86/subtract_sse2.asm +++ b/vp8/encoder/x86/subtract_sse2.asm @@ -77,7 +77,7 @@ sym(vp8_subtract_mby_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm index 0127b012e..b777ef566 100644 --- a/vp8/encoder/x86/temporal_filter_apply_sse2.asm +++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm @@ -26,7 +26,7 @@ sym(vp8_temporal_filter_apply_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 8 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -164,10 +164,10 @@ temporal_filter_apply_load_finished: movdqa xmm6, [rdi+32] movdqa xmm7, [rdi+48] ; += modifier - paddw xmm4, xmm0 - paddw xmm5, xmm2 - paddw xmm6, xmm1 - paddw xmm7, xmm3 + paddd xmm4, xmm0 + paddd xmm5, xmm2 + paddd xmm6, xmm1 + paddd xmm7, xmm3 ; write back movdqa [rdi], xmm4 movdqa [rdi+16], xmm5 diff --git a/vp8/encoder/x86/variance_impl_sse2.asm b/vp8/encoder/x86/variance_impl_sse2.asm index c2c30deb2..5becc7344 100644 --- a/vp8/encoder/x86/variance_impl_sse2.asm +++ b/vp8/encoder/x86/variance_impl_sse2.asm @@ -85,6 +85,7 @@ sym(vp8_get16x16var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 push rbx push rsi push rdi @@ -206,6 +207,7 @@ var16loop: pop rdi pop rsi pop rbx + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -223,6 +225,7 @@ sym(vp8_get16x16pred_error_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 4 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -321,6 +324,7 @@ var16peloop: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -341,6 +345,7 @@ sym(vp8_get8x8var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -506,6 +511,7 @@ sym(vp8_get8x8var_sse2): pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -528,7 +534,7 @@ sym(vp8_filter_block2d_bil_var_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -805,6 +811,7 @@ sym(vp8_half_horiz_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -906,6 +913,7 @@ vp8_half_horiz_vert_variance8x_h_1: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -925,7 +933,7 @@ sym(vp8_half_horiz_vert_variance16x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1041,6 +1049,7 @@ sym(vp8_half_vert_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1127,6 +1136,7 @@ vp8_half_vert_variance8x_h_1: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -1146,7 +1156,7 @@ sym(vp8_half_vert_variance16x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1254,6 +1264,7 @@ sym(vp8_half_horiz_variance8x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 GET_GOT rbx push rsi push rdi @@ -1338,6 +1349,7 @@ vp8_half_horiz_variance8x_h_1: pop rdi pop rsi RESTORE_GOT + RESTORE_XMM UNSHADOW_ARGS pop rbp ret @@ -1357,7 +1369,7 @@ sym(vp8_half_horiz_variance16x_h_sse2): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 7 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/encoder/x86/variance_impl_ssse3.asm b/vp8/encoder/x86/variance_impl_ssse3.asm index 3c0fef9b5..a582f8dc5 100644 --- a/vp8/encoder/x86/variance_impl_ssse3.asm +++ b/vp8/encoder/x86/variance_impl_ssse3.asm @@ -34,7 +34,7 @@ sym(vp8_filter_block2d_bil_var_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 9 - SAVE_XMM + SAVE_XMM 7 GET_GOT rbx push rsi push rdi diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 8f2774b7a..b01319fa4 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -271,9 +271,7 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad8x16x3 = vp8_sad8x16x3_sse3; cpi->rtcd.variance.sad8x8x3 = vp8_sad8x8x3_sse3; cpi->rtcd.variance.sad4x4x3 = vp8_sad4x4x3_sse3; -#if !(CONFIG_REALTIME_ONLY) cpi->rtcd.search.full_search = vp8_full_search_sadx3; -#endif cpi->rtcd.variance.sad16x16x4d = vp8_sad16x16x4d_sse3; cpi->rtcd.variance.sad16x8x4d = vp8_sad16x8x4d_sse3; cpi->rtcd.variance.sad8x16x4d = vp8_sad8x16x4d_sse3; @@ -314,9 +312,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.sad8x16x8 = vp8_sad8x16x8_sse4; cpi->rtcd.variance.sad8x8x8 = vp8_sad8x8x8_sse4; cpi->rtcd.variance.sad4x4x8 = vp8_sad4x4x8_sse4; -#if !(CONFIG_REALTIME_ONLY) cpi->rtcd.search.full_search = vp8_full_search_sadx8; -#endif + + cpi->rtcd.quantize.quantb = vp8_regular_quantize_b_sse4; } #endif diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 2a2f0cfad..5f5ba3a35 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -89,6 +89,7 @@ VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/subpixel_mmx.asm VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/loopfilter_mmx.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/idctllm_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_sse2.asm +VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/recon_wrapper_sse2.c VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/subpixel_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/loopfilter_sse2.asm VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/iwalsh_sse2.asm diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c index 2622738ec..db60bfe4f 100644 --- a/vp8/vp8_cx_iface.c +++ b/vp8/vp8_cx_iface.c @@ -137,8 +137,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, const vpx_codec_enc_cfg_t *cfg, const struct vp8_extracfg *vp8_cfg) { - RANGE_CHECK(cfg, g_w, 1, 16384); - RANGE_CHECK(cfg, g_h, 1, 16384); + RANGE_CHECK(cfg, g_w, 1, 16383); /* 14 bits available */ + RANGE_CHECK(cfg, g_h, 1, 16383); /* 14 bits available */ RANGE_CHECK(cfg, g_timebase.den, 1, 1000000000); RANGE_CHECK(cfg, g_timebase.num, 1, cfg->g_timebase.den); RANGE_CHECK_HI(cfg, g_profile, 3); @@ -151,7 +151,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(cfg, g_lag_in_frames, 0); #endif RANGE_CHECK(cfg, rc_end_usage, VPX_VBR, VPX_CQ); - RANGE_CHECK_HI(cfg, rc_undershoot_pct, 100); + RANGE_CHECK_HI(cfg, rc_undershoot_pct, 1000); + RANGE_CHECK_HI(cfg, rc_overshoot_pct, 1000); RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100); RANGE_CHECK(cfg, kf_mode, VPX_KF_DISABLED, VPX_KF_AUTO); //RANGE_CHECK_BOOL(cfg, g_delete_firstpassfile); @@ -174,16 +175,13 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, "or kf_max_dist instead."); RANGE_CHECK_BOOL(vp8_cfg, enable_auto_alt_ref); + RANGE_CHECK(vp8_cfg, cpu_used, -16, 16); + #if !(CONFIG_REALTIME_ONLY) RANGE_CHECK(vp8_cfg, encoding_mode, VP8_BEST_QUALITY_ENCODING, VP8_REAL_TIME_ENCODING); - RANGE_CHECK(vp8_cfg, cpu_used, -16, 16); RANGE_CHECK_HI(vp8_cfg, noise_sensitivity, 6); #else RANGE_CHECK(vp8_cfg, encoding_mode, VP8_REAL_TIME_ENCODING, VP8_REAL_TIME_ENCODING); - - if (!((vp8_cfg->cpu_used >= -16 && vp8_cfg->cpu_used <= -4) || (vp8_cfg->cpu_used >= 4 && vp8_cfg->cpu_used <= 16))) - ERROR("cpu_used out of range [-16..-4] or [4..16]"); - RANGE_CHECK(vp8_cfg, noise_sensitivity, 0, 0); #endif @@ -197,8 +195,6 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, #if !(CONFIG_REALTIME_ONLY) if (cfg->g_pass == VPX_RC_LAST_PASS) { - int mb_r = (cfg->g_h + 15) / 16; - int mb_c = (cfg->g_w + 15) / 16; size_t packet_sz = sizeof(FIRSTPASS_STATS); int n_packets = cfg->rc_twopass_stats_in.sz / packet_sz; FIRSTPASS_STATS *stats; @@ -309,6 +305,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, } oxcf->target_bandwidth = cfg.rc_target_bitrate; + oxcf->rc_max_intra_bitrate_pct = cfg.rc_max_intra_bitrate_pct; oxcf->best_allowed_q = cfg.rc_min_quantizer; oxcf->worst_allowed_q = cfg.rc_max_quantizer; @@ -316,7 +313,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, oxcf->fixed_q = -1; oxcf->under_shoot_pct = cfg.rc_undershoot_pct; - //oxcf->over_shoot_pct = cfg.rc_overshoot_pct; + oxcf->over_shoot_pct = cfg.rc_overshoot_pct; oxcf->maximum_buffer_size = cfg.rc_buf_sz; oxcf->starting_buffer_level = cfg.rc_buf_initial_sz; @@ -362,6 +359,7 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf, printf("key_freq: %d\n", oxcf->key_freq); printf("end_usage: %d\n", oxcf->end_usage); printf("under_shoot_pct: %d\n", oxcf->under_shoot_pct); + printf("over_shoot_pct: %d\n", oxcf->over_shoot_pct); printf("starting_buffer_level: %d\n", oxcf->starting_buffer_level); printf("optimal_buffer_level: %d\n", oxcf->optimal_buffer_level); printf("maximum_buffer_size: %d\n", oxcf->maximum_buffer_size); @@ -1088,11 +1086,11 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] = {0}, /* rc_twopass_stats_in */ #endif 256, /* rc_target_bandwidth */ - + 0, /* rc_max_intra_bitrate_pct */ 4, /* rc_min_quantizer */ 63, /* rc_max_quantizer */ - 95, /* rc_undershoot_pct */ - 200, /* rc_overshoot_pct */ + 100, /* rc_undershoot_pct */ + 100, /* rc_overshoot_pct */ 6000, /* rc_max_buffer_size */ 4000, /* rc_buffer_initial_size; */ diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 8037f9adb..c17837164 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -52,6 +52,8 @@ VP8_CX_SRCS-yes += encoder/encodeintra.h VP8_CX_SRCS-yes += encoder/encodemb.h VP8_CX_SRCS-yes += encoder/encodemv.h VP8_CX_SRCS-yes += encoder/firstpass.h +VP8_CX_SRCS-yes += encoder/lookahead.c +VP8_CX_SRCS-yes += encoder/lookahead.h VP8_CX_SRCS-yes += encoder/mcomp.h VP8_CX_SRCS-yes += encoder/modecosts.h VP8_CX_SRCS-yes += encoder/onyx_int.h @@ -115,6 +117,7 @@ VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_ssse3.c VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/variance_impl_ssse3.asm VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm +VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt.asm |