18 files changed, 455 insertions, 422 deletions
diff --git a/vp8/encoder/arm/arm_csystemdependent.c b/vp8/encoder/arm/arm_csystemdependent.c
index db079d5ed..75e3a53d2 100644
--- a/vp8/encoder/arm/arm_csystemdependent.c
+++ b/vp8/encoder/arm/arm_csystemdependent.c
@@ -54,8 +54,6 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
         /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
 
         /*cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
-        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
-        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;*/
 
         /*cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
@@ -104,8 +102,6 @@ void vp8_arch_arm_encoder_init(VP8_COMP *cpi)
         /*cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;*/
 
         cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_neon;
-        /*cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
-        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;*/
         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_neon;
 
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_neon;
diff --git a/vp8/encoder/arm/variance_arm.h b/vp8/encoder/arm/variance_arm.h
index ad0d37193..cbfc753b3 100644
--- a/vp8/encoder/arm/variance_arm.h
+++ b/vp8/encoder/arm/variance_arm.h
@@ -84,8 +84,6 @@ extern prototype_variance(vp8_variance_halfpixvar16x16_hv_neon);
 //extern prototype_getmbss(vp8_get_mb_ss_c);
 extern prototype_variance(vp8_mse16x16_neon);
 extern prototype_get16x16prederror(vp8_get16x16pred_error_neon);
-//extern prototype_variance2(vp8_get8x8var_c);
-//extern prototype_variance2(vp8_get16x16var_c);
 extern prototype_get16x16prederror(vp8_get4x4sse_cs_neon);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
@@ -152,12 +150,6 @@ extern prototype_get16x16prederror(vp8_get4x4sse_cs_neon);
 #undef  vp8_variance_get16x16prederror
 #define vp8_variance_get16x16prederror vp8_get16x16pred_error_neon
 
-//#undef  vp8_variance_get8x8var
-//#define vp8_variance_get8x8var vp8_get8x8var_c
-
-//#undef  vp8_variance_get16x16var
-//#define vp8_variance_get16x16var vp8_get16x16var_c
-
 #undef  vp8_variance_get4x4sse_cs
 #define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_neon
 #endif
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index ecbf0265f..5cae99f41 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -50,6 +50,7 @@ void vp8_build_block_offsets(MACROBLOCK *x);
 void vp8_setup_block_ptrs(MACROBLOCK *x);
 int vp8cx_encode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t, int recon_yoffset, int recon_uvoffset);
 int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t);
+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x );
 
 #ifdef MODE_STATS
 unsigned int inter_y_modes[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
@@ -84,8 +85,6 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
 {
     unsigned int act;
     unsigned int sse;
-    int sum;
-
     /* TODO: This could also be done over smaller areas (8x8), but that would
      *  require extensive changes elsewhere, as lambda is assumed to be fixed
      *  over an entire MB in most of the code.
@@ -93,14 +92,9 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
      *  lambda using a non-linear combination (e.g., the smallest, or second
      *  smallest, etc.).
      */
-    VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)(x->src.y_buffer,
-                    x->src.y_stride, VP8_VAR_OFFS, 0, &sse, &sum);
-
-    /* This requires a full 32 bits of precision. */
-    act = (sse<<8) - sum*sum;
-
-    /* Drop 4 to give us some headroom to work with. */
-    act = (act + 8) >> 4;
+    act =     VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)(x->src.y_buffer,
+                    x->src.y_stride, VP8_VAR_OFFS, 0, &sse);
+    act = act<<4;
 
     /* If the region is flat, lower the activity some more. */
     if (act < 8<<12)
@@ -110,70 +104,122 @@ static unsigned int tt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
 }
 
 // Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
+extern int encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
+static unsigned int alt_activity_measure( VP8_COMP *cpi,
+                                          MACROBLOCK *x, int use_dc_pred )
 {
-    unsigned int mb_activity = VP8_ACTIVITY_AVG_MIN;
-
-    x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;
-    x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
-    x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
-
-    vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-
-    mb_activity = VARIANCE_INVOKE(&cpi->rtcd.variance, getmbss)(x->src_diff);
-
-    return mb_activity;
+    return encode_intra(cpi,x, use_dc_pred);
 }
 
 
 // Measure the activity of the current macroblock
 // What we measure here is TBD so abstracted to this function
-static unsigned int mb_activity_measure( VP8_COMP *cpi, MACROBLOCK *x )
+#define ALT_ACT_MEASURE 1
+static unsigned int mb_activity_measure( VP8_COMP *cpi, MACROBLOCK *x,
+                                  int mb_row, int mb_col)
 {
     unsigned int mb_activity;
 
-    if  ( 1 )
+    if  ( ALT_ACT_MEASURE )
     {
-        // Original activity measure from Tim T's code.
-        mb_activity = tt_activity_measure( cpi, x );
+        int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
+
+        // Or use and alternative.
+        mb_activity = alt_activity_measure( cpi, x, use_dc_pred );
     }
     else
     {
-        // Or use and alternative.
-        mb_activity = alt_activity_measure( cpi, x );
+        // Original activity measure from Tim T's code.
+        mb_activity = tt_activity_measure( cpi, x );
     }
 
+    if ( mb_activity < VP8_ACTIVITY_AVG_MIN )
+        mb_activity = VP8_ACTIVITY_AVG_MIN;
+
     return mb_activity;
 }
 
 // Calculate an "average" mb activity value for the frame
+#define ACT_MEDIAN 0
 static void calc_av_activity( VP8_COMP *cpi, INT64 activity_sum )
 {
+#if ACT_MEDIAN
+    // Find median: Simple n^2 algorithm for experimentation
+    {
+        unsigned int median;
+        unsigned int i,j;
+        unsigned int * sortlist;
+        unsigned int tmp;
+
+        // Create a list to sort to
+        CHECK_MEM_ERROR(sortlist,
+                        vpx_calloc(sizeof(unsigned int),
+                        cpi->common.MBs));
+
+        // Copy map to sort list
+        vpx_memcpy( sortlist, cpi->mb_activity_map,
+                    sizeof(unsigned int) * cpi->common.MBs );
+
+
+        // Ripple each value down to its correct position
+        for ( i = 1; i < cpi->common.MBs; i ++ )
+        {
+            for ( j = i; j > 0; j -- )
+            {
+                if ( sortlist[j] < sortlist[j-1] )
+                {
+                    // Swap values
+                    tmp = sortlist[j-1];
+                    sortlist[j-1] = sortlist[j];
+                    sortlist[j] = tmp;
+                }
+                else
+                    break;
+            }
+        }
+
+        // Even number MBs so estimate median as mean of two either side.
+        median = ( 1 + sortlist[cpi->common.MBs >> 1] +
+                   sortlist[(cpi->common.MBs >> 1) + 1] ) >> 1;
+
+        cpi->activity_avg = median;
+
+        vpx_free(sortlist);
+    }
+#else
     // Simple mean for now
     cpi->activity_avg = (unsigned int)(activity_sum/cpi->common.MBs);
+#endif
+
     if (cpi->activity_avg < VP8_ACTIVITY_AVG_MIN)
         cpi->activity_avg = VP8_ACTIVITY_AVG_MIN;
+
+    // Experimental code: return fixed value normalized for several clips
+    if  ( ALT_ACT_MEASURE )
+        cpi->activity_avg = 100000;
 }
 
+#define USE_ACT_INDEX   0
 #define OUTPUT_NORM_ACT_STATS   0
-// Calculate a normalized activity value for each mb
-static void calc_norm_activity( VP8_COMP *cpi, MACROBLOCK *x )
+
+#if USE_ACT_INDEX
+// Calculate and activity index for each mb
+static void calc_activity_index( VP8_COMP *cpi, MACROBLOCK *x )
 {
     VP8_COMMON *const cm = & cpi->common;
     int mb_row, mb_col;
 
-    unsigned int act;
-    unsigned int a;
-    unsigned int b;
+    INT64 act;
+    INT64 a;
+    INT64 b;
 
 #if OUTPUT_NORM_ACT_STATS
     FILE *f = fopen("norm_act.stt", "a");
-    fprintf(f, "\n");
+    fprintf(f, "\n%12d\n", cpi->activity_avg );
 #endif
 
     // Reset pointers to start of activity map
     x->mb_activity_ptr = cpi->mb_activity_map;
-    x->mb_norm_activity_ptr = cpi->mb_norm_activity_map;
 
     // Calculate normalized mb activity number.
     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
@@ -185,25 +231,19 @@ static void calc_norm_activity( VP8_COMP *cpi, MACROBLOCK *x )
             act = *(x->mb_activity_ptr);
 
             // Calculate a normalized activity number
-            a = act + 2*cpi->activity_avg;
-            b = 2*act + cpi->activity_avg;
+            a = act + 4*cpi->activity_avg;
+            b = 4*act + cpi->activity_avg;
 
             if ( b >= a )
-                *(x->mb_norm_activity_ptr) = (int)((b + (a>>1))/a);
+                *(x->activity_ptr) = (int)((b + (a>>1))/a) - 1;
             else
-                *(x->mb_norm_activity_ptr) = -(int)((a + (b>>1))/b);
-
-            if ( *(x->mb_norm_activity_ptr) == 0 )
-            {
-                *(x->mb_norm_activity_ptr) = 1;
-            }
+                *(x->activity_ptr) = 1 - (int)((a + (b>>1))/b);
 
 #if OUTPUT_NORM_ACT_STATS
-            fprintf(f, " %6d", *(x->mb_norm_activity_ptr));
+            fprintf(f, " %6d", *(x->mb_activity_ptr));
 #endif
             // Increment activity map pointers
             x->mb_activity_ptr++;
-            x->mb_norm_activity_ptr++;
         }
 
 #if OUTPUT_NORM_ACT_STATS
@@ -217,33 +257,44 @@ static void calc_norm_activity( VP8_COMP *cpi, MACROBLOCK *x )
 #endif
 
 }
-
+#endif
 
 // Loop through all MBs. Note activity of each, average activity and
 // calculate a normalized activity for each
 static void build_activity_map( VP8_COMP *cpi )
 {
     MACROBLOCK *const x = & cpi->mb;
+    MACROBLOCKD *xd = &x->e_mbd;
     VP8_COMMON *const cm = & cpi->common;
 
+#if ALT_ACT_MEASURE
+    YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
+    int recon_yoffset;
+    int recon_y_stride = new_yv12->y_stride;
+#endif
+
     int mb_row, mb_col;
     unsigned int mb_activity;
     INT64 activity_sum = 0;
 
-    // Initialise source buffer pointer
-    x->src = *cpi->Source;
-
-    // Set pointer to start of activity map
-    x->mb_activity_ptr = cpi->mb_activity_map;
-
     // for each macroblock row in image
     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
     {
+#if ALT_ACT_MEASURE
+        // reset above block coeffs
+        xd->up_available = (mb_row != 0);
+        recon_yoffset = (mb_row * recon_y_stride * 16);
+#endif
         // for each macroblock col in image
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
+#if ALT_ACT_MEASURE
+            xd->dst.y_buffer = new_yv12->y_buffer + recon_yoffset;
+            xd->left_available = (mb_col != 0);
+            recon_yoffset += 16;
+#endif
             // measure activity
-            mb_activity = mb_activity_measure( cpi, x );
+            mb_activity = mb_activity_measure( cpi, x, mb_row, mb_col );
 
             // Keep frame sum
             activity_sum += mb_activity;
@@ -258,49 +309,50 @@ static void build_activity_map( VP8_COMP *cpi )
             x->src.y_buffer += 16;
         }
 
+
         // adjust to the next row of mbs
         x->src.y_buffer += 16 * x->src.y_stride - 16 * cm->mb_cols;
+
+#if ALT_ACT_MEASURE
+        //extend the recon for intra prediction
+        vp8_extend_mb_row(new_yv12, xd->dst.y_buffer + 16,
+                          xd->dst.u_buffer + 8, xd->dst.v_buffer + 8);
+#endif
+
     }
 
     // Calculate an "average" MB activity
     calc_av_activity(cpi, activity_sum);
 
-    // Calculate a normalized activity number of each mb
-    calc_norm_activity( cpi, x );
+#if USE_ACT_INDEX
+    // Calculate an activity index number of each mb
+    calc_activity_index( cpi, x );
+#endif
+
 }
 
-// Activity masking based on Tim T's original code
+// Macroblock activity masking
 void vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
 {
-
-    unsigned int a;
-    unsigned int b;
-    unsigned int act = *(x->mb_activity_ptr);
+#if USE_ACT_INDEX
+    x->rdmult += *(x->mb_activity_ptr) * (x->rdmult >> 2);
+    x->errorperbit = x->rdmult/x->rddiv;
+#else
+    INT64 a;
+    INT64 b;
+    INT64 act = *(x->mb_activity_ptr);
 
     // Apply the masking to the RD multiplier.
-    a = act + 2*cpi->activity_avg;
-    b = 2*act + cpi->activity_avg;
+    a = act + (2*cpi->activity_avg);
+    b = (2*act) + cpi->activity_avg;
 
-    //tmp = (unsigned int)(((INT64)tmp*b + (a>>1))/a);
     x->rdmult = (unsigned int)(((INT64)x->rdmult*b + (a>>1))/a);
+    x->errorperbit = x->rdmult/x->rddiv;
 
-    // For now now zbin adjustment on mode choice
-    x->act_zbin_adj = 0;
-}
-
-// Stub function to use a normalized activity measure stored at mb level.
-void vp8_norm_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
-{
-    int norm_act;
-
-    norm_act = *(x->mb_norm_activity_ptr);
-    if (norm_act > 0)
-        x->rdmult = norm_act * (x->rdmult);
-    else
-        x->rdmult = -(x->rdmult / norm_act);
+#endif
 
-    // For now now zbin adjustment on mode choice
-    x->act_zbin_adj = 0;
+    // Activity based Zbin adjustment
+    adjust_act_zbin(cpi, x);
 }
 
 static
@@ -356,7 +408,6 @@ void encode_mb_row(VP8_COMP *cpi,
 
     // Set the mb activity pointer to the start of the row.
     x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-    x->mb_norm_activity_ptr = &cpi->mb_norm_activity_map[map_index];
 
     // for each macroblock col in image
     for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
@@ -476,7 +527,6 @@ void encode_mb_row(VP8_COMP *cpi,
 
         // Increment the activity mask pointers.
         x->mb_activity_ptr++;
-        x->mb_norm_activity_ptr++;
 
         /* save the block info */
         for (i = 0; i < 16; i++)
@@ -525,6 +575,65 @@ void encode_mb_row(VP8_COMP *cpi,
 #endif
 }
 
+void init_encode_frame_mb_context(VP8_COMP *cpi)
+{
+    MACROBLOCK *const x = & cpi->mb;
+    VP8_COMMON *const cm = & cpi->common;
+    MACROBLOCKD *const xd = & x->e_mbd;
+
+    // GF active flags data structure
+    x->gf_active_ptr = (signed char *)cpi->gf_active_flags;
+
+    // Activity map pointer
+    x->mb_activity_ptr = cpi->mb_activity_map;
+
+    x->vector_range = 32;
+
+    x->act_zbin_adj = 0;
+
+    x->partition_info = x->pi;
+
+    xd->mode_info_context = cm->mi;
+    xd->mode_info_stride = cm->mode_info_stride;
+
+    xd->frame_type = cm->frame_type;
+
+    xd->frames_since_golden = cm->frames_since_golden;
+    xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
+
+    // reset intra mode contexts
+    if (cm->frame_type == KEY_FRAME)
+        vp8_init_mbmode_probs(cm);
+
+    // Copy data over into macro block data sturctures.
+    x->src = * cpi->Source;
+    xd->pre = cm->yv12_fb[cm->lst_fb_idx];
+    xd->dst = cm->yv12_fb[cm->new_fb_idx];
+
+    // set up frame for intra coded blocks
+    vp8_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
+
+    vp8_build_block_offsets(x);
+
+    vp8_setup_block_dptrs(&x->e_mbd);
+
+    vp8_setup_block_ptrs(x);
+
+    xd->mode_info_context->mbmi.mode = DC_PRED;
+    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
+
+    xd->left_context = &cm->left_context;
+
+    vp8_zero(cpi->count_mb_ref_frame_usage)
+    vp8_zero(cpi->ymode_count)
+    vp8_zero(cpi->uv_mode_count)
+
+    x->mvc = cm->fc.mvc;
+
+    vpx_memset(cm->above_context, 0,
+               sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
+}
+
 void vp8_encode_frame(VP8_COMP *cpi)
 {
     int mb_row;
@@ -536,6 +645,17 @@ void vp8_encode_frame(VP8_COMP *cpi)
     int segment_counts[MAX_MB_SEGMENTS];
     int totalrate;
 
+    vpx_memset(segment_counts, 0, sizeof(segment_counts));
+    totalrate = 0;
+
+    if (cpi->compressor_speed == 2)
+    {
+        if (cpi->oxcf.cpu_used < 0)
+            cpi->Speed = -(cpi->oxcf.cpu_used);
+        else
+            vp8_auto_select_speed(cpi);
+    }
+
     // Functions setup for all frame types so we can use MC in AltRef
     if (cm->mcomp_filter_type == SIXTAP)
     {
@@ -560,10 +680,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
                                       &cpi->common.rtcd.subpix, bilinear16x16);
     }
 
-    x->gf_active_ptr = (signed char *)cpi->gf_active_flags;     // Point to base of GF active flags data structure
-
-    x->vector_range = 32;
-
     // Reset frame count of inter 0,0 motion vector useage.
     cpi->inter_zz_count = 0;
 
@@ -574,89 +690,34 @@ void vp8_encode_frame(VP8_COMP *cpi)
     cpi->skip_true_count = 0;
     cpi->skip_false_count = 0;
 
-    x->act_zbin_adj = 0;
-
 #if 0
     // Experimental code
     cpi->frame_distortion = 0;
     cpi->last_mb_distortion = 0;
 #endif
 
-    totalrate = 0;
-
-    x->partition_info = x->pi;
-
-    xd->mode_info_context = cm->mi;
-    xd->mode_info_stride = cm->mode_info_stride;
-
-    xd->frame_type = cm->frame_type;
-
-    xd->frames_since_golden = cm->frames_since_golden;
-    xd->frames_till_alt_ref_frame = cm->frames_till_alt_ref_frame;
     vp8_zero(cpi->MVcount);
-    // vp8_zero( Contexts)
     vp8_zero(cpi->coef_counts);
 
-    // reset intra mode contexts
-    if (cm->frame_type == KEY_FRAME)
-        vp8_init_mbmode_probs(cm);
-
-
     vp8cx_frame_init_quantizer(cpi);
 
-    if (cpi->compressor_speed == 2)
-    {
-        if (cpi->oxcf.cpu_used < 0)
-            cpi->Speed = -(cpi->oxcf.cpu_used);
-        else
-            vp8_auto_select_speed(cpi);
-    }
+    vp8_initialize_rd_consts(cpi,
+                             vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
 
-    vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
     vp8cx_initialize_me_consts(cpi, cm->base_qindex);
 
-    // Copy data over into macro block data sturctures.
-    x->src = * cpi->Source;
-    xd->pre = cm->yv12_fb[cm->lst_fb_idx];
-    xd->dst = cm->yv12_fb[cm->new_fb_idx];
-
-    // set up frame new frame for intra coded blocks
-
-    vp8_setup_intra_recon(&cm->yv12_fb[cm->new_fb_idx]);
-
-    vp8_build_block_offsets(x);
-
-    vp8_setup_block_dptrs(&x->e_mbd);
-
-    vp8_setup_block_ptrs(x);
-
-    xd->mode_info_context->mbmi.mode = DC_PRED;
-    xd->mode_info_context->mbmi.uv_mode = DC_PRED;
-
-    xd->left_context = &cm->left_context;
-
-    vp8_zero(cpi->count_mb_ref_frame_usage)
-    vp8_zero(cpi->ymode_count)
-    vp8_zero(cpi->uv_mode_count)
-
-    x->mvc = cm->fc.mvc;
-
-    vpx_memset(cm->above_context, 0, sizeof(ENTROPY_CONTEXT_PLANES) * cm->mb_cols);
-
     if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
     {
-        if(1)
-        {
-            // Build a frame level activity map
-            build_activity_map(cpi);
-        }
+        // Initialize encode frame context.
+        init_encode_frame_mb_context(cpi);
 
-        // Reset various MB pointers.
-        x->src = *cpi->Source;
-        x->mb_activity_ptr = cpi->mb_activity_map;
-        x->mb_norm_activity_ptr = cpi->mb_norm_activity_map;
+        // Build a frame level activity map
+        build_activity_map(cpi);
     }
 
+    // re-initencode frame context.
+    init_encode_frame_mb_context(cpi);
+
     {
         struct vpx_usec_timer  emr_timer;
         vpx_usec_timer_start(&emr_timer);
@@ -997,27 +1058,24 @@ static void sum_intra_stats(VP8_COMP *cpi, MACROBLOCK *x)
 
 // Experimental stub function to create a per MB zbin adjustment based on
 // some previously calculated measure of MB activity.
-void adjust_act_zbin( VP8_COMP *cpi, int rate, MACROBLOCK *x )
+static void adjust_act_zbin( VP8_COMP *cpi, MACROBLOCK *x )
 {
-    INT64 act;
+#if USE_ACT_INDEX
+    x->act_zbin_adj = *(x->mb_activity_ptr);
+#else
     INT64 a;
     INT64 b;
+    INT64 act = *(x->mb_activity_ptr);
 
-    // Read activity from the map
-    act = (INT64)(*(x->mb_activity_ptr));
-
-    // Calculate a zbin adjustment for this mb
+    // Apply the masking to the RD multiplier.
     a = act + 4*cpi->activity_avg;
     b = 4*act + cpi->activity_avg;
-    if ( b > a )
-        //x->act_zbin_adj = (char)((b * 8) / a) - 8;
-        x->act_zbin_adj = 8;
-    else
-        x->act_zbin_adj = 0;
-
-    // Tmp force to 0 to disable.
-    x->act_zbin_adj = 0;
 
+    if ( act > cpi->activity_avg )
+        x->act_zbin_adj = (int)(((INT64)b + (a>>1))/a) - 1;
+    else
+        x->act_zbin_adj = 1 - (int)(((INT64)a + (b>>1))/b);
+#endif
 }
 
 int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
@@ -1041,55 +1099,29 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 
         Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4, Error16x16);
 
-        rate += (Error4x4 < Error16x16) ? rate4x4 : rate16x16;
+        if (Error4x4 < Error16x16)
+        {
+            x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
+            rate += rate4x4;
+        }
+        else
+        {
+            rate += rate16x16;
+        }
 
         if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
         {
-            adjust_act_zbin( cpi, rate, x );
+            adjust_act_zbin( cpi, x );
             vp8_update_zbin_extra(cpi, x);
         }
     }
     else
-    {
-        int rate2, best_distortion;
-        MB_PREDICTION_MODE mode, best_mode = DC_PRED;
-        int this_rd;
-        Error16x16 = INT_MAX;
-
-        vp8_pick_intra_mbuv_mode(x);
-
-        for (mode = DC_PRED; mode <= TM_PRED; mode ++)
-        {
-            int distortion2;
+        vp8_pick_intra_mode(cpi, x, &rate);
 
-            x->e_mbd.mode_info_context->mbmi.mode = mode;
-            RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
-                (&x->e_mbd);
-            distortion2 = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)(x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16);
-            rate2  = x->mbmode_cost[x->e_mbd.frame_type][mode];
-            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
-
-            if (Error16x16 > this_rd)
-            {
-                Error16x16 = this_rd;
-                best_mode = mode;
-                best_distortion = distortion2;
-            }
-        }
-        x->e_mbd.mode_info_context->mbmi.mode = best_mode;
-
-        Error4x4 = vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate2, &best_distortion);
-    }
-
-    if (Error4x4 < Error16x16)
-    {
-        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
+    if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED)
         vp8_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
-    }
     else
-    {
         vp8_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
-    }
 
     vp8_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
     sum_intra_stats(cpi, x);
@@ -1163,7 +1195,7 @@ int vp8cx_encode_inter_macroblock
     if(cpi->oxcf.tuning == VP8_TUNE_SSIM)
     {
         // Adjust the zbin based on this MB rate.
-        adjust_act_zbin( cpi, rate, x );
+        adjust_act_zbin( cpi, x );
     }
 
 #if 0
@@ -1193,11 +1225,10 @@ int vp8cx_encode_inter_macroblock
     {
         // Experimental code. Special case for gf and arf zeromv modes.
         // Increase zbin size to supress noise
+        cpi->zbin_mode_boost = 0;
         if (cpi->zbin_mode_boost_enabled)
         {
-            if ( xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME )
-                 cpi->zbin_mode_boost = 0;
-            else
+            if ( xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME )
             {
                 if (xd->mode_info_context->mbmi.mode == ZEROMV)
                 {
@@ -1212,9 +1243,6 @@ int vp8cx_encode_inter_macroblock
                     cpi->zbin_mode_boost = MV_ZBIN_BOOST;
             }
         }
-        else
-            cpi->zbin_mode_boost = 0;
-
         vp8_update_zbin_extra(cpi, x);
     }
 
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index 665b2d5dc..420ed8eff 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -114,8 +114,6 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
 
                 // Set the mb activity pointer to the start of the row.
                 x->mb_activity_ptr = &cpi->mb_activity_map[map_index];
-                x->mb_norm_activity_ptr =
-                    &cpi->mb_norm_activity_map[map_index];
 
                 // for each macroblock col in image
                 for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
@@ -230,7 +228,6 @@ THREAD_FUNCTION thread_encoding_proc(void *p_data)
 
                     // Increment the activity mask pointers.
                     x->mb_activity_ptr++;
-                    x->mb_norm_activity_ptr++;
 
                     /* save the block info */
                     for (i = 0; i < 16; i++)
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 7b4869534..bc1863471 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -81,7 +81,7 @@ static const int cq_level[QINDEX_RANGE] =
 
 static void find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
 
-static int encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
+int encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred)
 {
 
     int i;
@@ -243,33 +243,58 @@ static int frame_max_bits(VP8_COMP *cpi)
     int max_bits;
 
     // For CBR we need to also consider buffer fullness.
-    // If we are running below the optimal level then we need to gradually tighten up on max_bits.
     if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
     {
-        double buffer_fullness_ratio = (double)cpi->buffer_level / DOUBLE_DIVIDE_CHECK((double)cpi->oxcf.optimal_buffer_level);
+        max_bits = 2 * cpi->av_per_frame_bandwidth;
+        max_bits -= cpi->buffered_av_per_frame_bandwidth;
+        max_bits *= ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0);
+    }
+    // VBR
+    else
+    {
+        // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
+        max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+    }
+
+    // Trap case where we are out of bits
+    if (max_bits < 0)
+        max_bits = 0;
 
-        // For CBR base this on the target average bits per frame plus the maximum sedction rate passed in by the user
-        max_bits = (int)(cpi->av_per_frame_bandwidth * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+    return max_bits;
+}
 
-        // If our buffer is below the optimum level
-        if (buffer_fullness_ratio < 1.0)
-        {
-            // The lower of max_bits / 4 or cpi->av_per_frame_bandwidth / 4.
-            int min_max_bits = ((cpi->av_per_frame_bandwidth >> 2) < (max_bits >> 2)) ? cpi->av_per_frame_bandwidth >> 2 : max_bits >> 2;
 
-            max_bits = (int)(max_bits * buffer_fullness_ratio);
+static int gf_group_max_bits(VP8_COMP *cpi)
+{
+    // Max allocation for a golden frame group
+    int max_bits;
 
-            if (max_bits < min_max_bits)
-                max_bits = min_max_bits;       // Lowest value we will set ... which should allow the buffer to refil.
+    // For CBR we need to also consider buffer fullness.
+    if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
+    {
+        max_bits = cpi->av_per_frame_bandwidth * cpi->baseline_gf_interval;
+        if (max_bits > cpi->oxcf.optimal_buffer_level)
+        {
+            max_bits -= cpi->oxcf.optimal_buffer_level;
+            max_bits += cpi->buffer_level;
         }
+        else
+        {
+            max_bits -= (cpi->buffered_av_per_frame_bandwidth
+                         - cpi->av_per_frame_bandwidth)
+                        * cpi->baseline_gf_interval;
+        }
+
+        max_bits *= ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0);
     }
-    // VBR
     else
     {
         // For VBR base this on the bits and frames left plus the two_pass_vbrmax_section rate passed in by the user
         max_bits = (int)(((double)cpi->twopass.bits_left / (cpi->twopass.total_stats->count - (double)cpi->common.current_video_frame)) * ((double)cpi->oxcf.two_pass_vbrmax_section / 100.0));
+        max_bits *=  cpi->baseline_gf_interval;
     }
 
+
     // Trap case where we are out of bits
     if (max_bits < 0)
         max_bits = 0;
@@ -1362,7 +1387,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     double abs_mv_in_out_accumulator = 0.0;
     double mod_err_per_mb_accumulator = 0.0;
 
-    int max_bits = frame_max_bits(cpi);     // Max for a single frame
+    int max_group_bits;
 
     unsigned int allow_alt_ref =
                     cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames;
@@ -1715,8 +1740,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     cpi->twopass.gf_group_bits = (cpi->twopass.gf_group_bits < 0) ? 0 : (cpi->twopass.gf_group_bits > cpi->twopass.kf_group_bits) ? cpi->twopass.kf_group_bits : cpi->twopass.gf_group_bits;
 
     // Clip cpi->twopass.gf_group_bits based on user supplied data rate variability limit (cpi->oxcf.two_pass_vbrmax_section)
-    if (cpi->twopass.gf_group_bits > max_bits * cpi->baseline_gf_interval)
-        cpi->twopass.gf_group_bits = max_bits * cpi->baseline_gf_interval;
+    max_group_bits = gf_group_max_bits(cpi);
+    if (cpi->twopass.gf_group_bits > max_group_bits)
+        cpi->twopass.gf_group_bits = max_group_bits;
 
     // Reset the file position
     reset_fpf_position(cpi, start_pos);
@@ -1810,13 +1836,6 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
             }
         }
 
-        // Apply an additional limit for CBR
-        if (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)
-        {
-            if (cpi->twopass.gf_bits > (cpi->buffer_level >> 1))
-                cpi->twopass.gf_bits = cpi->buffer_level >> 1;
-        }
-
         // Dont allow a negative value for gf_bits
         if (cpi->twopass.gf_bits < 0)
             cpi->twopass.gf_bits = 0;
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index 9af3f183a..37885dadf 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -68,8 +68,6 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
     cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_c;
 
     cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_c;
-    cpi->rtcd.variance.get8x8var             = vp8_get8x8var_c;
-    cpi->rtcd.variance.get16x16var           = vp8_get16x16var_c;;
     cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_c;
 
     cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_c;
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 0296d9290..bebc9417d 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1477,6 +1477,7 @@ static void init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
     cpi->long_rolling_target_bits     = cpi->av_per_frame_bandwidth;
     cpi->long_rolling_actual_bits     = cpi->av_per_frame_bandwidth;
+    cpi->buffered_av_per_frame_bandwidth = cpi->av_per_frame_bandwidth;
 
     cpi->total_actual_bits            = 0;
     cpi->total_target_vs_actual       = 0;
@@ -1572,7 +1573,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
         break;
     }
 
-    if (cpi->pass == 0)
+    if (cpi->pass == 0 && cpi->oxcf.end_usage != USAGE_STREAM_FROM_SERVER)
         cpi->auto_worst_q = 1;
 
     cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
@@ -3453,7 +3454,8 @@ static void encode_frame_to_data_rate
     // For CBR if the buffer reaches its maximum level then we can no longer
     // save up bits for later frames so we might as well use them up
     // on the current frame.
-    if ((cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
+    if (cpi->pass == 2
+        && (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) &&
         (cpi->buffer_level >= cpi->oxcf.optimal_buffer_level) && cpi->buffered_mode)
     {
         int Adjustment = cpi->active_worst_quality / 4;       // Max adjustment is 1/4
@@ -3544,6 +3546,9 @@ static void encode_frame_to_data_rate
         }
         else
         {
+            if(cpi->pass != 2)
+                Q = cpi->avg_frame_qindex;
+
             cpi->active_best_quality = inter_minq[Q];
 
             // For the constant/constrained quality mode we dont want
@@ -3845,15 +3850,16 @@ static void encode_frame_to_data_rate
             (cpi->active_worst_quality < cpi->worst_quality)      &&
             (cpi->projected_frame_size > frame_over_shoot_limit))
         {
-            int over_size_percent = ((cpi->projected_frame_size - frame_over_shoot_limit) * 100) / frame_over_shoot_limit;
+            /* step down active_worst_quality such that the corresponding
+             * active_best_quality will be equal to the current
+             * active_worst_quality + 1
+             */
+            int i;
 
-            // If so is there any scope for relaxing it
-            while ((cpi->active_worst_quality < cpi->worst_quality) && (over_size_percent > 0))
-            {
-                cpi->active_worst_quality++;
-                top_index = cpi->active_worst_quality;
-                over_size_percent = (int)(over_size_percent * 0.96);        // Assume 1 qstep = about 4% on frame size.
-            }
+            for(i=cpi->active_worst_quality; i<cpi->worst_quality; i++)
+                if(inter_minq[i] >= cpi->active_worst_quality + 1)
+                    break;
+            cpi->active_worst_quality = i;
 
             // If we have updated the active max Q do not call vp8_update_rate_correction_factors() this loop.
             active_worst_qchanged = TRUE;
@@ -4241,10 +4247,9 @@ static void encode_frame_to_data_rate
 
     // Update the buffer level variable.
     // Non-viewable frames are a special case and are treated as pure overhead.
-    if ( !cm->show_frame )
-        cpi->bits_off_target -= cpi->projected_frame_size;
-    else
-        cpi->bits_off_target += cpi->av_per_frame_bandwidth - cpi->projected_frame_size;
+    if ( cm->show_frame )
+        cpi->bits_off_target += cpi->av_per_frame_bandwidth;
+    cpi->bits_off_target -= cpi->projected_frame_size;
 
     // Rolling monitors of whether we are over or underspending used to help regulate min and Max Q in two pass.
     cpi->rolling_target_bits = ((cpi->rolling_target_bits * 3) + cpi->this_frame_target + 2) / 4;
@@ -4258,7 +4263,33 @@ static void encode_frame_to_data_rate
     // Debug stats
     cpi->total_target_vs_actual += (cpi->this_frame_target - cpi->projected_frame_size);
 
-    cpi->buffer_level = cpi->bits_off_target;
+    // Update the buffered average bitrate
+    {
+        long long numerator;
+
+        numerator = cpi->oxcf.maximum_buffer_size
+                    - cpi->buffered_av_per_frame_bandwidth
+                    + cpi->projected_frame_size;
+        numerator *= cpi->buffered_av_per_frame_bandwidth;
+        cpi->buffered_av_per_frame_bandwidth = numerator
+                                               / cpi->oxcf.maximum_buffer_size;
+    }
+
+    {
+        long long tmp = (long long)cpi->buffered_av_per_frame_bandwidth
+                        * cpi->oxcf.maximum_buffer_size
+                        / cpi->av_per_frame_bandwidth;
+        cpi->buffer_level = cpi->oxcf.maximum_buffer_size
+                            - tmp
+                            + cpi->oxcf.optimal_buffer_level;
+    }
+
+    // Accumulate overshoot error.
+    cpi->accumulated_overshoot +=
+        (cpi->projected_frame_size > cpi->av_per_frame_bandwidth)
+        ? cpi->projected_frame_size - cpi->av_per_frame_bandwidth
+        : 0;
+
 
     // Update bits left to the kf and gf groups to account for overshoot or undershoot on these frames
     if (cm->frame_type == KEY_FRAME)
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 663786004..c460b9da9 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -47,8 +47,8 @@
 #define MIN_THRESHMULT  32
 #define MAX_THRESHMULT  512
 
-#define GF_ZEROMV_ZBIN_BOOST 24
-#define LF_ZEROMV_ZBIN_BOOST 12
+#define GF_ZEROMV_ZBIN_BOOST 12
+#define LF_ZEROMV_ZBIN_BOOST 6
 #define MV_ZBIN_BOOST        4
 #define ZBIN_OQ_MAX 192
 
@@ -351,6 +351,10 @@ typedef struct VP8_COMP
     int per_frame_bandwidth;          // Current section per frame bandwidth target
     int av_per_frame_bandwidth;        // Average frame size target for clip
     int min_frame_bandwidth;          // Minimum allocation that should be used for any frame
+    int buffered_av_per_frame_bandwidth; // Average bitrate over the last buffer
+    int buffered_av_per_frame_bandwidth_rem; // Average bitrate remainder
+    int accumulated_overshoot;           // Accumulated # of bits spent > target
+
     int inter_frame_target;
     double output_frame_rate;
     long long last_time_stamp_seen;
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 456059cf8..661ecb79a 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -221,7 +221,7 @@ static int pick_intra4x4block(
 }
 
 
-int vp8_pick_intra4x4mby_modes
+static int pick_intra4x4mby_modes
 (
     const VP8_ENCODER_RTCD *rtcd,
     MACROBLOCK *mb,
@@ -275,7 +275,7 @@ int vp8_pick_intra4x4mby_modes
     return error;
 }
 
-void vp8_pick_intra_mbuv_mode(MACROBLOCK *mb)
+static void pick_intra_mbuv_mode(MACROBLOCK *mb)
 {
 
     MACROBLOCKD *x = &mb->e_mbd;
@@ -659,10 +659,9 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
         switch (this_mode)
         {
         case B_PRED:
-            // Pass best so far to vp8_pick_intra4x4mby_modes to use as breakout
+            // Pass best so far to pick_intra4x4mby_modes to use as breakout
             distortion2 = *returndistortion;
-            vp8_pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x,
-                                         &rate, &distortion2);
+            pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate, &distortion2);
 
             if (distortion2 == INT_MAX)
             {
@@ -956,7 +955,7 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     if (best_mbmode.mode <= B_PRED)
     {
         /* set mode_info_context->mbmi.uv_mode */
-        vp8_pick_intra_mbuv_mode(x);
+        pick_intra_mbuv_mode(x);
     }
 
     if (x->e_mbd.mode_info_context->mbmi.mode == B_PRED)
@@ -968,3 +967,40 @@ void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset,
     }
     update_mvcount(cpi, &x->e_mbd, &frame_best_ref_mv[xd->mode_info_context->mbmi.ref_frame]);
 }
+
+
+void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate_)
+{
+    int error4x4, error16x16 = INT_MAX;
+    int rate, distortion, best_distortion;
+    MB_PREDICTION_MODE mode, best_mode = DC_PRED;
+    int this_rd;
+
+    pick_intra_mbuv_mode(x);
+
+    for (mode = DC_PRED; mode <= TM_PRED; mode ++)
+    {
+        x->e_mbd.mode_info_context->mbmi.mode = mode;
+        RECON_INVOKE(&cpi->common.rtcd.recon, build_intra_predictors_mby)
+            (&x->e_mbd);
+        distortion = VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16prederror)
+            (x->src.y_buffer, x->src.y_stride, x->e_mbd.predictor, 16);
+        rate = x->mbmode_cost[x->e_mbd.frame_type][mode];
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
+
+        if (error16x16 > this_rd)
+        {
+            error16x16 = this_rd;
+            best_mode = mode;
+            best_distortion = distortion;
+        }
+    }
+    x->e_mbd.mode_info_context->mbmi.mode = best_mode;
+
+    error4x4 = pick_intra4x4mby_modes(IF_RTCD(&cpi->rtcd), x, &rate,
+                                      &best_distortion);
+    if (error4x4 < error16x16)
+        x->e_mbd.mode_info_context->mbmi.mode = B_PRED;
+
+    *rate_ = rate;
+}
diff --git a/vp8/encoder/pickinter.h b/vp8/encoder/pickinter.h
index f96fc5376..a0103d165 100644
--- a/vp8/encoder/pickinter.h
+++ b/vp8/encoder/pickinter.h
@@ -14,7 +14,6 @@
 #include "vpx_ports/config.h"
 #include "vp8/common/onyxc_int.h"
 
-extern int vp8_pick_intra4x4mby_modes(const VP8_ENCODER_RTCD *, MACROBLOCK *mb, int *Rate, int *Distortion);
-extern void vp8_pick_intra_mbuv_mode(MACROBLOCK *mb);
 extern void vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int recon_uvoffset, int *returnrate, int *returndistortion, int *returnintra);
+extern void vp8_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate);
 #endif
diff --git a/vp8/encoder/ppc/csystemdependent.c b/vp8/encoder/ppc/csystemdependent.c
index 8dfd2a543..0dd097f84 100644
--- a/vp8/encoder/ppc/csystemdependent.c
+++ b/vp8/encoder/ppc/csystemdependent.c
@@ -49,8 +49,6 @@ void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc,
 void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
 
 unsigned int (*vp8_get16x16pred_error)(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-unsigned int (*vp8_get8x8var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-unsigned int (*vp8_get16x16var)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
 unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
 
 // c imports
@@ -89,8 +87,6 @@ extern sub_pixel_variance_function sub_pixel_variance16x16_c;
 
 extern unsigned int vp8_get_mb_ss_c(short *);
 extern unsigned int vp8_get16x16pred_error_c(unsigned char *src_ptr, int src_stride, unsigned char *ref_ptr, int ref_stride);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
 extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
 
 // ppc
@@ -150,8 +146,6 @@ void vp8_cmachine_specific_config(void)
 
     vp8_get_mb_ss                 = vp8_get_mb_ss_c;
     vp8_get16x16pred_error       = vp8_get16x16pred_error_c;
-    vp8_get8x8var               = vp8_get8x8var_ppc;
-    vp8_get16x16var             = vp8_get16x16var_ppc;
     vp8_get4x4sse_cs            = vp8_get4x4sse_cs_c;
 
     vp8_sad16x16                = vp8_sad16x16_ppc;
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 54c394dfc..73e1437b5 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -605,10 +605,10 @@ static void calc_gf_params(VP8_COMP *cpi)
 
 static void calc_pframe_target_size(VP8_COMP *cpi)
 {
-    int min_frame_target;
+    int min_frame_target, max_frame_target;
     int Adjustment;
 
-    min_frame_target = 0;
+    min_frame_target = 1;
 
     if (cpi->pass == 2)
     {
@@ -616,10 +616,19 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
 
         if (min_frame_target < (cpi->av_per_frame_bandwidth >> 5))
             min_frame_target = cpi->av_per_frame_bandwidth >> 5;
+
+        max_frame_target = INT_MAX;
     }
-    else if (min_frame_target < cpi->per_frame_bandwidth / 4)
-        min_frame_target = cpi->per_frame_bandwidth / 4;
+    else
+    {
+        if (min_frame_target < cpi->per_frame_bandwidth / 4)
+            min_frame_target = cpi->per_frame_bandwidth / 4;
 
+        /* Don't allow the target to completely deplete the buffer. */
+        max_frame_target = cpi->buffer_level + cpi->av_per_frame_bandwidth;
+        if(max_frame_target < min_frame_target)
+            max_frame_target = min_frame_target;
+    }
 
     // Special alt reference frame case
     if (cpi->common.refresh_alt_ref_frame)
@@ -1112,6 +1121,32 @@ static void calc_pframe_target_size(VP8_COMP *cpi)
 
         }
     }
+
+    if (cpi->pass==0 && cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER){
+        /* determine the accumulated error to apply to this frame. Apply
+         * more of the error when we've been undershooting, less when
+         * we've been overshooting
+         */
+        long long adjust;
+        int bitrate_error;
+
+        bitrate_error = cpi->av_per_frame_bandwidth
+                        - cpi->buffered_av_per_frame_bandwidth;
+
+        adjust = cpi->accumulated_overshoot;
+        adjust *= cpi->av_per_frame_bandwidth + bitrate_error;
+        adjust /= cpi->oxcf.maximum_buffer_size;
+        if (adjust > (cpi->this_frame_target - min_frame_target))
+            adjust = (cpi->this_frame_target - min_frame_target);
+        else if (adjust < 0)
+            adjust = 0;
+
+        cpi->this_frame_target -= adjust;
+        cpi->accumulated_overshoot -= adjust;
+    }
+
+    if(cpi->this_frame_target > max_frame_target)
+        cpi->this_frame_target = max_frame_target;
 }
 
 
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index d4d6cd7c7..ebfc438f7 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -155,51 +155,50 @@ static int rd_iifactor [ 32 ] =  {    4,   4,   3,   2,   1,   0,   0,   0,
                                       0,   0,   0,   0,   0,   0,   0,   0,
                                  };
 
-
 /* values are now correlated to quantizer */
 static int sad_per_bit16lut[QINDEX_RANGE] =
 {
+    2,  2,  2,  2,  2,  2,  2,  2,
+    2,  2,  2,  2,  2,  2,  2,  2,
+    3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  3,  3,  4,  4,
+    4,  4,  4,  4,  4,  4,  4,  4,
+    4,  4,  5,  5,  5,  5,  5,  5,
     5,  5,  5,  5,  5,  5,  6,  6,
-    6,  6,  6,  6,  6,  7,  7,  7,
-    7,  7,  7,  7,  8,  8,  8,  8,
-    8,  8,  8,  8,  8,  8,  9,  9,
-    9,  9,  9,  9, 10, 10, 10, 10,
-    10, 10, 11, 11, 11, 11, 11, 11,
-    12, 12, 12, 12, 12, 12, 12, 13,
-    13, 13, 13, 13, 13, 14, 14, 14,
-    14, 14, 15, 15, 15, 15, 15, 15,
-    16, 16, 16, 16, 16, 16, 17, 17,
-    17, 17, 17, 17, 17, 18, 18, 18,
-    18, 18, 19, 19, 19, 19, 19, 19,
-    20, 20, 20, 21, 21, 21, 21, 22,
-    22, 22, 23, 23, 23, 24, 24, 24,
-    25, 25, 26, 26, 27, 27, 27, 28,
-    28, 28, 29, 29, 30, 30, 31, 31
+    6,  6,  6,  6,  6,  6,  6,  6,
+    6,  6,  7,  7,  7,  7,  7,  7,
+    7,  7,  7,  7,  7,  7,  8,  8,
+    8,  8,  8,  8,  8,  8,  8,  8,
+    8,  8,  9,  9,  9,  9,  9,  9,
+    9,  9,  9,  9,  9,  9,  10, 10,
+    10, 10, 10, 10, 10, 10, 11, 11,
+    11, 11, 11, 11, 12, 12, 12, 12,
+    12, 12, 13, 13, 13, 13, 14, 14
 };
 static int sad_per_bit4lut[QINDEX_RANGE] =
 {
-    5,  5,  5,  5,  5,  5,  7,  7,
+    2,  2,  2,  2,  2,  2,  3,  3,
+    3,  3,  3,  3,  3,  3,  3,  3,
+    3,  3,  3,  3,  4,  4,  4,  4,
+    4,  4,  4,  4,  4,  4,  5,  5,
+    5,  5,  5,  5,  6,  6,  6,  6,
+    6,  6,  6,  6,  6,  6,  6,  6,
+    7,  7,  7,  7,  7,  7,  7,  7,
     7,  7,  7,  7,  7,  8,  8,  8,
-    8,  8,  8,  8,  10, 10, 10, 10,
-    10, 10, 10, 10, 10, 10, 11, 11,
-    11, 11, 11, 11, 13, 13, 13, 13,
-    13, 13, 14, 14, 14, 14, 14, 14,
-    16, 16, 16, 16, 16, 16, 16, 17,
-    17, 17, 17, 17, 17, 19, 19, 19,
-    19, 19, 20, 20, 20, 20, 20, 20,
-    22, 22, 22, 22, 22, 22, 23, 23,
-    23, 23, 23, 23, 23, 25, 25, 25,
-    25, 25, 26, 26, 26, 26, 26, 26,
-    28, 28, 28, 29, 29, 29, 29, 31,
-    31, 31, 32, 32, 32, 34, 34, 34,
-    35, 35, 37, 37, 38, 38, 38, 40,
-    40, 40, 41, 41, 43, 43, 44, 44,
+    8,  8,  9,  9,  9,  9,  9,  9,
+    10, 10, 10, 10, 10, 10, 10, 10,
+    11, 11, 11, 11, 11, 11, 11, 11,
+    12, 12, 12, 12, 12, 12, 12, 12,
+    13, 13, 13, 13, 13, 13, 13, 14,
+    14, 14, 14, 14, 15, 15, 15, 15,
+    16, 16, 16, 16, 17, 17, 17, 18,
+    18, 18, 19, 19, 19, 20, 20, 20,
 };
 
 void vp8cx_initialize_me_consts(VP8_COMP *cpi, int QIndex)
 {
-    cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex]/2;
-    cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex]/2;
+    cpi->mb.sadperbit16 =  sad_per_bit16lut[QIndex];
+    cpi->mb.sadperbit4  =  sad_per_bit4lut[QIndex];
 }
 
 void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
@@ -2182,29 +2181,28 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             }
             else if (x->encode_breakout)
             {
-                int sum;
                 unsigned int sse;
+                unsigned int var;
                 int threshold = (xd->block[0].dequant[1]
                             * xd->block[0].dequant[1] >>4);
 
                 if(threshold < x->encode_breakout)
                     threshold = x->encode_breakout;
 
-                VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)
-                    (x->src.y_buffer, x->src.y_stride,
-                     x->e_mbd.predictor, 16, &sse, &sum);
+                var = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+                        (x->src.y_buffer, x->src.y_stride,
+                        x->e_mbd.predictor, 16, &sse);
 
                 if (sse < threshold)
                 {
-                    // Check u and v to make sure skip is ok
-                    int sse2 = 0;
+                     unsigned int q2dc = xd->block[24].dequant[0];
                     /* If theres is no codeable 2nd order dc
                        or a very small uniform pixel change change */
-                    if (abs(sum) < (xd->block[24].dequant[0]<<2)||
-                        ((sum * sum>>8) > sse && abs(sum) <128))
+                    if ((sse - var < q2dc * q2dc >>4) ||
+                        (sse /2 > var && sse-var < 64))
                     {
-                        sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));
-
+                        // Check u and v to make sure skip is ok
+                        int sse2=  VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));
                         if (sse2 * 2 < threshold)
                         {
                             x->skip = 1;
diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h
index 0d7d977d7..d52aa1b1d 100644
--- a/vp8/encoder/variance.h
+++ b/vp8/encoder/variance.h
@@ -313,16 +313,6 @@ extern prototype_variance(vp8_variance_mse16x16);
 #endif
 extern prototype_get16x16prederror(vp8_variance_get16x16prederror);
 
-#ifndef vp8_variance_get8x8var
-#define vp8_variance_get8x8var vp8_get8x8var_c
-#endif
-extern prototype_variance2(vp8_variance_get8x8var);
-
-#ifndef vp8_variance_get16x16var
-#define vp8_variance_get16x16var vp8_get16x16var_c
-#endif
-extern prototype_variance2(vp8_variance_get16x16var);
-
 #ifndef vp8_variance_get4x4sse_cs
 #define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_c
 #endif
@@ -377,8 +367,6 @@ typedef struct
     vp8_variance_fn_t        mse16x16;
 
     vp8_get16x16prederror_fn_t get16x16prederror;
-    vp8_variance2_fn_t       get8x8var;
-    vp8_variance2_fn_t       get16x16var;
     vp8_get16x16prederror_fn_t get4x4sse_cs;
 
     vp8_sad_multi_fn_t       sad16x16x3;
diff --git a/vp8/encoder/variance_c.c b/vp8/encoder/variance_c.c
index ede07c8db..c7b9c2209 100644
--- a/vp8/encoder/variance_c.c
+++ b/vp8/encoder/variance_c.c
@@ -61,40 +61,6 @@ static void variance(
     }
 }
 
-unsigned int
-vp8_get8x8var_c
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-)
-{
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 8, 8, SSE, Sum);
-    return (*SSE - (((*Sum) * (*Sum)) >> 6));
-}
-
-unsigned int
-vp8_get16x16var_c
-(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *Sum
-)
-{
-
-    variance(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, SSE, Sum);
-    return (*SSE - (((*Sum) * (*Sum)) >> 8));
-
-}
-
-
 
 unsigned int vp8_variance16x16_c(
     const unsigned char *src_ptr,
diff --git a/vp8/encoder/x86/variance_mmx.c b/vp8/encoder/x86/variance_mmx.c
index 4a89868c2..1b05571f1 100644
--- a/vp8/encoder/x86/variance_mmx.c
+++ b/vp8/encoder/x86/variance_mmx.c
@@ -84,36 +84,6 @@ extern unsigned int vp8_get16x16pred_error_mmx
     int ref_stride
 );
 
-unsigned int vp8_get16x16var_mmx(
-    const unsigned char *src_ptr,
-    int  source_stride,
-    const unsigned char *ref_ptr,
-    int  recon_stride,
-    unsigned int *SSE,
-    int *SUM
-)
-{
-    unsigned int sse0, sse1, sse2, sse3, var;
-    int sum0, sum1, sum2, sum3, avg;
-
-
-    vp8_get8x8var_mmx(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, &sum0) ;
-    vp8_get8x8var_mmx(src_ptr + 8, source_stride, ref_ptr + 8, recon_stride, &sse1, &sum1);
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride, source_stride, ref_ptr + 8 * recon_stride, recon_stride, &sse2, &sum2) ;
-    vp8_get8x8var_mmx(src_ptr + 8 * source_stride + 8, source_stride, ref_ptr + 8 * recon_stride + 8, recon_stride, &sse3, &sum3);
-
-    var = sse0 + sse1 + sse2 + sse3;
-    avg = sum0 + sum1 + sum2 + sum3;
-
-    *SSE = var;
-    *SUM = avg;
-    return (var - ((avg * avg) >> 8));
-
-}
-
-
-
-
 
 unsigned int vp8_variance4x4_mmx(
     const unsigned char *src_ptr,
diff --git a/vp8/encoder/x86/variance_x86.h b/vp8/encoder/x86/variance_x86.h
index 77e05e1e8..4a640d7aa 100644
--- a/vp8/encoder/x86/variance_x86.h
+++ b/vp8/encoder/x86/variance_x86.h
@@ -43,7 +43,6 @@ extern prototype_getmbss(vp8_get_mb_ss_mmx);
 extern prototype_variance(vp8_mse16x16_mmx);
 extern prototype_get16x16prederror(vp8_get16x16pred_error_mmx);
 extern prototype_variance2(vp8_get8x8var_mmx);
-extern prototype_variance2(vp8_get16x16var_mmx);
 extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx);
 
 #if !CONFIG_RUNTIME_CPU_DETECT
@@ -113,12 +112,6 @@ extern prototype_get16x16prederror(vp8_get4x4sse_cs_mmx);
 #undef  vp8_variance_get16x16prederror
 #define vp8_variance_get16x16prederror vp8_get16x16pred_error_mmx
 
-#undef  vp8_variance_get8x8var
-#define vp8_variance_get8x8var vp8_get8x8var_mmx
-
-#undef  vp8_variance_get16x16var
-#define vp8_variance_get16x16var vp8_get16x16var_mmx
-
 #undef  vp8_variance_get4x4sse_cs
 #define vp8_variance_get4x4sse_cs vp8_get4x4sse_cs_mmx
 
@@ -219,12 +212,6 @@ extern prototype_variance2(vp8_get16x16var_sse2);
 #undef  vp8_variance_get16x16prederror
 #define vp8_variance_get16x16prederror vp8_get16x16pred_error_sse2
 
-#undef  vp8_variance_get8x8var
-#define vp8_variance_get8x8var vp8_get8x8var_sse2
-
-#undef  vp8_variance_get16x16var
-#define vp8_variance_get16x16var vp8_get16x16var_sse2
-
 #endif
 #endif
 
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 378b14066..f33c74a1c 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -176,8 +176,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_mmx;
 
         cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_mmx;
-        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_mmx;
-        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_mmx;
         cpi->rtcd.variance.get4x4sse_cs          = vp8_get4x4sse_cs_mmx;
 
         cpi->rtcd.fdct.short4x4                  = vp8_short_fdct4x4_mmx;
@@ -227,9 +225,6 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.getmbss               = vp8_get_mb_ss_sse2;
 
         cpi->rtcd.variance.get16x16prederror     = vp8_get16x16pred_error_sse2;
-        cpi->rtcd.variance.get8x8var             = vp8_get8x8var_sse2;
-        cpi->rtcd.variance.get16x16var           = vp8_get16x16var_sse2;
-
 
         /* cpi->rtcd.variance.get4x4sse_cs  not implemented for wmt */;