23 files changed, 803 insertions, 356 deletions
diff --git a/vp8/common/onyx.h b/vp8/common/onyx.h
index 3724b11e0..426b8fc2b 100644
--- a/vp8/common/onyx.h
+++ b/vp8/common/onyx.h
@@ -46,7 +46,8 @@ extern "C"
     typedef enum
     {
         USAGE_STREAM_FROM_SERVER    = 0x0,
-        USAGE_LOCAL_FILE_PLAYBACK   = 0x1
+        USAGE_LOCAL_FILE_PLAYBACK   = 0x1,
+        USAGE_CONSTRAINED_QUALITY   = 0x2
     } END_USAGE;
 
 
@@ -150,6 +151,7 @@ extern "C"
         int fixed_q;
         int worst_allowed_q;
         int best_allowed_q;
+        int cq_level;
 
         // allow internal resizing ( currently disabled in the build !!!!!)
         int allow_spatial_resampling;
@@ -187,7 +189,6 @@ extern "C"
         int arnr_strength ;
         int arnr_type     ;
 
-
         struct vpx_fixed_buf         two_pass_stats_in;
         struct vpx_codec_pkt_list  *output_pkt_list;
 
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index 15b1c2c89..d30068ef5 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -680,7 +680,6 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
 
 int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
 {
-    char message[512];
     int q = oci->filter_level * 10 / 6;
     int flags = ppflags->post_proc_flag;
     int deblock_level = ppflags->deblocking_level;
@@ -744,6 +743,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
 #if CONFIG_POSTPROC_VISUALIZER
     if (flags & VP8D_DEBUG_TXT_FRAME_INFO)
     {
+        char message[512];
         sprintf(message, "F%1dG%1dQ%3dF%3dP%d_s%dx%d",
                 (oci->frame_type == KEY_FRAME),
                 oci->refresh_golden_frame,
@@ -823,6 +823,7 @@ int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t
 
     if (flags & VP8D_DEBUG_TXT_RATE_INFO)
     {
+        char message[512];
         sprintf(message, "Bitrate: %10.2f frame_rate: %10.2f ", oci->bitrate, oci->framerate);
         vp8_blit_text(message, oci->post_proc_buffer.y_buffer, oci->post_proc_buffer.y_stride);
     }
diff --git a/vp8/decoder/decodframe.c b/vp8/decoder/decodframe.c
index 4702faeed..9305a0556 100644
--- a/vp8/decoder/decodframe.c
+++ b/vp8/decoder/decodframe.c
@@ -485,7 +485,6 @@ static void setup_token_decoder(VP8D_COMP *pbi,
 
 static void stop_token_decoder(VP8D_COMP *pbi)
 {
-    int i;
     VP8_COMMON *pc = &pbi->common;
 
     if (pc->multi_token_partition != ONE_PARTITION)
diff --git a/vp8/decoder/threading.c b/vp8/decoder/threading.c
index fea4e1cc1..dac990a26 100644
--- a/vp8/decoder/threading.c
+++ b/vp8/decoder/threading.c
@@ -451,7 +451,6 @@ void vp8_decoder_create_threads(VP8D_COMP *pbi)
 #if CONFIG_MULTITHREAD
     int core_count = 0;
     int ithread;
-    int i;
 
     pbi->b_multithreaded_rd = 0;
     pbi->allocated_decoding_thread_count = 0;
@@ -721,7 +720,6 @@ void vp8mt_lpf_init( VP8D_COMP *pbi, int default_filt_lvl)
     /*int mb_row;
     int mb_col;
     int baseline_filter_level[MAX_MB_SEGMENTS];*/
-    int filter_level;
     int alt_flt_enabled = mbd->segmentation_enabled;
 
     int i;
@@ -769,7 +767,7 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 
     int ibc = 0;
     int num_part = 1 << pbi->common.multi_token_partition;
-    int i, j;
+    int i;
     volatile int *last_row_current_mb_col = NULL;
     int nsync = pbi->sync_range;
 
@@ -809,7 +807,6 @@ void vp8mt_decode_mb_rows( VP8D_COMP *pbi, MACROBLOCKD *xd)
 
     for (mb_row = 0; mb_row < pc->mb_rows; mb_row += (pbi->decoding_thread_count + 1))
     {
-        int i;
 
         xd->current_bc = &pbi->mbc[mb_row%num_part];
 
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index cb7cc65d7..4c95f28d6 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -408,7 +408,6 @@ unsigned int vp8_activity_masking(VP8_COMP *cpi, MACROBLOCK *x)
     int sum;
     unsigned int a;
     unsigned int b;
-    unsigned int d;
     /* TODO: This could also be done over smaller areas (8x8), but that would
      *  require extensive changes elsewhere, as lambda is assumed to be fixed
      *  over an entire MB in most of the code.
@@ -629,7 +628,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
     VP8_COMMON *const cm = & cpi->common;
     MACROBLOCKD *const xd = & x->e_mbd;
 
-    int i;
     TOKENEXTRA *tp = cpi->tok;
     int segment_counts[MAX_MB_SEGMENTS];
     int totalrate;
@@ -712,9 +710,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
     }
 
     vp8_initialize_rd_consts(cpi, vp8_dc_quant(cm->base_qindex, cm->y1dc_delta_q));
-    //vp8_initialize_rd_consts( cpi, vp8_dc_quant(cpi->avg_frame_qindex, cm->y1dc_delta_q) );
     vp8cx_initialize_me_consts(cpi, cm->base_qindex);
-    //vp8cx_initialize_me_consts( cpi, cpi->avg_frame_qindex);
 
     // Copy data over into macro block data sturctures.
 
@@ -734,20 +730,6 @@ void vp8_encode_frame(VP8_COMP *cpi)
 
     x->activity_sum = 0;
 
-#if 0
-    // Experimental rd code
-    // 2 Pass - Possibly set Rdmult based on last frame distortion + this frame target bits or other metrics
-    // such as cpi->rate_correction_factor that indicate relative complexity.
-    /*if ( cpi->pass == 2 && (cpi->last_frame_distortion > 0) && (cpi->target_bits_per_mb > 0) )
-    {
-        //x->rdmult = ((cpi->last_frame_distortion * 256)/cpi->common.MBs)/ cpi->target_bits_per_mb;
-        x->rdmult = (int)(cpi->RDMULT * cpi->rate_correction_factor);
-    }
-    else
-        x->rdmult = cpi->RDMULT; */
-    //x->rdmult = (int)(cpi->RDMULT * pow( (cpi->rate_correction_factor * 2.0), 0.75 ));
-#endif
-
     xd->mode_info_context->mbmi.mode = DC_PRED;
     xd->mode_info_context->mbmi.uv_mode = DC_PRED;
 
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 464d4a236..efcea745b 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -273,7 +273,6 @@ void vp8_optimize_b(MACROBLOCK *mb, int ib, int type,
     int x;
     int sz;
     int next;
-    int path;
     int rdmult;
     int rddiv;
     int final_eob;
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index da4d740cb..32a39c5f2 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -58,6 +58,7 @@ extern const int vp8_gf_boost_qadjustment[QINDEX_RANGE];
 
 #define KF_MB_INTRA_MIN 300
 #define GF_MB_INTRA_MIN 200
+
 #define DOUBLE_DIVIDE_CHECK(X) ((X)<0?(X)-.000001:(X)+.000001)
 
 #define POW1 (double)cpi->oxcf.two_pass_vbrbias/100.0
@@ -67,6 +68,18 @@ static int vscale_lookup[7] = {0, 1, 1, 2, 2, 3, 3};
 static int hscale_lookup[7] = {0, 0, 1, 1, 2, 2, 3};
 
 
+const int cq_level[QINDEX_RANGE] =
+{
+    0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
+    9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
+    20,21,22,22,23,24,24,25,26,27,27,28,29,30,30,31,
+    32,33,33,34,35,36,36,37,38,39,39,40,41,42,42,43,
+    44,45,46,46,47,48,49,50,50,51,52,53,54,55,55,56,
+    57,58,59,60,60,61,62,63,64,65,66,67,67,68,69,70,
+    71,72,73,74,75,75,76,77,78,79,80,81,82,83,84,85,
+    86,86,87,88,89,90,91,92,93,94,95,96,97,98,99,100
+};
+
 void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame);
 int vp8_input_stats(VP8_COMP *cpi, FIRSTPASS_STATS *fps);
 
@@ -249,7 +262,6 @@ extern size_t vp8_firstpass_stats_sz(unsigned int mb_count)
      * macroblock.
      */
     size_t stats_sz;
-    FIRSTPASS_STATS stats;
 
     stats_sz = sizeof(FIRSTPASS_STATS) + mb_count;
     stats_sz = (stats_sz + 7) & ~7;
@@ -376,8 +388,6 @@ unsigned char *vp8_fpmm_get_pos(VP8_COMP *cpi)
 }
 void vp8_fpmm_reset_pos(VP8_COMP *cpi, unsigned char *target_pos)
 {
-    int Offset;
-
     cpi->fp_motion_map_stats = target_pos;
 }
 
@@ -907,7 +917,7 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
     double pow_lowq = 0.40;
 
     if (section_target_bandwitdh <= 0)
-        return MAXQ;
+        return cpi->maxq_max_limit;          // Highest value allowed
 
     target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20)) ? (512 * section_target_bandwitdh) / num_mbs : 512 * (section_target_bandwitdh / num_mbs);
 
@@ -943,10 +953,12 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
 
     // Correction factor used for Q values >= 20
     corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
-    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+    corr_high = (corr_high < 0.05)
+                    ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
 
-    // Try and pick a Q that should be high enough to encode the content at the given rate.
-    for (Q = 0; Q < MAXQ; Q++)
+    // Try and pick a max Q that will be high enough to encode the
+    // content at the given rate.
+    for (Q = cpi->maxq_min_limit; Q < cpi->maxq_max_limit; Q++)
     {
         int bits_per_mb_at_this_q;
 
@@ -965,6 +977,28 @@ static int estimate_max_q(VP8_COMP *cpi, double section_err, int section_target_
             break;
     }
 
+    // Restriction on active max q for constrained quality mode.
+    if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+         (Q < cpi->cq_target_quality) )
+         //(Q < cpi->oxcf.cq_level;) )
+    {
+        Q = cpi->cq_target_quality;
+        //Q = cpi->oxcf.cq_level;
+    }
+
+    // Adjust maxq_min_limit and maxq_max_limit limits based on
+    // averaga q observed in clip for non kf/gf.arf frames
+    // Give average a chance to settle though.
+    if ( (cpi->ni_frames >
+                  ((unsigned int)cpi->total_stats->count >> 8)) &&
+         (cpi->ni_frames > 150) )
+    {
+        cpi->maxq_max_limit = ((cpi->ni_av_qi + 32) < cpi->worst_quality)
+                                  ? (cpi->ni_av_qi + 32) : cpi->worst_quality;
+        cpi->maxq_min_limit = ((cpi->ni_av_qi - 32) > cpi->best_quality)
+                                  ? (cpi->ni_av_qi - 32) : cpi->best_quality;
+    }
+
     return Q;
 }
 static int estimate_q(VP8_COMP *cpi, double section_err, int section_target_bandwitdh, int Height, int Width)
@@ -1113,6 +1147,79 @@ static int estimate_kf_group_q(VP8_COMP *cpi, double section_err, int section_ta
 
     return Q;
 }
+
+// For cq mode estimate a cq level that matches the observed
+// complexity and data rate.
+static int estimate_cq(VP8_COMP *cpi, double section_err,
+                       int section_target_bandwitdh, int Height, int Width)
+{
+    int Q;
+    int num_mbs = ((Height * Width) / (16 * 16));
+    int target_norm_bits_per_mb;
+
+    double err_per_mb = section_err / num_mbs;
+    double correction_factor;
+    double corr_high;
+    double speed_correction = 1.0;
+    double pow_highq = 0.90;
+    double pow_lowq = 0.40;
+    double clip_iiratio;
+    double clip_iifactor;
+
+    target_norm_bits_per_mb = (section_target_bandwitdh < (1 << 20))
+                              ? (512 * section_target_bandwitdh) / num_mbs
+                              : 512 * (section_target_bandwitdh / num_mbs);
+
+    // Corrections for higher compression speed settings
+    // (reduced compression expected)
+    if ((cpi->compressor_speed == 3) || (cpi->compressor_speed == 1))
+    {
+        if (cpi->oxcf.cpu_used <= 5)
+            speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+        else
+            speed_correction = 1.25;
+    }
+    // II ratio correction factor for clip as a whole
+    clip_iiratio = cpi->total_stats->intra_error /
+                   DOUBLE_DIVIDE_CHECK(cpi->total_stats->coded_error);
+    clip_iifactor = 1.0 - ((clip_iiratio - 10.0) * 0.025);
+    if (clip_iifactor < 0.80)
+        clip_iifactor = 0.80;
+
+    // Correction factor used for Q values >= 20
+    corr_high = pow(err_per_mb / BASE_ERRPERMB, pow_highq);
+    corr_high = (corr_high < 0.05) ? 0.05 : (corr_high > 5.0) ? 5.0 : corr_high;
+
+    // Try and pick a Q that can encode the content at the given rate.
+    for (Q = 0; Q < MAXQ; Q++)
+    {
+        int bits_per_mb_at_this_q;
+
+        if (Q < 50)
+        {
+            correction_factor =
+                pow( err_per_mb / BASE_ERRPERMB, (pow_lowq + Q * 0.01));
+
+            correction_factor = (correction_factor < 0.05) ? 0.05
+                                    : (correction_factor > 5.0) ? 5.0
+                                        : correction_factor;
+        }
+        else
+            correction_factor = corr_high;
+
+        bits_per_mb_at_this_q =
+            (int)( .5 + correction_factor *
+                        speed_correction *
+                        clip_iifactor *
+                        (double)vp8_bits_per_mb[INTER_FRAME][Q] / 1.0);
+
+        if (bits_per_mb_at_this_q <= target_norm_bits_per_mb)
+            break;
+    }
+
+    return cq_level[Q];
+}
+
 extern void vp8_new_frame_rate(VP8_COMP *cpi, double framerate);
 
 void vp8_init_second_pass(VP8_COMP *cpi)
@@ -1268,7 +1375,7 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     // what level of boost is appropriate for the GF or ARF that will be coded with the group
     i = 0;
 
-    while (((i < cpi->max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
+    while (((i < cpi->static_scene_max_gf_interval) || ((cpi->frames_to_key - i) < MIN_GF_INTERVAL)) && (i < cpi->frames_to_key))
     {
         double r;
         double this_frame_mvr_ratio;
@@ -1378,18 +1485,20 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         boost_score += (decay_accumulator * r);
 
         // Break out conditions.
-        if (   /* i>4 || */
+        if  (   /* i>4 || */
+            // Break at cpi->max_gf_interval unless almost totally static
+            (i >= cpi->max_gf_interval && (loop_decay_rate < 0.99)) ||
             (
-                (i > MIN_GF_INTERVAL) &&                            // Dont break out with a very short interval
-                ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) &&      // Dont break out very close to a key frame
+                // Dont break out with a very short interval
+                (i > MIN_GF_INTERVAL) &&
+                // Dont break out very close to a key frame
+                ((cpi->frames_to_key - i) >= MIN_GF_INTERVAL) &&
                 ((boost_score > 20.0) || (next_frame.pcnt_inter < 0.75)) &&
                 ((mv_ratio_accumulator > 100.0) ||
                  (abs_mv_in_out_accumulator > 3.0) ||
                  (mv_in_out_accumulator < -2.0) ||
-                 ((boost_score - old_boost_score) < 2.0)
-                )
-            )
-        )
+                 ((boost_score - old_boost_score) < 2.0))
+            ) )
         {
             boost_score = old_boost_score;
             break;
@@ -1766,7 +1875,9 @@ static void define_gf_group(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
         vp8_avg_stats(&sectionstats);
 
-        cpi->section_intra_rating = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
+        cpi->section_intra_rating =
+            sectionstats.intra_error /
+            DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
 
         Ratio = sectionstats.intra_error / DOUBLE_DIVIDE_CHECK(sectionstats.coded_error);
         //if( (Ratio > 11) ) //&& (sectionstats.pcnt_second_ref < .20) )
@@ -1993,21 +2104,48 @@ void vp8_second_pass(VP8_COMP *cpi)
 
     if (cpi->common.current_video_frame == 0)
     {
-        // guess at 2nd pass q
         cpi->est_max_qcorrection_factor = 1.0;
-        tmp_q = estimate_max_q(cpi, (cpi->total_coded_error_left / frames_left), (int)(cpi->bits_left / frames_left), cpi->common.Height, cpi->common.Width);
 
-        if (tmp_q < cpi->worst_quality)
-        {
-            cpi->active_worst_quality         = tmp_q;
-            cpi->ni_av_qi                     = tmp_q;
-        }
-        else
+        // Experimental code to try and set a cq_level in constrained
+        // quality mode.
+        if ( cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY )
         {
-            cpi->active_worst_quality         = cpi->worst_quality;
-            cpi->ni_av_qi                     = cpi->worst_quality;
+            int est_cq;
+
+            est_cq =
+                estimate_cq( cpi,
+                             (cpi->total_coded_error_left / frames_left),
+                             (int)(cpi->bits_left / frames_left),
+                             cpi->common.Height, cpi->common.Width);
+
+            cpi->cq_target_quality = cpi->oxcf.cq_level;
+            if ( est_cq > cpi->cq_target_quality )
+                cpi->cq_target_quality = est_cq;
         }
+
+        // guess at maxq needed in 2nd pass
+        cpi->maxq_max_limit = cpi->worst_quality;
+        cpi->maxq_min_limit = cpi->best_quality;
+        tmp_q = estimate_max_q( cpi,
+                                (cpi->total_coded_error_left / frames_left),
+                                (int)(cpi->bits_left / frames_left),
+                                cpi->common.Height,
+                                cpi->common.Width);
+
+        // Limit the maxq value returned subsequently.
+        // This increases the risk of overspend or underspend if the initial
+        // estimate for the clip is bad, but helps prevent excessive
+        // variation in Q, especially near the end of a clip
+        // where for example a small overspend may cause Q to crash
+        cpi->maxq_max_limit = ((tmp_q + 32) < cpi->worst_quality)
+                                  ? (tmp_q + 32) : cpi->worst_quality;
+        cpi->maxq_min_limit = ((tmp_q - 32) > cpi->best_quality)
+                                  ? (tmp_q - 32) : cpi->best_quality;
+
+        cpi->active_worst_quality         = tmp_q;
+        cpi->ni_av_qi                     = tmp_q;
     }
+
     // The last few frames of a clip almost always have to few or too many
     // bits and for the sake of over exact rate control we dont want to make
     // radical adjustments to the allowed quantizer range just to use up a
@@ -2029,13 +2167,6 @@ void vp8_second_pass(VP8_COMP *cpi)
             cpi->active_worst_quality --;
 
         cpi->active_worst_quality = ((cpi->active_worst_quality * 3) + tmp_q + 2) / 4;
-
-        // Clamp to user set limits
-        if (cpi->active_worst_quality > cpi->worst_quality)
-            cpi->active_worst_quality = cpi->worst_quality;
-        else if (cpi->active_worst_quality < cpi->best_quality)
-            cpi->active_worst_quality = cpi->best_quality;
-
     }
 
     cpi->frames_to_key --;
@@ -2157,6 +2288,9 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
 
     cpi->common.frame_type = KEY_FRAME;
 
+    // is this a forced key frame by interval
+    cpi->this_key_frame_forced = cpi->next_key_frame_forced;
+
     // Clear the alt ref active flag as this can never be active on a key frame
     cpi->source_alt_ref_active = FALSE;
 
@@ -2219,7 +2353,11 @@ void vp8_find_next_key_frame(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
         kf_group_err /= 2.0;
         kf_group_intra_err /= 2.0;
         kf_group_coded_err /= 2.0;
+
+        cpi->next_key_frame_forced = TRUE;
     }
+    else
+        cpi->next_key_frame_forced = FALSE;
 
     // Special case for the last frame of the file
     if (cpi->stats_in >= cpi->stats_in_end)
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c
index be00d0218..4738a5b28 100644
--- a/vp8/encoder/generic/csystemdependent.c
+++ b/vp8/encoder/generic/csystemdependent.c
@@ -91,8 +91,9 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi)
 
     cpi->rtcd.quantize.quantb                = vp8_regular_quantize_b;
     cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_c;
-
+#if !(CONFIG_REALTIME_ONLY)
     cpi->rtcd.search.full_search             = vp8_full_search_sad;
+#endif
     cpi->rtcd.search.diamond_search          = vp8_diamond_search_sad;
 
     cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_c;
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index 9b91739cc..d9923fbe9 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -408,6 +408,7 @@ int vp8_find_best_sub_pixel_step(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *bestmv,
         diag = vfp->svf_halfpix_hv(y - 1, d->pre_stride, z, b->src_stride, &sse);
         break;
     case 3:
+    default:
         this_mv.col += 4;
         this_mv.row += 4;
         diag = vfp->svf_halfpix_hv(y, d->pre_stride, z, b->src_stride, &sse);
@@ -1387,8 +1388,6 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
     else
         return INT_MAX;
 }
-#endif
-
 
 int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
 {
@@ -1541,6 +1540,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
     else
         return INT_MAX;
 }
+#endif /* !(CONFIG_REALTIME_ONLY) */
 
 #ifdef ENTROPY_STATS
 void print_mode_context(void)
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 122debcae..7600f87fc 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -24,7 +24,6 @@ extern void accum_mv_refs(MB_PREDICTION_MODE, const int near_mv_ref_cts[4]);
 #define MAX_MVSEARCH_STEPS 8                                    // The maximum number of steps in a step search given the largest allowed initial step
 #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS+3)) - 8)    // Max full pel mv specified in 1/8 pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))            // Maximum size of the first step in full pel units
-#define MAX_POSSIBLE_MV (1 << 11)                               // Maximum MV in 1/8 pel units
 
 extern void print_mode_context(void);
 extern int vp8_mv_bit_cost(MV *mv, MV *ref, int *mvcost[2], int Weight);
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 56f7ef6f8..77fbb29b1 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -155,25 +155,25 @@ extern const int vp8cx_base_skip_false_prob[128];
 // Tables relating active max Q to active min Q
 static const int kf_low_motion_minq[QINDEX_RANGE] =
 {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4,
-    5, 5, 5, 6, 6, 6, 7, 7, 7, 8, 8, 8, 9, 9, 10,10,
-    11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,
-    19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,
-    27,27,28,28,29,29,30,30,31,32,33,34,35,36,37,38,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
+    3,3,3,3,3,3,4,4,4,5,5,5,5,5,6,6,
+    6,6,7,7,8,8,8,8,9,9,10,10,10,10,11,11,
+    11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
+    16,16,17,17,18,18,18,18,19,20,20,21,21,22,23,23
 };
 static const int kf_high_motion_minq[QINDEX_RANGE] =
 {
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
-    2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5,
-    6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8, 9, 9, 9, 10,10,
-    11,11,12,12,13,13,14,14,15,15,16,16,17,17,18,18,
-    19,19,20,20,21,21,22,22,23,23,24,24,25,25,26,26,
-    27,27,28,28,29,29,30,30,31,31,32,32,33,33,34,34,
-    35,35,36,36,37,38,39,40,41,42,43,44,45,46,47,48,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,2,2,2,2,3,3,3,3,
+    3,3,3,3,4,4,4,4,5,5,5,5,5,5,6,6,
+    6,6,7,7,8,8,8,8,9,9,10,10,10,10,11,11,
+    11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
+    16,16,17,17,18,18,18,18,19,19,20,20,20,20,21,21,
+    21,21,22,22,23,23,24,25,25,26,26,27,28,28,29,30
 };
 static const int gf_low_motion_minq[QINDEX_RANGE] =
 {
@@ -195,7 +195,7 @@ static const int gf_mid_motion_minq[QINDEX_RANGE] =
     22,22,23,23,24,24,25,25,26,26,27,27,28,28,29,29,
     30,30,31,31,32,32,33,33,34,34,35,35,36,36,37,37,
     38,39,39,40,40,41,41,42,42,43,43,44,45,46,47,48,
-    49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,
+    49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
 };
 static const int gf_high_motion_minq[QINDEX_RANGE] =
 {
@@ -206,7 +206,7 @@ static const int gf_high_motion_minq[QINDEX_RANGE] =
     25,25,26,26,27,27,28,28,29,29,30,30,31,31,32,32,
     33,33,34,34,35,35,36,36,37,37,38,38,39,39,40,40,
     41,41,42,42,43,44,45,46,47,48,49,50,51,52,53,54,
-    55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80,
+    55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80
 };
 static const int inter_minq[QINDEX_RANGE] =
 {
@@ -314,7 +314,7 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi)
     vpx_free(cpi->tok);
     cpi->tok = 0;
 
-    // Structure used to minitor GF useage
+    // Structure used to monitor GF usage
     if (cpi->gf_active_flags != 0)
         vpx_free(cpi->gf_active_flags);
 
@@ -325,6 +325,7 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi)
 
     cpi->mb.pip = 0;
 
+#if !(CONFIG_REALTIME_ONLY)
     if(cpi->total_stats)
         vpx_free(cpi->total_stats);
 
@@ -334,6 +335,7 @@ void vp8_dealloc_compressor_data(VP8_COMP *cpi)
         vpx_free(cpi->this_frame_stats);
 
     cpi->this_frame_stats = 0;
+#endif
 }
 
 static void enable_segmentation(VP8_PTR ptr)
@@ -576,7 +578,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 
     cpi->mbs_tested_so_far = 0;
 
-    // best quality
+    // best quality defaults
     sf->RD = 1;
     sf->search_method = NSTEP;
     sf->improved_quant = 1;
@@ -592,6 +594,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
     sf->iterative_sub_pixel = 1;
     sf->optimize_coefficients = 1;
     sf->use_fastquant_for_pick = 0;
+    sf->no_skip_block4x4_search = 1;
 
     sf->first_step = 0;
     sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
@@ -794,6 +797,7 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 
             sf->first_step = 1;
             sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
+            sf->no_skip_block4x4_search = 0;
         }
 
         if (Speed > 1)
@@ -1268,6 +1272,15 @@ void vp8_set_speed_features(VP8_COMP *cpi)
 
     };
 
+    // Slow quant, dct and trellis not worthwhile for first pass
+    // so make sure they are always turned off.
+    if ( cpi->pass == 1 )
+    {
+        sf->improved_quant = 0;
+        sf->optimize_coefficients = 0;
+        sf->improved_dct = 0;
+    }
+
     if (cpi->sf.search_method == NSTEP)
     {
         vp8_init3smotion_compensation(&cpi->mb, cm->yv12_fb[cm->lst_fb_idx].y_stride);
@@ -1437,6 +1450,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
 
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
 
+#if !(CONFIG_REALTIME_ONLY)
     if(cpi->total_stats)
         vpx_free(cpi->total_stats);
 
@@ -1450,6 +1464,7 @@ void vp8_alloc_compressor_data(VP8_COMP *cpi)
     if(!cpi->total_stats || !cpi->this_frame_stats)
         vpx_internal_error(&cpi->common.error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate firstpass stats");
+#endif
 }
 
 
@@ -1486,21 +1501,28 @@ void vp8_new_frame_rate(VP8_COMP *cpi, double framerate)
     cpi->per_frame_bandwidth          = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
     cpi->av_per_frame_bandwidth        = (int)(cpi->oxcf.target_bandwidth / cpi->output_frame_rate);
     cpi->min_frame_bandwidth          = (int)(cpi->av_per_frame_bandwidth * cpi->oxcf.two_pass_vbrmin_section / 100);
-    cpi->max_gf_interval = (int)(cpi->output_frame_rate / 2) + 2;
 
-    //cpi->max_gf_interval = (int)(cpi->output_frame_rate * 2 / 3) + 1;
-    //cpi->max_gf_interval = 24;
+    // Set Maximum gf/arf interval
+    cpi->max_gf_interval = ((int)(cpi->output_frame_rate / 2.0) + 2);
 
-    if (cpi->max_gf_interval < 12)
+    if(cpi->max_gf_interval < 12)
         cpi->max_gf_interval = 12;
 
+    // Extended interval for genuinely static scenes
+    cpi->static_scene_max_gf_interval = cpi->key_frame_frequency >> 1;
 
-    // Special conditions when altr ref frame enabled in lagged compress mode
+     // Special conditions when altr ref frame enabled in lagged compress mode
     if (cpi->oxcf.play_alternate && cpi->oxcf.lag_in_frames)
     {
         if (cpi->max_gf_interval > cpi->oxcf.lag_in_frames - 1)
             cpi->max_gf_interval = cpi->oxcf.lag_in_frames - 1;
+
+        if (cpi->static_scene_max_gf_interval > cpi->oxcf.lag_in_frames - 1)
+            cpi->static_scene_max_gf_interval = cpi->oxcf.lag_in_frames - 1;
     }
+
+    if ( cpi->max_gf_interval > cpi->static_scene_max_gf_interval )
+        cpi->max_gf_interval = cpi->static_scene_max_gf_interval;
 }
 
 
@@ -1540,6 +1562,7 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
         cpi->auto_worst_q              = 0;
         cpi->oxcf.best_allowed_q            = MINQ;
         cpi->oxcf.worst_allowed_q           = MAXQ;
+        cpi->oxcf.cq_level = MINQ;
 
         cpi->oxcf.end_usage                = USAGE_STREAM_FROM_SERVER;
         cpi->oxcf.starting_buffer_level     =   4000;
@@ -1640,6 +1663,7 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 
     cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
     cpi->oxcf.best_allowed_q  = q_trans[oxcf->best_allowed_q];
+    cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
 
     if (oxcf->fixed_q >= 0)
     {
@@ -1729,6 +1753,8 @@ void vp8_init_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cpi->avg_frame_qindex             = cpi->oxcf.worst_allowed_q;
     cpi->best_quality                = cpi->oxcf.best_allowed_q;
     cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
+    cpi->cq_target_quality = cpi->oxcf.cq_level;
+
     cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
 
     cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
@@ -1925,6 +1951,7 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
 
     cpi->oxcf.worst_allowed_q = q_trans[oxcf->worst_allowed_q];
     cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
+    cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
 
     if (oxcf->fixed_q >= 0)
     {
@@ -2017,6 +2044,8 @@ void vp8_change_config(VP8_PTR ptr, VP8_CONFIG *oxcf)
     cpi->active_best_quality          = cpi->oxcf.best_allowed_q;
     cpi->buffered_mode = (cpi->oxcf.optimal_buffer_level > 0) ? TRUE : FALSE;
 
+    cpi->cq_target_quality = cpi->oxcf.cq_level;
+
     cpi->rolling_target_bits          = cpi->av_per_frame_bandwidth;
     cpi->rolling_actual_bits          = cpi->av_per_frame_bandwidth;
     cpi->long_rolling_target_bits      = cpi->av_per_frame_bandwidth;
@@ -2258,6 +2287,8 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
 
     cpi->frames_since_key = 8;        // Give a sensible default for the first frame.
     cpi->key_frame_frequency = cpi->oxcf.key_freq;
+    cpi->this_key_frame_forced = FALSE;
+    cpi->next_key_frame_forced = FALSE;
 
     cpi->source_alt_ref_pending = FALSE;
     cpi->source_alt_ref_active = FALSE;
@@ -3081,9 +3112,6 @@ static int pick_frame_size(VP8_COMP *cpi)
         }
     }
 
-    // Note target_size in bits * 256 per MB
-    cpi->target_bits_per_mb = (cpi->this_frame_target * 256) / cpi->common.MBs;
-
     return 1;
 }
 static void set_quantizer(VP8_COMP *cpi, int Q)
@@ -3133,8 +3161,8 @@ static void update_alt_ref_frame_and_stats(VP8_COMP *cpi)
     // Update data structure that monitors level of reference to last GF
     vpx_memset(cpi->gf_active_flags, 1, (cm->mb_rows * cm->mb_cols));
     cpi->gf_active_count = cm->mb_rows * cm->mb_cols;
-    // this frame refreshes means next frames don't unless specified by user
 
+    // this frame refreshes means next frames don't unless specified by user
     cpi->common.frames_since_golden = 0;
 
     // Clear the alternate reference update pending flag.
@@ -3510,8 +3538,25 @@ static BOOL recode_loop_test( VP8_COMP *cpi,
         {
             force_recode = TRUE;
         }
-        // Specific rate control mode related tests
-        // TBD
+        // Special Constrained quality tests
+        else if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+        {
+            // Undershoot and below auto cq level
+            if ( (q > cpi->cq_target_quality) &&
+                 (cpi->projected_frame_size <
+                     ((cpi->this_frame_target * 7) >> 3)))
+            {
+                force_recode = TRUE;
+            }
+            // Severe undershoot and between auto and user cq level
+            else if ( (q > cpi->oxcf.cq_level) &&
+                      (cpi->projected_frame_size < cpi->min_frame_bandwidth) &&
+                      (cpi->active_best_quality > cpi->oxcf.cq_level))
+            {
+                force_recode = TRUE;
+                cpi->active_best_quality = cpi->oxcf.cq_level;
+            }
+        }
     }
 
     return force_recode;
@@ -3778,51 +3823,85 @@ static void encode_frame_to_data_rate
     }
 
     // Set an active best quality and if necessary active worst quality
-    if (cpi->pass == 2 || (cm->current_video_frame > 150))
+    // There is some odd behaviour for one pass here that needs attention.
+    if ( (cpi->pass == 2) || (cpi->ni_frames > 150))
     {
-        int Q;
-        int i;
-        int bpm_target;
-        //int tmp;
-
         vp8_clear_system_state();
 
         Q = cpi->active_worst_quality;
 
-        if ((cm->frame_type == KEY_FRAME) || cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
+        if ( cm->frame_type == KEY_FRAME )
         {
-            if (cm->frame_type != KEY_FRAME)
+            if ( cpi->pass == 2 )
             {
-                if (cpi->avg_frame_qindex < cpi->active_worst_quality)
-                    Q = cpi->avg_frame_qindex;
+                if (cpi->gfu_boost > 600)
+                   cpi->active_best_quality = kf_low_motion_minq[Q];
+                else
+                   cpi->active_best_quality = kf_high_motion_minq[Q];
 
-               if ( cpi->gfu_boost > 1000 )
+                // Special case for key frames forced because we have reached
+                // the maximum key frame interval. Here force the Q to a range
+                // based on the ambient Q to reduce the risk of popping
+                if ( cpi->this_key_frame_forced )
+                {
+                    if ( cpi->active_best_quality > cpi->avg_frame_qindex * 7/8)
+                        cpi->active_best_quality = cpi->avg_frame_qindex * 7/8;
+                    else if ( cpi->active_best_quality < cpi->avg_frame_qindex >> 2 )
+                        cpi->active_best_quality = cpi->avg_frame_qindex >> 2;
+                }
+            }
+            // One pass more conservative
+            else
+               cpi->active_best_quality = kf_high_motion_minq[Q];
+         }
+
+        else if (cm->refresh_golden_frame || cpi->common.refresh_alt_ref_frame)
+        {
+            // Use the lower of cpi->active_worst_quality and recent
+            // average Q as basis for GF/ARF Q limit unless last frame was
+            // a key frame.
+            if ( (cpi->frames_since_key > 1) &&
+                 (cpi->avg_frame_qindex < cpi->active_worst_quality) )
+            {
+                Q = cpi->avg_frame_qindex;
+
+                if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+                     (Q < cpi->oxcf.cq_level) )
+                {
+                    Q = cpi->oxcf.cq_level;
+                }
+            }
+
+            if ( cpi->pass == 2 )
+            {
+                if ( cpi->gfu_boost > 1000 )
                     cpi->active_best_quality = gf_low_motion_minq[Q];
                 else if ( cpi->gfu_boost < 400 )
                     cpi->active_best_quality = gf_high_motion_minq[Q];
                 else
                     cpi->active_best_quality = gf_mid_motion_minq[Q];
-
-                /*cpi->active_best_quality = gf_arf_minq[Q];
-                tmp = (cpi->gfu_boost > 1000) ? 600 : cpi->gfu_boost - 400;
-                //tmp = (cpi->gfu_boost > 1000) ? 600 :
-                          //(cpi->gfu_boost < 400) ? 0 : cpi->gfu_boost - 400;
-                tmp = 128 - (tmp >> 4);
-                cpi->active_best_quality = (cpi->active_best_quality * tmp)>>7;*/
-
-           }
-           // KEY FRAMES
-           else
-           {
-               if (cpi->gfu_boost > 600)
-                   cpi->active_best_quality = kf_low_motion_minq[Q];
-               else
-                   cpi->active_best_quality = kf_high_motion_minq[Q];
-           }
+            }
+            // One pass more conservative
+            else
+                cpi->active_best_quality = gf_high_motion_minq[Q];
         }
         else
         {
             cpi->active_best_quality = inter_minq[Q];
+
+            // For the constant/constrained quality mode we dont want
+            // the quality to rise above the cq level.
+            if ((cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+                (cpi->active_best_quality < cpi->cq_target_quality) )
+            {
+                // If we are strongly undershooting the target rate in the last
+                // frames then use the user passed in cq value not the auto
+                // cq value.
+                if ( cpi->rolling_actual_bits < cpi->min_frame_bandwidth )
+                    cpi->active_best_quality = cpi->oxcf.cq_level;
+                else
+                    cpi->active_best_quality = cpi->cq_target_quality;
+            }
         }
 
         // If CBR and the buffer is as full then it is reasonable to allow higher quality on the frames
@@ -3840,7 +3919,6 @@ static void encode_frame_to_data_rate
 
                 cpi->active_best_quality -= min_qadjustment;
             }
-
         }
     }
 
@@ -4101,9 +4179,44 @@ static void encode_frame_to_data_rate
             active_worst_qchanged = FALSE;
 
 #if !(CONFIG_REALTIME_ONLY)
+        // Special case handling for forced key frames
+        if ( (cm->frame_type == KEY_FRAME) && cpi->this_key_frame_forced )
+        {
+            int last_q = Q;
+            int kf_err = vp8_calc_ss_err(cpi->Source,
+                                         &cm->yv12_fb[cm->new_fb_idx],
+                                         IF_RTCD(&cpi->rtcd.variance));
+
+            // The key frame is not good enough
+            if ( kf_err > ((cpi->ambient_err * 3) >> 2) )
+            {
+                // Lower q_high
+                q_high = (Q > q_low) ? (Q - 1) : q_low;
+
+                // Adjust Q
+                Q = (q_high + q_low) >> 1;
+            }
+            // The key frame is much better than the previous frame
+            else if ( kf_err < (cpi->ambient_err >> 1) )
+            {
+                // Raise q_low
+                q_low = (Q < q_high) ? (Q + 1) : q_high;
+
+                // Adjust Q
+                Q = (q_high + q_low + 1) >> 1;
+            }
+
+            // Clamp Q to upper and lower limits:
+            if (Q > q_high)
+                Q = q_high;
+            else if (Q < q_low)
+                Q = q_low;
+
+            Loop = ((Q != last_q)) ? TRUE : FALSE;
+        }
 
         // Is the projected frame size out of range and are we allowed to attempt to recode.
-        if ( recode_loop_test( cpi,
+        else if ( recode_loop_test( cpi,
                                frame_over_shoot_limit, frame_under_shoot_limit,
                                Q, top_index, bottom_index ) )
         {
@@ -4119,7 +4232,7 @@ static void encode_frame_to_data_rate
                 //if ( cpi->zbin_over_quant == 0 )
                 q_low = (Q < q_high) ? (Q + 1) : q_high; // Raise Qlow as to at least the current value
 
-                if (cpi->zbin_over_quant > 0)           // If we are using over quant do the same for zbin_oq_low
+                if (cpi->zbin_over_quant > 0)            // If we are using over quant do the same for zbin_oq_low
                     zbin_oq_low = (cpi->zbin_over_quant < zbin_oq_high) ? (cpi->zbin_over_quant + 1) : zbin_oq_high;
 
                 //if ( undershoot_seen || (Q == MAXQ) )
@@ -4188,6 +4301,16 @@ static void encode_frame_to_data_rate
 
                     Q = vp8_regulate_q(cpi, cpi->this_frame_target);
 
+                    // Special case reset for qlow for constrained quality.
+                    // This should only trigger where there is very substantial
+                    // undershoot on a frame and the auto cq level is above
+                    // the user passsed in value.
+                    if ( (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY) &&
+                         (Q < q_low) )
+                    {
+                        q_low = Q;
+                    }
+
                     while (((Q > q_high) || (cpi->zbin_over_quant > zbin_oq_high)) && (Retries < 10))
                     {
                         vp8_update_rate_correction_factors(cpi, 0);
@@ -4252,9 +4375,15 @@ static void encode_frame_to_data_rate
     }
 #endif
 
-    // Update the GF useage maps.
-    // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
-    vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
+    // Special case code to reduce pulsing when key frames are forced at a
+    // fixed interval. Note the reconstruction error if it is the frame before
+    // the force key frame
+    if ( cpi->next_key_frame_forced && (cpi->frames_to_key == 0) )
+    {
+        cpi->ambient_err = vp8_calc_ss_err(cpi->Source,
+                                           &cm->yv12_fb[cm->new_fb_idx],
+                                           IF_RTCD(&cpi->rtcd.variance));
+    }
 
     // This frame's MVs are saved and will be used in next frame's MV prediction.
     if(cm->show_frame)   //do not save for altref frame
@@ -4281,7 +4410,6 @@ static void encode_frame_to_data_rate
       }
     }
 
-
     // Update the GF useage maps.
     // This is done after completing the compression of a frame when all modes etc. are finalized but before loop filter
     vp8_update_gf_useage_maps(cpi, cm, &cpi->mb);
@@ -4312,8 +4440,6 @@ static void encode_frame_to_data_rate
     else
         cm->frame_to_show = &cm->yv12_fb[cm->new_fb_idx];
 
-
-
     //#pragma omp parallel sections
     {
 
@@ -4421,9 +4547,7 @@ static void encode_frame_to_data_rate
     }
 
     // Keep a record of ambient average Q.
-    if (cm->frame_type == KEY_FRAME)
-        cpi->avg_frame_qindex = cm->base_qindex;
-    else
+    if (cm->frame_type != KEY_FRAME)
         cpi->avg_frame_qindex = (2 + 3 * cpi->avg_frame_qindex + cm->base_qindex) >> 2;
 
     // Keep a record from which we can calculate the average Q excluding GF updates and key frames
@@ -4431,30 +4555,38 @@ static void encode_frame_to_data_rate
     {
         cpi->ni_frames++;
 
-        // Calculate the average Q for normal inter frames (not key or GFU frames)
-        // This is used as a basis for setting active worst quality.
-        if (cpi->ni_frames > 150)
+        // Calculate the average Q for normal inter frames (not key or GFU
+        // frames).
+        if ( cpi->pass == 2 )
         {
             cpi->ni_tot_qi += Q;
             cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
         }
-        // Early in the clip ... average the current frame Q value with the default
-        // entered by the user as a dampening measure
         else
         {
-            cpi->ni_tot_qi += Q;
-            cpi->ni_av_qi = ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2;
-        }
-
-        // If the average Q is higher than what was used in the last frame
-        // (after going through the recode loop to keep the frame size within range)
-        // then use the last frame value - 1.
-        // The -1 is designed to stop Q and hence the data rate, from progressively
-        // falling away during difficult sections, but at the same time reduce the number of
-        // itterations around the recode loop.
-        if (Q > cpi->ni_av_qi)
-            cpi->ni_av_qi = Q - 1;
+            // Damp value for first few frames
+            if (cpi->ni_frames > 150 )
+            {
+                cpi->ni_tot_qi += Q;
+                cpi->ni_av_qi = (cpi->ni_tot_qi / cpi->ni_frames);
+            }
+            // For one pass, early in the clip ... average the current frame Q
+            // value with the worstq entered by the user as a dampening measure
+            else
+            {
+                cpi->ni_tot_qi += Q;
+                cpi->ni_av_qi = ((cpi->ni_tot_qi / cpi->ni_frames) + cpi->worst_quality + 1) / 2;
+            }
 
+            // If the average Q is higher than what was used in the last frame
+            // (after going through the recode loop to keep the frame size within range)
+            // then use the last frame value - 1.
+            // The -1 is designed to stop Q and hence the data rate, from progressively
+            // falling away during difficult sections, but at the same time reduce the number of
+            // itterations around the recode loop.
+            if (Q > cpi->ni_av_qi)
+                cpi->ni_av_qi = Q - 1;
+        }
     }
 
 #if 0
@@ -4548,7 +4680,7 @@ static void encode_frame_to_data_rate
 
         if (cpi->total_coded_error_left != 0.0)
             fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld"
-                       "%6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
+                       "%6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
                        "%10.3f %8ld\n",
                        cpi->common.current_video_frame, cpi->this_frame_target,
                        cpi->projected_frame_size,
@@ -4557,7 +4689,8 @@ static void encode_frame_to_data_rate
                        (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
                        (int)cpi->total_actual_bits, cm->base_qindex,
                        cpi->active_best_quality, cpi->active_worst_quality,
-                       cpi->avg_frame_qindex, cpi->zbin_over_quant,
+                       cpi->ni_av_qi, cpi->cq_target_quality, cpi->zbin_over_quant,
+                       //cpi->avg_frame_qindex, cpi->zbin_over_quant,
                        cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
                        cm->frame_type, cpi->gfu_boost,
                        cpi->est_max_qcorrection_factor, (int)cpi->bits_left,
@@ -4566,7 +4699,7 @@ static void encode_frame_to_data_rate
                        cpi->tot_recode_hits);
         else
             fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %6ld %6ld"
-                       "%6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
+                       "%6ld %6ld %6ld %5ld %5ld %5ld %8ld %8.2f %10d %10.3f"
                        "%8ld\n",
                        cpi->common.current_video_frame,
                        cpi->this_frame_target, cpi->projected_frame_size,
@@ -4575,7 +4708,8 @@ static void encode_frame_to_data_rate
                        (cpi->oxcf.starting_buffer_level-cpi->bits_off_target),
                        (int)cpi->total_actual_bits, cm->base_qindex,
                        cpi->active_best_quality, cpi->active_worst_quality,
-                       cpi->avg_frame_qindex, cpi->zbin_over_quant,
+                       cpi->ni_av_qi, cpi->cq_target_quality, cpi->zbin_over_quant,
+                       //cpi->avg_frame_qindex, cpi->zbin_over_quant,
                        cm->refresh_golden_frame, cm->refresh_alt_ref_frame,
                        cm->frame_type, cpi->gfu_boost,
                        cpi->est_max_qcorrection_factor, (int)cpi->bits_left,
@@ -4807,7 +4941,9 @@ extern void vp8_pop_neon(INT64 *store);
 #endif
 int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CONFIG *sd, INT64 time_stamp, INT64 end_time)
 {
+#if HAVE_ARMV7
     INT64 store_reg[8];
+#endif
     VP8_COMP *cpi = (VP8_COMP *) ptr;
     VP8_COMMON *cm = &cpi->common;
     struct vpx_usec_timer  timer;
@@ -4910,7 +5046,9 @@ int vp8_receive_raw_frame(VP8_PTR ptr, unsigned int frame_flags, YV12_BUFFER_CON
 }
 int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned long *size, unsigned char *dest, INT64 *time_stamp, INT64 *time_end, int flush)
 {
+#if HAVE_ARMV7
     INT64 store_reg[8];
+#endif
     VP8_COMP *cpi = (VP8_COMP *) ptr;
     VP8_COMMON *cm = &cpi->common;
     struct vpx_usec_timer  tsctimer;
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 2f9cc4776..8a97e983b 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -183,6 +183,7 @@ typedef struct
     int optimize_coefficients;
 
     int use_fastquant_for_pick;
+    int no_skip_block4x4_search;
 
 } SPEED_FEATURES;
 
@@ -316,7 +317,11 @@ typedef struct
 
     unsigned int frames_since_key;
     unsigned int key_frame_frequency;
-    unsigned int next_key;
+    unsigned int this_key_frame_forced;
+    unsigned int next_key_frame_forced;
+
+    // Ambient reconstruction err target for force key frames
+    int ambient_err;
 
     unsigned int mode_check_freq[MAX_MODES];
     unsigned int mode_test_hit_counts[MAX_MODES];
@@ -365,7 +370,6 @@ typedef struct
     int this_frame_target;
     int projected_frame_size;
     int last_q[2];                   // Separate values for Intra/Inter
-    int target_bits_per_mb;
 
     double rate_correction_factor;
     double key_frame_rate_correction_factor;
@@ -398,6 +402,7 @@ typedef struct
     int kf_overspend_bits;            // Extra bits spent on key frames that need to be recovered on inter frames
     int kf_bitrate_adjustment;        // Current number of bit s to try and recover on each inter frame.
     int max_gf_interval;
+    int static_scene_max_gf_interval;
     int baseline_gf_interval;
     int gf_decay_rate;
     int active_arnr_frames;           // <= cpi->oxcf.arnr_max_frames
@@ -447,6 +452,10 @@ typedef struct
     int best_quality;
     int active_best_quality;
 
+    int cq_target_quality;
+    int maxq_max_limit;
+    int maxq_min_limit;
+
     int drop_frames_allowed;          // Are we permitted to drop frames?
     int drop_frame;                  // Drop this frame?
     int drop_count;                  // How many frames have we dropped?
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 8dfca351c..2b0f57508 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -608,8 +608,10 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
             memcpy(mdcounts, MDCounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
         }
 
-        //Only consider ZEROMV/ALTREF_FRAME for alt ref frame.
-        if (cpi->is_src_frame_alt_ref)
+        // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+        // unless ARNR filtering is enabled in which case we want
+        // an unfiltered alternative
+        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
         {
             if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
                 continue;
diff --git a/vp8/encoder/quantize.c b/vp8/encoder/quantize.c
index a67299487..be9f26c7f 100644
--- a/vp8/encoder/quantize.c
+++ b/vp8/encoder/quantize.c
@@ -70,7 +70,6 @@ void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d)
 {
     int i, rc, eob;
-    int zbin;
     int x, y, z, sz;
     short *coeff_ptr   = b->coeff;
     short *round_ptr   = b->round;
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 8455b7bdb..b69a1965e 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -1550,12 +1550,21 @@ void vp8_compute_frame_size_bounds(VP8_COMP *cpi, int *frame_under_shoot_limit,
                         *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
                     }
                 }
-                // VBR
+                // VBR and CQ mode
                 // Note that tighter restrictions here can help quality but hurt encode speed
                 else
                 {
-                    *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
-                    *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    // Stron overshoot limit for constrained quality
+                    if (cpi->oxcf.end_usage == USAGE_CONSTRAINED_QUALITY)
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 2 / 8;
+                    }
+                    else
+                    {
+                        *frame_over_shoot_limit  = cpi->this_frame_target * 11 / 8;
+                        *frame_under_shoot_limit = cpi->this_frame_target * 5 / 8;
+                    }
                 }
             }
         }
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index e6c7c9ab3..b2a3e117f 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -36,7 +36,6 @@
 #include "dct.h"
 #include "systemdependent.h"
 
-#define DIAMONDSEARCH 1
 #if CONFIG_RUNTIME_CPU_DETECT
 #define IF_RTCD(x)  (x)
 #else
@@ -46,19 +45,6 @@
 
 void vp8cx_mb_init_quantizer(VP8_COMP *cpi, MACROBLOCK *x);
 
-
-#define RDFUNC(RM,DM,R,D,target_rd) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
-/*int  RDFUNC( int RM,int DM, int R, int D, int target_r )
-{
-    int rd_value;
-
-    rd_value =  ( ((128+(R)*(RM)) >> 8) + (DM)*(D) );
-
-    return rd_value;
-}*/
-
-#define UVRDFUNC(RM,DM,R,D,target_r)  RDFUNC(RM,DM,R,D,target_r)
-
 #define RDCOST(RM,DM,R,D) ( ((128+(R)*(RM)) >> 8) + (DM)*(D) )
 
 #define MAXF(a,b)            (((a) > (b)) ? (a) : (b))
@@ -223,8 +209,6 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
 {
     int q;
     int i;
-    int *thresh;
-    int threshmult;
     double capped_q = (Qvalue < 160) ? (double)Qvalue : 160.0;
     double rdconst = 3.00;
 
@@ -271,22 +255,6 @@ void vp8_initialize_rd_consts(VP8_COMP *cpi, int Qvalue)
     if (q < 8)
         q = 8;
 
-    if (cpi->ref_frame_flags == VP8_ALT_FLAG)
-    {
-        thresh      = &cpi->rd_threshes[THR_NEWA];
-        threshmult  = cpi->sf.thresh_mult[THR_NEWA];
-    }
-    else if (cpi->ref_frame_flags == VP8_GOLD_FLAG)
-    {
-        thresh      = &cpi->rd_threshes[THR_NEWG];
-        threshmult  = cpi->sf.thresh_mult[THR_NEWG];
-    }
-    else
-    {
-        thresh      = &cpi->rd_threshes[THR_NEWMV];
-        threshmult  = cpi->sf.thresh_mult[THR_NEWMV];
-    }
-
     if (cpi->RDMULT > 1000)
     {
         cpi->RDDIV = 1;
@@ -775,7 +743,7 @@ static int vp8_rd_inter_uv(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *distort
     *rate       = rd_cost_mbuv(x);
     *distortion = ENCODEMB_INVOKE(&cpi->rtcd.encodemb, mbuverr)(x) / 4;
 
-    return UVRDFUNC(x->rdmult, x->rddiv, *rate, *distortion, cpi->target_bits_per_mb);
+    return RDCOST(x->rdmult, x->rddiv, *rate, *distortion);
 }
 
 int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *rate_tokenonly, int *distortion)
@@ -800,7 +768,7 @@ int vp8_rd_pick_intra_mbuv_mode(VP8_COMP *cpi, MACROBLOCK *x, int *rate, int *ra
 
         distortion = vp8_get_mbuvrecon_error(IF_RTCD(&cpi->rtcd.variance), x);
 
-        this_rd = UVRDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+        this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
         if (this_rd < best_rd)
         {
@@ -1097,7 +1065,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
     // Segmentation method overheads
     rate = vp8_cost_token(vp8_mbsplit_tree, vp8_mbsplit_probs, vp8_mbsplit_encodings + segmentation);
     rate += vp8_cost_mv_ref(SPLITMV, bsi->mdcounts);
-    this_segment_rd += RDFUNC(x->rdmult, x->rddiv, rate, 0, cpi->target_bits_per_mb);
+    this_segment_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
     br += rate;
 
     for (i = 0; i < label_count; i++)
@@ -1252,7 +1220,7 @@ void vp8_rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x, BEST_SEG_INFO *bsi,
             labelyrate = rdcost_mbsegment_y(x, labels, i, ta_s, tl_s);
             rate += labelyrate;
 
-            this_rd = RDFUNC(x->rdmult, x->rddiv, rate, distortion, cpi->target_bits_per_mb);
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate, distortion);
 
             if (this_rd < best_label_rd)
             {
@@ -1357,10 +1325,10 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
 
         if (bsi.segment_rd < best_rd)
         {
-            int col_min = (best_ref_mv->col - MAX_POSSIBLE_MV) >>3;
-            int col_max = (best_ref_mv->col + MAX_POSSIBLE_MV) >>3;
-            int row_min = (best_ref_mv->row - MAX_POSSIBLE_MV) >>3;
-            int row_max = (best_ref_mv->row + MAX_POSSIBLE_MV) >>3;
+            int col_min = (best_ref_mv->col - MAX_FULL_PEL_VAL) >>3;
+            int col_max = (best_ref_mv->col + MAX_FULL_PEL_VAL) >>3;
+            int row_min = (best_ref_mv->row - MAX_FULL_PEL_VAL) >>3;
+            int row_max = (best_ref_mv->row + MAX_FULL_PEL_VAL) >>3;
 
             int tmp_col_min = x->mv_col_min;
             int tmp_col_max = x->mv_col_max;
@@ -1407,7 +1375,8 @@ static int vp8_rd_pick_best_mbsegmentation(VP8_COMP *cpi, MACROBLOCK *x,
             }
 
             /* If 8x8 is better than 16x8/8x16, then do 4x4 search */
-            if (bsi.segment_num == BLOCK_8X8)  /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
+            /* Not skip 4x4 if speed=0 (good quality) */
+            if (cpi->sf.no_skip_block4x4_search || bsi.segment_num == BLOCK_8X8)  /* || (sv_segment_rd8x8-bsi.segment_rd) < sv_segment_rd8x8>>5) */
             {
                 bsi.mvp = &bsi.sv_mvp[0];
                 vp8_rd_check_segment(cpi, x, &bsi, BLOCK_4X4);
@@ -1751,7 +1720,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     //int intermodecost[MAX_MODES];
 
     MB_PREDICTION_MODE uv_intra_mode;
-    int uvintra_eob = 0;
+
     int force_no_skip = 0;
 
     MV mvp;
@@ -1760,36 +1729,65 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     int saddone=0;
     int sr=0;    //search range got from mv_pred(). It uses step_param levels. (0-7)
 
-    *returnintra = INT_MAX;
+    MV frame_nearest_mv[4];
+    MV frame_near_mv[4];
+    MV frame_best_ref_mv[4];
+    int frame_mdcounts[4][4];
+    int frame_lf_or_gf[4];
+    unsigned char *y_buffer[4];
+    unsigned char *u_buffer[4];
+    unsigned char *v_buffer[4];
 
-    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); // clean
+    vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
 
-    cpi->mbs_tested_so_far++;          // Count of the number of MBs tested so far this frame
+    if (cpi->ref_frame_flags & VP8_LAST_FLAG)
+    {
+        YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
 
-    x->skip = 0;
+        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[LAST_FRAME], &frame_near_mv[LAST_FRAME],
+                          &frame_best_ref_mv[LAST_FRAME], frame_mdcounts[LAST_FRAME], LAST_FRAME, cpi->common.ref_frame_sign_bias);
 
-    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
+        y_buffer[LAST_FRAME] = lst_yv12->y_buffer + recon_yoffset;
+        u_buffer[LAST_FRAME] = lst_yv12->u_buffer + recon_uvoffset;
+        v_buffer[LAST_FRAME] = lst_yv12->v_buffer + recon_uvoffset;
+
+        frame_lf_or_gf[LAST_FRAME] = 0;
+    }
+
+    if (cpi->ref_frame_flags & VP8_GOLD_FLAG)
+    {
+        YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];
+
+        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[GOLDEN_FRAME], &frame_near_mv[GOLDEN_FRAME],
+                          &frame_best_ref_mv[GOLDEN_FRAME], frame_mdcounts[GOLDEN_FRAME], GOLDEN_FRAME, cpi->common.ref_frame_sign_bias);
+
+        y_buffer[GOLDEN_FRAME] = gld_yv12->y_buffer + recon_yoffset;
+        u_buffer[GOLDEN_FRAME] = gld_yv12->u_buffer + recon_uvoffset;
+        v_buffer[GOLDEN_FRAME] = gld_yv12->v_buffer + recon_uvoffset;
+
+        frame_lf_or_gf[GOLDEN_FRAME] = 1;
+    }
 
-    // Experimental code
-    // Adjust the RD multiplier based on the best case distortion we saw in the most recently coded mb
-    //if ( (cpi->last_mb_distortion) > 0 && (cpi->target_bits_per_mb > 0) )
-    /*{
-        int tmprdmult;
+    if (cpi->ref_frame_flags & VP8_ALT_FLAG)
+    {
+        YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
+
+        vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context, &frame_nearest_mv[ALTREF_FRAME], &frame_near_mv[ALTREF_FRAME],
+                          &frame_best_ref_mv[ALTREF_FRAME], frame_mdcounts[ALTREF_FRAME], ALTREF_FRAME, cpi->common.ref_frame_sign_bias);
 
-        //tmprdmult = (cpi->last_mb_distortion * 256) / ((cpi->av_per_frame_bandwidth*256)/cpi->common.MBs);
-        tmprdmult = (cpi->last_mb_distortion * 256) / cpi->target_bits_per_mb;
-        //tmprdmult = tmprdmult;
+        y_buffer[ALTREF_FRAME] = alt_yv12->y_buffer + recon_yoffset;
+        u_buffer[ALTREF_FRAME] = alt_yv12->u_buffer + recon_uvoffset;
+        v_buffer[ALTREF_FRAME] = alt_yv12->v_buffer + recon_uvoffset;
 
-        //if ( tmprdmult > cpi->RDMULT * 2 )
-        //  tmprdmult = cpi->RDMULT * 2;
-        //else if ( tmprdmult < cpi->RDMULT / 2 )
-        //  tmprdmult = cpi->RDMULT / 2;
+        frame_lf_or_gf[ALTREF_FRAME] = 1;
+    }
 
-        //tmprdmult = (tmprdmult < 25) ? 25 : tmprdmult;
+    *returnintra = INT_MAX;
+    cpi->mbs_tested_so_far++;          // Count of the number of MBs tested so far this frame
 
-        //x->rdmult = tmprdmult;
+    x->skip = 0;
 
-    }*/
+    ref_frame_cost[INTRA_FRAME]   = vp8_cost_zero(cpi->prob_intra_coded);
 
     // Special case treatment when GF and ARF are not sensible options for reference
     if (cpi->ref_frame_flags == VP8_LAST_FLAG)
@@ -1820,12 +1818,6 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
     x->e_mbd.mode_info_context->mbmi.ref_frame = INTRA_FRAME;
     vp8_rd_pick_intra_mbuv_mode(cpi, x, &uv_intra_rate, &uv_intra_rate_tokenonly, &uv_intra_distortion);
     uv_intra_mode = x->e_mbd.mode_info_context->mbmi.uv_mode;
-    {
-        uvintra_eob = 0;
-
-        for (i = 16; i < 24; i++)
-            uvintra_eob += x->e_mbd.block[i].eob;
-    }
 
     for (mode_index = 0; mode_index < MAX_MODES; mode_index++)
     {
@@ -1847,8 +1839,6 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         if (best_rd <= cpi->rd_threshes[mode_index])
             continue;
 
-
-
         // These variables hold are rolling total cost and distortion for this mode
         rate2 = 0;
         distortion2 = 0;
@@ -1859,65 +1849,28 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
         x->e_mbd.mode_info_context->mbmi.uv_mode = DC_PRED;
         x->e_mbd.mode_info_context->mbmi.ref_frame = vp8_ref_frame_order[mode_index];
 
-        //Only consider ZEROMV/ALTREF_FRAME for alt ref frame.
-        if (cpi->is_src_frame_alt_ref)
+        // Only consider ZEROMV/ALTREF_FRAME for alt ref frame,
+        // unless ARNR filtering is enabled in which case we want
+        // an unfiltered alternative
+        if (cpi->is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0))
         {
             if (this_mode != ZEROMV || x->e_mbd.mode_info_context->mbmi.ref_frame != ALTREF_FRAME)
                 continue;
         }
 
-        if (x->e_mbd.mode_info_context->mbmi.ref_frame == LAST_FRAME)
+        /* everything but intra */
+        if (x->e_mbd.mode_info_context->mbmi.ref_frame)
         {
-            YV12_BUFFER_CONFIG *lst_yv12 = &cpi->common.yv12_fb[cpi->common.lst_fb_idx];
-
-            if (!(cpi->ref_frame_flags & VP8_LAST_FLAG))
-                continue;
-
-            lf_or_gf = 0;  // Local last frame vs Golden frame flag
-
-            // Set up pointers for this macro block into the previous frame recon buffer
-            x->e_mbd.pre.y_buffer = lst_yv12->y_buffer + recon_yoffset;
-            x->e_mbd.pre.u_buffer = lst_yv12->u_buffer + recon_uvoffset;
-            x->e_mbd.pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
-        }
-        else if (x->e_mbd.mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-        {
-            YV12_BUFFER_CONFIG *gld_yv12 = &cpi->common.yv12_fb[cpi->common.gld_fb_idx];
-
-            // not supposed to reference gold frame
-            if (!(cpi->ref_frame_flags & VP8_GOLD_FLAG))
-                continue;
-
-            lf_or_gf = 1;  // Local last frame vs Golden frame flag
-
-            // Set up pointers for this macro block into the previous frame recon buffer
-            x->e_mbd.pre.y_buffer = gld_yv12->y_buffer + recon_yoffset;
-            x->e_mbd.pre.u_buffer = gld_yv12->u_buffer + recon_uvoffset;
-            x->e_mbd.pre.v_buffer = gld_yv12->v_buffer + recon_uvoffset;
-        }
-        else if (x->e_mbd.mode_info_context->mbmi.ref_frame == ALTREF_FRAME)
-        {
-            YV12_BUFFER_CONFIG *alt_yv12 = &cpi->common.yv12_fb[cpi->common.alt_fb_idx];
-
-            // not supposed to reference alt ref frame
-            if (!(cpi->ref_frame_flags & VP8_ALT_FLAG))
-                continue;
-
-            //if ( !cpi->source_alt_ref_active )
-            //  continue;
-
-            lf_or_gf = 1;  // Local last frame vs Golden frame flag
-
-            // Set up pointers for this macro block into the previous frame recon buffer
-            x->e_mbd.pre.y_buffer = alt_yv12->y_buffer + recon_yoffset;
-            x->e_mbd.pre.u_buffer = alt_yv12->u_buffer + recon_uvoffset;
-            x->e_mbd.pre.v_buffer = alt_yv12->v_buffer + recon_uvoffset;
+            x->e_mbd.pre.y_buffer = y_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            x->e_mbd.pre.u_buffer = u_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            x->e_mbd.pre.v_buffer = v_buffer[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            mode_mv[NEARESTMV] = frame_nearest_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            mode_mv[NEARMV] = frame_near_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            best_ref_mv = frame_best_ref_mv[x->e_mbd.mode_info_context->mbmi.ref_frame];
+            vpx_memcpy(mdcounts, frame_mdcounts[x->e_mbd.mode_info_context->mbmi.ref_frame], sizeof(mdcounts));
+            lf_or_gf = frame_lf_or_gf[x->e_mbd.mode_info_context->mbmi.ref_frame];
         }
 
-        vp8_find_near_mvs(&x->e_mbd,
-                          x->e_mbd.mode_info_context,
-                          &mode_mv[NEARESTMV], &mode_mv[NEARMV], &best_ref_mv,
-                          mdcounts, x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias);
 
         if(x->e_mbd.mode_info_context->mbmi.mode == NEWMV)
         {
@@ -1986,14 +1939,14 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                         x->e_mbd.mode_info_context->mbmi.ref_frame, cpi->common.ref_frame_sign_bias, &sr, &near_sadidx[0]);
 
             /* adjust mvp to make sure it is within MV range */
-            if(mvp.row > best_ref_mv.row + MAX_POSSIBLE_MV)
-                mvp.row = best_ref_mv.row + MAX_POSSIBLE_MV;
-            else if(mvp.row < best_ref_mv.row - MAX_POSSIBLE_MV)
-                mvp.row = best_ref_mv.row - MAX_POSSIBLE_MV;
-            if(mvp.col > best_ref_mv.col + MAX_POSSIBLE_MV)
-                mvp.col = best_ref_mv.col + MAX_POSSIBLE_MV;
-            else if(mvp.col < best_ref_mv.col - MAX_POSSIBLE_MV)
-                mvp.col = best_ref_mv.col - MAX_POSSIBLE_MV;
+            if(mvp.row > best_ref_mv.row + MAX_FULL_PEL_VAL)
+                mvp.row = best_ref_mv.row + MAX_FULL_PEL_VAL;
+            else if(mvp.row < best_ref_mv.row - MAX_FULL_PEL_VAL)
+                mvp.row = best_ref_mv.row - MAX_FULL_PEL_VAL;
+            if(mvp.col > best_ref_mv.col + MAX_FULL_PEL_VAL)
+                mvp.col = best_ref_mv.col + MAX_FULL_PEL_VAL;
+            else if(mvp.col < best_ref_mv.col - MAX_FULL_PEL_VAL)
+                mvp.col = best_ref_mv.col - MAX_FULL_PEL_VAL;
         }
 
         // Check to see if the testing frequency for this mode is at its max
@@ -2125,10 +2078,10 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                 int further_steps;
                 int n;
 
-                int col_min = (best_ref_mv.col - MAX_POSSIBLE_MV) >>3;
-                int col_max = (best_ref_mv.col + MAX_POSSIBLE_MV) >>3;
-                int row_min = (best_ref_mv.row - MAX_POSSIBLE_MV) >>3;
-                int row_max = (best_ref_mv.row + MAX_POSSIBLE_MV) >>3;
+                int col_min = (best_ref_mv.col - MAX_FULL_PEL_VAL) >>3;
+                int col_max = (best_ref_mv.col + MAX_FULL_PEL_VAL) >>3;
+                int row_min = (best_ref_mv.row - MAX_FULL_PEL_VAL) >>3;
+                int row_max = (best_ref_mv.row + MAX_FULL_PEL_VAL) >>3;
 
                 int tmp_col_min = x->mv_col_min;
                 int tmp_col_max = x->mv_col_max;
@@ -2339,8 +2292,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                             distortion_uv = sse2;
 
                             disable_skip = 1;
-                            this_rd = RDFUNC(x->rdmult, x->rddiv, rate2,
-                                             distortion2, cpi->target_bits_per_mb);
+                            this_rd = RDCOST(x->rdmult, x->rddiv, rate2,
+                                             distortion2);
 
                             break;
                         }
@@ -2414,7 +2367,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
                 }
             }
             // Calculate the final RD estimate for this mode
-            this_rd = RDFUNC(x->rdmult, x->rddiv, rate2, distortion2, cpi->target_bits_per_mb);
+            this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
         }
 
         // Experimental debug code.
@@ -2442,8 +2395,8 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             other_cost += ref_frame_cost[x->e_mbd.mode_info_context->mbmi.ref_frame];
 
             /* Calculate the final y RD estimate for this mode */
-            best_yrd = RDFUNC(x->rdmult, x->rddiv, (rate2-rate_uv-other_cost),
-                              (distortion2-distortion_uv), cpi->target_bits_per_mb);
+            best_yrd = RDCOST(x->rdmult, x->rddiv, (rate2-rate_uv-other_cost),
+                              (distortion2-distortion_uv));
 
             *returnrate = rate2;
             *returndistortion = distortion2;
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index 2fffaa95f..f28daaff9 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -36,36 +36,9 @@
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
 #define ALT_REF_SUBPEL_ENABLED 1 // dis/enable subpel in MC AltRef filtering
-#define USE_FILTER_LUT 0         // use lookup table to improve filter
 
 #if VP8_TEMPORAL_ALT_REF
 
-#if USE_FILTER_LUT
-// for (strength = 0; strength <= 6; strength++) {
-//   for (delta = 0; delta <= 18; delta++) {
-//     float coeff = (3.0 * delta * delta) / pow(2, strength);
-//     printf("%3d", (int)roundf(coeff > 16 ? 0 : 16-coeff));
-//   }
-//   printf("\n");
-// }
-static int modifier_lut[7][19] =
-{
-    // Strength=0
-    {16, 13,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=1
-    {16, 15, 10,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=2
-    {16, 15, 13,  9,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=3
-    {16, 16, 15, 13, 10,  7,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=4
-    {16, 16, 15, 14, 13, 11,  9,  7,  4,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0},
-    // Strength=5
-    {16, 16, 16, 15, 15, 14, 13, 11, 10,  8,  7,  5,  3,  0,  0,  0,  0,  0,  0},
-    // Strength=6
-    {16, 16, 16, 16, 15, 15, 14, 14, 13, 12, 11, 10,  9,  8,  7,  5,  4,  2,  1}
-};
-#endif
 static void vp8_temporal_filter_predictors_mb_c
 (
     MACROBLOCKD *x,
@@ -86,14 +59,11 @@ static void vp8_temporal_filter_predictors_mb_c
 
     if ((mv_row | mv_col) & 7)
     {
-//        vp8_sixtap_predict16x16_c(yptr, stride,
-//                                    mv_col & 7, mv_row & 7, &pred[0], 16);
         x->subpixel_predict16x16(yptr, stride,
                                     mv_col & 7, mv_row & 7, &pred[0], 16);
     }
     else
     {
-        //vp8_copy_mem16x16_c (yptr, stride, &pred[0], 16);
         RECON_INVOKE(&x->rtcd->recon, copy16x16)(yptr, stride, &pred[0], 16);
     }
 
@@ -127,17 +97,13 @@ void vp8_temporal_filter_apply_c
     int strength,
     int filter_weight,
     unsigned int *accumulator,
-    unsigned int *count
+    unsigned short *count
 )
 {
     int i, j, k;
     int modifier;
     int byte = 0;
 
-#if USE_FILTER_LUT
-    int *lut = modifier_lut[strength];
-#endif
-
     for (i = 0,k = 0; i < block_size; i++)
     {
         for (j = 0; j < block_size; j++, k++)
@@ -146,11 +112,10 @@ void vp8_temporal_filter_apply_c
             int src_byte = frame1[byte];
             int pixel_value = *frame2++;
 
-#if USE_FILTER_LUT
-            modifier = abs(src_byte-pixel_value);
-            modifier = modifier>18 ? 0 : lut[modifier];
-#else
             modifier   = src_byte - pixel_value;
+            // This is an integer approximation of:
+            // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
+            // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
             modifier  *= modifier;
             modifier  *= 3;
             modifier  += 1 << (strength - 1);
@@ -160,7 +125,6 @@ void vp8_temporal_filter_apply_c
                 modifier = 16;
 
             modifier = 16 - modifier;
-#endif
             modifier *= filter_weight;
 
             count[k] += modifier;
@@ -326,17 +290,17 @@ static void vp8_temporal_filter_iterate_c
     int mb_col, mb_row;
     unsigned int filter_weight[MAX_LAG_BUFFERS];
     unsigned char *mm_ptr = cpi->fp_motion_map;
-    int cols = cpi->common.mb_cols;
-    int rows = cpi->common.mb_rows;
+    int mb_cols = cpi->common.mb_cols;
+    int mb_rows = cpi->common.mb_rows;
     int MBs  = cpi->common.MBs;
     int mb_y_offset = 0;
     int mb_uv_offset = 0;
-    unsigned int accumulator[384];
-    unsigned int count[384];
+    DECLARE_ALIGNED_ARRAY(16, unsigned int, accumulator, 16*16 + 8*8 + 8*8);
+    DECLARE_ALIGNED_ARRAY(16, unsigned short, count, 16*16 + 8*8 + 8*8);
     MACROBLOCKD *mbd = &cpi->mb.e_mbd;
     YV12_BUFFER_CONFIG *f = cpi->frames[alt_ref_index];
     unsigned char *dst1, *dst2;
-    DECLARE_ALIGNED(16, unsigned char,  predictor[384]);
+    DECLARE_ALIGNED_ARRAY(16, unsigned char,  predictor, 16*16 + 8*8 + 8*8);
 
     // Save input state
     unsigned char *y_buffer = mbd->pre.y_buffer;
@@ -350,7 +314,7 @@ static void vp8_temporal_filter_iterate_c
             filter_weight[frame] = 1;
     }
 
-    for (mb_row = 0; mb_row < rows; mb_row++)
+    for (mb_row = 0; mb_row < mb_rows; mb_row++)
     {
 #if ALT_REF_MC_ENABLED
         // Reduced search extent by 3 for 6-tap filter & smaller UMV border
@@ -359,14 +323,14 @@ static void vp8_temporal_filter_iterate_c
                                 + (VP8BORDERINPIXELS - 19);
 #endif
 
-        for (mb_col = 0; mb_col < cols; mb_col++)
+        for (mb_col = 0; mb_col < mb_cols; mb_col++)
         {
             int i, j, k, w;
             int weight_cap;
             int stride;
 
             vpx_memset(accumulator, 0, 384*sizeof(unsigned int));
-            vpx_memset(count, 0, 384*sizeof(unsigned int));
+            vpx_memset(count, 0, 384*sizeof(unsigned short));
 
 #if ALT_REF_MC_ENABLED
             // Reduced search extent by 3 for 6-tap filter & smaller UMV border
@@ -533,8 +497,8 @@ static void vp8_temporal_filter_iterate_c
             mb_uv_offset += 8;
         }
 
-        mb_y_offset += 16*f->y_stride-f->y_width;
-        mb_uv_offset += 8*f->uv_stride-f->uv_width;
+        mb_y_offset += 16*(f->y_stride-mb_cols);
+        mb_uv_offset += 8*(f->uv_stride-mb_cols);
     }
 
     // Restore input state
diff --git a/vp8/encoder/temporal_filter.h b/vp8/encoder/temporal_filter.h
index 7b8c21c04..740037a85 100644
--- a/vp8/encoder/temporal_filter.h
+++ b/vp8/encoder/temporal_filter.h
@@ -22,9 +22,13 @@
      int strength, \
      int filter_weight, \
      unsigned int *accumulator, \
-     unsigned int *count \
+     unsigned short *count \
     )
 
+#if ARCH_X86 || ARCH_X86_64
+#include "x86/temporal_filter_x86.h"
+#endif
+
 #ifndef vp8_temporal_filter_apply
 #define vp8_temporal_filter_apply vp8_temporal_filter_apply_c
 #endif
diff --git a/vp8/encoder/x86/temporal_filter_apply_sse2.asm b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
new file mode 100644
index 000000000..f2adcccba
--- /dev/null
+++ b/vp8/encoder/x86/temporal_filter_apply_sse2.asm
@@ -0,0 +1,207 @@
+;
+;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+;  Use of this source code is governed by a BSD-style license
+;  that can be found in the LICENSE file in the root of the source
+;  tree. An additional intellectual property rights grant can be found
+;  in the file PATENTS.  All contributing project authors may
+;  be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+; void vp8_temporal_filter_apply_sse2 | arg
+;  (unsigned char  *frame1,           |  0
+;   unsigned int    stride,           |  1
+;   unsigned char  *frame2,           |  2
+;   unsigned int    block_size,       |  3
+;   int             strength,         |  4
+;   int             filter_weight,    |  5
+;   unsigned int   *accumulator,      |  6
+;   unsigned short *count)            |  7
+global sym(vp8_temporal_filter_apply_sse2)
+sym(vp8_temporal_filter_apply_sse2):
+
+    push        rbp
+    mov         rbp, rsp
+    SHADOW_ARGS_TO_STACK 8
+    SAVE_XMM
+    GET_GOT     rbx
+    push        rsi
+    push        rdi
+    ALIGN_STACK 16, rax
+    %define block_size    0
+    %define strength      16
+    %define filter_weight 32
+    %define rounding_bit  48
+    %define rbp_backup    64
+    %define stack_size    80
+    sub         rsp,           stack_size
+    mov         [rsp + rbp_backup], rbp
+    ; end prolog
+
+        mov         rdx,            arg(3)
+        mov         [rsp + block_size], rdx
+        movd        xmm6,            arg(4)
+        movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
+
+        ; calculate the rounding bit outside the loop
+        ; 0x8000 >> (16 - strength)
+        mov         rdx,            16
+        sub         rdx,            arg(4) ; 16 - strength
+        movd        xmm4,           rdx    ; can't use rdx w/ shift
+        movdqa      xmm5,           [GLOBAL(_const_top_bit)]
+        psrlw       xmm5,           xmm4
+        movdqa      [rsp + rounding_bit], xmm5
+
+        mov         rsi,            arg(0) ; src/frame1
+        mov         rdx,            arg(2) ; predictor frame
+        mov         rdi,            arg(6) ; accumulator
+        mov         rax,            arg(7) ; count
+
+        ; dup the filter weight and store for later
+        movd        xmm0,           arg(5) ; filter_weight
+        pshuflw     xmm0,           xmm0, 0
+        punpcklwd   xmm0,           xmm0
+        movdqa      [rsp + filter_weight], xmm0
+
+        mov         rbp,            arg(1) ; stride
+        pxor        xmm7,           xmm7   ; zero for extraction
+
+        lea         rcx,            [rdx + 16*16*1]
+        cmp         dword ptr [rsp + block_size], 8
+        jne         temporal_filter_apply_load_16
+        lea         rcx,            [rdx + 8*8*1]
+
+temporal_filter_apply_load_8:
+        movq        xmm0,           [rsi]  ; first row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        movq        xmm1,           [rsi]  ; second row
+        lea         rsi,            [rsi + rbp] ; += stride
+        punpcklbw   xmm1,           xmm7   ; src[ 8-15]
+        jmp         temporal_filter_apply_load_finished
+
+temporal_filter_apply_load_16:
+        movdqu      xmm0,           [rsi]  ; src (frame1)
+        lea         rsi,            [rsi + rbp] ; += stride
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; src[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; src[ 8-15]
+
+temporal_filter_apply_load_finished:
+        movdqa      xmm2,           [rdx]  ; predictor (frame2)
+        movdqa      xmm3,           xmm2
+        punpcklbw   xmm2,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm3,           xmm7   ; pred[ 8-15]
+
+        ; modifier = src_byte - pixel_value
+        psubw       xmm0,           xmm2   ; src - pred[ 0- 7]
+        psubw       xmm1,           xmm3   ; src - pred[ 8-15]
+
+        ; modifier *= modifier
+        pmullw      xmm0,           xmm0   ; modifer[ 0- 7]^2
+        pmullw      xmm1,           xmm1   ; modifer[ 8-15]^2
+
+        ; modifier *= 3
+        pmullw      xmm0,           [GLOBAL(_const_3w)]
+        pmullw      xmm1,           [GLOBAL(_const_3w)]
+
+        ; modifer += 0x8000 >> (16 - strength)
+        paddw       xmm0,           [rsp + rounding_bit]
+        paddw       xmm1,           [rsp + rounding_bit]
+
+        ; modifier >>= strength
+        psrlw       xmm0,           [rsp + strength]
+        psrlw       xmm1,           [rsp + strength]
+
+        ; modifier = 16 - modifier
+        ; saturation takes care of modifier > 16
+        movdqa      xmm3,           [GLOBAL(_const_16w)]
+        movdqa      xmm2,           [GLOBAL(_const_16w)]
+        psubusw     xmm3,           xmm1
+        psubusw     xmm2,           xmm0
+
+        ; modifier *= filter_weight
+        pmullw      xmm2,           [rsp + filter_weight]
+        pmullw      xmm3,           [rsp + filter_weight]
+
+        ; count
+        movdqa      xmm4,           [rax]
+        movdqa      xmm5,           [rax+16]
+        ; += modifier
+        paddw       xmm4,           xmm2
+        paddw       xmm5,           xmm3
+        ; write back
+        movdqa      [rax],          xmm4
+        movdqa      [rax+16],       xmm5
+        lea         rax,            [rax + 16*2] ; count += 16*(sizeof(short))
+
+        ; load and extract the predictor up to shorts
+        pxor        xmm7,           xmm7
+        movdqa      xmm0,           [rdx]
+        lea         rdx,            [rdx + 16*1] ; pred += 16*(sizeof(char))
+        movdqa      xmm1,           xmm0
+        punpcklbw   xmm0,           xmm7   ; pred[ 0- 7]
+        punpckhbw   xmm1,           xmm7   ; pred[ 8-15]
+
+        ; modifier *= pixel_value
+        pmullw      xmm0,           xmm2
+        pmullw      xmm1,           xmm3
+
+        ; expand to double words
+        movdqa      xmm2,           xmm0
+        punpcklwd   xmm0,           xmm7   ; [ 0- 3]
+        punpckhwd   xmm2,           xmm7   ; [ 4- 7]
+        movdqa      xmm3,           xmm1
+        punpcklwd   xmm1,           xmm7   ; [ 8-11]
+        punpckhwd   xmm3,           xmm7   ; [12-15]
+
+        ; accumulator
+        movdqa      xmm4,           [rdi]
+        movdqa      xmm5,           [rdi+16]
+        movdqa      xmm6,           [rdi+32]
+        movdqa      xmm7,           [rdi+48]
+        ; += modifier
+        paddw       xmm4,           xmm0
+        paddw       xmm5,           xmm2
+        paddw       xmm6,           xmm1
+        paddw       xmm7,           xmm3
+        ; write back
+        movdqa      [rdi],          xmm4
+        movdqa      [rdi+16],       xmm5
+        movdqa      [rdi+32],       xmm6
+        movdqa      [rdi+48],       xmm7
+        lea         rdi,            [rdi + 16*4] ; accumulator += 16*(sizeof(int))
+
+        cmp         rdx,            rcx
+        je          temporal_filter_apply_epilog
+        pxor        xmm7,           xmm7   ; zero for extraction
+        cmp         dword ptr [rsp + block_size], 16
+        je          temporal_filter_apply_load_16
+        jmp         temporal_filter_apply_load_8
+
+temporal_filter_apply_epilog:
+    ; begin epilog
+    mov         rbp,            [rsp + rbp_backup]
+    add         rsp,            stack_size
+    pop         rsp
+    pop         rdi
+    pop         rsi
+    RESTORE_GOT
+    RESTORE_XMM
+    UNSHADOW_ARGS
+    pop         rbp
+    ret
+
+SECTION_RODATA
+align 16
+_const_3w:
+    times 8 dw 3
+align 16
+_const_top_bit:
+    times 8 dw 1<<15
+align 16
+_const_16w
+    times 8 dw 16
diff --git a/vp8/encoder/x86/temporal_filter_x86.h b/vp8/encoder/x86/temporal_filter_x86.h
new file mode 100644
index 000000000..2daa14018
--- /dev/null
+++ b/vp8/encoder/x86/temporal_filter_x86.h
@@ -0,0 +1,27 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef __INC_VP8_TEMPORAL_FILTER_X86_H
+#define __INC_VP8_TEMPORAL_FILTER_X86_H
+
+#if HAVE_SSE2
+extern prototype_apply(vp8_temporal_filter_apply_sse2);
+
+#if !CONFIG_RUNTIME_CPU_DETECT
+
+#undef  vp8_temporal_filter_apply
+#define vp8_temporal_filter_apply vp8_temporal_filter_apply_sse2
+
+#endif
+
+#endif
+
+#endif // __INC_VP8_TEMPORAL_FILTER_X86_H
diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c
index 6e317e2a2..f9b3ea1d8 100644
--- a/vp8/encoder/x86/x86_csystemdependent.c
+++ b/vp8/encoder/x86/x86_csystemdependent.c
@@ -309,6 +309,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
 
         /*cpi->rtcd.quantize.quantb            = vp8_regular_quantize_b_sse2;*/
         cpi->rtcd.quantize.fastquantb            = vp8_fast_quantize_b_sse2;
+
+        cpi->rtcd.temporal.apply                 = vp8_temporal_filter_apply_sse2;
     }
 #endif
 
@@ -321,8 +323,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad8x16x3             = vp8_sad8x16x3_sse3;
         cpi->rtcd.variance.sad8x8x3              = vp8_sad8x8x3_sse3;
         cpi->rtcd.variance.sad4x4x3              = vp8_sad4x4x3_sse3;
+#if !(CONFIG_REALTIME_ONLY)
         cpi->rtcd.search.full_search             = vp8_full_search_sadx3;
-
+#endif
         cpi->rtcd.variance.sad16x16x4d           = vp8_sad16x16x4d_sse3;
         cpi->rtcd.variance.sad16x8x4d            = vp8_sad16x8x4d_sse3;
         cpi->rtcd.variance.sad8x16x4d            = vp8_sad8x16x4d_sse3;
@@ -351,7 +354,9 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi)
         cpi->rtcd.variance.sad8x16x8             = vp8_sad8x16x8_sse4;
         cpi->rtcd.variance.sad8x8x8              = vp8_sad8x8x8_sse4;
         cpi->rtcd.variance.sad4x4x8              = vp8_sad4x4x8_sse4;
+#if !(CONFIG_REALTIME_ONLY)
         cpi->rtcd.search.full_search             = vp8_full_search_sadx8;
+#endif
     }
 #endif
 
diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index f95920775..903c56c88 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -38,6 +38,7 @@ struct vp8_extracfg
     unsigned int                arnr_strength;    /* alt_ref Noise Reduction Strength */
     unsigned int                arnr_type;        /* alt_ref filter type */
     vp8e_tuning                 tuning;
+    unsigned int                cq_level;         /* constrained quality level */
 
 };
 
@@ -69,6 +70,7 @@ static const struct extraconfig_map extracfg_map[] =
             3,                          /* arnr_strength */
             3,                          /* arnr_type*/
             0,                          /* tuning*/
+            10,                         /* cq_level */
         }
     }
 };
@@ -148,7 +150,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
 #else
     RANGE_CHECK_HI(cfg, g_lag_in_frames,    0);
 #endif
-    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CBR);
+    RANGE_CHECK(cfg, rc_end_usage,          VPX_VBR, VPX_CQ);
     RANGE_CHECK_HI(cfg, rc_undershoot_pct,  100);
     RANGE_CHECK_HI(cfg, rc_2pass_vbr_bias_pct, 100);
     RANGE_CHECK(cfg, kf_mode,               VPX_KF_DISABLED, VPX_KF_AUTO);
@@ -190,7 +192,9 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     RANGE_CHECK(vp8_cfg, arnr_max_frames, 0, 15);
     RANGE_CHECK_HI(vp8_cfg, arnr_strength,   6);
     RANGE_CHECK(vp8_cfg, arnr_type,       1, 3);
+    RANGE_CHECK(vp8_cfg, cq_level, 0, 63);
 
+#if !(CONFIG_REALTIME_ONLY)
     if (cfg->g_pass == VPX_RC_LAST_PASS)
     {
         int              mb_r = (cfg->g_h + 15) / 16;
@@ -214,6 +218,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
         if ((int)(stats->count + 0.5) != n_packets - 1)
             ERROR("rc_twopass_stats_in missing EOS stats packet");
     }
+#endif
 
     return VPX_CODEC_OK;
 }
@@ -298,11 +303,16 @@ static vpx_codec_err_t set_vp8e_config(VP8_CONFIG *oxcf,
     {
         oxcf->end_usage          = USAGE_STREAM_FROM_SERVER;
     }
+    else if (cfg.rc_end_usage == VPX_CQ)
+    {
+        oxcf->end_usage          = USAGE_CONSTRAINED_QUALITY;
+    }
 
     oxcf->target_bandwidth       = cfg.rc_target_bitrate;
 
     oxcf->best_allowed_q          = cfg.rc_min_quantizer;
     oxcf->worst_allowed_q         = cfg.rc_max_quantizer;
+    oxcf->cq_level                = vp8_cfg.cq_level;
     oxcf->fixed_q = -1;
 
     oxcf->under_shoot_pct         = cfg.rc_undershoot_pct;
@@ -453,6 +463,7 @@ static vpx_codec_err_t set_param(vpx_codec_alg_priv_t *ctx,
         MAP(VP8E_SET_ARNR_STRENGTH ,        xcfg.arnr_strength);
         MAP(VP8E_SET_ARNR_TYPE     ,        xcfg.arnr_type);
         MAP(VP8E_SET_TUNING,                xcfg.tuning);
+        MAP(VP8E_SET_CQ_LEVEL,              xcfg.cq_level);
 
     }
 
@@ -1034,6 +1045,7 @@ static vpx_codec_ctrl_fn_map_t vp8e_ctf_maps[] =
     {VP8E_SET_ARNR_STRENGTH ,           set_param},
     {VP8E_SET_ARNR_TYPE     ,           set_param},
     {VP8E_SET_TUNING,                   set_param},
+    {VP8E_SET_CQ_LEVEL,                 set_param},
     { -1, NULL},
 };
 
@@ -1069,7 +1081,6 @@ static vpx_codec_enc_cfg_map_t vp8e_usage_cfg_map[] =
 
         4,                  /* rc_min_quantizer */
         63,                 /* rc_max_quantizer */
-
         95,                 /* rc_undershoot_pct */
         200,                /* rc_overshoot_pct */
 
diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk
index 683d785e6..932f145e6 100644
--- a/vp8/vp8cx.mk
+++ b/vp8/vp8cx.mk
@@ -94,6 +94,7 @@ VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/dct_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/mcomp_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/variance_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_x86.h
+VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/temporal_filter_x86.h
 VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/x86_csystemdependent.c
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_mmx.c
 VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/variance_impl_mmx.asm
@@ -107,6 +108,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/sad_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm
+VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm
 VP8_CX_SRCS-$(HAVE_SSE3) += encoder/x86/sad_sse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/sad_ssse3.asm
 VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm