From 3c18a2bb2e5f6cde8189643345e33a1c27189ff8 Mon Sep 17 00:00:00 2001
From: Scott LaVarnway <slavarnway@google.com>
Date: Wed, 26 Jan 2011 16:42:56 -0500
Subject: Performance improvement of first pass

Improved the performance of the first pass only
(~6% on 720p test clip) by making use of LUT instead of the
float calculations.  Might try a SIMD version later.
Also started to make use of int_mv instead of
MV.

Change-Id: If2a217c7d6b59cd2c25c5553e0ca7e0502403af8
---
 vp8/encoder/firstpass.c | 99 ++++++++++++++++++++++++++++++-------------------
 1 file changed, 61 insertions(+), 38 deletions(-)

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 06e26be85..fc6f043c3 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -8,7 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #include "math.h"
 #include "limits.h"
 #include "block.h"
@@ -178,40 +177,68 @@ static double calculate_modified_err(VP8_COMP *cpi, FIRSTPASS_STATS *this_frame)
     return modified_err;
 }
 
+static const double weight_table[256] = {
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000, 0.020000,
+0.020000, 0.031250, 0.062500, 0.093750, 0.125000, 0.156250, 0.187500, 0.218750,
+0.250000, 0.281250, 0.312500, 0.343750, 0.375000, 0.406250, 0.437500, 0.468750,
+0.500000, 0.531250, 0.562500, 0.593750, 0.625000, 0.656250, 0.687500, 0.718750,
+0.750000, 0.781250, 0.812500, 0.843750, 0.875000, 0.906250, 0.937500, 0.968750,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000,
+1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000, 1.000000
+};
+
 double vp8_simple_weight(YV12_BUFFER_CONFIG *source)
 {
     int i, j;
 
     unsigned char *src = source->y_buffer;
-    unsigned char value;
     double sum_weights = 0.0;
-    double Weight;
 
     // Loop throught the Y plane raw examining levels and creating a weight for the image
-    for (i = 0; i < source->y_height; i++)
+    i = source->y_height;
+    do
     {
-        for (j = 0; j < source->y_width; j++)
+        j = source->y_width;
+        do
         {
-            value = src[j];
-
-            if (value >= 64)
-                Weight = 1.0;
-            else if (value > 32)
-                Weight = (value - 32.0f) / 32.0f;
-            else
-                Weight = 0.02;
-
-            sum_weights += Weight;
-        }
-
+            sum_weights += weight_table[ *src];
+            src++;
+        }while(--j);
+        src -= source->y_width;
         src += source->y_stride;
-    }
+    }while(--i);
 
     sum_weights /= (source->y_height * source->y_width);
 
     return sum_weights;
 }
 
+
 // This function returns the current per frame maximum bitrate target
 int frame_max_bits(VP8_COMP *cpi)
 {
@@ -440,7 +467,6 @@ void vp8_end_first_pass(VP8_COMP *cpi)
     vp8_output_stats(cpi, cpi->output_pkt_list, cpi->total_stats);
 }
 
-
 void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * recon_buffer, int * best_motion_err, int recon_yoffset )
 {
     MACROBLOCKD * const xd = & x->e_mbd;
@@ -460,7 +486,6 @@ void vp8_zz_motion_search( VP8_COMP *cpi, MACROBLOCK * x, YV12_BUFFER_CONFIG * r
     VARIANCE_INVOKE(IF_RTCD(&cpi->rtcd.variance), mse16x16) ( src_ptr, src_stride, ref_ptr, ref_stride, (unsigned int *)(best_motion_err));
 }
 
-
 void vp8_first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, MV *best_mv, YV12_BUFFER_CONFIG *recon_buffer, int *best_motion_err, int recon_yoffset )
 {
     MACROBLOCKD *const xd = & x->e_mbd;
@@ -548,7 +573,6 @@ void vp8_first_pass(VP8_COMP *cpi)
 
     int sum_in_vectors = 0;
 
-    MV best_ref_mv = {0, 0};
     MV zero_ref_mv = {0, 0};
 
     unsigned char *fp_motion_map_ptr = cpi->fp_motion_map;
@@ -586,13 +610,20 @@ void vp8_first_pass(VP8_COMP *cpi)
     // for each macroblock row in image
     for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
     {
-        MV best_ref_mv = {0, 0};
+        int_mv best_ref_mv;
+
+        best_ref_mv.as_int = 0;
 
         // reset above block coeffs
         xd->up_available = (mb_row != 0);
         recon_yoffset = (mb_row * recon_y_stride * 16);
         recon_uvoffset = (mb_row * recon_uv_stride * 8);
 
+        // Set up limit values for motion vectors to prevent them extending outside the UMV borders
+        x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
+        x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
+
+
         // for each macroblock col in image
         for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
         {
@@ -625,8 +656,6 @@ void vp8_first_pass(VP8_COMP *cpi)
             // Set up limit values for motion vectors to prevent them extending outside the UMV borders
             x->mv_col_min = -((mb_col * 16) + (VP8BORDERINPIXELS - 16));
             x->mv_col_max = ((cm->mb_cols - 1 - mb_col) * 16) + (VP8BORDERINPIXELS - 16);
-            x->mv_row_min = -((mb_row * 16) + (VP8BORDERINPIXELS - 16));
-            x->mv_row_max = ((cm->mb_rows - 1 - mb_row) * 16) + (VP8BORDERINPIXELS - 16);
 
             // Other than for the first frame do a motion search
             if (cm->current_video_frame > 0)
@@ -647,12 +676,12 @@ void vp8_first_pass(VP8_COMP *cpi)
 
                 // Test last reference frame using the previous best mv as the
                 // starting point (best reference) for the search
-                vp8_first_pass_motion_search(cpi, x, &best_ref_mv,
+                vp8_first_pass_motion_search(cpi, x, &best_ref_mv.as_mv,
                                         &d->bmi.mv.as_mv, lst_yv12,
                                         &motion_error, recon_yoffset);
 
                 // If the current best reference mv is not centred on 0,0 then do a 0,0 based search as well
-                if ((best_ref_mv.col != 0) || (best_ref_mv.row != 0))
+                if (best_ref_mv.as_int)
                 {
                    tmp_err = INT_MAX;
                    vp8_first_pass_motion_search(cpi, x, &zero_ref_mv, &tmp_mv,
@@ -664,7 +693,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                         d->bmi.mv.as_mv.row = tmp_mv.row;
                         d->bmi.mv.as_mv.col = tmp_mv.col;
                    }
-
                 }
 
                 // Experimental search in a second reference frame ((0,0) based only)
@@ -693,6 +721,9 @@ void vp8_first_pass(VP8_COMP *cpi)
                     xd->pre.v_buffer = lst_yv12->v_buffer + recon_uvoffset;
                 }
 
+                /* Intra assumed best */
+                best_ref_mv.as_int = 0;
+
                 if (motion_error <= this_error)
                 {
                     d->bmi.mv.as_mv.row <<= 3;
@@ -708,13 +739,10 @@ void vp8_first_pass(VP8_COMP *cpi)
                     sum_mvcs += d->bmi.mv.as_mv.col * d->bmi.mv.as_mv.col;
                     intercount++;
 
-                    best_ref_mv.row = d->bmi.mv.as_mv.row;
-                    best_ref_mv.col = d->bmi.mv.as_mv.col;
-                    //best_ref_mv.row = 0;
-                    //best_ref_mv.col = 0;
+                    best_ref_mv.as_int = d->bmi.mv.as_int;
 
                     // Was the vector non-zero
-                    if (d->bmi.mv.as_mv.row || d->bmi.mv.as_mv.col)
+                    if (d->bmi.mv.as_int)
                     {
                         mvcount++;
 
@@ -770,12 +798,6 @@ void vp8_first_pass(VP8_COMP *cpi)
                             *fp_motion_map_ptr = 1;
                     }
                 }
-                else
-                {
-                    // Intra was best
-                    best_ref_mv.row = 0;
-                    best_ref_mv.col = 0;
-                }
             }
 
             coded_error += this_error;
@@ -813,6 +835,7 @@ void vp8_first_pass(VP8_COMP *cpi)
         fps.coded_error = coded_error >> 8;
         weight = vp8_simple_weight(cpi->Source);
 
+
         if (weight < 0.1)
             weight = 0.1;
 
-- 
cgit v1.2.3


From e9f513d74ae9cfc88f5423cb25bd65000bc32c0d Mon Sep 17 00:00:00 2001
From: Adrian Grange <agrange@google.com>
Date: Fri, 28 Jan 2011 14:47:36 +0000
Subject: Changed condition for using RD in Intra Mode

The condition for using RD when selecting the intra coding mode
for a MB is that the RD flag is set AND we're not in real-time
mode.

Previously the code used RD if either the RD flag was set OR
we were not using real-time mode.

Change-Id: Ic711151298468a3f99babad39ba8375f66d55a08
---
 vp8/encoder/encodeframe.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 1689b43d1..793191d24 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1160,7 +1160,7 @@ int vp8cx_encode_intra_macro_block(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t)
 
 #if !(CONFIG_REALTIME_ONLY)
 
-    if (cpi->sf.RD || cpi->compressor_speed != 2)
+    if (cpi->sf.RD && cpi->compressor_speed != 2)
     {
         Error4x4 = vp8_rd_pick_intra4x4mby_modes(cpi, x, &rate4x4, &rate4x4_tokenonly, &dist4x4);
 
-- 
cgit v1.2.3


From 8f279596cbb7a6a3016fdc00624bc33ba36641bf Mon Sep 17 00:00:00 2001
From: Yaowu Xu <yaowu@google.com>
Date: Wed, 19 Jan 2011 16:21:01 -0800
Subject: change the threshold of DC check for encode breakout

Previously, the DC check is to make sure there is no code-able
DC shift for quantizer Q0, which has been verified rather
conservative. This commit changes the criteria to have two
components, DC and AC, to address the conservativeness. First,
it checks if all AC energy is enough to contribute a single
non-zero quantized AC coefficient. Second, for DC, the decision
to skip further considers two possible scenarios: 1. There is
no code-able 2nd order DC coefficient at all; 2 The residue is
relatively flat, but the uniform DC change is very small, i.e.
less than 1/2 gray level per pixel.

Comparing to previous criteria, the new criteria is about 10%
to 15% faster in encoding time with a very small quality loss.
(threshold ~1000 and quality range 33db-45db)

It should be noted that this commit enables "automatic" static
threshold for encodebreakout if a non-zero small value is passed
in to encoder.

Change-Id: I0f77719a1ac2c2dfddbd950d84920df374515ce3
---
 vp8/encoder/rdopt.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 36420aad1..fcff74778 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -2267,22 +2267,28 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
             else if (x->encode_breakout)
             {
                 int sum, sse;
+                int threshold = (xd->block[0].dequant[1]
+                            * xd->block[0].dequant[1] >>4);
+
+                if(threshold < x->encode_breakout)
+                    threshold = x->encode_breakout;
 
                 VARIANCE_INVOKE(&cpi->rtcd.variance, get16x16var)
                     (x->src.y_buffer, x->src.y_stride,
                      x->e_mbd.predictor, 16, (unsigned int *)(&sse), &sum);
 
-                if (sse < x->encode_breakout)
+                if (sse < threshold)
                 {
                     // Check u and v to make sure skip is ok
                     int sse2 = 0;
-
-                    // add dc check
-                    if (abs(sum) < (cpi->common.Y2dequant[0][0] << 2))
+                    /* If theres is no codeable 2nd order dc
+                       or a very small uniform pixel change change */
+                    if (abs(sum) < (xd->block[24].dequant[0]<<2)||
+                        ((sum * sum>>8) > sse && abs(sum) <128))
                     {
                         sse2 = VP8_UVSSE(x, IF_RTCD(&cpi->rtcd.variance));
 
-                        if (sse2 * 2 < x->encode_breakout)
+                        if (sse2 * 2 < threshold)
                         {
                             x->skip = 1;
                             distortion2 = sse + sse2;
@@ -2428,6 +2434,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
 
         if (x->skip)
             break;
+
     }
 
     // Reduce the activation RD thresholds for the best choice mode
-- 
cgit v1.2.3


From 2d03f073a729b74e8c92448dcfd7291f810b2fb1 Mon Sep 17 00:00:00 2001
From: John Koleszar <jkoleszar@google.com>
Date: Fri, 28 Jan 2011 11:56:18 -0500
Subject: validate min_q against max_q

min_q is required to be <= max_q.

Change-Id: I28eccf96df3b52a94913762b54c4fbe0d021ce5e
---
 vp8/vp8_cx_iface.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 903c56c88..b23bd951d 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c
@@ -142,8 +142,8 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t      *ctx,
     RANGE_CHECK(cfg, g_timebase.den,        1, 1000000000);
     RANGE_CHECK(cfg, g_timebase.num,        1, cfg->g_timebase.den);
     RANGE_CHECK_HI(cfg, g_profile,          3);
-    RANGE_CHECK_HI(cfg, rc_min_quantizer,   63);
     RANGE_CHECK_HI(cfg, rc_max_quantizer,   63);
+    RANGE_CHECK_HI(cfg, rc_min_quantizer,   cfg->rc_max_quantizer);
     RANGE_CHECK_HI(cfg, g_threads,          64);
 #if !(CONFIG_REALTIME_ONLY)
     RANGE_CHECK_HI(cfg, g_lag_in_frames,    25);
-- 
cgit v1.2.3