10 files changed, 428 insertions, 40 deletions
diff --git a/vp9/common/ppc/vp9_idct_vsx.c b/vp9/common/ppc/vp9_idct_vsx.c
new file mode 100644
index 000000000..1b2a93edb
--- /dev/null
+++ b/vp9/common/ppc/vp9_idct_vsx.c
@@ -0,0 +1,115 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/ppc/inv_txfm_vsx.h"
+#include "vpx_dsp/ppc/bitdepth_conversion_vsx.h"
+
+#include "vp9/common/vp9_enums.h"
+
+void vp9_iht4x4_16_add_vsx(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  int16x8_t in[2], out[2];
+
+  in[0] = load_tran_low(0, input);
+  in[1] = load_tran_low(8 * sizeof(*input), input);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_idct4_vsx(in, out);
+      vpx_idct4_vsx(out, in);
+      break;
+    case ADST_DCT:
+      vpx_idct4_vsx(in, out);
+      vp9_iadst4_vsx(out, in);
+      break;
+    case DCT_ADST:
+      vp9_iadst4_vsx(in, out);
+      vpx_idct4_vsx(out, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      vp9_iadst4_vsx(in, out);
+      vp9_iadst4_vsx(out, in);
+      break;
+  }
+
+  vpx_round_store4x4_vsx(in, out, dest, stride);
+}
+
+void vp9_iht8x8_64_add_vsx(const tran_low_t *input, uint8_t *dest, int stride,
+                           int tx_type) {
+  int16x8_t in[8], out[8];
+
+  // load input data
+  in[0] = load_tran_low(0, input);
+  in[1] = load_tran_low(8 * sizeof(*input), input);
+  in[2] = load_tran_low(2 * 8 * sizeof(*input), input);
+  in[3] = load_tran_low(3 * 8 * sizeof(*input), input);
+  in[4] = load_tran_low(4 * 8 * sizeof(*input), input);
+  in[5] = load_tran_low(5 * 8 * sizeof(*input), input);
+  in[6] = load_tran_low(6 * 8 * sizeof(*input), input);
+  in[7] = load_tran_low(7 * 8 * sizeof(*input), input);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_idct8_vsx(in, out);
+      vpx_idct8_vsx(out, in);
+      break;
+    case ADST_DCT:
+      vpx_idct8_vsx(in, out);
+      vp9_iadst8_vsx(out, in);
+      break;
+    case DCT_ADST:
+      vp9_iadst8_vsx(in, out);
+      vpx_idct8_vsx(out, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      vp9_iadst8_vsx(in, out);
+      vp9_iadst8_vsx(out, in);
+      break;
+  }
+
+  vpx_round_store8x8_vsx(in, dest, stride);
+}
+
+void vp9_iht16x16_256_add_vsx(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  int16x8_t in0[16], in1[16];
+
+  LOAD_INPUT16(load_tran_low, input, 0, 8 * sizeof(*input), in0);
+  LOAD_INPUT16(load_tran_low, input, 8 * 8 * 2 * sizeof(*input),
+               8 * sizeof(*input), in1);
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vpx_idct16_vsx(in0, in1);
+      vpx_idct16_vsx(in0, in1);
+      break;
+    case ADST_DCT:
+      vpx_idct16_vsx(in0, in1);
+      vpx_iadst16_vsx(in0, in1);
+      break;
+    case DCT_ADST:
+      vpx_iadst16_vsx(in0, in1);
+      vpx_idct16_vsx(in0, in1);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      vpx_iadst16_vsx(in0, in1);
+      vpx_iadst16_vsx(in0, in1);
+      break;
+  }
+
+  vpx_round_store16x16_vsx(in0, in1, dest, stride);
+}
diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 8f5b0bf30..6d7f95260 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl
@@ -67,9 +67,9 @@ add_proto qw/void vp9_iht16x16_256_add/, "const tran_low_t *input, uint8_t *outp
 if (vpx_config("CONFIG_EMULATE_HARDWARE") ne "yes") {
   # Note that there are more specializations appended when
   # CONFIG_VP9_HIGHBITDEPTH is off.
-  specialize qw/vp9_iht4x4_16_add neon sse2/;
-  specialize qw/vp9_iht8x8_64_add neon sse2/;
-  specialize qw/vp9_iht16x16_256_add neon sse2/;
+  specialize qw/vp9_iht4x4_16_add neon sse2 vsx/;
+  specialize qw/vp9_iht8x8_64_add neon sse2 vsx/;
+  specialize qw/vp9_iht16x16_256_add neon sse2 vsx/;
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
     # Note that these specializations are appended to the above ones.
     specialize qw/vp9_iht4x4_16_add dspr2 msa/;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 383f7a8d7..f6fcd9d33 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1967,6 +1967,8 @@ static void rd_pick_sb_modes(VP9_COMP *cpi, TileDataEnc *tile_data,
     vp9_caq_select_segment(cpi, x, bsize, mi_row, mi_col, rd_cost->rate);
   }
 
+  rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv, rd_cost->rate, rd_cost->dist);
+
   x->rdmult = orig_rdmult;
 
   // TODO(jingning) The rate-distortion optimization flow needs to be
@@ -3317,20 +3319,73 @@ static int ml_pruning_partition(VP9_COMMON *const cm, MACROBLOCKD *const xd,
 }
 
 #define FEATURES 4
-static const float partition_breakout_weights_64[FEATURES + 1] = {
-  -0.016673f, -0.001025f, -0.000032f, 0.000833f, 1.94261885f - 2.1f,
+#define Q_CTX 2
+static const float partition_breakout_weights_64[Q_CTX][FEATURES + 1] = {
+  {
+      -0.016673f,
+      -0.001025f,
+      -0.000032f,
+      0.000833f,
+      1.94261885f - 2.1f,
+  },
+  {
+      -0.160867f,
+      -0.002101f,
+      0.000011f,
+      0.002448f,
+      1.65738142f - 2.5f,
+  },
 };
 
-static const float partition_breakout_weights_32[FEATURES + 1] = {
-  -0.010554f, -0.003081f, -0.000134f, 0.004491f, 1.68445992f - 3.5f,
+static const float partition_breakout_weights_32[Q_CTX][FEATURES + 1] = {
+  {
+      -0.010554f,
+      -0.003081f,
+      -0.000134f,
+      0.004491f,
+      1.68445992f - 3.5f,
+  },
+  {
+      -0.051489f,
+      -0.007609f,
+      0.000016f,
+      0.009792f,
+      1.28089404f - 2.5f,
+  },
 };
 
-static const float partition_breakout_weights_16[FEATURES + 1] = {
-  -0.013154f, -0.002404f, -0.000977f, 0.008450f, 2.57404566f - 5.5f,
+static const float partition_breakout_weights_16[Q_CTX][FEATURES + 1] = {
+  {
+      -0.013154f,
+      -0.002404f,
+      -0.000977f,
+      0.008450f,
+      2.57404566f - 5.5f,
+  },
+  {
+      -0.019146f,
+      -0.004018f,
+      0.000064f,
+      0.008187f,
+      2.15043926f - 2.5f,
+  },
 };
 
-static const float partition_breakout_weights_8[FEATURES + 1] = {
-  -0.011807f, -0.009873f, -0.000931f, 0.034768f, 1.32254851f - 2.0f,
+static const float partition_breakout_weights_8[Q_CTX][FEATURES + 1] = {
+  {
+      -0.011807f,
+      -0.009873f,
+      -0.000931f,
+      0.034768f,
+      1.32254851f - 2.0f,
+  },
+  {
+      -0.003861f,
+      -0.002701f,
+      0.000100f,
+      0.013876f,
+      1.96755111f - 1.5f,
+  },
 };
 
 // ML-based partition search breakout.
@@ -3338,22 +3393,30 @@ static int ml_predict_breakout(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
                                const MACROBLOCK *const x,
                                const RD_COST *const rd_cost) {
   DECLARE_ALIGNED(16, static const uint8_t, vp9_64_zeros[64]) = { 0 };
+  const VP9_COMMON *const cm = &cpi->common;
   float features[FEATURES];
   const float *linear_weights = NULL;  // Linear model weights.
   float linear_score = 0.0f;
+  const int qindex = cm->base_qindex;
+  const int q_ctx = qindex >= 200 ? 0 : 1;
 
   switch (bsize) {
-    case BLOCK_64X64: linear_weights = partition_breakout_weights_64; break;
-    case BLOCK_32X32: linear_weights = partition_breakout_weights_32; break;
-    case BLOCK_16X16: linear_weights = partition_breakout_weights_16; break;
-    case BLOCK_8X8: linear_weights = partition_breakout_weights_8; break;
+    case BLOCK_64X64:
+      linear_weights = partition_breakout_weights_64[q_ctx];
+      break;
+    case BLOCK_32X32:
+      linear_weights = partition_breakout_weights_32[q_ctx];
+      break;
+    case BLOCK_16X16:
+      linear_weights = partition_breakout_weights_16[q_ctx];
+      break;
+    case BLOCK_8X8: linear_weights = partition_breakout_weights_8[q_ctx]; break;
     default: assert(0 && "Unexpected block size."); return 0;
   }
   if (!linear_weights) return 0;
 
   {  // Generate feature values.
-    const VP9_COMMON *const cm = &cpi->common;
-    const int ac_q = vp9_ac_quant(cm->base_qindex, 0, cm->bit_depth);
+    const int ac_q = vp9_ac_quant(qindex, 0, cm->bit_depth);
     const int num_pels_log2 = num_pels_log2_lookup[bsize];
     int feature_index = 0;
     unsigned int var, sse;
@@ -3385,9 +3448,10 @@ static int ml_predict_breakout(const VP9_COMP *const cpi, BLOCK_SIZE bsize,
       linear_score += linear_weights[i] * features[i];
   }
 
-  return linear_score >= 0;
+  return linear_score >= cpi->sf.ml_partition_search_breakout_thresh[q_ctx];
 }
 #undef FEATURES
+#undef Q_CTX
 
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
@@ -3559,8 +3623,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
                      best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        this_rdc.rdcost =
-            RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist);
         this_rdc.rdcost += RDCOST(x->rdmult, x->rddiv,
                                   cpi->partition_cost[pl][PARTITION_NONE], 0);
         this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
@@ -3579,7 +3641,7 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
           if (!x->e_mbd.lossless && ctx->skippable) {
             int use_ml_based_breakout =
                 cpi->sf.use_ml_partition_search_breakout &&
-                cm->base_qindex >= 200;
+                cm->base_qindex >= 150;
 #if CONFIG_VP9_HIGHBITDEPTH
             if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
               use_ml_based_breakout = 0;
@@ -3714,7 +3776,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
       sum_rdc.rdcost += RDCOST(x->rdmult, x->rddiv,
                                cpi->partition_cost[pl][PARTITION_SPLIT], 0);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
@@ -3777,7 +3838,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
       sum_rdc.rdcost += RDCOST(x->rdmult, x->rddiv,
                                cpi->partition_cost[pl][PARTITION_HORZ], 0);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
@@ -3827,7 +3887,6 @@ static void rd_pick_partition(VP9_COMP *cpi, ThreadData *td,
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
       sum_rdc.rdcost += RDCOST(x->rdmult, x->rddiv,
                                cpi->partition_cost[pl][PARTITION_VERT], 0);
       sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 6ec7a5ee8..74e0d85a5 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -2952,7 +2952,7 @@ static int recode_loop_test(VP9_COMP *cpi, int high_limit, int low_limit, int q,
   return force_recode;
 }
 
-void vp9_update_reference_frames(VP9_COMP *cpi) {
+void update_ref_frames(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
 
@@ -3016,6 +3016,14 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
   }
+}
+
+void vp9_update_reference_frames(VP9_COMP *cpi) {
+  VP9_COMMON *const cm = &cpi->common;
+  BufferPool *const pool = cm->buffer_pool;
+
+  update_ref_frames(cpi);
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0 && denoise_svc(cpi) &&
       cpi->denoiser.denoising_level > kDenLowLow) {
@@ -3054,6 +3062,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) {
         denoise_svc_second_layer);
   }
 #endif
+
   if (is_one_pass_cbr_svc(cpi)) {
     // Keep track of frame index for each reference frame.
     SVC *const svc = &cpi->svc;
@@ -5670,6 +5679,15 @@ void setup_tpl_stats(VP9_COMP *cpi) {
   int tpl_group_frames = 0;
   int frame_idx;
 
+  // TODO(jingning): Make the model support high bit-depth route.
+#if CONFIG_VP9_HIGHBITDEPTH
+  (void)gf_picture;
+  (void)gf_group;
+  (void)tpl_group_frames;
+  (void)frame_idx;
+  return;
+#endif
+
   init_gop_frames(cpi, gf_picture, gf_group, &tpl_group_frames);
 
   init_tpl_stats(cpi);
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index a3d39266f..ec02a78ee 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -758,9 +758,7 @@ typedef struct VP9_COMP {
   int num_extra_arfs;
   int arf_pos_in_gf[MAX_EXT_ARFS + 1];
   int arf_pos_for_ovrly[MAX_EXT_ARFS + 1];
-
   int extra_arf_allowed;
-  int bwd_ref_allowed;
 
   vpx_roi_map_t roi;
 } VP9_COMP;
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index c13576343..6717d961d 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -2135,7 +2135,7 @@ static void define_gf_multi_arf_structure(VP9_COMP *cpi) {
   // (3) The bi-predictive group interval is strictly smaller than the
   //     golden group interval.
   const int is_bipred_enabled =
-      cpi->bwd_ref_allowed && rc->source_alt_ref_pending &&
+      cpi->extra_arf_allowed && rc->source_alt_ref_pending &&
       rc->bipred_group_interval &&
       rc->bipred_group_interval <=
           (rc->baseline_gf_interval - rc->source_alt_ref_pending);
@@ -2439,6 +2439,151 @@ static void define_gf_group_structure(VP9_COMP *cpi) {
   cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
 }
 
+static void allocate_gf_multi_arf_bits(VP9_COMP *cpi, int64_t gf_group_bits,
+                                       int gf_arf_bits) {
+  VP9EncoderConfig *const oxcf = &cpi->oxcf;
+  RATE_CONTROL *const rc = &cpi->rc;
+  TWO_PASS *const twopass = &cpi->twopass;
+  GF_GROUP *const gf_group = &twopass->gf_group;
+  FIRSTPASS_STATS frame_stats;
+  int i;
+  int frame_index = 0;
+  int target_frame_size;
+  int key_frame;
+  const int max_bits = frame_max_bits(&cpi->rc, oxcf);
+  int64_t total_group_bits = gf_group_bits;
+  int normal_frames;
+  int normal_frame_bits;
+  int last_frame_reduction = 0;
+  double av_score = 1.0;
+  double tot_norm_frame_score = 1.0;
+  double this_frame_score = 1.0;
+
+  // Define the GF structure and specify
+  define_gf_multi_arf_structure(cpi);
+
+  //========================================
+
+  key_frame = cpi->common.frame_type == KEY_FRAME;
+
+  // For key frames the frame target rate is already set and it
+  // is also the golden frame.
+  // === [frame_index == 0] ===
+  if (!key_frame) {
+    gf_group->bit_allocation[frame_index] =
+        rc->source_alt_ref_active ? 0 : gf_arf_bits;
+  }
+
+  // Deduct the boost bits for arf (or gf if it is not a key frame)
+  // from the group total.
+  if (rc->source_alt_ref_pending || !key_frame) total_group_bits -= gf_arf_bits;
+
+  ++frame_index;
+
+  // === [frame_index == 1] ===
+  // Store the bits to spend on the ARF if there is one.
+  if (rc->source_alt_ref_pending) {
+    gf_group->bit_allocation[frame_index] = gf_arf_bits;
+
+    ++frame_index;
+
+    // Skip all the extra-ARF's right after ARF at the starting segment of
+    // the current GF group.
+    if (cpi->num_extra_arfs) {
+      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+        ++frame_index;
+    }
+  }
+
+  normal_frames = (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+  if (normal_frames > 1)
+    normal_frame_bits = (int)(total_group_bits / normal_frames);
+  else
+    normal_frame_bits = (int)total_group_bits;
+
+  if (oxcf->vbr_corpus_complexity) {
+    av_score = get_distribution_av_err(cpi, twopass);
+    tot_norm_frame_score = calculate_group_score(cpi, av_score, normal_frames);
+  }
+
+  // Allocate bits to the other frames in the group.
+  for (i = 0; i < normal_frames; ++i) {
+    if (EOF == input_stats(twopass, &frame_stats)) break;
+
+    if (oxcf->vbr_corpus_complexity) {
+      this_frame_score = calculate_norm_frame_score(cpi, twopass, oxcf,
+                                                    &frame_stats, av_score);
+      normal_frame_bits = (int)((double)total_group_bits *
+                                (this_frame_score / tot_norm_frame_score));
+    }
+
+    target_frame_size = normal_frame_bits;
+    if ((i == (normal_frames - 1)) && (i >= 1)) {
+      last_frame_reduction = normal_frame_bits / 16;
+      target_frame_size -= last_frame_reduction;
+    }
+
+    // TODO(zoeliu): Further check whether following is needed for
+    //               hierarchical GF group structure.
+    if (rc->source_alt_ref_pending && cpi->multi_arf_enabled) {
+      target_frame_size -= (target_frame_size >> 4);
+    }
+
+    target_frame_size =
+        clamp(target_frame_size, 0, VPXMIN(max_bits, (int)total_group_bits));
+
+    if (gf_group->update_type[frame_index] == BRF_UPDATE) {
+      // Boost up the allocated bits on BWDREF_FRAME
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size + (target_frame_size >> 2);
+    } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
+      // Press down the allocated bits on LAST_BIPRED_UPDATE frames
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size - (target_frame_size >> 1);
+    } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
+      // TODO(zoeliu): Investigate whether the allocated bits on BIPRED_UPDATE
+      //               frames need to be further adjusted.
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+    } else {
+      assert(gf_group->update_type[frame_index] == LF_UPDATE ||
+             gf_group->update_type[frame_index] == INTNL_OVERLAY_UPDATE);
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+    }
+
+    ++frame_index;
+
+    // Skip all the extra-ARF's.
+    if (cpi->num_extra_arfs) {
+      while (gf_group->update_type[frame_index] == INTNL_ARF_UPDATE)
+        ++frame_index;
+    }
+  }
+
+  // NOTE: We need to configure the frame at the end of the sequence + 1 that
+  //       will be the start frame for the next group. Otherwise prior to the
+  //       call to av1_rc_get_second_pass_params() the data will be undefined.
+  if (rc->source_alt_ref_pending) {
+    if (cpi->num_extra_arfs) {
+      // NOTE: For bit allocation, move the allocated bits associated with
+      //       INTNL_OVERLAY_UPDATE to the corresponding INTNL_ARF_UPDATE.
+      //       i > 0 for extra-ARF's and i == 0 for ARF:
+      //         arf_pos_for_ovrly[i]: Position for INTNL_OVERLAY_UPDATE
+      //         arf_pos_in_gf[i]: Position for INTNL_ARF_UPDATE
+      for (i = cpi->num_extra_arfs; i > 0; --i) {
+        assert(gf_group->update_type[cpi->arf_pos_for_ovrly[i]] ==
+               INTNL_OVERLAY_UPDATE);
+
+        // Encoder's choice:
+        //   Set show_existing_frame == 1 for all extra-ARF's, and hence
+        //   allocate zero bit for both all internal OVERLAY frames.
+        gf_group->bit_allocation[cpi->arf_pos_in_gf[i]] =
+            gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]];
+        gf_group->bit_allocation[cpi->arf_pos_for_ovrly[i]] = 0;
+      }
+    }
+  }
+}
+
 static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
                                    int gf_arf_bits) {
   VP9EncoderConfig *const oxcf = &cpi->oxcf;
@@ -2462,17 +2607,7 @@ static void allocate_gf_group_bits(VP9_COMP *cpi, int64_t gf_group_bits,
   double this_frame_score = 1.0;
 
   // Define the GF structure and specify
-  cpi->bwd_ref_allowed = 0;
-  cpi->extra_arf_allowed = 0;
-
-  cpi->num_extra_arfs = 0;
-  cpi->num_extra_arfs = cpi->extra_arf_allowed ? cpi->num_extra_arfs : 0;
-
-  if (cpi->bwd_ref_allowed) {
-    define_gf_multi_arf_structure(cpi);
-  } else {
-    define_gf_group_structure(cpi);
-  }
+  define_gf_group_structure(cpi);
 
   key_frame = cpi->common.frame_type == KEY_FRAME;
 
@@ -2620,6 +2755,8 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   const int is_key_frame = frame_is_intra_only(cm);
   const int arf_active_or_kf = is_key_frame || rc->source_alt_ref_active;
 
+  int disable_bwd_extarf;
+
   // Reset the GF group data structures unless this is a key
   // frame in which case it will already have been done.
   if (is_key_frame == 0) {
@@ -2800,6 +2937,39 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
 
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
+  // TODO(zoeliu): Turn on the option to disable extra ALTREFs for still GF
+  //               groups.
+  // Disable extra altrefs for "still" gf group:
+  //   zero_motion_accumulator: minimum percentage of (0,0) motion;
+  //   avg_sr_coded_error:      average of the SSE per pixel of each frame;
+  //   avg_raw_err_stdev:       average of the standard deviation of (0,0)
+  //                            motion error per block of each frame.
+#if 0
+  assert(num_mbs > 0);
+  disable_bwd_extarf =
+      (zero_motion_accumulator > MIN_ZERO_MOTION &&
+       avg_sr_coded_error / num_mbs < MAX_SR_CODED_ERROR &&
+       avg_raw_err_stdev < MAX_RAW_ERR_VAR);
+#else
+  disable_bwd_extarf = 0;
+#endif  // 0
+
+  if (disable_bwd_extarf) cpi->extra_arf_allowed = 0;
+
+  if (!cpi->extra_arf_allowed) {
+    cpi->num_extra_arfs = 0;
+  } else {
+    // Compute how many extra alt_refs we can have
+    cpi->num_extra_arfs = get_number_of_extra_arfs(rc->baseline_gf_interval,
+                                                   rc->source_alt_ref_pending);
+  }
+  // Currently at maximum two extra ARFs' are allowed
+  assert(cpi->num_extra_arfs <= MAX_EXT_ARFS);
+
+  rc->bipred_group_interval = BFG_INTERVAL;
+  // The minimum bi-predictive frame group interval is 2.
+  if (rc->bipred_group_interval < 2) rc->bipred_group_interval = 0;
+
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
@@ -2851,7 +3021,11 @@ static void define_gf_group(VP9_COMP *cpi, FIRSTPASS_STATS *this_frame) {
   twopass->kf_group_error_left -= gf_group_err;
 
   // Allocate bits to each of the frames in the GF group.
-  allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits);
+  if (cpi->extra_arf_allowed) {
+    allocate_gf_multi_arf_bits(cpi, gf_group_bits, gf_arf_bits);
+  } else {
+    allocate_gf_group_bits(cpi, gf_group_bits, gf_arf_bits);
+  }
 
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
diff --git a/vp9/encoder/vp9_firstpass.h b/vp9/encoder/vp9_firstpass.h
index 958dc128d..404175d92 100644
--- a/vp9/encoder/vp9_firstpass.h
+++ b/vp9/encoder/vp9_firstpass.h
@@ -11,6 +11,8 @@
 #ifndef VP9_ENCODER_VP9_FIRSTPASS_H_
 #define VP9_ENCODER_VP9_FIRSTPASS_H_
 
+#include <assert.h>
+
 #include "vp9/encoder/vp9_lookahead.h"
 #include "vp9/encoder/vp9_ratectrl.h"
 
@@ -41,7 +43,12 @@ typedef struct {
 
 #define INVALID_ROW -1
 
+// Length of the bi-predictive frame group (BFG)
+// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
+//       number of bi-predictive frames.
+#define BFG_INTERVAL 2
 #define MAX_EXT_ARFS 2
+#define MIN_EXT_ARF_INTERVAL 4
 
 typedef struct {
   double frame_mb_intra_factor;
@@ -210,6 +217,17 @@ void vp9_twopass_postencode_update(struct VP9_COMP *cpi);
 void calculate_coded_size(struct VP9_COMP *cpi, int *scaled_frame_width,
                           int *scaled_frame_height);
 
+static INLINE int get_number_of_extra_arfs(int interval, int arf_pending) {
+  assert(MAX_EXT_ARFS > 0);
+  if (arf_pending) {
+    if (interval >= MIN_EXT_ARF_INTERVAL * (MAX_EXT_ARFS + 1))
+      return MAX_EXT_ARFS;
+    else if (interval >= MIN_EXT_ARF_INTERVAL * MAX_EXT_ARFS)
+      return MAX_EXT_ARFS - 1;
+  }
+  return 0;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 254c4e2b1..7a02623dc 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c
@@ -83,6 +83,8 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
       sf->disable_split_mask = DISABLE_COMPOUND_SPLIT;
       sf->partition_search_breakout_thr.dist = (1 << 21);
       sf->use_ml_partition_search_breakout = 1;
+      sf->ml_partition_search_breakout_thresh[0] = 0.0f;
+      sf->ml_partition_search_breakout_thresh[1] = 0.0f;
     }
   }
 
@@ -97,6 +99,8 @@ static void set_good_speed_feature_framesize_dependent(VP9_COMP *cpi,
       sf->disable_split_mask = LAST_AND_INTRA_SPLIT_ONLY;
       sf->partition_search_breakout_thr.dist = (1 << 22);
       sf->partition_search_breakout_thr.rate = 100;
+      sf->ml_partition_search_breakout_thresh[0] = 0.0f;
+      sf->ml_partition_search_breakout_thresh[1] = -1.0f;
     }
     sf->rd_auto_partition_min_limit = set_partition_min_limit(cm);
 
diff --git a/vp9/encoder/vp9_speed_features.h b/vp9/encoder/vp9_speed_features.h
index eede9cbe2..7a9b3a622 100644
--- a/vp9/encoder/vp9_speed_features.h
+++ b/vp9/encoder/vp9_speed_features.h
@@ -472,6 +472,7 @@ typedef struct SPEED_FEATURES {
 
   // Use ML-based partition search early breakout.
   int use_ml_partition_search_breakout;
+  float ml_partition_search_breakout_thresh[2];
 
   // Machine-learning based partition search early termination
   int ml_partition_search_early_termination;
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index d40d3c445..7ca4004b0 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -68,6 +68,7 @@ VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct4x4_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct8x8_msa.c
 VP9_COMMON_SRCS-$(HAVE_MSA)   += common/mips/msa/vp9_idct16x16_msa.c
 VP9_COMMON_SRCS-$(HAVE_SSE2)  += common/x86/vp9_idct_intrin_sse2.c
+VP9_COMMON_SRCS-$(HAVE_VSX)   += common/ppc/vp9_idct_vsx.c
 VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht8x8_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON)  += common/arm/neon/vp9_iht16x16_add_neon.c