6 files changed, 220 insertions, 23 deletions
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index 1d96d92c2..e053e2ee0 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -256,6 +256,8 @@ typedef struct VP9Common {
   PARTITION_CONTEXT *above_seg_context;
   ENTROPY_CONTEXT *above_context;
   int above_context_alloc_cols;
+
+  int lf_row;
 } VP9_COMMON;
 
 static INLINE YV12_BUFFER_CONFIG *get_ref_frame(VP9_COMMON *cm, int index) {
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index e0f5e0d83..dc9aa405a 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -229,6 +229,26 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, VP9_COMMON *cm,
                       workers, num_workers, lf_sync);
 }
 
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, VP9_COMMON *cm, int frame_filter_level,
+                     int num_workers) {
+  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+
+  if (!frame_filter_level) return;
+
+  if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
+      num_workers > lf_sync->num_workers) {
+    vp9_loop_filter_dealloc(lf_sync);
+    vp9_loop_filter_alloc(lf_sync, cm, sb_rows, cm->width, num_workers);
+  }
+
+  // Initialize cur_sb_col to -1 for all SB rows.
+  memset(lf_sync->cur_sb_col, -1, sizeof(*lf_sync->cur_sb_col) * sb_rows);
+
+  memset(lf_sync->num_tiles_done, 0,
+         sizeof(*lf_sync->num_tiles_done) * sb_rows);
+  cm->lf_row = 0;
+}
+
 // Set up nsync by width.
 static INLINE int get_sync_range(int width) {
   // nsync numbers are picked by testing. For example, for 4k
@@ -266,6 +286,25 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
         pthread_cond_init(&lf_sync->cond_[i], NULL);
       }
     }
+    pthread_mutex_init(&lf_sync->lf_mutex, NULL);
+
+    CHECK_MEM_ERROR(cm, lf_sync->recon_done_mutex,
+                    vpx_malloc(sizeof(*lf_sync->recon_done_mutex) * rows));
+    if (lf_sync->recon_done_mutex) {
+      int i;
+      for (i = 0; i < rows; ++i) {
+        pthread_mutex_init(&lf_sync->recon_done_mutex[i], NULL);
+      }
+    }
+
+    CHECK_MEM_ERROR(cm, lf_sync->recon_done_cond,
+                    vpx_malloc(sizeof(*lf_sync->recon_done_cond) * rows));
+    if (lf_sync->recon_done_cond) {
+      int i;
+      for (i = 0; i < rows; ++i) {
+        pthread_cond_init(&lf_sync->recon_done_cond[i], NULL);
+      }
+    }
   }
 #endif  // CONFIG_MULTITHREAD
 
@@ -276,6 +315,11 @@ void vp9_loop_filter_alloc(VP9LfSync *lf_sync, VP9_COMMON *cm, int rows,
   CHECK_MEM_ERROR(cm, lf_sync->cur_sb_col,
                   vpx_malloc(sizeof(*lf_sync->cur_sb_col) * rows));
 
+  CHECK_MEM_ERROR(cm, lf_sync->num_tiles_done,
+                  vpx_malloc(sizeof(*lf_sync->num_tiles_done) *
+                                 mi_cols_aligned_to_sb(cm->mi_rows) >>
+                             MI_BLOCK_SIZE_LOG2));
+
   // Set up nsync.
   lf_sync->sync_range = get_sync_range(width);
 }
@@ -298,15 +342,118 @@ void vp9_loop_filter_dealloc(VP9LfSync *lf_sync) {
       }
       vpx_free(lf_sync->cond_);
     }
+    if (lf_sync->recon_done_mutex != NULL) {
+      int i;
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_mutex_destroy(&lf_sync->recon_done_mutex[i]);
+      }
+      vpx_free(lf_sync->recon_done_mutex);
+    }
+
+    pthread_mutex_destroy(&lf_sync->lf_mutex);
+    if (lf_sync->recon_done_cond != NULL) {
+      int i;
+      for (i = 0; i < lf_sync->rows; ++i) {
+        pthread_cond_destroy(&lf_sync->recon_done_cond[i]);
+      }
+      vpx_free(lf_sync->recon_done_cond);
+    }
 #endif  // CONFIG_MULTITHREAD
+
     vpx_free(lf_sync->lfdata);
     vpx_free(lf_sync->cur_sb_col);
+    vpx_free(lf_sync->num_tiles_done);
     // clear the structure as the source of this call may be a resize in which
     // case this call will be followed by an _alloc() which may fail.
     vp9_zero(*lf_sync);
   }
 }
 
+static int get_next_row(VP9_COMMON *cm, VP9LfSync *lf_sync) {
+  int return_val = -1;
+  int cur_row;
+  const int max_rows = cm->mi_rows;
+
+#if CONFIG_MULTITHREAD
+  const int tile_cols = 1 << cm->log2_tile_cols;
+
+  pthread_mutex_lock(&lf_sync->lf_mutex);
+  if (cm->lf_row < max_rows) {
+    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
+    return_val = cm->lf_row;
+    cm->lf_row += MI_BLOCK_SIZE;
+    if (cm->lf_row < max_rows) {
+      /* If this is not the last row, make sure the next row is also decoded.
+       * This is because the intra predict has to happen before loop filter */
+      cur_row += 1;
+    }
+  }
+  pthread_mutex_unlock(&lf_sync->lf_mutex);
+
+  if (return_val == -1) return return_val;
+
+  pthread_mutex_lock(&lf_sync->recon_done_mutex[cur_row]);
+  if (lf_sync->num_tiles_done[cur_row] < tile_cols) {
+    pthread_cond_wait(&lf_sync->recon_done_cond[cur_row],
+                      &lf_sync->recon_done_mutex[cur_row]);
+  }
+  pthread_mutex_unlock(&lf_sync->recon_done_mutex[cur_row]);
+#else
+  (void)lf_sync;
+  if (cm->lf_row < max_rows) {
+    cur_row = cm->lf_row >> MI_BLOCK_SIZE_LOG2;
+    return_val = cm->lf_row;
+    cm->lf_row += MI_BLOCK_SIZE;
+    if (cm->lf_row < max_rows) {
+      /* If this is not the last row, make sure the next row is also decoded.
+       * This is because the intra predict has to happen before loop filter */
+      cur_row += 1;
+    }
+  }
+#endif  // CONFIG_MULTITHREAD
+
+  return return_val;
+}
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync,
+                         MACROBLOCKD *xd) {
+  int mi_row;
+  VP9_COMMON *cm = lf_data->cm;
+
+  while (!xd->corrupted && (mi_row = get_next_row(cm, lf_sync)) != -1 &&
+         mi_row < cm->mi_rows) {
+    lf_data->start = mi_row;
+    lf_data->stop = mi_row + MI_BLOCK_SIZE;
+
+    thread_loop_filter_rows(lf_data->frame_buffer, lf_data->cm, lf_data->planes,
+                            lf_data->start, lf_data->stop, lf_data->y_only,
+                            lf_sync);
+  }
+}
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row) {
+#if CONFIG_MULTITHREAD
+  pthread_mutex_lock(&lf_sync->recon_done_mutex[row]);
+  lf_sync->num_tiles_done[row] += 1;
+  if (num_tiles == lf_sync->num_tiles_done[row]) {
+    if (is_last_row) {
+      /* The last 2 rows wait on the last row to be done.
+       * So, we have to broadcast the signal in this case.
+       */
+      pthread_cond_broadcast(&lf_sync->recon_done_cond[row]);
+    } else {
+      pthread_cond_signal(&lf_sync->recon_done_cond[row]);
+    }
+  }
+  pthread_mutex_unlock(&lf_sync->recon_done_mutex[row]);
+#else
+  (void)lf_sync;
+  (void)num_tiles;
+  (void)row;
+  (void)is_last_row;
+#endif  // CONFIG_MULTITHREAD
+}
+
 // Accumulate frame counts.
 void vp9_accumulate_frame_counts(FRAME_COUNTS *accum,
                                  const FRAME_COUNTS *counts, int is_dec) {
diff --git a/vp9/common/vp9_thread_common.h b/vp9/common/vp9_thread_common.h
index 0f7c3ff74..09609f821 100644
--- a/vp9/common/vp9_thread_common.h
+++ b/vp9/common/vp9_thread_common.h
@@ -37,6 +37,13 @@ typedef struct VP9LfSyncData {
   // Row-based parallel loopfilter data
   LFWorkerData *lfdata;
   int num_workers;
+
+#if CONFIG_MULTITHREAD
+  pthread_mutex_t lf_mutex;
+  pthread_mutex_t *recon_done_mutex;
+  pthread_cond_t *recon_done_cond;
+#endif
+  int *num_tiles_done;
 } VP9LfSync;
 
 // Allocate memory for loopfilter row synchronization.
@@ -53,6 +60,17 @@ void vp9_loop_filter_frame_mt(YV12_BUFFER_CONFIG *frame, struct VP9Common *cm,
                               int partial_frame, VPxWorker *workers,
                               int num_workers, VP9LfSync *lf_sync);
 
+// Multi-threaded loopfilter initialisations
+void vp9_lpf_mt_init(VP9LfSync *lf_sync, struct VP9Common *cm,
+                     int frame_filter_level, int num_workers);
+
+void vp9_loopfilter_rows(LFWorkerData *lf_data, VP9LfSync *lf_sync,
+                         MACROBLOCKD *xd);
+
+void vp9_set_row(VP9LfSync *lf_sync, int num_tiles, int row, int is_last_row);
+
+void vp9_set_last_decoded_row(struct VP9Common *cm, int tile_col, int mi_row);
+
 void vp9_accumulate_frame_counts(struct FRAME_COUNTS *accum,
                                  const struct FRAME_COUNTS *counts, int is_dec);
 
diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 3c17c3dc3..b38f868fb 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c
@@ -1487,6 +1487,11 @@ static int tile_worker_hook(void *arg1, void *arg2) {
   TileInfo *volatile tile = &tile_data->xd.tile;
   const int final_col = (1 << pbi->common.log2_tile_cols) - 1;
   const uint8_t *volatile bit_reader_end = NULL;
+  VP9_COMMON *cm = &pbi->common;
+
+  LFWorkerData *lf_data = tile_data->lf_data;
+  VP9LfSync *lf_sync = tile_data->lf_sync;
+
   volatile int n = tile_data->buf_start;
   tile_data->error_info.setjmp = 1;
 
@@ -1519,6 +1524,13 @@ static int tile_worker_hook(void *arg1, void *arg2) {
            mi_col += MI_BLOCK_SIZE) {
         decode_partition(tile_data, pbi, mi_row, mi_col, BLOCK_64X64, 4);
       }
+      if (cm->lf.filter_level && !cm->skip_loop_filter) {
+        const int aligned_rows = mi_cols_aligned_to_sb(cm->mi_rows);
+        const int sb_rows = (aligned_rows >> MI_BLOCK_SIZE_LOG2);
+        const int is_last_row = (sb_rows - 1 == mi_row >> MI_BLOCK_SIZE_LOG2);
+        vp9_set_row(lf_sync, 1 << cm->log2_tile_cols,
+                    mi_row >> MI_BLOCK_SIZE_LOG2, is_last_row);
+      }
     }
 
     if (buf->col == final_col) {
@@ -1526,6 +1538,11 @@ static int tile_worker_hook(void *arg1, void *arg2) {
     }
   } while (!tile_data->xd.corrupted && ++n <= tile_data->buf_end);
 
+  if (!tile_data->xd.corrupted && cm->lf.filter_level &&
+      !cm->skip_loop_filter) {
+    vp9_loopfilter_rows(lf_data, lf_sync, &tile_data->xd);
+  }
+
   tile_data->data_end = bit_reader_end;
   return !tile_data->xd.corrupted;
 }
@@ -1542,6 +1559,8 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
   VP9_COMMON *const cm = &pbi->common;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   const uint8_t *bit_reader_end = NULL;
+  VP9LfSync *lf_row_sync = &pbi->lf_row_sync;
+  YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
   const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
   const int tile_cols = 1 << cm->log2_tile_cols;
   const int tile_rows = 1 << cm->log2_tile_rows;
@@ -1568,12 +1587,26 @@ static const uint8_t *decode_tiles_mt(VP9Decoder *pbi, const uint8_t *data,
     }
   }
 
+  // Initialize LPF
+  if (cm->lf.filter_level && !cm->skip_loop_filter) {
+    vp9_lpf_mt_init(lf_row_sync, cm, cm->lf.filter_level,
+                    pbi->num_tile_workers);
+  }
+
   // Reset tile decoding hook
   for (n = 0; n < num_workers; ++n) {
     VPxWorker *const worker = &pbi->tile_workers[n];
     TileWorkerData *const tile_data =
         &pbi->tile_worker_data[n + pbi->total_tiles];
     winterface->sync(worker);
+
+    if (cm->lf.filter_level && !cm->skip_loop_filter) {
+      tile_data->lf_sync = lf_row_sync;
+      tile_data->lf_data = &tile_data->lf_sync->lfdata[n];
+      vp9_loop_filter_data_reset(tile_data->lf_data, new_fb, cm, pbi->mb.plane);
+      tile_data->lf_data->y_only = 0;
+    }
+
     tile_data->xd = pbi->mb;
     tile_data->xd.counts =
         cm->frame_parallel_decoding_mode ? NULL : &tile_data->counts;
@@ -2095,15 +2128,7 @@ void vp9_decode_frame(VP9Decoder *pbi, const uint8_t *data,
   if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
     // Multi-threaded tile decoder
     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
-    if (!xd->corrupted) {
-      if (!cm->skip_loop_filter) {
-        // If multiple threads are used to decode tiles, then we use those
-        // threads to do parallel loopfiltering.
-        vp9_loop_filter_frame_mt(new_fb, cm, pbi->mb.plane, cm->lf.filter_level,
-                                 0, 0, pbi->tile_workers, pbi->num_tile_workers,
-                                 &pbi->lf_row_sync);
-      }
-    } else {
+    if (xd->corrupted) {
       vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                          "Decode failed. Frame data is corrupted.");
     }
diff --git a/vp9/decoder/vp9_decoder.h b/vp9/decoder/vp9_decoder.h
index 5f22c00cb..532f5f848 100644
--- a/vp9/decoder/vp9_decoder.h
+++ b/vp9/decoder/vp9_decoder.h
@@ -37,6 +37,8 @@ typedef struct TileWorkerData {
   int buf_start, buf_end;  // pbi->tile_buffers to decode, inclusive
   vpx_reader bit_reader;
   FRAME_COUNTS counts;
+  LFWorkerData *lf_data;
+  VP9LfSync *lf_sync;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
   DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index ee4c1d059..5bb0f04f9 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c
@@ -134,7 +134,8 @@ static void apply_temporal_filter(
 
       // non-local mean approach
       int diff_sse[9] = { 0 };
-      int idx, idy, index = 0;
+      int idx, idy;
+      int y_index = 0;
 
       const int uv_r = i >> ss_y;
       const int uv_c = j >> ss_x;
@@ -150,13 +151,13 @@ static void apply_temporal_filter(
               col < (int)block_width) {
             const int diff = y_frame1[row * (int)y_stride + col] -
                              y_pred[row * (int)block_width + col];
-            diff_sse[index] = diff * diff;
-            ++index;
+            diff_sse[y_index] = diff * diff;
+            ++y_index;
           }
         }
       }
 
-      assert(index > 0);
+      assert(y_index > 0);
 
       modifier = 0;
       for (idx = 0; idx < 9; ++idx) modifier += diff_sse[idx];
@@ -169,9 +170,10 @@ static void apply_temporal_filter(
              v_pred[uv_r * uv_buf_stride + uv_c];
       modifier += diff * diff;
 
-      index += 2;
+      y_index += 2;
 
-      modifier = mod_index(modifier, index, rounding, strength, filter_weight);
+      modifier =
+          mod_index(modifier, y_index, rounding, strength, filter_weight);
 
       y_count[k] += modifier;
       y_accumulator[k] += modifier * pixel_value;
@@ -186,7 +188,8 @@ static void apply_temporal_filter(
         // non-local mean approach
         int u_diff_sse[9] = { 0 };
         int v_diff_sse[9] = { 0 };
-        int idx, idy, index = 0;
+        int idx, idy;
+        int cr_index = 0;
         int u_mod = 0, v_mod = 0;
         int y_diff = 0;
 
@@ -199,18 +202,18 @@ static void apply_temporal_filter(
                 col < uv_block_width) {
               int diff = u_frame1[row * uv_stride + col] -
                          u_pred[row * uv_buf_stride + col];
-              u_diff_sse[index] = diff * diff;
+              u_diff_sse[cr_index] = diff * diff;
 
               diff = v_frame1[row * uv_stride + col] -
                      v_pred[row * uv_buf_stride + col];
-              v_diff_sse[index] = diff * diff;
+              v_diff_sse[cr_index] = diff * diff;
 
-              ++index;
+              ++cr_index;
             }
           }
         }
 
-        assert(index > 0);
+        assert(cr_index > 0);
 
         for (idx = 0; idx < 9; ++idx) {
           u_mod += u_diff_sse[idx];
@@ -224,15 +227,15 @@ static void apply_temporal_filter(
             const int diff = y_frame1[row * (int)y_stride + col] -
                              y_pred[row * (int)block_width + col];
             y_diff += diff * diff;
-            ++index;
+            ++cr_index;
           }
         }
 
         u_mod += y_diff;
         v_mod += y_diff;
 
-        u_mod = mod_index(u_mod, index, rounding, strength, filter_weight);
-        v_mod = mod_index(v_mod, index, rounding, strength, filter_weight);
+        u_mod = mod_index(u_mod, cr_index, rounding, strength, filter_weight);
+        v_mod = mod_index(v_mod, cr_index, rounding, strength, filter_weight);
 
         u_count[m] += u_mod;
         u_accumulator[m] += u_mod * u_pixel_value;