Encoder denoiser performance improvement

The denoiser function was modified to reduce the computational complexity. 1. The denoiser c function modification: The original implementation calculated pixel's filter_coefficient based on the pixel value difference between current raw frame and last denoised raw frame, and stored them in lookup tables. For each pixel c, find its coefficient using filter_coefficient[c] = LUT[abs_diff[c]]; and then apply filtering operation for the pixel. The denoising filter costed about 12% of encoding time when it was turned on, and half of the time was spent on finding coefficients in lookup tables. In order to simplify the process, a short cut was taken. The pixel adjustments vs. pixel diff value were calculated ahead of time. adjustment = filtered_value - current_raw = (filter_coefficient * diff + 128) >> 8 The adjustment vs. diff curve becomes flat very quick when diff increases. This allowed us to use only several levels to get a close approximation of the curve. Following the denoiser algorithm, the adjustments are further modified according to how big the motion magnitude is. 2. The sse2 function was rewritten. This change made denoiser filter function 3x faster, and improved the encoder performance by 7% ~ 10% with the denoiser on. Change-Id: I93a4308963b8e80c7307f96ffa8b8c667425bf50
author: Yunqing Wang <yunqingwang@google.com> 2012-08-10 12:35:55 -0700
committer: Yunqing Wang <yunqingwang@google.com> 2012-08-31 13:48:13 -0700
commit: 64075c9b0129efcdba8c85d7519dbe385dbc56c5 (patch)
tree: 2228c9057962a580891cb4a2420399e1d06fda7b /vp8/encoder/denoising.c
parent: c533f2a43e102f6cd330834ffdb74575d8382bbf (diff)
download: libvpx-64075c9b0129efcdba8c85d7519dbe385dbc56c5.tar
libvpx-64075c9b0129efcdba8c85d7519dbe385dbc56c5.tar.gz
libvpx-64075c9b0129efcdba8c85d7519dbe385dbc56c5.tar.bz2
libvpx-64075c9b0129efcdba8c85d7519dbe385dbc56c5.zip
1 files changed, 75 insertions, 108 deletions
diff --git a/vp8/encoder/denoising.c b/vp8/encoder/denoising.c
index d6b03e63a..c0dd7c106 100644
--- a/vp8/encoder/denoising.c
+++ b/vp8/encoder/denoising.c
@@ -23,151 +23,118 @@ static const unsigned int SSE_DIFF_THRESHOLD = 16 * 16 * 20;
 static const unsigned int SSE_THRESHOLD = 16 * 16 * 40;
 
 /*
- * The filtering coefficients used for denoizing are adjusted for static
- * blocks, or blocks with very small motion vectors. This is done through
- * the motion magnitude parameter.
+ * The filter function was modified to reduce the computational complexity.
+ * Step 1:
+ * Instead of applying tap coefficients for each pixel, we calculated the
+ * pixel adjustments vs. pixel diff value ahead of time.
+ *     adjustment = filtered_value - current_raw
+ *                = (filter_coefficient * diff + 128) >> 8
+ * where
+ *     filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
+ *     filter_coefficient += filter_coefficient /
+ *                           (3 + motion_magnitude_adjustment);
+ *     filter_coefficient is clamped to 0 ~ 255.
  *
- * There are currently 2048 possible mapping from absolute difference to
- * filter coefficient depending on the motion magnitude. Each mapping is
- * in a LUT table. All these tables are staticly allocated but they are only
- * filled on their first use.
- *
- * Each entry is a pair of 16b values, the coefficient and its complement
- * to 256. Each of these value should only be 8b but they are 16b wide to
- * avoid slow partial register manipulations.
+ * Step 2:
+ * The adjustment vs. diff curve becomes flat very quick when diff increases.
+ * This allowed us to use only several levels to approximate the curve without
+ * changing the filtering algorithm too much.
+ * The adjustments were further corrected by checking the motion magnitude.
+ * The levels used are:
+ * diff       adjustment w/o motion correction   adjustment w/ motion correction
+ * [-255, -16]           -6                                   -7
+ * [-15, -8]             -4                                   -5
+ * [-7, -4]              -3                                   -4
+ * [-3, 3]               diff                                 diff
+ * [4, 7]                 3                                    4
+ * [8, 15]                4                                    5
+ * [16, 255]              6                                    7
  */
-enum {num_motion_magnitude_adjustments = 2048};
-
-static union coeff_pair filter_coeff_LUT[num_motion_magnitude_adjustments][256];
-static uint8_t filter_coeff_LUT_initialized[num_motion_magnitude_adjustments] =
-    { 0 };
-
-
-union coeff_pair *vp8_get_filter_coeff_LUT(unsigned int motion_magnitude)
-{
-    union coeff_pair *LUT;
-    unsigned int motion_magnitude_adjustment = motion_magnitude >> 3;
-
-    if (motion_magnitude_adjustment >= num_motion_magnitude_adjustments)
-    {
-        motion_magnitude_adjustment = num_motion_magnitude_adjustments - 1;
-    }
-
-    LUT = filter_coeff_LUT[motion_magnitude_adjustment];
-
-    if (!filter_coeff_LUT_initialized[motion_magnitude_adjustment])
-    {
-        int absdiff;
-
-        for (absdiff = 0; absdiff < 256; ++absdiff)
-        {
-            unsigned int filter_coefficient;
-            filter_coefficient = (255 << 8) / (256 + ((absdiff * 330) >> 3));
-            filter_coefficient += filter_coefficient /
-                                  (3 + motion_magnitude_adjustment);
-
-            if (filter_coefficient > 255)
-            {
-                filter_coefficient = 255;
-            }
-
-            LUT[absdiff].as_short[0] = filter_coefficient ;
-            LUT[absdiff].as_short[1] = 256 - filter_coefficient;
-        }
-
-        filter_coeff_LUT_initialized[motion_magnitude_adjustment] = 1;
-    }
-
-    return LUT;
-}
-
-
 
 int vp8_denoiser_filter_c(YV12_BUFFER_CONFIG *mc_running_avg,
-                          YV12_BUFFER_CONFIG *running_avg,
-                          MACROBLOCK *signal,
-                          unsigned int motion_magnitude,
-                          int y_offset,
+                          YV12_BUFFER_CONFIG *running_avg, MACROBLOCK *signal,
+                          unsigned int motion_magnitude, int y_offset,
                           int uv_offset)
 {
-    unsigned char filtered_buf[16*16];
-    unsigned char *filtered = filtered_buf;
     unsigned char *sig = signal->thismb;
     int sig_stride = 16;
     unsigned char *mc_running_avg_y = mc_running_avg->y_buffer + y_offset;
     int mc_avg_y_stride = mc_running_avg->y_stride;
     unsigned char *running_avg_y = running_avg->y_buffer + y_offset;
     int avg_y_stride = running_avg->y_stride;
-    const union coeff_pair *LUT = vp8_get_filter_coeff_LUT(motion_magnitude);
-    int r, c;
+    int r, c, i;
     int sum_diff = 0;
+    int adj_val[3] = {3, 4, 6};
 
-    for (r = 0; r < 16; ++r)
+    /* If motion_magnitude is small, making the denoiser more aggressive by
+     * increasing the adjustment for each level. */
+    if (motion_magnitude <= MOTION_MAGNITUDE_THRESHOLD)
     {
-        /* Calculate absolute differences */
-        unsigned char abs_diff[16];
-
-        union coeff_pair filter_coefficient[16];
-
-        for (c = 0; c < 16; ++c)
-        {
-            int absdiff = sig[c] - mc_running_avg_y[c];
-            absdiff = absdiff > 0 ? absdiff : -absdiff;
-            abs_diff[c] = absdiff;
-        }
-
-        /* Use LUT to get filter coefficients (two 16b value; f and 256-f) */
-        for (c = 0; c < 16; ++c)
-        {
-            filter_coefficient[c] = LUT[abs_diff[c]];
-        }
+        for (i = 0; i < 3; i++)
+            adj_val[i] += 1;
+    }
 
-        /* Filtering... */
+    for (r = 0; r < 16; ++r)
+    {
         for (c = 0; c < 16; ++c)
         {
-            const uint16_t state = (uint16_t)(mc_running_avg_y[c]);
-            const uint16_t sample = (uint16_t)(sig[c]);
-
-            running_avg_y[c] = (filter_coefficient[c].as_short[0] * state +
-                    filter_coefficient[c].as_short[1] * sample + 128) >> 8;
-        }
+            int diff = 0;
+            int adjustment = 0;
+            int absdiff = 0;
 
-        /* Depending on the magnitude of the difference between the signal and
-         * filtered version, either replace the signal by the filtered one or
-         * update the filter state with the signal when the change in a pixel
-         * isn't classified as noise.
-         */
-        for (c = 0; c < 16; ++c)
-        {
-            const int diff = sig[c] - running_avg_y[c];
-            sum_diff += diff;
+            diff = mc_running_avg_y[c] - sig[c];
+            absdiff = abs(diff);
 
-            if (diff * diff < NOISE_DIFF2_THRESHOLD)
+            /* When |diff| < 4, use pixel value from last denoised raw. */
+            if (absdiff <= 3)
             {
-                filtered[c] = running_avg_y[c];
+                running_avg_y[c] = mc_running_avg_y[c];
+                sum_diff += diff;
             }
             else
             {
-                filtered[c] = sig[c];
-                running_avg_y[c] = sig[c];
+                if (absdiff >= 4 && absdiff <= 7)
+                    adjustment = adj_val[0];
+                else if (absdiff >= 8 && absdiff <= 15)
+                    adjustment = adj_val[1];
+                else
+                    adjustment = adj_val[2];
+
+                if (diff > 0)
+                {
+                    if ((sig[c] + adjustment) > 255)
+                        running_avg_y[c] = 255;
+                    else
+                        running_avg_y[c] = sig[c] + adjustment;
+
+                    sum_diff += adjustment;
+                }
+                else
+                {
+                    if ((sig[c] - adjustment) < 0)
+                        running_avg_y[c] = 0;
+                    else
+                        running_avg_y[c] = sig[c] - adjustment;
+
+                    sum_diff -= adjustment;
+                }
             }
         }
 
         /* Update pointers for next iteration. */
         sig += sig_stride;
-        filtered += 16;
         mc_running_avg_y += mc_avg_y_stride;
         running_avg_y += avg_y_stride;
     }
+
     if (abs(sum_diff) > SUM_DIFF_THRESHOLD)
-    {
         return COPY_BLOCK;
-    }
-    vp8_copy_mem16x16(filtered_buf, 16, signal->thismb, sig_stride);
+
+    vp8_copy_mem16x16(running_avg->y_buffer + y_offset, avg_y_stride,
+                      signal->thismb, sig_stride);
     return FILTER_BLOCK;
 }
 
-
 int vp8_denoiser_allocate(VP8_DENOISER *denoiser, int width, int height)
 {
     int i;
author	Yunqing Wang <yunqingwang@google.com>	2012-08-10 12:35:55 -0700
committer	Yunqing Wang <yunqingwang@google.com>	2012-08-31 13:48:13 -0700
commit	64075c9b0129efcdba8c85d7519dbe385dbc56c5 (patch)
tree	2228c9057962a580891cb4a2420399e1d06fda7b /vp8/encoder/denoising.c
parent	c533f2a43e102f6cd330834ffdb74575d8382bbf (diff)
download	libvpx-64075c9b0129efcdba8c85d7519dbe385dbc56c5.tar libvpx-64075c9b0129efcdba8c85d7519dbe385dbc56c5.tar.gz libvpx-64075c9b0129efcdba8c85d7519dbe385dbc56c5.tar.bz2 libvpx-64075c9b0129efcdba8c85d7519dbe385dbc56c5.zip