2 files changed, 107 insertions, 90 deletions
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index b1510c648..9bdce69ff 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -456,45 +456,86 @@ DECLARE_ALIGNED(256, const int16_t, kChangeFilters[16][8]) = {
     { 128}
 };
 
+/* This test exercises the horizontal and vertical filter functions. */
 TEST_P(ConvolveTest, ChangeFilterWorks) {
   uint8_t* const in = input();
   uint8_t* const out = output();
+
+  /* Assume that the first input sample is at the 8/16th position. */
+  const int kInitialSubPelOffset = 8;
+
+  /* Filters are 8-tap, so the first filter tap will be applied to the pixel
+   * at position -3 with respect to the current filtering position. Since
+   * kInitialSubPelOffset is set to 8, we first select sub-pixel filter 8,
+   * which is non-zero only in the last tap. So, applying the filter at the
+   * current input position will result in an output equal to the pixel at
+   * offset +4 (-3 + 7) with respect to the current filtering position.
+   */
   const int kPixelSelected = 4;
 
+  /* Assume that each output pixel requires us to step on by 17/16th pixels in
+   * the input.
+   */
+  const int kInputPixelStep = 17;
+
+  /* The filters are setup in such a way that the expected output produces
+   * sets of 8 identical output samples. As the filter position moves to the
+   * next 1/16th pixel position the only active (=128) filter tap moves one
+   * position to the left, resulting in the same input pixel being replicated
+   * in to the output for 8 consecutive samples. After each set of 8 positions
+   * the filters select a different input pixel. kFilterPeriodAdjust below
+   * computes which input pixel is written to the output for a specified
+   * x or y position.
+   */
+
+  /* Test the horizontal filter. */
   REGISTER_STATE_CHECK(UUT_->h8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[8], 17, kChangeFilters[4], 16,
-                                 Width(), Height()));
+                                 kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, NULL, 0, Width(), Height()));
 
   for (int x = 0; x < Width(); ++x) {
-    const int kQ4StepAdjust = x >> 4;
     const int kFilterPeriodAdjust = (x >> 3) << 3;
-    const int ref_x = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
-    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x;
+    const int ref_x =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
+    ASSERT_EQ(in[ref_x], out[x]) << "x == " << x << "width = " << Width();
   }
 
+  /* Test the vertical filter. */
   REGISTER_STATE_CHECK(UUT_->v8_(in, kInputStride, out, kOutputStride,
-                                 kChangeFilters[4], 16, kChangeFilters[8], 17,
-                                 Width(), Height()));
+                                 NULL, 0, kChangeFilters[kInitialSubPelOffset],
+                                 kInputPixelStep, Width(), Height()));
 
   for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjust = y >> 4;
     const int kFilterPeriodAdjust = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjust + kFilterPeriodAdjust + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjust * kInputPixelStep)
+                          >> SUBPEL_BITS);
     ASSERT_EQ(in[ref_y * kInputStride], out[y * kInputStride]) << "y == " << y;
   }
 
+  /* Test the horizontal and vertical filters in combination. */
   REGISTER_STATE_CHECK(UUT_->hv8_(in, kInputStride, out, kOutputStride,
-                                  kChangeFilters[8], 17, kChangeFilters[8], 17,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
+                                  kChangeFilters[kInitialSubPelOffset],
+                                  kInputPixelStep,
                                   Width(), Height()));
 
   for (int y = 0; y < Height(); ++y) {
-    const int kQ4StepAdjustY = y >> 4;
     const int kFilterPeriodAdjustY = (y >> 3) << 3;
-    const int ref_y = kQ4StepAdjustY + kFilterPeriodAdjustY + kPixelSelected;
+    const int ref_y =
+        kPixelSelected + ((kInitialSubPelOffset
+            + kFilterPeriodAdjustY * kInputPixelStep)
+                          >> SUBPEL_BITS);
     for (int x = 0; x < Width(); ++x) {
-      const int kQ4StepAdjustX = x >> 4;
       const int kFilterPeriodAdjustX = (x >> 3) << 3;
-      const int ref_x = kQ4StepAdjustX + kFilterPeriodAdjustX + kPixelSelected;
+      const int ref_x =
+          kPixelSelected + ((kInitialSubPelOffset
+              + kFilterPeriodAdjustX * kInputPixelStep)
+                            >> SUBPEL_BITS);
 
       ASSERT_EQ(in[ref_y * kInputStride + ref_x], out[y * kOutputStride + x])
           << "x == " << x << ", y == " << y;
@@ -502,7 +543,6 @@ TEST_P(ConvolveTest, ChangeFilterWorks) {
   }
 }
 
-
 using std::tr1::make_tuple;
 
 const ConvolveFunctions convolve8_c(
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 4e79bbc1b..1d9684992 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -14,63 +14,45 @@
 #include "./vpx_config.h"
 #include "./vp9_rtcd.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_filter.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
 
-/* Assume a bank of 16 filters to choose from. There are two implementations
- * for filter wrapping behavior, since we want to be able to pick which filter
- * to start with. We could either:
- *
- * 1) make filter_ a pointer to the base of the filter array, and then add an
- *    additional offset parameter, to choose the starting filter.
- * 2) use a pointer to 2 periods worth of filters, so that even if the original
- *    phase offset is at 15/16, we'll have valid data to read. The filter
- *    tables become [32][8], and the second half is duplicated.
- * 3) fix the alignment of the filter tables, so that we know the 0/16 is
- *    always 256 byte aligned.
- *
- * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
- * parameters, and switching between them is trivial, with the
- * ALIGN_FILTERS_256 macro, below.
- */
- #define ALIGN_FILTERS_256 1
-
 static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x0, int x_step_q4,
                              const int16_t *filter_y, int y_step_q4,
                              int w, int h, int taps) {
   int x, y, k;
-  const int16_t *filter_x_base = filter_x0;
 
-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_x_base =
+      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
 
   /* Adjust base pointer address for this source line */
   src -= taps / 2 - 1;
 
   for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
     /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
+    int x_q4 = (filter_x0 - filter_x_base) / taps;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
+      const int src_x = x_q4 >> SUBPEL_BITS;
       int sum = 0;
 
+      /* Pointer to filter to use */
+      const int16_t *const filter_x = filter_x_base +
+          (x_q4 & SUBPEL_MASK) * taps;
+
       for (k = 0; k < taps; ++k)
         sum += src[src_x + k] * filter_x[k];
 
       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, VP9_FILTER_BITS));
 
-      /* Adjust source and filter to use for the next pixel */
+      /* Move to the next source pixel */
       x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
     }
     src += src_stride;
     dst += dst_stride;
@@ -83,37 +65,36 @@ static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
                                  const int16_t *filter_y, int y_step_q4,
                                  int w, int h, int taps) {
   int x, y, k;
-  const int16_t *filter_x_base = filter_x0;
 
-#if ALIGN_FILTERS_256
-  filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_x_base =
+      (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
 
   /* Adjust base pointer address for this source line */
   src -= taps / 2 - 1;
 
   for (y = 0; y < h; ++y) {
-    /* Pointer to filter to use */
-    const int16_t *filter_x = filter_x0;
-
     /* Initial phase offset */
-    int x0_q4 = (filter_x - filter_x_base) / taps;
-    int x_q4 = x0_q4;
+    int x_q4 = (filter_x0 - filter_x_base) / taps;
 
     for (x = 0; x < w; ++x) {
       /* Per-pixel src offset */
-      int src_x = (x_q4 - x0_q4) >> 4;
+      const int src_x = x_q4 >> SUBPEL_BITS;
       int sum = 0;
 
+      /* Pointer to filter to use */
+      const int16_t *const filter_x = filter_x_base +
+          (x_q4 & SUBPEL_MASK) * taps;
+
       for (k = 0; k < taps; ++k)
         sum += src[src_x + k] * filter_x[k];
 
       dst[x] = ROUND_POWER_OF_TWO(dst[x] +
                    clip_pixel(ROUND_POWER_OF_TWO(sum, VP9_FILTER_BITS)), 1);
 
-      /* Adjust source and filter to use for the next pixel */
+      /* Move to the next source pixel */
       x_q4 += x_step_q4;
-      filter_x = filter_x_base + (x_q4 & 0xf) * taps;
     }
     src += src_stride;
     dst += dst_stride;
@@ -127,36 +108,35 @@ static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                             int w, int h, int taps) {
   int x, y, k;
 
-  const int16_t *filter_y_base = filter_y0;
-
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_y_base =
+      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
 
   /* Adjust base pointer address for this source column */
   src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;
 
+  for (x = 0; x < w; ++x) {
     /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
+    int y_q4 = (filter_y0 - filter_y_base) / taps;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
+      const int src_y = y_q4 >> SUBPEL_BITS;
       int sum = 0;
 
+      /* Pointer to filter to use */
+      const int16_t *const filter_y = filter_y_base +
+          (y_q4 & SUBPEL_MASK) * taps;
+
       for (k = 0; k < taps; ++k)
         sum += src[(src_y + k) * src_stride] * filter_y[k];
 
       dst[y * dst_stride] =
           clip_pixel(ROUND_POWER_OF_TWO(sum, VP9_FILTER_BITS));
 
-      /* Adjust source and filter to use for the next pixel */
+      /* Move to the next source pixel */
       y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
     }
     ++src;
     ++dst;
@@ -170,36 +150,35 @@ static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
                                 int w, int h, int taps) {
   int x, y, k;
 
-  const int16_t *filter_y_base = filter_y0;
-
-#if ALIGN_FILTERS_256
-  filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
-#endif
+  /* NOTE: This assumes that the filter table is 256-byte aligned. */
+  /* TODO(agrange) Modify to make independent of table alignment. */
+  const int16_t *const filter_y_base =
+      (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
 
   /* Adjust base pointer address for this source column */
   src -= src_stride * (taps / 2 - 1);
-  for (x = 0; x < w; ++x) {
-    /* Pointer to filter to use */
-    const int16_t *filter_y = filter_y0;
 
+  for (x = 0; x < w; ++x) {
     /* Initial phase offset */
-    int y0_q4 = (filter_y - filter_y_base) / taps;
-    int y_q4 = y0_q4;
+    int y_q4 = (filter_y0 - filter_y_base) / taps;
 
     for (y = 0; y < h; ++y) {
       /* Per-pixel src offset */
-      int src_y = (y_q4 - y0_q4) >> 4;
+      const int src_y = y_q4 >> SUBPEL_BITS;
       int sum = 0;
 
+      /* Pointer to filter to use */
+      const int16_t *const filter_y = filter_y_base +
+          (y_q4 & SUBPEL_MASK) * taps;
+
       for (k = 0; k < taps; ++k)
         sum += src[(src_y + k) * src_stride] * filter_y[k];
 
       dst[y * dst_stride] = ROUND_POWER_OF_TWO(dst[y * dst_stride] +
            clip_pixel(ROUND_POWER_OF_TWO(sum, VP9_FILTER_BITS)), 1);
 
-      /* Adjust source and filter to use for the next pixel */
+      /* Move to the next source pixel */
       y_q4 += y_step_q4;
-      filter_y = filter_y_base + (y_q4 & 0xf) * taps;
     }
     ++src;
     ++dst;
@@ -227,13 +206,11 @@ static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
   if (intermediate_height < h)
     intermediate_height = h;
 
-  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
-                   temp, 64,
-                   filter_x, x_step_q4, filter_y, y_step_q4,
-                   w, intermediate_height, taps);
-  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride,
-                  filter_x, x_step_q4, filter_y, y_step_q4,
-                  w, h, taps);
+  convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride, temp, 64,
+                   filter_x, x_step_q4, filter_y, y_step_q4, w,
+                   intermediate_height, taps);
+  convolve_vert_c(temp + 64 * (taps / 2 - 1), 64, dst, dst_stride, filter_x,
+                  x_step_q4, filter_y, y_step_q4, w, h, taps);
 }
 
 void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,