convolve: support larger blocks, fix asm saturation bug

Updates the common convoloution code to support blocks larger than 16x16, and rectangular blocks. This uncovered a bug in the SSSE3 filtering routines due to the order of application of saturation. This commit fixes that bug, adjusts the unit test to bias its random values towards the extremes, and adds a test to ensure that all filters conform to the expected pairwise addition structure. Change-Id: I81f69668b1de0de5a8ed43f0643845641525c8f0
author: John Koleszar <jkoleszar@google.com> 2013-04-18 13:05:38 -0700
committer: John Koleszar <jkoleszar@google.com> 2013-04-18 13:57:59 -0700
commit: a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a (patch)
tree: 466865fec91567c8c72481c194ad9b1c6b8d5db8 /vp9/common/x86
parent: ff3f93639c147fa9b8dcf6d014b3c80bf3b7c8ef (diff)
download: libvpx-a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a.tar
libvpx-a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a.tar.gz
libvpx-a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a.tar.bz2
libvpx-a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a.zip
2 files changed, 12 insertions, 16 deletions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 6d3bb021a..310f8ed24 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -278,11 +278,9 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*71);
 
-  // check w/h due to fixed size fdata2 array
-  assert(w <= 16);
-  assert(h <= 16);
+  assert(h <= 64);
 
   if (x_step_q4 == 16 && y_step_q4 == 16 &&
       filter_x[3] != 128 && filter_y[3] != 128) {
@@ -324,11 +322,9 @@ void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
                          const int16_t *filter_x, int x_step_q4,
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
-  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*71);
 
-  // check w/h due to fixed size fdata2 array
-  assert(w <= 16);
-  assert(h <= 16);
+  assert(h <= 64);
 
   if (x_step_q4 == 16 && y_step_q4 == 16 &&
       filter_x[3] != 128 && filter_y[3] != 128) {
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index 32f00e289..bbf9888ca 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -81,10 +81,10 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    paddsw      xmm0, xmm6
     paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
     paddsw      xmm0, xmm4
+    paddsw      xmm0, krd
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
@@ -165,10 +165,10 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    paddsw      xmm0, xmm6
     paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
     paddsw      xmm0, xmm4
+    paddsw      xmm0, krd
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
@@ -250,10 +250,10 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    paddsw      xmm0, xmm6
     paddsw      xmm0, xmm2
-    paddsw      xmm0, krd
-    paddsw      xmm4, xmm6
     paddsw      xmm0, xmm4
+    paddsw      xmm0, krd
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
@@ -285,10 +285,10 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    paddsw      xmm0, xmm6
     paddsw      xmm0, xmm2
-    paddsw      xmm4, xmm6
-    paddsw      xmm0, krd
     paddsw      xmm0, xmm4
+    paddsw      xmm0, krd
 
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
author	John Koleszar <jkoleszar@google.com>	2013-04-18 13:05:38 -0700
committer	John Koleszar <jkoleszar@google.com>	2013-04-18 13:57:59 -0700
commit	a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a (patch)
tree	466865fec91567c8c72481c194ad9b1c6b8d5db8 /vp9/common/x86
parent	ff3f93639c147fa9b8dcf6d014b3c80bf3b7c8ef (diff)
download	libvpx-a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a.tar libvpx-a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a.tar.gz libvpx-a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a.tar.bz2 libvpx-a9ebbcc33836d8666f98de7a6d416a6dcd0cae9a.zip