summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRonald S. Bultje <rbultje@google.com>2013-07-10 11:17:19 -0700
committerGerrit Code Review <gerrit@gerrit.golo.chromium.org>2013-07-10 18:27:24 -0700
commitdecead7336246aa9f17469ae784c1ba0dc24b210 (patch)
treee268f83f3ac2f17500b198f1151d0cf95dbee3bc
parentc13e0bcb520d64c0b08222355ad86c8feb637b4e (diff)
downloadlibvpx-decead7336246aa9f17469ae784c1ba0dc24b210.tar
libvpx-decead7336246aa9f17469ae784c1ba0dc24b210.tar.gz
libvpx-decead7336246aa9f17469ae784c1ba0dc24b210.tar.bz2
libvpx-decead7336246aa9f17469ae784c1ba0dc24b210.zip
Replace copy_memNxM functions with a generic copy/avg function.
Change-Id: I3ce849452ed4f08527de9565a9914d5ee36170aa
-rw-r--r--test/convolve_test.cc4
-rw-r--r--vp9/common/vp9_convolve.c88
-rw-r--r--vp9/common/vp9_convolve.h18
-rw-r--r--vp9/common/vp9_reconinter.c87
-rw-r--r--vp9/common/vp9_rtcd_defs.sh29
-rw-r--r--vp9/common/x86/vp9_asm_stubs.c24
-rw-r--r--vp9/common/x86/vp9_copy_sse2.asm152
-rw-r--r--vp9/common/x86/vp9_recon_mmx.asm272
-rw-r--r--vp9/common/x86/vp9_recon_sse2.asm115
-rw-r--r--vp9/vp9_common.mk3
10 files changed, 221 insertions, 571 deletions
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index fd2bd36c1..6e5002fe9 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -22,8 +22,8 @@ extern "C" {
}
namespace {
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int filter_x_stride,
const int16_t *filter_y, int filter_y_stride,
int w, int h);
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
index 914afa702..6f1e41866 100644
--- a/vp9/common/vp9_convolve.c
+++ b/vp9/common/vp9_convolve.c
@@ -38,8 +38,8 @@
*/
#define ALIGN_FILTERS_256 1
-static void convolve_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x0, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -80,8 +80,8 @@ static void convolve_horiz_c(const uint8_t *src, int src_stride,
}
}
-static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x0, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -122,8 +122,8 @@ static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
}
}
-static void convolve_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y0, int y_step_q4,
int w, int h, int taps) {
@@ -164,8 +164,8 @@ static void convolve_vert_c(const uint8_t *src, int src_stride,
}
}
-static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y0, int y_step_q4,
int w, int h, int taps) {
@@ -207,8 +207,8 @@ static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
}
}
-static void convolve_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -237,8 +237,8 @@ static void convolve_c(const uint8_t *src, int src_stride,
w, h, taps);
}
-static void convolve_avg_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+static void convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h, int taps) {
@@ -267,8 +267,8 @@ static void convolve_avg_c(const uint8_t *src, int src_stride,
w, h, taps);
}
-void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -277,8 +277,8 @@ void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -287,8 +287,8 @@ void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -297,8 +297,8 @@ void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -307,8 +307,8 @@ void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -317,8 +317,8 @@ void vp9_convolve8_c(const uint8_t *src, int src_stride,
w, h, 8);
}
-void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -339,33 +339,25 @@ void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
w, h);
}
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
- if (w == 16 && h == 16) {
- vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
- } else if (w == 8 && h == 8) {
- vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
- } else if (w == 8 && h == 4) {
- vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
- } else {
- int r;
-
- for (r = h; r > 0; --r) {
- memcpy(dst, src, w);
- src += src_stride;
- dst += dst_stride;
- }
+void vp9_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int r;
+
+ for (r = h; r > 0; --r) {
+ memcpy(dst, src, w);
+ src += src_stride;
+ dst += dst_stride;
}
}
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int filter_x_stride,
- const int16_t *filter_y, int filter_y_stride,
- int w, int h) {
+void vp9_convolve_avg_c(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
int x, y;
for (y = 0; y < h; ++y) {
diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h
index 0596080c0..3de81119f 100644
--- a/vp9/common/vp9_convolve.h
+++ b/vp9/common/vp9_convolve.h
@@ -13,26 +13,12 @@
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
-typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h);
-// Not a convolution, a block copy conforming to the convolution prototype
-void vp9_convolve_copy(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
-// Not a convolution, a block average conforming to the convolution prototype
-void vp9_convolve_avg(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
- const int16_t *filter_x, int x_step_q4,
- const int16_t *filter_y, int y_step_q4,
- int w, int h);
-
struct subpix_fn_table {
const int16_t (*filter_x)[8];
const int16_t (*filter_y)[8];
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 265a19ab1..c29fd147e 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -194,93 +194,6 @@ void vp9_setup_interp_filters(MACROBLOCKD *xd,
assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
}
-void vp9_copy_mem16x16_c(const uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 16; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
- dst[8] = src[8];
- dst[9] = src[9];
- dst[10] = src[10];
- dst[11] = src[11];
- dst[12] = src[12];
- dst[13] = src[13];
- dst[14] = src[14];
- dst[15] = src[15];
-
-#else
- ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
- ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
- ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
-
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x8_c(const uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 8; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x4_c(const uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- dst[0] = src[0];
- dst[1] = src[1];
- dst[2] = src[2];
- dst[3] = src[3];
- dst[4] = src[4];
- dst[5] = src[5];
- dst[6] = src[6];
- dst[7] = src[7];
-#else
- ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
-#endif
- src += src_stride;
- dst += dst_stride;
- }
-}
-
void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
uint8_t *dst, int dst_stride,
const int_mv *src_mv,
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index d861a7a5e..459336e4d 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -43,17 +43,6 @@ specialize vp9_idct_add_32x32
#
# RECON
#
-prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem16x16 mmx sse2 dspr2
-vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
-
-prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x8 mmx dspr2
-vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
-
-prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx
-
prototype void vp9_d27_predictor_4x4 "uint8_t *ypred_ptr, ptrdiff_t y_stride, uint8_t *yabove_row, uint8_t *yleft_col"
specialize vp9_d27_predictor_4x4
@@ -275,22 +264,28 @@ specialize vp9_blend_b
#
# Sub Pixel Filters
#
-prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve_copy "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_copy sse2
+
+prototype void vp9_convolve_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve_avg sse2
+
+prototype void vp9_convolve8 "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8 ssse3
-prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_horiz ssse3
-prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_vert ssse3
-prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg ssse3
-prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_horiz ssse3
-prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+prototype void vp9_convolve8_avg_vert "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
specialize vp9_convolve8_avg_vert ssse3
#
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index 2b66834a7..98fc4dc83 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -121,8 +121,8 @@ void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
unsigned int output_height,
const short *filter);
-void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -159,8 +159,8 @@ void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -197,8 +197,8 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -235,8 +235,8 @@ void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -273,8 +273,8 @@ void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
@@ -294,8 +294,8 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
}
}
-void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
- uint8_t *dst, int dst_stride,
+void vp9_convolve8_avg_ssse3(const uint8_t *src, ptrdiff_t src_stride,
+ uint8_t *dst, ptrdiff_t dst_stride,
const int16_t *filter_x, int x_step_q4,
const int16_t *filter_y, int y_step_q4,
int w, int h) {
diff --git a/vp9/common/x86/vp9_copy_sse2.asm b/vp9/common/x86/vp9_copy_sse2.asm
new file mode 100644
index 000000000..dd522c698
--- /dev/null
+++ b/vp9/common/x86/vp9_copy_sse2.asm
@@ -0,0 +1,152 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+%macro convolve_fn 1
+INIT_XMM sse2
+cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \
+ fx, fxs, fy, fys, w, h
+ mov r4d, dword wm
+ cmp r4d, 4
+ je .w4
+ cmp r4d, 8
+ je .w8
+ cmp r4d, 16
+ je .w16
+ cmp r4d, 32
+ je .w32
+
+ mov r4d, dword hm
+.loop64:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+32]
+ movu m3, [srcq+48]
+ add srcq, src_strideq
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+16]
+ pavgb m2, [dstq+32]
+ pavgb m3, [dstq+48]
+%endif
+ mova [dstq ], m0
+ mova [dstq+16], m1
+ mova [dstq+32], m2
+ mova [dstq+48], m3
+ add dstq, dst_strideq
+ dec r4d
+ jnz .loop64
+ RET
+
+.w32:
+ mov r4d, dword hm
+.loop32:
+ movu m0, [srcq]
+ movu m1, [srcq+16]
+ movu m2, [srcq+src_strideq]
+ movu m3, [srcq+src_strideq+16]
+ lea srcq, [srcq+src_strideq*2]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq +16]
+ pavgb m2, [dstq+dst_strideq]
+ pavgb m3, [dstq+dst_strideq+16]
+%endif
+ mova [dstq ], m0
+ mova [dstq +16], m1
+ mova [dstq+dst_strideq ], m2
+ mova [dstq+dst_strideq+16], m3
+ lea dstq, [dstq+dst_strideq*2]
+ sub r4d, 2
+ jnz .loop32
+ RET
+
+.w16:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop16:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop16
+ RET
+
+INIT_MMX sse
+.w8:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop8:
+ movu m0, [srcq]
+ movu m1, [srcq+src_strideq]
+ movu m2, [srcq+src_strideq*2]
+ movu m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ mova [dstq ], m0
+ mova [dstq+dst_strideq ], m1
+ mova [dstq+dst_strideq*2], m2
+ mova [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop8
+ RET
+
+.w4:
+ mov r4d, dword hm
+ lea r5q, [src_strideq*3]
+ lea r6q, [dst_strideq*3]
+.loop4:
+ movh m0, [srcq]
+ movh m1, [srcq+src_strideq]
+ movh m2, [srcq+src_strideq*2]
+ movh m3, [srcq+r5q]
+ lea srcq, [srcq+src_strideq*4]
+%ifidn %1, avg
+ pavgb m0, [dstq]
+ pavgb m1, [dstq+dst_strideq]
+ pavgb m2, [dstq+dst_strideq*2]
+ pavgb m3, [dstq+r6q]
+%endif
+ movh [dstq ], m0
+ movh [dstq+dst_strideq ], m1
+ movh [dstq+dst_strideq*2], m2
+ movh [dstq+r6q ], m3
+ lea dstq, [dstq+dst_strideq*4]
+ sub r4d, 4
+ jnz .loop4
+ RET
+%endmacro
+
+convolve_fn copy
+convolve_fn avg
diff --git a/vp9/common/x86/vp9_recon_mmx.asm b/vp9/common/x86/vp9_recon_mmx.asm
deleted file mode 100644
index 6fbbe48cb..000000000
--- a/vp9/common/x86/vp9_recon_mmx.asm
+++ /dev/null
@@ -1,272 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem8x8_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem8x8_mmx) PRIVATE
-sym(vp9_copy_mem8x8_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- add rsi, rax
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx*2], mm2
-
-
- lea rdi, [rdi+rcx*2]
- movq mm3, [rsi]
-
- add rdi, rcx
- movq mm4, [rsi+rax]
-
- movq mm5, [rsi+rax*2]
- movq [rdi], mm3
-
- lea rsi, [rsi+rax*2]
- movq [rdi+rcx], mm4
-
- movq [rdi+rcx*2], mm5
- lea rdi, [rdi+rcx*2]
-
- movq mm0, [rsi+rax]
- movq mm1, [rsi+rax*2]
-
- movq [rdi+rcx], mm0
- movq [rdi+rcx*2],mm1
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem8x4_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem8x4_mmx) PRIVATE
-sym(vp9_copy_mem8x4_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movq mm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movq mm1, [rsi+rax]
- movq mm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movq [rdi], mm0
- movq [rdi+rcx], mm1
-
- movq [rdi+rcx*2], mm2
- lea rdi, [rdi+rcx*2]
-
- movq mm3, [rsi+rax]
- movq [rdi+rcx], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void copy_mem16x16_mmx(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem16x16_mmx) PRIVATE
-sym(vp9_copy_mem16x16_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movsxd rax, dword ptr arg(1) ;src_stride;
-
- mov rdi, arg(2) ;dst;
- movsxd rcx, dword ptr arg(3) ;dst_stride
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq mm1, [rsi+rax]
- movq mm4, [rsi+rax+8]
-
- movq mm2, [rsi+rax*2]
- movq mm5, [rsi+rax*2+8]
-
- lea rsi, [rsi+rax*2]
- add rsi, rax
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- movq [rdi+rcx], mm1
- movq [rdi+rcx+8], mm4
-
- movq [rdi+rcx*2], mm2
- movq [rdi+rcx*2+8], mm5
-
- lea rdi, [rdi+rcx*2]
- add rdi, rcx
-
- movq mm0, [rsi]
- movq mm3, [rsi+8];
-
- movq [rdi], mm0
- movq [rdi+8], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp9/common/x86/vp9_recon_sse2.asm b/vp9/common/x86/vp9_recon_sse2.asm
deleted file mode 100644
index f7cc611f4..000000000
--- a/vp9/common/x86/vp9_recon_sse2.asm
+++ /dev/null
@@ -1,115 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-;void copy_mem16x16_sse2(
-; unsigned char *src,
-; int src_stride,
-; unsigned char *dst,
-; int dst_stride
-; )
-global sym(vp9_copy_mem16x16_sse2) PRIVATE
-sym(vp9_copy_mem16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src;
- movdqu xmm0, [rsi]
-
- movsxd rax, dword ptr arg(1) ;src_stride;
- mov rdi, arg(2) ;dst;
-
- movdqu xmm1, [rsi+rax]
- movdqu xmm2, [rsi+rax*2]
-
- movsxd rcx, dword ptr arg(3) ;dst_stride
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm3, [rsi]
-
- add rdi, rcx
- movdqu xmm4, [rsi+rax]
-
- movdqu xmm5, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm3
- add rsi, rax
-
- movdqa [rdi+rcx], xmm4
- movdqa [rdi+rcx*2],xmm5
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm0, [rsi]
-
- add rdi, rcx
- movdqu xmm1, [rsi+rax]
-
- movdqu xmm2, [rsi+rax*2]
- lea rsi, [rsi+rax*2]
-
- movdqa [rdi], xmm0
- add rsi, rax
-
- movdqa [rdi+rcx], xmm1
-
- movdqa [rdi+rcx*2], xmm2
- movdqu xmm3, [rsi]
-
- movdqu xmm4, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- add rdi, rcx
- movdqu xmm5, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm3
-
- add rsi, rax
- movdqa [rdi+rcx], xmm4
-
- movdqa [rdi+rcx*2],xmm5
- movdqu xmm0, [rsi]
-
- lea rdi, [rdi+rcx*2]
- movdqu xmm1, [rsi+rax]
-
- add rdi, rcx
- movdqu xmm2, [rsi+rax*2]
-
- lea rsi, [rsi+rax*2]
- movdqa [rdi], xmm0
-
- movdqa [rdi+rcx], xmm1
- movdqa [rdi+rcx*2],xmm2
-
- movdqu xmm3, [rsi+rax]
- lea rdi, [rdi+rcx*2]
-
- movdqa [rdi+rcx], xmm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 1079e2048..ee744d541 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -75,10 +75,9 @@ VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_asm_stubs.c
VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_loopfilter_intrin_sse2.c
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.h
VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c
-VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm
VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm
+VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_copy_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm
-VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_intrapred_sse2.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_intrapred_ssse3.asm
VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_subpixel_8t_ssse3.asm