summaryrefslogtreecommitdiff
path: root/vp9/common
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2013-02-08 13:19:47 -0800
committerGerrit Code Review <gerrit@gerrit.golo.chromium.org>2013-02-08 13:19:47 -0800
commit3de8ee6ba1db38f14643c10793f57ed5d0b7122f (patch)
tree2e924af27c81965d6b08884f5d336aabaea11d6c /vp9/common
parente6ad9ab02c9914182fec83b9170c5bc86baf4102 (diff)
parent29d47ac80eb934368dee41d2a7e6265aa8ccee9c (diff)
downloadlibvpx-3de8ee6ba1db38f14643c10793f57ed5d0b7122f.tar
libvpx-3de8ee6ba1db38f14643c10793f57ed5d0b7122f.tar.gz
libvpx-3de8ee6ba1db38f14643c10793f57ed5d0b7122f.tar.bz2
libvpx-3de8ee6ba1db38f14643c10793f57ed5d0b7122f.zip
Merge changes Ife0d8147,I7d469716,Ic9a5615f into experimental
* changes: Restore SSSE3 subpixel filters in new convolve framework Convert subpixel filters to use convolve framework Add 8-tap generic convolver
Diffstat (limited to 'vp9/common')
-rw-r--r--vp9/common/generic/vp9_systemdependent.c2
-rw-r--r--vp9/common/ppc/vp9_systemdependent.c1
-rw-r--r--vp9/common/vp9_blockd.h11
-rw-r--r--vp9/common/vp9_convolve.c345
-rw-r--r--vp9/common/vp9_convolve.h43
-rw-r--r--vp9/common/vp9_filter.c1119
-rw-r--r--vp9/common/vp9_filter.h11
-rw-r--r--vp9/common/vp9_findnearmv.c8
-rw-r--r--vp9/common/vp9_reconinter.c324
-rw-r--r--vp9/common/vp9_reconinter.h6
-rw-r--r--vp9/common/vp9_rtcd_defs.sh142
-rw-r--r--vp9/common/vp9_subpixel.h20
-rw-r--r--vp9/common/x86/vp9_asm_stubs.c617
-rw-r--r--vp9/common/x86/vp9_filter_sse2.c290
-rw-r--r--vp9/common/x86/vp9_filter_sse4.c362
-rw-r--r--vp9/common/x86/vp9_subpixel_mmx.asm268
-rw-r--r--vp9/common/x86/vp9_subpixel_sse2.asm1372
-rw-r--r--vp9/common/x86/vp9_subpixel_ssse3.asm1515
-rw-r--r--vp9/common/x86/vp9_subpixel_x86.h109
19 files changed, 671 insertions, 5894 deletions
diff --git a/vp9/common/generic/vp9_systemdependent.c b/vp9/common/generic/vp9_systemdependent.c
index b02f3f083..79092cd0e 100644
--- a/vp9/common/generic/vp9_systemdependent.c
+++ b/vp9/common/generic/vp9_systemdependent.c
@@ -11,8 +11,6 @@
#include "./vpx_config.h"
#include "vp9_rtcd.h"
-#include "vp9/common/vp9_subpixel.h"
-#include "vp9/common/vp9_loopfilter.h"
#include "vp9/common/vp9_onyxc_int.h"
void vp9_machine_specific_config(VP9_COMMON *ctx) {
diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c
index 106a2b763..02035191f 100644
--- a/vp9/common/ppc/vp9_systemdependent.c
+++ b/vp9/common/ppc/vp9_systemdependent.c
@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "vp9/common/vp9_subpixel.h"
#include "vp9/common/vp9_loopfilter.h"
#include "recon.h"
#include "vp9/common/vp9_onyxc_int.h"
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c6702ae31..28fafbdc9 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -16,9 +16,9 @@ void vpx_log(const char *format, ...);
#include "./vpx_config.h"
#include "vpx_scale/yv12config.h"
+#include "vp9/common/vp9_convolve.h"
#include "vp9/common/vp9_mv.h"
#include "vp9/common/vp9_treecoder.h"
-#include "vp9/common/vp9_subpixel.h"
#include "vpx_ports/mem.h"
#include "vp9/common/vp9_common.h"
@@ -394,15 +394,8 @@ typedef struct macroblockd {
void (*inv_walsh4x4_1)(int16_t *in, int16_t *out);
void (*inv_walsh4x4_lossless)(int16_t *in, int16_t *out);
+ struct subpix_fn_table subpix;
- vp9_subpix_fn_t subpixel_predict4x4;
- vp9_subpix_fn_t subpixel_predict8x4;
- vp9_subpix_fn_t subpixel_predict8x8;
- vp9_subpix_fn_t subpixel_predict16x16;
- vp9_subpix_fn_t subpixel_predict_avg4x4;
- vp9_subpix_fn_t subpixel_predict_avg8x4;
- vp9_subpix_fn_t subpixel_predict_avg8x8;
- vp9_subpix_fn_t subpixel_predict_avg16x16;
int allow_high_precision_mv;
int corrupted;
diff --git a/vp9/common/vp9_convolve.c b/vp9/common/vp9_convolve.c
new file mode 100644
index 000000000..f21f1d84e
--- /dev/null
+++ b/vp9/common/vp9_convolve.c
@@ -0,0 +1,345 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "vp9/common/vp9_common.h"
+#include "vpx/vpx_integer.h"
+
+#define VP9_FILTER_WEIGHT 128
+#define VP9_FILTER_SHIFT 7
+#define ALIGN_FILTERS_256 0
+
+/* Assume a bank of 16 filters to choose from. There are two implementations
+ * for filter wrapping behavior, since we want to be able to pick which filter
+ * to start with. We could either:
+ *
+ * 1) make filter_ a pointer to the base of the filter array, and then add an
+ * additional offset parameter, to choose the starting filter.
+ * 2) use a pointer to 2 periods worth of filters, so that even if the original
+ * phase offset is at 15/16, we'll have valid data to read. The filter
+ * tables become [32][8], and the second half is duplicated.
+ * 3) fix the alignment of the filter tables, so that we know the 0/16 is
+ * always 256 byte aligned.
+ *
+ * Implementations 2 and 3 are likely preferable, as they avoid an extra 2
+ * parameters, and switching between them is trivial.
+ */
+static void convolve_horiz_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x0, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int taps) {
+ int x, y, k, sum;
+ const int16_t *filter_x_base = filter_x0;
+
+#if ALIGN_FILTERS_256
+ filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
+#endif
+
+ /* Adjust base pointer address for this source line */
+ src -= taps / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ /* Pointer to filter to use */
+ const int16_t *filter_x = filter_x0;
+
+ /* Initial phase offset */
+ int x_q4 = (filter_x - filter_x_base) / taps;
+
+ for (x = 0; x < w; ++x) {
+ /* Per-pixel src offset */
+ int src_x = x_q4 >> 4;
+
+ for (sum = 0, k = 0; k < taps; ++k) {
+ sum += src[src_x + k] * filter_x[k];
+ }
+ sum += (VP9_FILTER_WEIGHT >> 1);
+ dst[x] = clip_pixel(sum >> VP9_FILTER_SHIFT);
+
+ /* Adjust source and filter to use for the next pixel */
+ x_q4 += x_step_q4;
+ filter_x = filter_x_base + (x_q4 & 0xf) * taps;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_avg_horiz_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x0, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int taps) {
+ int x, y, k, sum;
+ const int16_t *filter_x_base = filter_x0;
+
+#if ALIGN_FILTERS_256
+ filter_x_base = (const int16_t *)(((intptr_t)filter_x0) & ~(intptr_t)0xff);
+#endif
+
+ /* Adjust base pointer address for this source line */
+ src -= taps / 2 - 1;
+
+ for (y = 0; y < h; ++y) {
+ /* Pointer to filter to use */
+ const int16_t *filter_x = filter_x0;
+
+ /* Initial phase offset */
+ int x_q4 = (filter_x - filter_x_base) / taps;
+
+ for (x = 0; x < w; ++x) {
+ /* Per-pixel src offset */
+ int src_x = x_q4 >> 4;
+
+ for (sum = 0, k = 0; k < taps; ++k) {
+ sum += src[src_x + k] * filter_x[k];
+ }
+ sum += (VP9_FILTER_WEIGHT >> 1);
+ dst[x] = (dst[x] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;
+
+ /* Adjust source and filter to use for the next pixel */
+ x_q4 += x_step_q4;
+ filter_x = filter_x_base + (x_q4 & 0xf) * taps;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+static void convolve_vert_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y0, int y_step_q4,
+ int w, int h, int taps) {
+ int x, y, k, sum;
+
+ const int16_t *filter_y_base = filter_y0;
+
+#if ALIGN_FILTERS_256
+ filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
+#endif
+
+ /* Adjust base pointer address for this source column */
+ src -= src_stride * (taps / 2 - 1);
+ for (x = 0; x < w; ++x) {
+ /* Pointer to filter to use */
+ const int16_t *filter_y = filter_y0;
+
+ /* Initial phase offset */
+ int y_q4 = (filter_y - filter_y_base) / taps;
+
+ for (y = 0; y < h; ++y) {
+ /* Per-pixel src offset */
+ int src_y = y_q4 >> 4;
+
+ for (sum = 0, k = 0; k < taps; ++k) {
+ sum += src[(src_y + k) * src_stride] * filter_y[k];
+ }
+ sum += (VP9_FILTER_WEIGHT >> 1);
+ dst[y * dst_stride] = clip_pixel(sum >> VP9_FILTER_SHIFT);
+
+ /* Adjust source and filter to use for the next pixel */
+ y_q4 += y_step_q4;
+ filter_y = filter_y_base + (y_q4 & 0xf) * taps;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve_avg_vert_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y0, int y_step_q4,
+ int w, int h, int taps) {
+ int x, y, k, sum;
+
+ const int16_t *filter_y_base = filter_y0;
+
+#if ALIGN_FILTERS_256
+ filter_y_base = (const int16_t *)(((intptr_t)filter_y0) & ~(intptr_t)0xff);
+#endif
+
+ /* Adjust base pointer address for this source column */
+ src -= src_stride * (taps / 2 - 1);
+ for (x = 0; x < w; ++x) {
+ /* Pointer to filter to use */
+ const int16_t *filter_y = filter_y0;
+
+ /* Initial phase offset */
+ int y_q4 = (filter_y - filter_y_base) / taps;
+
+ for (y = 0; y < h; ++y) {
+ /* Per-pixel src offset */
+ int src_y = y_q4 >> 4;
+
+ for (sum = 0, k = 0; k < taps; ++k) {
+ sum += src[(src_y + k) * src_stride] * filter_y[k];
+ }
+ sum += (VP9_FILTER_WEIGHT >> 1);
+ dst[y * dst_stride] =
+ (dst[y * dst_stride] + clip_pixel(sum >> VP9_FILTER_SHIFT) + 1) >> 1;
+
+ /* Adjust source and filter to use for the next pixel */
+ y_q4 += y_step_q4;
+ filter_y = filter_y_base + (y_q4 & 0xf) * taps;
+ }
+ ++src;
+ ++dst;
+ }
+}
+
+static void convolve_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int taps) {
+ /* Fixed size intermediate buffer places limits on parameters. */
+ uint8_t temp[16 * 23];
+ assert(w <= 16);
+ assert(h <= 16);
+ assert(taps <= 8);
+
+ convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
+ temp, 16,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h + taps - 1, taps);
+ convolve_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h, taps);
+}
+
+static void convolve_avg_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h, int taps) {
+ /* Fixed size intermediate buffer places limits on parameters. */
+ uint8_t temp[16 * 23];
+ assert(w <= 16);
+ assert(h <= 16);
+ assert(taps <= 8);
+
+ convolve_horiz_c(src - src_stride * (taps / 2 - 1), src_stride,
+ temp, 16,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h + taps - 1, taps);
+ convolve_avg_vert_c(temp + 16 * (taps / 2 - 1), 16, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h, taps);
+}
+
+void vp9_convolve8_horiz_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ convolve_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h, 8);
+}
+
+void vp9_convolve8_avg_horiz_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ convolve_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h, 8);
+}
+
+void vp9_convolve8_vert_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ convolve_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h, 8);
+}
+
+void vp9_convolve8_avg_vert_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ convolve_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h, 8);
+}
+
+void vp9_convolve8_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ convolve_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h, 8);
+}
+
+void vp9_convolve8_avg_c(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ convolve_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h, 8);
+}
+
+void vp9_convolve_copy(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ if (h == 16) {
+ vp9_copy_mem16x16(src, src_stride, dst, dst_stride);
+ } else if (h == 8) {
+ vp9_copy_mem8x8(src, src_stride, dst, dst_stride);
+ } else if (w == 8) {
+ vp9_copy_mem8x4(src, src_stride, dst, dst_stride);
+ } else {
+ // 4x4
+ int r;
+
+ for (r = 0; r < 4; ++r) {
+#if !(CONFIG_FAST_UNALIGNED)
+ dst[0] = src[0];
+ dst[1] = src[1];
+ dst[2] = src[2];
+ dst[3] = src[3];
+#else
+ *(uint32_t *)dst = *(const uint32_t *)src;
+#endif
+ src += src_stride;
+ dst += dst_stride;
+ }
+ }
+}
+
+void vp9_convolve_avg(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int filter_x_stride,
+ const int16_t *filter_y, int filter_y_stride,
+ int w, int h) {
+ int x, y;
+
+ for (y = 0; y < h; ++y) {
+ for (x = 0; x < w; ++x) {
+ dst[x] = (dst[x] + src[x] + 1) >> 1;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
diff --git a/vp9/common/vp9_convolve.h b/vp9/common/vp9_convolve.h
new file mode 100644
index 000000000..46c935ab7
--- /dev/null
+++ b/vp9/common/vp9_convolve.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP9_COMMON_CONVOLVE_H_
+#define VP9_COMMON_CONVOLVE_H_
+
+#include "vpx/vpx_integer.h"
+
+typedef void (*convolve_fn_t)(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+// Not a convolution, a block copy conforming to the convolution prototype
+void vp9_convolve_copy(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+// Not a convolution, a block average conforming to the convolution prototype
+void vp9_convolve_avg(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h);
+
+struct subpix_fn_table {
+ convolve_fn_t predict[2][2][2]; // horiz, vert, avg
+ const int16_t (*filter_x)[8];
+ const int16_t (*filter_y)[8];
+ int x_step_q4;
+ int y_step_q4;
+};
+
+#endif // VP9_COMMON_CONVOLVE_H_
diff --git a/vp9/common/vp9_filter.c b/vp9/common/vp9_filter.c
index 07d8a169f..5e425895f 100644
--- a/vp9/common/vp9_filter.c
+++ b/vp9/common/vp9_filter.c
@@ -15,23 +15,23 @@
#include "vp9_rtcd.h"
#include "vp9/common/vp9_common.h"
-DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][2]) = {
- { 128, 0 },
- { 120, 8 },
- { 112, 16 },
- { 104, 24 },
- { 96, 32 },
- { 88, 40 },
- { 80, 48 },
- { 72, 56 },
- { 64, 64 },
- { 56, 72 },
- { 48, 80 },
- { 40, 88 },
- { 32, 96 },
- { 24, 104 },
- { 16, 112 },
- { 8, 120 }
+DECLARE_ALIGNED(16, const int16_t, vp9_bilinear_filters[SUBPEL_SHIFTS][8]) = {
+ { 0, 0, 0, 128, 0, 0, 0, 0 },
+ { 0, 0, 0, 120, 8, 0, 0, 0 },
+ { 0, 0, 0, 112, 16, 0, 0, 0 },
+ { 0, 0, 0, 104, 24, 0, 0, 0 },
+ { 0, 0, 0, 96, 32, 0, 0, 0 },
+ { 0, 0, 0, 88, 40, 0, 0, 0 },
+ { 0, 0, 0, 80, 48, 0, 0, 0 },
+ { 0, 0, 0, 72, 56, 0, 0, 0 },
+ { 0, 0, 0, 64, 64, 0, 0, 0 },
+ { 0, 0, 0, 56, 72, 0, 0, 0 },
+ { 0, 0, 0, 48, 80, 0, 0, 0 },
+ { 0, 0, 0, 40, 88, 0, 0, 0 },
+ { 0, 0, 0, 32, 96, 0, 0, 0 },
+ { 0, 0, 0, 24, 104, 0, 0, 0 },
+ { 0, 0, 0, 16, 112, 0, 0, 0 },
+ { 0, 0, 0, 8, 120, 0, 0, 0 }
};
#define FILTER_ALPHA 0
@@ -144,1072 +144,21 @@ DECLARE_ALIGNED(16, const int16_t,
{ 1, -2, -7, 37, 80, 28, -8, -1}
};
-DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6]) = {
- {0, 0, 128, 0, 0, 0},
- {1, -5, 125, 8, -2, 1},
- {1, -8, 122, 17, -5, 1},
- {2, -11, 116, 27, -8, 2},
- {3, -14, 110, 37, -10, 2},
- {3, -15, 103, 47, -12, 2},
- {3, -16, 95, 57, -14, 3},
- {3, -16, 86, 67, -15, 3},
- {3, -16, 77, 77, -16, 3},
- {3, -15, 67, 86, -16, 3},
- {3, -14, 57, 95, -16, 3},
- {2, -12, 47, 103, -15, 3},
- {2, -10, 37, 110, -14, 3},
- {2, -8, 27, 116, -11, 2},
- {1, -5, 17, 122, -8, 1},
- {1, -2, 8, 125, -5, 1}
+DECLARE_ALIGNED(16, const int16_t, vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8]) = {
+ {0, 0, 0, 128, 0, 0, 0, 0},
+ {0, 1, -5, 125, 8, -2, 1, 0},
+ {0, 1, -8, 122, 17, -5, 1, 0},
+ {0, 2, -11, 116, 27, -8, 2, 0},
+ {0, 3, -14, 110, 37, -10, 2, 0},
+ {0, 3, -15, 103, 47, -12, 2, 0},
+ {0, 3, -16, 95, 57, -14, 3, 0},
+ {0, 3, -16, 86, 67, -15, 3, 0},
+ {0, 3, -16, 77, 77, -16, 3, 0},
+ {0, 3, -15, 67, 86, -16, 3, 0},
+ {0, 3, -14, 57, 95, -16, 3, 0},
+ {0, 2, -12, 47, 103, -15, 3, 0},
+ {0, 2, -10, 37, 110, -14, 3, 0},
+ {0, 2, -8, 27, 116, -11, 2, 0},
+ {0, 1, -5, 17, 122, -8, 1, 0},
+ {0, 1, -2, 8, 125, -5, 1, 0}
};
-
-static void filter_block2d_first_pass_6(uint8_t *src_ptr,
- int *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const int16_t *vp9_filter) {
- unsigned int i, j;
- int temp;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
- ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
- ((int)src_ptr[0] * vp9_filter[2]) +
- ((int)src_ptr[pixel_step] * vp9_filter[3]) +
- ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) +
- ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) +
- (VP9_FILTER_WEIGHT >> 1); /* Rounding */
-
- /* Normalize back to 0-255 */
- output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);
- src_ptr++;
- }
-
- /* Next row... */
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_width;
- }
-}
-
-static void filter_block2d_second_pass_6(int *src_ptr,
- uint8_t *output_ptr,
- int output_pitch,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const int16_t *vp9_filter) {
- unsigned int i, j;
- int temp;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- /* Apply filter */
- temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
- ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
- ((int)src_ptr[0] * vp9_filter[2]) +
- ((int)src_ptr[pixel_step] * vp9_filter[3]) +
- ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) +
- ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) +
- (VP9_FILTER_WEIGHT >> 1); /* Rounding */
-
- /* Normalize back to 0-255 */
- output_ptr[j] = clip_pixel(temp >> VP9_FILTER_SHIFT);
- src_ptr++;
- }
-
- /* Start next row */
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_pitch;
- }
-}
-
-/*
- * The only functional difference between filter_block2d_second_pass()
- * and this function is that filter_block2d_second_pass() does a sixtap
- * filter on the input and stores it in the output. This function
- * (filter_block2d_second_pass_avg()) does a sixtap filter on the input,
- * and then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_second_pass_avg_6(int *src_ptr,
- uint8_t *output_ptr,
- int output_pitch,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const int16_t *vp9_filter) {
- unsigned int i, j;
- int temp;
-
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- /* Apply filter */
- temp = ((int)src_ptr[-2 * (int)pixel_step] * vp9_filter[0]) +
- ((int)src_ptr[-1 * (int)pixel_step] * vp9_filter[1]) +
- ((int)src_ptr[0] * vp9_filter[2]) +
- ((int)src_ptr[pixel_step] * vp9_filter[3]) +
- ((int)src_ptr[2 * pixel_step] * vp9_filter[4]) +
- ((int)src_ptr[3 * pixel_step] * vp9_filter[5]) +
- (VP9_FILTER_WEIGHT >> 1); /* Rounding */
-
- /* Normalize back to 0-255 */
- output_ptr[j] = (clip_pixel(temp >> VP9_FILTER_SHIFT) +
- output_ptr[j] + 1) >> 1;
- src_ptr++;
- }
-
- /* Start next row */
- src_ptr += src_pixels_per_line - output_width;
- output_ptr += output_pitch;
- }
-}
-
-#define Interp_Extend 3
-static void filter_block2d_6(uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int output_pitch,
- const int16_t *HFilter,
- const int16_t *VFilter) {
- int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(
- src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
- src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);
-
- /* then filter vertically... */
- filter_block2d_second_pass_6(FData + 4 * (Interp_Extend - 1), output_ptr,
- output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-
-void vp9_sixtap_predict4x4_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- filter_block2d_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter,
- VFilter);
-}
-
-/*
- * The difference between filter_block2d_6() and filter_block2d_avg_6 is
- * that filter_block2d_6() does a 6-tap filter and stores it in the output
- * buffer, whereas filter_block2d_avg_6() does the same 6-tap filter, and
- * then averages that with the content already present in the output
- * ((filter_result + dest + 1) >> 1) and stores that in the output.
- */
-static void filter_block2d_avg_6(uint8_t *src_ptr,
- uint8_t *output_ptr,
- unsigned int src_pixels_per_line,
- int output_pitch,
- const int16_t *HFilter,
- const int16_t *VFilter) {
- int FData[(3 + Interp_Extend * 2) * 4]; /* Temp data buffer */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(
- src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
- src_pixels_per_line, 1, 3 + Interp_Extend * 2, 4, HFilter);
-
- /* then filter vertically... */
- filter_block2d_second_pass_avg_6(FData + 4 * (Interp_Extend - 1), output_ptr,
- output_pitch, 4, 4, 4, 4, VFilter);
-}
-
-void vp9_sixtap_predict_avg4x4_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- filter_block2d_avg_6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch,
- HFilter, VFilter);
-}
-
-void vp9_sixtap_predict8x8_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
- int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(
- src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
- src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);
-
- /* then filter vertically... */
- filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
- dst_pitch, 8, 8, 8, 8, VFilter);
-
-}
-
-void vp9_sixtap_predict_avg8x8_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
- int FData[(7 + Interp_Extend * 2) * 8]; /* Temp data buffer */
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(
- src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
- src_pixels_per_line, 1, 7 + Interp_Extend * 2, 8, HFilter);
-
- /* then filter vertically... */
- filter_block2d_second_pass_avg_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
- dst_pitch, 8, 8, 8, 8, VFilter);
-}
-
-void vp9_sixtap_predict8x4_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
- int FData[(3 + Interp_Extend * 2) * 8]; /* Temp data buffer */
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(
- src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
- src_pixels_per_line, 1, 3 + Interp_Extend * 2, 8, HFilter);
-
- /* then filter vertically... */
- filter_block2d_second_pass_6(FData + 8 * (Interp_Extend - 1), dst_ptr,
- dst_pitch, 8, 8, 4, 8, VFilter);
-}
-
-void vp9_sixtap_predict16x16_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
- int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(
- src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
- src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
-
- /* then filter vertically... */
- filter_block2d_second_pass_6(FData + 16 * (Interp_Extend - 1), dst_ptr,
- dst_pitch, 16, 16, 16, 16, VFilter);
-}
-
-void vp9_sixtap_predict_avg16x16_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
- int FData[(15 + Interp_Extend * 2) * 16]; /* Temp data buffer */
-
- HFilter = vp9_sub_pel_filters_6[xoffset]; /* 6 tap */
- VFilter = vp9_sub_pel_filters_6[yoffset]; /* 6 tap */
-
- /* First filter 1-D horizontally... */
- filter_block2d_first_pass_6(
- src_ptr - ((Interp_Extend - 1) * src_pixels_per_line), FData,
- src_pixels_per_line, 1, 15 + Interp_Extend * 2, 16, HFilter);
-
- /* then filter vertically... */
- filter_block2d_second_pass_avg_6(FData + 16 * (Interp_Extend - 1), dst_ptr,
- dst_pitch, 16, 16, 16, 16, VFilter);
-}
-
-typedef enum {
- VPX_FILTER_4x4 = 0,
- VPX_FILTER_8x8 = 1,
- VPX_FILTER_8x4 = 2,
- VPX_FILTER_16x16 = 3,
-} filter_size_t;
-
-static const unsigned int filter_size_to_wh[][2] = {
- {4, 4},
- {8, 8},
- {8, 4},
- {16,16},
-};
-
-static void filter_block2d_8_c(const uint8_t *src_ptr,
- const unsigned int src_stride,
- const int16_t *HFilter,
- const int16_t *VFilter,
- const filter_size_t filter_size,
- uint8_t *dst_ptr,
- unsigned int dst_stride) {
- const unsigned int output_width = filter_size_to_wh[filter_size][0];
- const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
- // Between passes, we use an intermediate buffer whose height is extended to
- // have enough horizontally filtered values as input for the vertical pass.
- // This buffer is allocated to be big enough for the largest block type we
- // support.
- const int kInterp_Extend = 4;
- const unsigned int intermediate_height =
- (kInterp_Extend - 1) + output_height + kInterp_Extend;
-
- /* Size of intermediate_buffer is max_intermediate_height * filter_max_width,
- * where max_intermediate_height = (kInterp_Extend - 1) + filter_max_height
- * + kInterp_Extend
- * = 3 + 16 + 4
- * = 23
- * and filter_max_width = 16
- */
- uint8_t intermediate_buffer[23 * 16];
- const int intermediate_next_stride = 1 - intermediate_height * output_width;
-
- // Horizontal pass (src -> transposed intermediate).
- {
- uint8_t *output_ptr = intermediate_buffer;
- const int src_next_row_stride = src_stride - output_width;
- unsigned int i, j;
- src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
- for (i = 0; i < intermediate_height; i++) {
- for (j = 0; j < output_width; j++) {
- // Apply filter...
- int temp = ((int)src_ptr[0] * HFilter[0]) +
- ((int)src_ptr[1] * HFilter[1]) +
- ((int)src_ptr[2] * HFilter[2]) +
- ((int)src_ptr[3] * HFilter[3]) +
- ((int)src_ptr[4] * HFilter[4]) +
- ((int)src_ptr[5] * HFilter[5]) +
- ((int)src_ptr[6] * HFilter[6]) +
- ((int)src_ptr[7] * HFilter[7]) +
- (VP9_FILTER_WEIGHT >> 1); // Rounding
-
- // Normalize back to 0-255...
- *output_ptr = clip_pixel(temp >> VP9_FILTER_SHIFT);
- src_ptr++;
- output_ptr += intermediate_height;
- }
- src_ptr += src_next_row_stride;
- output_ptr += intermediate_next_stride;
- }
- }
-
- // Vertical pass (transposed intermediate -> dst).
- {
- uint8_t *src_ptr = intermediate_buffer;
- const int dst_next_row_stride = dst_stride - output_width;
- unsigned int i, j;
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- // Apply filter...
- int temp = ((int)src_ptr[0] * VFilter[0]) +
- ((int)src_ptr[1] * VFilter[1]) +
- ((int)src_ptr[2] * VFilter[2]) +
- ((int)src_ptr[3] * VFilter[3]) +
- ((int)src_ptr[4] * VFilter[4]) +
- ((int)src_ptr[5] * VFilter[5]) +
- ((int)src_ptr[6] * VFilter[6]) +
- ((int)src_ptr[7] * VFilter[7]) +
- (VP9_FILTER_WEIGHT >> 1); // Rounding
-
- // Normalize back to 0-255...
- *dst_ptr++ = clip_pixel(temp >> VP9_FILTER_SHIFT);
- src_ptr += intermediate_height;
- }
- src_ptr += intermediate_next_stride;
- dst_ptr += dst_next_row_stride;
- }
- }
-}
-
-void vp9_filter_block2d_4x4_8_c(const uint8_t *src_ptr,
- const unsigned int src_stride,
- const int16_t *HFilter_aligned16,
- const int16_t *VFilter_aligned16,
- uint8_t *dst_ptr,
- unsigned int dst_stride) {
- filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
- VPX_FILTER_4x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x4_8_c(const uint8_t *src_ptr,
- const unsigned int src_stride,
- const int16_t *HFilter_aligned16,
- const int16_t *VFilter_aligned16,
- uint8_t *dst_ptr,
- unsigned int dst_stride) {
- filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
- VPX_FILTER_8x4, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_8x8_8_c(const uint8_t *src_ptr,
- const unsigned int src_stride,
- const int16_t *HFilter_aligned16,
- const int16_t *VFilter_aligned16,
- uint8_t *dst_ptr,
- unsigned int dst_stride) {
- filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
- VPX_FILTER_8x8, dst_ptr, dst_stride);
-}
-
-void vp9_filter_block2d_16x16_8_c(const uint8_t *src_ptr,
- const unsigned int src_stride,
- const int16_t *HFilter_aligned16,
- const int16_t *VFilter_aligned16,
- uint8_t *dst_ptr,
- unsigned int dst_stride) {
- filter_block2d_8_c(src_ptr, src_stride, HFilter_aligned16, VFilter_aligned16,
- VPX_FILTER_16x16, dst_ptr, dst_stride);
-}
-
-static void block2d_average_c(uint8_t *src,
- unsigned int src_stride,
- uint8_t *output_ptr,
- unsigned int output_stride,
- const filter_size_t filter_size) {
- const unsigned int output_width = filter_size_to_wh[filter_size][0];
- const unsigned int output_height = filter_size_to_wh[filter_size][1];
-
- unsigned int i, j;
- for (i = 0; i < output_height; i++) {
- for (j = 0; j < output_width; j++) {
- output_ptr[j] = (output_ptr[j] + src[i * src_stride + j] + 1) >> 1;
- }
- output_ptr += output_stride;
- }
-}
-
-#define block2d_average block2d_average_c
-
-void vp9_eighttap_predict4x4_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_sub_pel_filters_8[xoffset];
- VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
- uint8_t tmp[4 * 4];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
- 4);
- block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict4x4_sharp_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_sub_pel_filters_8s[xoffset];
- VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict4x4_smooth_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_sub_pel_filters_8lp[xoffset];
- VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line,
- HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg4x4_sharp_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
- uint8_t tmp[4 * 4];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
- 4);
- block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-void vp9_eighttap_predict_avg4x4_smooth_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
- uint8_t tmp[4 * 4];
-
- vp9_filter_block2d_4x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
- 4);
- block2d_average(tmp, 4, dst_ptr, dst_pitch, VPX_FILTER_4x4);
-}
-
-
-void vp9_eighttap_predict8x8_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x8_sharp_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x8_smooth_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg8x8_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- uint8_t tmp[8 * 8];
- const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
- 8);
- block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict_avg8x8_sharp_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- uint8_t tmp[8 * 8];
- const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
- 8);
- block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict_avg8x8_smooth_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- uint8_t tmp[8 * 8];
- const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
- vp9_filter_block2d_8x8_8(src_ptr, src_pixels_per_line, HFilter, VFilter, tmp,
- 8);
- block2d_average(tmp, 8, dst_ptr, dst_pitch, VPX_FILTER_8x8);
-}
-
-void vp9_eighttap_predict8x4_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x4_sharp_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict8x4_smooth_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
- vp9_filter_block2d_8x4_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_sharp_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict16x16_smooth_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- dst_ptr, dst_pitch);
-}
-
-void vp9_eighttap_predict_avg16x16_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
- const int16_t *HFilter = vp9_sub_pel_filters_8[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- tmp, 16);
- block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-void vp9_eighttap_predict_avg16x16_sharp_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
- const int16_t *HFilter = vp9_sub_pel_filters_8s[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8s[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- tmp, 16);
- block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-void vp9_eighttap_predict_avg16x16_smooth_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp, 16 * 16);
- const int16_t *HFilter = vp9_sub_pel_filters_8lp[xoffset];
- const int16_t *VFilter = vp9_sub_pel_filters_8lp[yoffset];
-
- vp9_filter_block2d_16x16_8(src_ptr, src_pixels_per_line, HFilter, VFilter,
- tmp, 16);
- block2d_average(tmp, 16, dst_ptr, dst_pitch, VPX_FILTER_16x16);
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_first_pass
- *
- * INPUTS : uint8_t *src_ptr : Pointer to source block.
- * uint32_t src_stride : Stride of source block.
- * uint32_t height : Block height.
- * uint32_t width : Block width.
- * int32_t *vp9_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : int32_t *dst_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
- * in the horizontal direction to produce the filtered output
- * block. Used to implement first-pass of 2-D separable filter.
- *
- * SPECIAL NOTES : Produces int32_t output to retain precision for next pass.
- * Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_first_pass(uint8_t *src_ptr,
- uint16_t *dst_ptr,
- unsigned int src_stride,
- unsigned int height,
- unsigned int width,
- const int16_t *vp9_filter) {
- unsigned int i, j;
-
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- /* Apply bilinear filter */
- dst_ptr[j] = (((int)src_ptr[0] * vp9_filter[0]) +
- ((int)src_ptr[1] * vp9_filter[1]) +
- (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
- src_ptr++;
- }
-
- /* Next row... */
- src_ptr += src_stride - width;
- dst_ptr += width;
- }
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil_second_pass
- *
- * INPUTS : int32_t *src_ptr : Pointer to source block.
- * uint32_t dst_pitch : Destination block pitch.
- * uint32_t height : Block height.
- * uint32_t width : Block width.
- * int32_t *vp9_filter : Array of 2 bi-linear filter taps.
- *
- * OUTPUTS : uint16_t *dst_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : Applies a 1-D 2-tap bi-linear filter to the source block
- * in the vertical direction to produce the filtered output
- * block. Used to implement second-pass of 2-D separable filter.
- *
- * SPECIAL NOTES : Requires 32-bit input as produced by filter_block2d_bil_first_pass.
- * Two filter taps should sum to VP9_FILTER_WEIGHT.
- *
- ****************************************************************************/
-static void filter_block2d_bil_second_pass(uint16_t *src_ptr,
- uint8_t *dst_ptr,
- int dst_pitch,
- unsigned int height,
- unsigned int width,
- const int16_t *vp9_filter) {
- unsigned int i, j;
- int temp;
-
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- /* Apply filter */
- temp = ((int)src_ptr[0] * vp9_filter[0]) +
- ((int)src_ptr[width] * vp9_filter[1]) +
- (VP9_FILTER_WEIGHT / 2);
- dst_ptr[j] = (unsigned int)(temp >> VP9_FILTER_SHIFT);
- src_ptr++;
- }
-
- /* Next row... */
- dst_ptr += dst_pitch;
- }
-}
-
-/*
- * As before for filter_block2d_second_pass_avg(), the functional difference
- * between filter_block2d_bil_second_pass() and filter_block2d_bil_second_pass_avg()
- * is that filter_block2d_bil_second_pass() does a bilinear filter on input
- * and stores the result in output; filter_block2d_bil_second_pass_avg(),
- * instead, does a bilinear filter on input, averages the resulting value
- * with the values already present in the output and stores the result of
- * that back into the output ((filter_result + dest + 1) >> 1).
- */
-static void filter_block2d_bil_second_pass_avg(uint16_t *src_ptr,
- uint8_t *dst_ptr,
- int dst_pitch,
- unsigned int height,
- unsigned int width,
- const int16_t *vp9_filter) {
- unsigned int i, j;
- int temp;
-
- for (i = 0; i < height; i++) {
- for (j = 0; j < width; j++) {
- /* Apply filter */
- temp = (((int)src_ptr[0] * vp9_filter[0]) +
- ((int)src_ptr[width] * vp9_filter[1]) +
- (VP9_FILTER_WEIGHT / 2)) >> VP9_FILTER_SHIFT;
- dst_ptr[j] = (unsigned int)((temp + dst_ptr[j] + 1) >> 1);
- src_ptr++;
- }
-
- /* Next row... */
- dst_ptr += dst_pitch;
- }
-}
-
-/****************************************************************************
- *
- * ROUTINE : filter_block2d_bil
- *
- * INPUTS : uint8_t *src_ptr : Pointer to source block.
- * uint32_t src_pitch : Stride of source block.
- * uint32_t dst_pitch : Stride of destination block.
- * int32_t *HFilter : Array of 2 horizontal filter taps.
- * int32_t *VFilter : Array of 2 vertical filter taps.
- * int32_t Width : Block width
- * int32_t Height : Block height
- *
- * OUTPUTS : uint16_t *dst_ptr : Pointer to filtered block.
- *
- * RETURNS : void
- *
- * FUNCTION : 2-D filters an input block by applying a 2-tap
- * bi-linear filter horizontally followed by a 2-tap
- * bi-linear filter vertically on the result.
- *
- * SPECIAL NOTES : The largest block size can be handled here is 16x16
- *
- ****************************************************************************/
-static void filter_block2d_bil(uint8_t *src_ptr,
- uint8_t *dst_ptr,
- unsigned int src_pitch,
- unsigned int dst_pitch,
- const int16_t *HFilter,
- const int16_t *VFilter,
- int Width,
- int Height) {
-
- uint16_t FData[17 * 16]; /* Temp data buffer used in filtering */
-
- /* First filter 1-D horizontally... */
- filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
- /* then 1-D vertically... */
- filter_block2d_bil_second_pass(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-static void filter_block2d_bil_avg(uint8_t *src_ptr,
- uint8_t *dst_ptr,
- unsigned int src_pitch,
- unsigned int dst_pitch,
- const int16_t *HFilter,
- const int16_t *VFilter,
- int Width,
- int Height) {
- uint16_t FData[17 * 16]; /* Temp data buffer used in filtering */
-
- /* First filter 1-D horizontally... */
- filter_block2d_bil_first_pass(src_ptr, FData, src_pitch, Height + 1, Width, HFilter);
-
- /* then 1-D vertically... */
- filter_block2d_bil_second_pass_avg(FData, dst_ptr, dst_pitch, Height, Width, VFilter);
-}
-
-void vp9_bilinear_predict4x4_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict_avg4x4_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
- dst_pitch, HFilter, VFilter, 4, 4);
-}
-
-void vp9_bilinear_predict8x8_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 8);
-
-}
-
-void vp9_bilinear_predict_avg8x8_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
- dst_pitch, HFilter, VFilter, 8, 8);
-}
-
-void vp9_bilinear_predict8x4_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 8, 4);
-
-}
-
-void vp9_bilinear_predict16x16_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, HFilter, VFilter, 16, 16);
-}
-
-void vp9_bilinear_predict_avg16x16_c(uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- uint8_t *dst_ptr,
- int dst_pitch) {
- const int16_t *HFilter;
- const int16_t *VFilter;
-
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
-
- filter_block2d_bil_avg(src_ptr, dst_ptr, src_pixels_per_line,
- dst_pitch, HFilter, VFilter, 16, 16);
-}
diff --git a/vp9/common/vp9_filter.h b/vp9/common/vp9_filter.h
index cd666578d..1ccfdaac2 100644
--- a/vp9/common/vp9_filter.h
+++ b/vp9/common/vp9_filter.h
@@ -21,10 +21,17 @@
#define SUBPEL_SHIFTS 16
-extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][2];
-extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][6];
+extern const int16_t vp9_bilinear_filters[SUBPEL_SHIFTS][8];
+extern const int16_t vp9_sub_pel_filters_6[SUBPEL_SHIFTS][8];
extern const int16_t vp9_sub_pel_filters_8[SUBPEL_SHIFTS][8];
extern const int16_t vp9_sub_pel_filters_8s[SUBPEL_SHIFTS][8];
extern const int16_t vp9_sub_pel_filters_8lp[SUBPEL_SHIFTS][8];
+// The VP9_BILINEAR_FILTERS_2TAP macro returns a pointer to the bilinear
+// filter kernel as a 2 tap filter.
+#define BF_LENGTH (sizeof(vp9_bilinear_filters[0]) / \
+ sizeof(vp9_bilinear_filters[0][0]))
+#define BF_OFFSET (BF_LENGTH / 2 - 1)
+#define VP9_BILINEAR_FILTERS_2TAP(x) (vp9_bilinear_filters[x] + BF_OFFSET)
+
#endif // VP9_COMMON_VP9_FILTER_H_
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index 9585611dd..f713212a7 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -87,8 +87,8 @@ unsigned int vp9_sub_pixel_variance16x2_c(const uint8_t *src_ptr,
uint8_t temp2[2 * 16];
const int16_t *HFilter, *VFilter;
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
+ HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, FData3,
src_pixels_per_line, 1, 3, 16, HFilter);
@@ -108,8 +108,8 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr,
uint8_t temp2[2 * 16];
const int16_t *HFilter, *VFilter;
- HFilter = vp9_bilinear_filters[xoffset];
- VFilter = vp9_bilinear_filters[yoffset];
+ HFilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ VFilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
var_filter_block2d_bil_first_pass(src_ptr, FData3,
src_pixels_per_line, 1, 17, 2, HFilter);
diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 20de7b7f1..d4435d872 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c
@@ -8,66 +8,58 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include "./vpx_config.h"
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_reconinter.h"
#include "vp9/common/vp9_reconintra.h"
void vp9_setup_interp_filters(MACROBLOCKD *xd,
INTERPOLATIONFILTERTYPE mcomp_filter_type,
VP9_COMMON *cm) {
+ // TODO(agrange): Investigate the best choice of functions to use here
+ // for EIGHTTAP_SMOOTH. Since it is not interpolating, need to choose what
+ // to do at full-pel offsets. The current selection, where the filter is
+ // applied in one direction only, and not at all for 0,0, seems to give the
+ // best quality, but it may be worth trying an additional mode that does
+ // do the filtering on full-pel.
+ xd->subpix.predict[0][0][0] = vp9_convolve_copy;
+ xd->subpix.predict[0][0][1] = vp9_convolve_avg;
+ xd->subpix.predict[0][1][0] = vp9_convolve8_vert;
+ xd->subpix.predict[0][1][1] = vp9_convolve8_avg_vert;
+ xd->subpix.predict[1][0][0] = vp9_convolve8_horiz;
+ xd->subpix.predict[1][0][1] = vp9_convolve8_avg_horiz;
+ xd->subpix.predict[1][1][0] = vp9_convolve8;
+ xd->subpix.predict[1][1][1] = vp9_convolve8_avg;
+
+ xd->subpix.x_step_q4 = 16;
+ xd->subpix.y_step_q4 = 16;
+ switch (mcomp_filter_type) {
+ case EIGHTTAP:
+ case SWITCHABLE:
+ xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8;
+ break;
+ case EIGHTTAP_SMOOTH:
+ xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8lp;
+ break;
+ case EIGHTTAP_SHARP:
+ xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_8s;
+ break;
+ case BILINEAR:
+ xd->subpix.filter_x = xd->subpix.filter_y = vp9_bilinear_filters;
+ break;
#if CONFIG_ENABLE_6TAP
- if (mcomp_filter_type == SIXTAP) {
- xd->subpixel_predict4x4 = vp9_sixtap_predict4x4;
- xd->subpixel_predict8x4 = vp9_sixtap_predict8x4;
- xd->subpixel_predict8x8 = vp9_sixtap_predict8x8;
- xd->subpixel_predict16x16 = vp9_sixtap_predict16x16;
- xd->subpixel_predict_avg4x4 = vp9_sixtap_predict_avg4x4;
- xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8;
- xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16;
- } else {
+ case SIXTAP:
+ xd->subpix.filter_x = xd->subpix.filter_y = vp9_sub_pel_filters_6;
+ break;
#endif
- if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
- xd->subpixel_predict4x4 = vp9_eighttap_predict4x4;
- xd->subpixel_predict8x4 = vp9_eighttap_predict8x4;
- xd->subpixel_predict8x8 = vp9_eighttap_predict8x8;
- xd->subpixel_predict16x16 = vp9_eighttap_predict16x16;
- xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4;
- xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8;
- xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16;
- } else if (mcomp_filter_type == EIGHTTAP_SMOOTH) {
- xd->subpixel_predict4x4 = vp9_eighttap_predict4x4_smooth;
- xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_smooth;
- xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_smooth;
- xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_smooth;
- xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_smooth;
- xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_smooth;
- xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_smooth;
- } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
- xd->subpixel_predict4x4 = vp9_eighttap_predict4x4_sharp;
- xd->subpixel_predict8x4 = vp9_eighttap_predict8x4_sharp;
- xd->subpixel_predict8x8 = vp9_eighttap_predict8x8_sharp;
- xd->subpixel_predict16x16 = vp9_eighttap_predict16x16_sharp;
- xd->subpixel_predict_avg4x4 = vp9_eighttap_predict_avg4x4_sharp;
- xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp;
- xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c;
- } else {
- xd->subpixel_predict4x4 = vp9_bilinear_predict4x4;
- xd->subpixel_predict8x4 = vp9_bilinear_predict8x4;
- xd->subpixel_predict8x8 = vp9_bilinear_predict8x8;
- xd->subpixel_predict16x16 = vp9_bilinear_predict16x16;
- xd->subpixel_predict_avg4x4 = vp9_bilinear_predict_avg4x4;
- xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8;
- xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16;
- }
-#if CONFIG_ENABLE_6TAP
}
-#endif
}
-void vp9_copy_mem16x16_c(uint8_t *src,
+void vp9_copy_mem16x16_c(const uint8_t *src,
int src_stride,
uint8_t *dst,
int dst_stride) {
@@ -93,10 +85,10 @@ void vp9_copy_mem16x16_c(uint8_t *src,
dst[15] = src[15];
#else
- ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
- ((uint32_t *)dst)[2] = ((uint32_t *)src)[2];
- ((uint32_t *)dst)[3] = ((uint32_t *)src)[3];
+ ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
+ ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
+ ((uint32_t *)dst)[2] = ((const uint32_t *)src)[2];
+ ((uint32_t *)dst)[3] = ((const uint32_t *)src)[3];
#endif
src += src_stride;
@@ -104,25 +96,7 @@ void vp9_copy_mem16x16_c(uint8_t *src,
}
}
-void vp9_avg_mem16x16_c(uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 16; r++) {
- int n;
-
- for (n = 0; n < 16; n++) {
- dst[n] = (dst[n] + src[n] + 1) >> 1;
- }
-
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x8_c(uint8_t *src,
+void vp9_copy_mem8x8_c(const uint8_t *src,
int src_stride,
uint8_t *dst,
int dst_stride) {
@@ -139,33 +113,15 @@ void vp9_copy_mem8x8_c(uint8_t *src,
dst[6] = src[6];
dst[7] = src[7];
#else
- ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+ ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
+ ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
#endif
src += src_stride;
dst += dst_stride;
}
}
-void vp9_avg_mem8x8_c(uint8_t *src,
- int src_stride,
- uint8_t *dst,
- int dst_stride) {
- int r;
-
- for (r = 0; r < 8; r++) {
- int n;
-
- for (n = 0; n < 8; n++) {
- dst[n] = (dst[n] + src[n] + 1) >> 1;
- }
-
- src += src_stride;
- dst += dst_stride;
- }
-}
-
-void vp9_copy_mem8x4_c(uint8_t *src,
+void vp9_copy_mem8x4_c(const uint8_t *src,
int src_stride,
uint8_t *dst,
int dst_stride) {
@@ -182,16 +138,16 @@ void vp9_copy_mem8x4_c(uint8_t *src,
dst[6] = src[6];
dst[7] = src[7];
#else
- ((uint32_t *)dst)[0] = ((uint32_t *)src)[0];
- ((uint32_t *)dst)[1] = ((uint32_t *)src)[1];
+ ((uint32_t *)dst)[0] = ((const uint32_t *)src)[0];
+ ((uint32_t *)dst)[1] = ((const uint32_t *)src)[1];
#endif
src += src_stride;
dst += dst_stride;
}
}
-void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
- int r;
+void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
+ struct subpix_fn_table *subpix) {
uint8_t *ptr_base;
uint8_t *ptr;
uint8_t *pred_ptr = d->predictor;
@@ -199,30 +155,14 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
ptr_base = *(d->base_pre);
mv.as_int = d->bmi.as_mv.first.as_int;
+ ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+ (mv.as_mv.col >> 3);
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
- sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
- pred_ptr, pitch);
- } else {
- ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
- ptr = ptr_base;
-
- for (r = 0; r < 4; r++) {
-#if !(CONFIG_FAST_UNALIGNED)
- pred_ptr[0] = ptr[0];
- pred_ptr[1] = ptr[1];
- pred_ptr[2] = ptr[2];
- pred_ptr[3] = ptr[3];
-#else
- *(uint32_t *)pred_ptr = *(uint32_t *)ptr;
-#endif
- pred_ptr += pitch;
- ptr += d->pre_stride;
- }
- }
+ subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
+ ptr, d->pre_stride, pred_ptr, pitch,
+ subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,
+ subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,
+ 4, 4);
}
/*
@@ -232,8 +172,7 @@ void vp9_build_inter_predictors_b(BLOCKD *d, int pitch, vp9_subpix_fn_t sppf) {
* predictor of the second reference frame / motion vector.
*/
void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
- vp9_subpix_fn_t sppf) {
- int r;
+ struct subpix_fn_table *subpix) {
uint8_t *ptr_base;
uint8_t *ptr;
uint8_t *pred_ptr = d->predictor;
@@ -241,26 +180,14 @@ void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
ptr_base = *(d->base_second_pre);
mv.as_int = d->bmi.as_mv.second.as_int;
+ ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
+ (mv.as_mv.col >> 3);
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
- sppf(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1, (mv.as_mv.row & 7) << 1,
- pred_ptr, pitch);
- } else {
- ptr_base += d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
- (mv.as_mv.col >> 3);
- ptr = ptr_base;
-
- for (r = 0; r < 4; r++) {
- pred_ptr[0] = (pred_ptr[0] + ptr[0] + 1) >> 1;
- pred_ptr[1] = (pred_ptr[1] + ptr[1] + 1) >> 1;
- pred_ptr[2] = (pred_ptr[2] + ptr[2] + 1) >> 1;
- pred_ptr[3] = (pred_ptr[3] + ptr[3] + 1) >> 1;
- pred_ptr += pitch;
- ptr += d->pre_stride;
- }
- }
+ subpix->predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](
+ ptr, d->pre_stride, pred_ptr, pitch,
+ subpix->filter_x[(mv.as_mv.col & 7) << 1], subpix->x_step_q4,
+ subpix->filter_y[(mv.as_mv.row & 7) << 1], subpix->y_step_q4,
+ 4, 4);
}
void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
@@ -274,12 +201,11 @@ void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
(mv.as_mv.col >> 3);
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- xd->subpixel_predict8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
- (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
- } else {
- vp9_copy_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
- }
+ xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
+ ptr, d->pre_stride, pred_ptr, pitch,
+ xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
+ xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
+ 8, 8);
}
/*
@@ -300,12 +226,11 @@ void vp9_build_2nd_inter_predictors4b(MACROBLOCKD *xd,
ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
(mv.as_mv.col >> 3);
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- xd->subpixel_predict_avg8x8(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
- (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
- } else {
- vp9_avg_mem8x8(ptr, d->pre_stride, pred_ptr, pitch);
- }
+ xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][1](
+ ptr, d->pre_stride, pred_ptr, pitch,
+ xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
+ xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
+ 8, 8);
}
static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
@@ -319,12 +244,11 @@ static void build_inter_predictors2b(MACROBLOCKD *xd, BLOCKD *d, int pitch) {
ptr = ptr_base + d->pre + (mv.as_mv.row >> 3) * d->pre_stride +
(mv.as_mv.col >> 3);
- if (mv.as_mv.row & 7 || mv.as_mv.col & 7) {
- xd->subpixel_predict8x4(ptr, d->pre_stride, (mv.as_mv.col & 7) << 1,
- (mv.as_mv.row & 7) << 1, pred_ptr, pitch);
- } else {
- vp9_copy_mem8x4(ptr, d->pre_stride, pred_ptr, pitch);
- }
+ xd->subpix.predict[!!(mv.as_mv.col & 7)][!!(mv.as_mv.row & 7)][0](
+ ptr, d->pre_stride, pred_ptr, pitch,
+ xd->subpix.filter_x[(mv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
+ xd->subpix.filter_y[(mv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
+ 8, 4);
}
/*encoder only*/
@@ -411,13 +335,13 @@ void vp9_build_inter4x4_predictors_mbuv(MACROBLOCKD *xd) {
if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
build_inter_predictors2b(xd, d0, 8);
else {
- vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
- vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
+ vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
+ vp9_build_inter_predictors_b(d1, 8, &xd->subpix);
}
if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
- vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
- vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
+ vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);
+ vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);
}
}
}
@@ -475,14 +399,11 @@ void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
ptr = ptr_base + (ymv.as_mv.row >> 3) * pre_stride + (ymv.as_mv.col >> 3);
- if ((ymv.as_mv.row | ymv.as_mv.col) & 7) {
- xd->subpixel_predict16x16(ptr, pre_stride,
- (ymv.as_mv.col & 7) << 1,
- (ymv.as_mv.row & 7) << 1,
- dst_y, dst_ystride);
- } else {
- vp9_copy_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
- }
+ xd->subpix.predict[!!(ymv.as_mv.col & 7)][!!(ymv.as_mv.row & 7)][0](
+ ptr, pre_stride, dst_y, dst_ystride,
+ xd->subpix.filter_x[(ymv.as_mv.col & 7) << 1], xd->subpix.x_step_q4,
+ xd->subpix.filter_y[(ymv.as_mv.row & 7) << 1], xd->subpix.y_step_q4,
+ 16, 16);
}
void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
@@ -523,15 +444,19 @@ void vp9_build_1st_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
uptr = xd->pre.u_buffer + offset;
vptr = xd->pre.v_buffer + offset;
- if (_o16x16mv.as_int & 0x000f000f) {
- xd->subpixel_predict8x8(uptr, pre_stride, _o16x16mv.as_mv.col & 15,
- _o16x16mv.as_mv.row & 15, dst_u, dst_uvstride);
- xd->subpixel_predict8x8(vptr, pre_stride, _o16x16mv.as_mv.col & 15,
- _o16x16mv.as_mv.row & 15, dst_v, dst_uvstride);
- } else {
- vp9_copy_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
- vp9_copy_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
- }
+ xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]
+ [!!(_o16x16mv.as_mv.row & 15)][0](
+ uptr, pre_stride, dst_u, dst_uvstride,
+ xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,
+ xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,
+ 8, 8);
+
+ xd->subpix.predict[!!(_o16x16mv.as_mv.col & 15)]
+ [!!(_o16x16mv.as_mv.row & 15)][0](
+ vptr, pre_stride, dst_v, dst_uvstride,
+ xd->subpix.filter_x[_o16x16mv.as_mv.col & 15], xd->subpix.x_step_q4,
+ xd->subpix.filter_y[_o16x16mv.as_mv.row & 15], xd->subpix.y_step_q4,
+ 8, 8);
}
@@ -714,12 +639,11 @@ void vp9_build_2nd_inter16x16_predictors_mby(MACROBLOCKD *xd,
ptr = ptr_base + (mv_row >> 3) * pre_stride + (mv_col >> 3);
- if ((mv_row | mv_col) & 7) {
- xd->subpixel_predict_avg16x16(ptr, pre_stride, (mv_col & 7) << 1,
- (mv_row & 7) << 1, dst_y, dst_ystride);
- } else {
- vp9_avg_mem16x16(ptr, pre_stride, dst_y, dst_ystride);
- }
+ xd->subpix.predict[!!(mv_col & 7)][!!(mv_row & 7)][1](
+ ptr, pre_stride, dst_y, dst_ystride,
+ xd->subpix.filter_x[(mv_col & 7) << 1], xd->subpix.x_step_q4,
+ xd->subpix.filter_y[(mv_row & 7) << 1], xd->subpix.y_step_q4,
+ 16, 16);
}
void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
@@ -758,15 +682,17 @@ void vp9_build_2nd_inter16x16_predictors_mbuv(MACROBLOCKD *xd,
uptr = xd->second_pre.u_buffer + offset;
vptr = xd->second_pre.v_buffer + offset;
- if ((omv_row | omv_col) & 15) {
- xd->subpixel_predict_avg8x8(uptr, pre_stride, omv_col & 15,
- omv_row & 15, dst_u, dst_uvstride);
- xd->subpixel_predict_avg8x8(vptr, pre_stride, omv_col & 15,
- omv_row & 15, dst_v, dst_uvstride);
- } else {
- vp9_avg_mem8x8(uptr, pre_stride, dst_u, dst_uvstride);
- vp9_avg_mem8x8(vptr, pre_stride, dst_v, dst_uvstride);
- }
+ xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](
+ uptr, pre_stride, dst_u, dst_uvstride,
+ xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,
+ xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,
+ 8, 8);
+
+ xd->subpix.predict[!!(omv_col & 15)][!!(omv_row & 15)][1](
+ vptr, pre_stride, dst_v, dst_uvstride,
+ xd->subpix.filter_x[omv_col & 15], xd->subpix.x_step_q4,
+ xd->subpix.filter_y[omv_row & 15], xd->subpix.y_step_q4,
+ 8, 8);
}
void vp9_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
@@ -835,13 +761,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
build_inter_predictors2b(xd, d0, 16);
else {
- vp9_build_inter_predictors_b(d0, 16, xd->subpixel_predict4x4);
- vp9_build_inter_predictors_b(d1, 16, xd->subpixel_predict4x4);
+ vp9_build_inter_predictors_b(d0, 16, &xd->subpix);
+ vp9_build_inter_predictors_b(d1, 16, &xd->subpix);
}
if (mbmi->second_ref_frame > 0) {
- vp9_build_2nd_inter_predictors_b(d0, 16, xd->subpixel_predict_avg4x4);
- vp9_build_2nd_inter_predictors_b(d1, 16, xd->subpixel_predict_avg4x4);
+ vp9_build_2nd_inter_predictors_b(d0, 16, &xd->subpix);
+ vp9_build_2nd_inter_predictors_b(d1, 16, &xd->subpix);
}
}
}
@@ -853,13 +779,13 @@ static void build_inter4x4_predictors_mb(MACROBLOCKD *xd) {
if (d0->bmi.as_mv.first.as_int == d1->bmi.as_mv.first.as_int)
build_inter_predictors2b(xd, d0, 8);
else {
- vp9_build_inter_predictors_b(d0, 8, xd->subpixel_predict4x4);
- vp9_build_inter_predictors_b(d1, 8, xd->subpixel_predict4x4);
+ vp9_build_inter_predictors_b(d0, 8, &xd->subpix);
+ vp9_build_inter_predictors_b(d1, 8, &xd->subpix);
}
if (mbmi->second_ref_frame > 0) {
- vp9_build_2nd_inter_predictors_b(d0, 8, xd->subpixel_predict_avg4x4);
- vp9_build_2nd_inter_predictors_b(d1, 8, xd->subpixel_predict_avg4x4);
+ vp9_build_2nd_inter_predictors_b(d0, 8, &xd->subpix);
+ vp9_build_2nd_inter_predictors_b(d1, 8, &xd->subpix);
}
}
}
diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 89868b95e..903bd2e86 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h
@@ -14,6 +14,8 @@
#include "vpx/vpx_integer.h"
#include "vp9/common/vp9_onyxc_int.h"
+struct subpix_fn_table;
+
extern void vp9_build_1st_inter16x16_predictors_mby(MACROBLOCKD *xd,
uint8_t *dst_y,
int dst_ystride,
@@ -64,10 +66,10 @@ extern void vp9_build_inter64x64_predictors_sb(MACROBLOCKD *x,
extern void vp9_build_inter_predictors_mb(MACROBLOCKD *xd);
extern void vp9_build_inter_predictors_b(BLOCKD *d, int pitch,
- vp9_subpix_fn_t sppf);
+ struct subpix_fn_table *sppf);
extern void vp9_build_2nd_inter_predictors_b(BLOCKD *d, int pitch,
- vp9_subpix_fn_t sppf);
+ struct subpix_fn_table *sppf);
extern void vp9_build_inter_predictors4b(MACROBLOCKD *xd, BLOCKD *d,
int pitch);
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 5e4d485b5..9259f7d99 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -23,21 +23,6 @@ EOF
}
forward_decls vp9_common_forward_decls
-prototype void vp9_filter_block2d_4x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x4_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_8x8_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-prototype void vp9_filter_block2d_16x16_8 "const uint8_t *src_ptr, const unsigned int src_stride, const int16_t *HFilter_aligned16, const int16_t *VFilter_aligned16, uint8_t *dst_ptr, unsigned int dst_stride"
-
-# At the very least, MSVC 2008 has compiler bug exhibited by this code; code
-# compiles warning free but a dissassembly of generated code show bugs. To be
-# on the safe side, only enabled when compiled with 'gcc'.
-if [ "$CONFIG_GCC" = "yes" ]; then
- specialize vp9_filter_block2d_4x4_8 sse4_1 sse2
-fi
- specialize vp9_filter_block2d_8x4_8 ssse3 #sse4_1 sse2
- specialize vp9_filter_block2d_8x8_8 ssse3 #sse4_1 sse2
- specialize vp9_filter_block2d_16x16_8 ssse3 #sse4_1 sse2
-
#
# Dequant
#
@@ -86,27 +71,17 @@ specialize vp9_dequant_idct_add_uv_block_16x16
#
# RECON
#
-prototype void vp9_copy_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
+prototype void vp9_copy_mem16x16 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem16x16 mmx sse2 dspr2
vp9_copy_mem16x16_dspr2=vp9_copy_mem16x16_dspr2
-prototype void vp9_copy_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
+prototype void vp9_copy_mem8x8 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem8x8 mmx dspr2
vp9_copy_mem8x8_dspr2=vp9_copy_mem8x8_dspr2
-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
+prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
specialize vp9_copy_mem8x4 mmx
-prototype void vp9_avg_mem16x16 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_avg_mem16x16
-
-prototype void vp9_avg_mem8x8 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_avg_mem8x8
-
-prototype void vp9_copy_mem8x4 "uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
-specialize vp9_copy_mem8x4 mmx dspr2
-vp9_copy_mem8x4_dspr2=vp9_copy_mem8x4_dspr2
-
prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, uint8_t *dst_ptr, int stride"
specialize vp9_recon_b
@@ -269,110 +244,23 @@ specialize vp9_sub_pixel_variance16x2 sse2
#
# Sub Pixel Filters
#
-prototype void vp9_eighttap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict16x16
-
-prototype void vp9_eighttap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict8x8
-
-prototype void vp9_eighttap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict_avg16x16
-
-prototype void vp9_eighttap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict_avg8x8
-
-prototype void vp9_eighttap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict_avg4x4
-
-prototype void vp9_eighttap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict8x4
-
-prototype void vp9_eighttap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict4x4
-
-prototype void vp9_eighttap_predict16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict16x16_sharp
-
-prototype void vp9_eighttap_predict8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict8x8_sharp
-
-prototype void vp9_eighttap_predict_avg16x16_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict_avg16x16_sharp
-
-prototype void vp9_eighttap_predict_avg8x8_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict_avg8x8_sharp
-
-prototype void vp9_eighttap_predict_avg4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict_avg4x4_sharp
-
-prototype void vp9_eighttap_predict8x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict8x4_sharp
-
-prototype void vp9_eighttap_predict4x4_sharp "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict4x4_sharp
-
-prototype void vp9_eighttap_predict16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict16x16_smooth
-
-prototype void vp9_eighttap_predict8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict8x8_smooth
-
-prototype void vp9_eighttap_predict_avg16x16_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict_avg16x16_smooth
-
-prototype void vp9_eighttap_predict_avg8x8_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict_avg8x8_smooth
-
-prototype void vp9_eighttap_predict_avg4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict_avg4x4_smooth
-
-prototype void vp9_eighttap_predict8x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict8x4_smooth
-
-prototype void vp9_eighttap_predict4x4_smooth "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_eighttap_predict4x4_smooth
-
-prototype void vp9_sixtap_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_sixtap_predict16x16
-
-prototype void vp9_sixtap_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_sixtap_predict8x8
-
-prototype void vp9_sixtap_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_sixtap_predict_avg16x16
-
-prototype void vp9_sixtap_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_sixtap_predict_avg8x8
-
-prototype void vp9_sixtap_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_sixtap_predict8x4
-
-prototype void vp9_sixtap_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_sixtap_predict4x4
-
-prototype void vp9_sixtap_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_sixtap_predict_avg4x4
-
-prototype void vp9_bilinear_predict16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_bilinear_predict16x16 sse2
-
-prototype void vp9_bilinear_predict8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_bilinear_predict8x8 sse2
+prototype void vp9_convolve8 "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8 ssse3
-prototype void vp9_bilinear_predict_avg16x16 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_bilinear_predict_avg16x16
+prototype void vp9_convolve8_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_horiz ssse3
-prototype void vp9_bilinear_predict_avg8x8 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_bilinear_predict_avg8x8
+prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_vert ssse3
-prototype void vp9_bilinear_predict8x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_bilinear_predict8x4
+prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg
-prototype void vp9_bilinear_predict4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_bilinear_predict4x4
+prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg_horiz
-prototype void vp9_bilinear_predict_avg4x4 "uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, uint8_t *dst_ptr, int dst_pitch"
-specialize vp9_bilinear_predict_avg4x4
+prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h"
+specialize vp9_convolve8_avg_vert
#
# dct
diff --git a/vp9/common/vp9_subpixel.h b/vp9/common/vp9_subpixel.h
deleted file mode 100644
index dc4eadfb1..000000000
--- a/vp9/common/vp9_subpixel.h
+++ /dev/null
@@ -1,20 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_COMMON_VP9_SUBPIXEL_H_
-#define VP9_COMMON_VP9_SUBPIXEL_H_
-
-#define prototype_subpixel_predict(sym) \
- void sym(uint8_t *src, int src_pitch, int xofst, int yofst, \
- uint8_t *dst, int dst_pitch)
-
-typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
-
-#endif // VP9_COMMON_VP9_SUBPIXEL_H_
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f09e2d78b..3e2346f29 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -8,91 +8,11 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_subpixel.h"
-
-extern const short vp9_six_tap_mmx[8][6 * 8];
-
-extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int output_pitch,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int output_height,
- unsigned int output_width);
-
-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_lin,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
///////////////////////////////////////////////////////////////////////////
// the mmx function that does the bilinear filtering and var calculation //
// int one pass //
@@ -116,389 +36,7 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
{ 8, 8, 8, 8, 120, 120, 120, 120 }
};
-#if HAVE_MMX
-void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict4x4_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
- const short *hfilter, *vfilter;
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 9, 8, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
- 8, 4, 4, 4, vfilter);
-}
-
-void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
- fdata2 + 8, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
- fdata2 + 12, src_pixels_per_line, 1, 21, 32,
- hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
- 32, 16, 16, 16, vfilter);
-}
-
-void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 13, 16,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 13, 16,
- hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 8, 8, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
- 16, 8, 8, 8, vfilter);
-}
-
-void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 9, 16, hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 4, 8, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
- 16, 8, 4, 8, vfilter);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 21, 32, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 16, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 21, 32);
- vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, dst_pitch, vfilter);
- }
-}
-
-void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 13, 16, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 8, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 8, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 8, vfilter);
- }
-}
-
-void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 9, 16, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 4, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, vfilter);
- }
-}
-#endif
-
#if HAVE_SSSE3
-extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- fdata2, 16, 21, xoffset);
- vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
- 16, yoffset);
- } else {
- /* First-pass only */
- vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 16, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 16, yoffset);
- }
-}
-
-void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 8, 13, xoffset);
- vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
- } else {
- vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 8, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 8, yoffset);
- }
-}
-
-void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 8, 9, xoffset);
- vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
- } else {
- /* First-pass only */
- vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
-}
-
-void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict4x4_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 4, 9, xoffset);
- vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
- } else {
- vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- } else {
- vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
-}
-
void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
@@ -513,30 +51,6 @@ void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
unsigned int output_height,
const short *filter);
-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
- vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 23, hfilter_aligned16);
- vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
- 16, hfilter_aligned16);
- } else {
- vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 16, vfilter_aligned16);
- }
- }
-}
-
void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
@@ -551,51 +65,100 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
unsigned int output_height,
const short *filter);
-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
- vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 15, hfilter_aligned16);
- vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
- hfilter_aligned16);
- } else {
- vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 8, vfilter_aligned16);
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (x_step_q4 == 16 && filter_x[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_h8_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 16;
+ dst += 16;
+ w -= 16;
}
+ while (w >= 8) {
+ vp9_filter_block1d8_h8_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ }
+ if (w) {
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
}
}
-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (y_step_q4 == 16 && filter_y[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ }
+ if (w) {
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
- vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 11, hfilter_aligned16);
- vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
- hfilter_aligned16);
- } else {
- vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 4, vfilter_aligned16);
+void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+
+ // check w/h due to fixed size fdata2 array
+ assert(w <= 16);
+ assert(h <= 16);
+
+ if (x_step_q4 == 16 && y_step_q4 == 16 &&
+ filter_x[3] != 128 && filter_y[3] != 128) {
+ if (w == 16) {
+ vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d16_v8_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
+ }
+ if (w == 8) {
+ vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d8_v8_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
}
}
+ vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
}
#endif
diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c
deleted file mode 100644
index 8e02ac197..000000000
--- a/vp9/common/x86/vp9_filter_sse2.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <emmintrin.h> // SSE2
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/emmintrin_compat.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-// just a quick partial snapshot so that other can already use some
-// speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-// filtering.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-// of positive above 128), or have higher precision filter
-// coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, src_ptr, offset) \
- { \
- /* Do shifted load to achieve require shuffles through unpacking */ \
- const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
- const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
- const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
- const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
- const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \
- const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \
- const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \
- const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \
- /* Shit by 4 bytes through suffle to get additional shifted loads */ \
- const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \
- const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \
- const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \
- const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \
- /* multiply accumulate them */ \
- const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
- const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
- const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
- const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
- const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
- const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
- mad_all = _mm_add_epi32(mad_all, rounding); \
- result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \
- }
-
-void vp9_filter_block2d_4x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- __m128i intermediateA, intermediateB, intermediateC;
-
- const int kInterp_Extend = 4;
-
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-
- // check alignment
- assert(0 == ((long)HFilter_aligned16)%16);
- assert(0 == ((long)VFilter_aligned16)%16);
-
- {
- __m128i transpose3_0;
- __m128i transpose3_1;
- __m128i transpose3_2;
- __m128i transpose3_3;
-
- // Horizontal pass (src -> intermediate).
- {
- const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
- src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
- {
- __m128i mad_all0;
- __m128i mad_all1;
- __m128i mad_all2;
- __m128i mad_all3;
- DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
- intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
- }
- }
-
- // Transpose result (intermediate -> transpose3_x)
- {
- // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
- // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
- // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
- const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
- const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
- const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
- const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
- // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
- // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
- // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
- // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
- const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
- const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
- const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
- const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
- // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
- // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
- // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
- // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
- const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
- const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
- const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
- const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
- // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
- transpose3_0 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose2_2),
- _MM_SHUFFLE(1, 0, 1, 0)));
- transpose3_1 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose2_2),
- _MM_SHUFFLE(3, 2, 3, 2)));
- transpose3_2 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose2_3),
- _MM_SHUFFLE(1, 0, 1, 0)));
- transpose3_3 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose2_3),
- _MM_SHUFFLE(3, 2, 3, 2)));
- // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
- // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
- // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
- // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
- }
-
- // Vertical pass (transpose3_x -> dst).
- {
- const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
- __m128i col0, col1, col2, col3;
- DECLARE_ALIGNED(16, unsigned char, temp[32]);
- {
- _mm_store_si128((__m128i *)temp, transpose3_0);
- DO_FOUR_PIXELS(col0, temp, 0);
- }
- {
- _mm_store_si128((__m128i *)temp, transpose3_1);
- DO_FOUR_PIXELS(col1, temp, 0);
- }
- {
- _mm_store_si128((__m128i *)temp, transpose3_2);
- DO_FOUR_PIXELS(col2, temp, 0);
- }
- {
- _mm_store_si128((__m128i *)temp, transpose3_3);
- DO_FOUR_PIXELS(col3, temp, 0);
- }
- // transpose
- {
- __m128i T0 = _mm_unpacklo_epi32(col0, col1);
- __m128i T1 = _mm_unpacklo_epi32(col2, col3);
- __m128i T2 = _mm_unpackhi_epi32(col0, col1);
- __m128i T3 = _mm_unpackhi_epi32(col2, col3);
- col0 = _mm_unpacklo_epi64(T0, T1);
- col1 = _mm_unpackhi_epi64(T0, T1);
- col2 = _mm_unpacklo_epi64(T2, T3);
- col3 = _mm_unpackhi_epi64(T2, T3);
- }
- // saturate to 8 bit
- {
- col0 = _mm_packs_epi32(col0, col0);
- col0 = _mm_packus_epi16(col0, col0);
- col1 = _mm_packs_epi32(col1, col1);
- col1 = _mm_packus_epi16(col1, col1);
- col2 = _mm_packs_epi32 (col2, col2);
- col2 = _mm_packus_epi16(col2, col2);
- col3 = _mm_packs_epi32 (col3, col3);
- col3 = _mm_packus_epi16(col3, col3);
- }
- // store
- {
- *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
- *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
- *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
- *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
- }
- }
- }
-}
-
-void vp9_filter_block2d_8x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int j;
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j, dst_stride);
- }
-}
-
-void vp9_filter_block2d_8x8_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<8; i+=4) {
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
-
-void vp9_filter_block2d_16x16_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<16; i+=4) {
- for (j=0; j<16; j+=4) {
- vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c
deleted file mode 100644
index 52c35b296..000000000
--- a/vp9/common/x86/vp9_filter_sse4.c
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <smmintrin.h> // SSE4.1
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-// just a quick partial snapshot so that other can already use some
-// speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-// filtering.
-// TODO(cd): Reduce source size by using macros instead of current code
-// duplication.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-// of positive above 128), or have higher precision filter
-// coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
- 0x00, 0x01,
- 0x01, 0x02,
- 0x02, 0x03,
- 0x03, 0x04,
- 0x02, 0x03,
- 0x03, 0x04,
- 0x04, 0x05,
- 0x05, 0x06,
-};
-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
- 0x04, 0x05,
- 0x05, 0x06,
- 0x06, 0x07,
- 0x07, 0x08,
- 0x06, 0x07,
- 0x07, 0x08,
- 0x08, 0x09,
- 0x09, 0x0A,
-};
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
-};
-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
- 0, 4, 8, 12,
- 1, 5, 9, 13,
- 2, 6, 10, 14,
- 3, 7, 11, 15
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, offset) \
- { \
- /*load pixels*/ \
- __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \
- /* extract the ones used for first column */ \
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \
- /* multiply accumulate them */ \
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
- __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
- __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
- mad_all = _mm_add_epi32(mad_all, rounding); \
- result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \
- }
-
-void vp9_filter_block2d_4x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- __m128i intermediateA, intermediateB, intermediateC;
-
- const int kInterp_Extend = 4;
-
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
- const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
- const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
- const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
-
- // check alignment
- assert(0 == ((long)HFilter_aligned16)%16);
- assert(0 == ((long)VFilter_aligned16)%16);
-
- {
- __m128i transpose3_0;
- __m128i transpose3_1;
- __m128i transpose3_2;
- __m128i transpose3_3;
-
- // Horizontal pass (src -> intermediate).
- {
- const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
- src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
- {
- __m128i mad_all0;
- __m128i mad_all1;
- __m128i mad_all2;
- __m128i mad_all3;
- DO_FOUR_PIXELS(mad_all0, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, 2*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
- intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
- }
- }
-
- // Transpose result (intermediate -> transpose3_x)
- {
- // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
- // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
- // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
- const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
- const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
- const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
- const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
- const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- transpose3_0 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(0, 0, 1, 0)));
- transpose3_1 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(1, 1, 3, 2)));
- transpose3_2 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(2, 2, 1, 0)));
- transpose3_3 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(3, 3, 3, 2)));
- // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
- // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
- // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
- // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
- }
-
- // Vertical pass (transpose3_x -> dst).
- {
- const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
- __m128i col0, col1, col2, col3;
- {
- //load pixels
- __m128i src = transpose3_0;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col0 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- //load pixels
- __m128i src = transpose3_1;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col1 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- //load pixels
- __m128i src = transpose3_2;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col2 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- //load pixels
- __m128i src = transpose3_3;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col3 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- __m128i col01 = _mm_unpacklo_epi8(col0, col1);
- __m128i col23 = _mm_unpacklo_epi8(col2, col3);
- __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
- //TODO(cd): look into Ronald's comment:
- // Future suggestion: I believe here, too, you can merge the
- // packs_epi32() and pacus_epi16() for the 4 cols above, so that
- // you get the data in a single register, and then use pshufb
- // (shuffle_epi8()) instead of the unpacks here. Should be
- // 2+3+2 instructions faster.
- *((unsigned int *)&dst_ptr[dst_stride * 0]) =
- _mm_extract_epi32(col0123, 0);
- *((unsigned int *)&dst_ptr[dst_stride * 1]) =
- _mm_extract_epi32(col0123, 1);
- *((unsigned int *)&dst_ptr[dst_stride * 2]) =
- _mm_extract_epi32(col0123, 2);
- *((unsigned int *)&dst_ptr[dst_stride * 3]) =
- _mm_extract_epi32(col0123, 3);
- }
- }
- }
-}
-
-void vp9_filter_block2d_8x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int j;
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j, dst_stride);
- }
-}
-
-void vp9_filter_block2d_8x8_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<8; i+=4) {
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
-
-void vp9_filter_block2d_16x16_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<16; i+=4) {
- for (j=0; j<16; j+=4) {
- vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm
deleted file mode 100644
index dee29b8fb..000000000
--- a/vp9/common/x86/vp9_subpixel_mmx.asm
+++ /dev/null
@@ -1,268 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define vp9_filter_weight 128
-%define VP9_FILTER_SHIFT 7
-
-
-;void vp9_filter_block1d_h6_mmx
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp9_filter
-;)
-global sym(vp9_filter_block1d_h6_mmx) PRIVATE
-sym(vp9_filter_block1d_h6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp9_filter
-
- movq mm1, [rdx + 16] ; do both the negative taps first!!!
- movq mm2, [rdx + 32] ;
- movq mm6, [rdx + 48] ;
- movq mm7, [rdx + 64] ;
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-.nextrow:
- movq mm3, [rsi-2] ; mm3 = p-2..p5
- movq mm4, mm3 ; mm4 = p-2..p5
- psrlq mm3, 8 ; mm3 = p-1..p5
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
-
- movq mm5, mm4 ; mm5 = p-2..p5
- punpckhbw mm4, mm0 ; mm5 = p2..p5
- pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- movq mm4, mm5 ; mm4 = p-2..p5;
- psrlq mm5, 16 ; mm5 = p0..p5;
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
-
- movq mm5, mm4 ; mm5 = p-2..p5
- psrlq mm4, 24 ; mm4 = p1..p5
- punpcklbw mm4, mm0 ; mm4 = p1..p4
- pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- ; do outer positive taps
- movd mm4, [rsi+3]
- punpcklbw mm4, mm0 ; mm5 = p3..p6
- pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
-
- paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- punpcklbw mm3, mm0 ;
-
- movq [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
- add rdi, rax;
-%else
- movsxd r8, dword ptr arg(2) ;src_pixels_per_line
- add rdi, rax;
-
- add rsi, r8 ; next line
-%endif
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1dc_v6_mmx
-;(
-; short *src_ptr,
-; unsigned char *output_ptr,
-; int output_pitch,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp9_filter
-;)
-global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
-sym(vp9_filter_block1dc_v6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movq mm5, [GLOBAL(rd)]
- push rbx
- mov rbx, arg(7) ;vp9_filter
- movq mm1, [rbx + 16] ; do both the negative taps first!!!
- movq mm2, [rbx + 32] ;
- movq mm6, [rbx + 48] ;
- movq mm7, [rbx + 64] ;
-
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- sub rsi, rdx
- sub rsi, rdx
- movsxd rcx, DWORD PTR arg(5) ;output_height
- movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-
-.nextrow_cv:
- movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
-
-
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
- pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
- pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi] ; mm4 = p0..p3 = row -2
- pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
-
- add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
- pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
- pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
-
- paddsw mm3, mm5 ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
-
- movd [rdi],mm3 ; store the results in the destination
- ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
- ; recon block should be in cache this shouldn't cost much. Its obviously
- ; avoidable!!!.
- lea rdi, [rdi+rax] ;
- dec rcx ; decrement count
- jnz .nextrow_cv ; next row
-
- pop rbx
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-rd:
- times 4 dw 0x40
-
-align 16
-global HIDDEN_DATA(sym(vp9_six_tap_mmx))
-sym(vp9_six_tap_mmx):
- times 8 dw 0
- times 8 dw 0
- times 8 dw 128
- times 8 dw 0
- times 8 dw 0
- times 8 dw 0
-
- times 8 dw 0
- times 8 dw -6
- times 8 dw 123
- times 8 dw 12
- times 8 dw -1
- times 8 dw 0
-
- times 8 dw 2
- times 8 dw -11
- times 8 dw 108
- times 8 dw 36
- times 8 dw -8
- times 8 dw 1
-
- times 8 dw 0
- times 8 dw -9
- times 8 dw 93
- times 8 dw 50
- times 8 dw -6
- times 8 dw 0
-
- times 8 dw 3
- times 8 dw -16
- times 8 dw 77
- times 8 dw 77
- times 8 dw -16
- times 8 dw 3
-
- times 8 dw 0
- times 8 dw -6
- times 8 dw 50
- times 8 dw 93
- times 8 dw -9
- times 8 dw 0
-
- times 8 dw 1
- times 8 dw -8
- times 8 dw 36
- times 8 dw 108
- times 8 dw -11
- times 8 dw 2
-
- times 8 dw 0
- times 8 dw -1
- times 8 dw 12
- times 8 dw 123
- times 8 dw -6
- times 8 dw 0
-
diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm
deleted file mode 100644
index b0c4f1282..000000000
--- a/vp9/common/x86/vp9_subpixel_sse2.asm
+++ /dev/null
@@ -1,1372 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT 7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short *vp9_filter
-;)
-global sym(vp9_filter_block1d8_h6_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;output_width
-%endif
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d8_h6_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm4
- lea rsi, [rsi + rax]
-
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(5) ;[output_width]
-%else
- add rdi, r8
-%endif
- dec rcx
-
- jnz .filter_block1d8_h6_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d16_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_h6_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;output_width
-%endif
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d16_h6_sse2_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- movq xmm2, MMWORD PTR [rsi +14]
- pslldq xmm2, 8
-
- por xmm2, xmm1
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm4
-
- movdqa xmm3, xmm2
- movdqa xmm4, xmm2
-
- movdqa xmm5, xmm2
- movdqa xmm6, xmm2
-
- movdqa xmm7, xmm2
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm2
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi+16], xmm4
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(5) ;[output_width]
-%else
- add rdi, r8
-%endif
-
- dec rcx
- jnz .filter_block1d16_h6_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d8_v6_sse2
-;(
-; short *src_ptr,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d8_v6_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rax, arg(7) ;vp9_filter
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
-
- sub rsi, rdx
- sub rsi, rdx
-
- movsxd rcx, DWORD PTR arg(5) ;[output_height]
- pxor xmm0, xmm0 ; clear xmm0
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_sse2_loop:
- movdqa xmm1, XMMWORD PTR [rsi]
- pmullw xmm1, [rax]
-
- movdqa xmm2, XMMWORD PTR [rsi + rdx]
- pmullw xmm2, [rax + 16]
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
- pmullw xmm3, [rax + 32]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
- pmullw xmm5, [rax + 64]
-
- add rsi, rdx
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
-
- pmullw xmm4, [rax + 48]
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
-
- pmullw xmm6, [rax + 80]
-
- paddsw xmm2, xmm5
- paddsw xmm2, xmm3
-
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
-
- paddsw xmm2, xmm6
- paddsw xmm2, xmm7
-
- psraw xmm2, 7
- packuswb xmm2, xmm0 ; pack and saturate
-
- movq QWORD PTR [rdi], xmm2 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(2) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp9_filter_block1d8_v6_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d16_v6_sse2
-;(
-; unsigned short *src_ptr,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; const short *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_v6_sse2) PRIVATE
-sym(vp9_filter_block1d16_v6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rax, arg(7) ;vp9_filter
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
-
- sub rsi, rdx
- sub rsi, rdx
-
- movsxd rcx, DWORD PTR arg(5) ;[output_height]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d16_v6_sse2_loop:
-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
- movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
- movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
- pmullw xmm1, [rax + 16]
- pmullw xmm2, [rax + 16]
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
- pmullw xmm3, [rax + 64]
- pmullw xmm4, [rax + 64]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
- pmullw xmm5, [rax + 32]
- pmullw xmm6, [rax + 32]
-
- movdqa xmm7, XMMWORD PTR [rsi] ; line 1
- movdqa xmm0, XMMWORD PTR [rsi + 16]
- pmullw xmm7, [rax]
- pmullw xmm0, [rax]
-
- paddsw xmm1, xmm3
- paddsw xmm2, xmm4
- paddsw xmm1, xmm5
- paddsw xmm2, xmm6
- paddsw xmm1, xmm7
- paddsw xmm2, xmm0
-
- add rsi, rdx
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
- pmullw xmm3, [rax + 48]
- pmullw xmm4, [rax + 48]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
- pmullw xmm5, [rax + 80]
- pmullw xmm6, [rax + 80]
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
- pxor xmm0, xmm0 ; clear xmm0
-
- paddsw xmm1, xmm3
- paddsw xmm2, xmm4
- paddsw xmm1, xmm5
- paddsw xmm2, xmm6
-
- paddsw xmm1, xmm7
- paddsw xmm2, xmm7
-
- psraw xmm1, 7
- psraw xmm2, 7
-
- packuswb xmm1, xmm2 ; pack and saturate
- movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(2) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp9_filter_block1d16_v6_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d8_h6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(5) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ;dst_ptich
-%endif
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d8_h6_only_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
-
- movq QWORD PTR [rdi], xmm4 ; store the results in the destination
- lea rsi, [rsi + rax]
-
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(3) ;dst_ptich
-%else
- add rdi, r8
-%endif
- dec rcx
-
- jnz .filter_block1d8_h6_only_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d16_h6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(5) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ;dst_ptich
-%endif
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d16_h6_only_sse2_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- movq xmm2, MMWORD PTR [rsi +14]
- pslldq xmm2, 8
-
- por xmm2, xmm1
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0 ; lower 8 bytes
-
- movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
-
- movdqa xmm3, xmm2
- movdqa xmm4, xmm2
-
- movdqa xmm5, xmm2
- movdqa xmm6, xmm2
-
- movdqa xmm7, xmm2
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm2
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0 ; higher 8 bytes
-
- movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(3) ;dst_ptich
-%else
- add rdi, r8
-%endif
-
- dec rcx
- jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d8_v6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp9_filter
-;)
-; Second-pass filter only when xoffset==0
-global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- mov rax, arg(5) ;vp9_filter
-
- pxor xmm0, xmm0 ; clear xmm0
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_only_sse2_loop:
- movq xmm1, MMWORD PTR [rsi]
- movq xmm2, MMWORD PTR [rsi + rdx]
- movq xmm3, MMWORD PTR [rsi + rdx * 2]
- movq xmm5, MMWORD PTR [rsi + rdx * 4]
- add rsi, rdx
- movq xmm4, MMWORD PTR [rsi + rdx * 2]
- movq xmm6, MMWORD PTR [rsi + rdx * 4]
-
- punpcklbw xmm1, xmm0
- pmullw xmm1, [rax]
-
- punpcklbw xmm2, xmm0
- pmullw xmm2, [rax + 16]
-
- punpcklbw xmm3, xmm0
- pmullw xmm3, [rax + 32]
-
- punpcklbw xmm5, xmm0
- pmullw xmm5, [rax + 64]
-
- punpcklbw xmm4, xmm0
- pmullw xmm4, [rax + 48]
-
- punpcklbw xmm6, xmm0
- pmullw xmm6, [rax + 80]
-
- paddsw xmm2, xmm5
- paddsw xmm2, xmm3
-
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
-
- paddsw xmm2, xmm6
- paddsw xmm2, xmm7
-
- psraw xmm2, 7
- packuswb xmm2, xmm0 ; pack and saturate
-
- movq QWORD PTR [rdi], xmm2 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_unpack_block1d16_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int output_height,
-; unsigned int output_width
-;)
-global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE
-sym(vp9_unpack_block1d16_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(3) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
-%endif
-
-.unpack_block1d16_h6_sse2_rowloop:
- movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
- movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- punpcklbw xmm1, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm1
- movdqa XMMWORD Ptr [rdi + 16], xmm3
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(4) ;[output_width]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .unpack_block1d16_h6_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_bilinear_predict16x16_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict16x16_sse2) PRIVATE
-sym(vp9_bilinear_predict16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = bilinear_filters_mmx[xoffset]
- ;const short *VFilter = bilinear_filters_mmx[yoffset]
-
- lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
- movsxd rax, dword ptr arg(2) ;xoffset
-
- cmp rax, 0 ;skip first_pass filter if xoffset=0
- je .b16x16_sp_only
-
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
-
- cmp rax, 0 ;skip second_pass filter if yoffset=0
- je .b16x16_fp_only
-
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;dst_pitch
-%endif
- ; get the first horizontal line done
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- add rsi, rdx ; next line
-.next_row:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, [rax]
- pmullw xmm6, [rax]
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- pmullw xmm3, [rax+16]
- pmullw xmm4, [rax+16]
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rdx ; next line
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(5) ;dst_pitch
-%else
- add rdi, r8
-%endif
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done
-
-.b16x16_sp_only:
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- add rsi, rax ; next line
-.next_row_spo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- movdqa xmm4, xmm3 ; make a copy of current line
- movdqa xmm7, xmm3
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm5, xmm1
- pmullw xmm6, xmm1
- pmullw xmm3, xmm2
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ;dst_pitch
- cmp rdi, rcx
- jne .next_row_spo
-
- jmp .done
-
-.b16x16_fp_only:
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- pxor xmm0, xmm0
-
-.next_row_fpo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ; dst_pitch
- cmp rdi, rcx
- jne .next_row_fpo
-
-.done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_bilinear_predict8x8_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict8x8_sse2) PRIVATE
-sym(vp9_bilinear_predict8x8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 144 ; reserve 144 bytes
-
- ;const short *HFilter = bilinear_filters_mmx[xoffset]
- ;const short *VFilter = bilinear_filters_mmx[yoffset]
- lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- ;Read 9-line unaligned data in and put them on stack. This gives a big
- ;performance boost.
- movdqu xmm0, [rsi]
- lea rax, [rdx + rdx*2]
- movdqu xmm1, [rsi+rdx]
- movdqu xmm2, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi+rdx]
- movdqu xmm5, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm6, [rsi]
- movdqu xmm7, [rsi+rdx]
-
- movdqa XMMWORD PTR [rsp], xmm0
-
- movdqu xmm0, [rsi+rdx*2]
-
- movdqa XMMWORD PTR [rsp+16], xmm1
- movdqa XMMWORD PTR [rsp+32], xmm2
- movdqa XMMWORD PTR [rsp+48], xmm3
- movdqa XMMWORD PTR [rsp+64], xmm4
- movdqa XMMWORD PTR [rsp+80], xmm5
- movdqa XMMWORD PTR [rsp+96], xmm6
- movdqa XMMWORD PTR [rsp+112], xmm7
- movdqa XMMWORD PTR [rsp+128], xmm0
-
- movsxd rax, dword ptr arg(2) ;xoffset
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
-
- movdqa xmm5, [rax]
- movdqa xmm6, [rax+16]
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqa xmm3, XMMWORD PTR [rsp]
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm7, xmm3
- add rsp, 16 ; next line
-.next_row8x8:
- movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
- pmullw xmm7, xmm5
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm4, xmm3
-
- pmullw xmm3, xmm6
- paddw xmm3, xmm7
-
- movdqa xmm7, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- packuswb xmm3, xmm0
- movq [rdi], xmm3 ; store the results in the destination
-
- add rsp, 16 ; next line
- add rdi, rdx
-
- cmp rdi, rcx
- jne .next_row8x8
-
- ;add rsp, 144
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-rd:
- times 8 dw 0x40
diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm
deleted file mode 100644
index b260480e0..000000000
--- a/vp9/common/x86/vp9_subpixel_ssse3.asm
+++ /dev/null
@@ -1,1515 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT 7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4
-
- movdqa xmm7, [GLOBAL(rd)]
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
- mov rdi, arg(2) ;output_ptr
-
- cmp esi, DWORD PTR [rax]
- je vp9_filter_block1d8_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-;xmm3 free
-.filter_block1d8_h6_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm1, xmm0
- pmaddubsw xmm0, xmm4
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
-
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
- pmaddubsw xmm1, xmm5
-
- lea rdi, [rdi + rdx]
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
-
- paddsw xmm0, xmm1
- paddsw xmm2, xmm7
-
- paddsw xmm0, xmm2
-
- psraw xmm0, 7
-
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
- jnz .filter_block1d8_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp9_filter_block1d8_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
- movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
-
- mov rsi, arg(0) ;src_ptr
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-
-.filter_block1d8_h4_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm2, xmm0
- pshufb xmm0, xmm3
-
- pshufb xmm2, xmm4
- pmaddubsw xmm0, xmm5
-
- lea rdi, [rdi + rdx]
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
-
- paddsw xmm0, xmm7
-
- paddsw xmm0, xmm2
-
- psraw xmm0, 7
-
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
-
- jnz .filter_block1d8_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp9_filter_block1d16_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- mov rdi, arg(2) ;output_ptr
-
- mov rsi, arg(0) ;src_ptr
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-.filter_block1d16_h6_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm1, xmm0
- pmaddubsw xmm0, xmm4
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
-
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
- movq xmm3, MMWORD PTR [rsi + 6]
-
- pmaddubsw xmm1, xmm5
- movq xmm7, MMWORD PTR [rsi + 11]
-
- pmaddubsw xmm2, xmm6
- punpcklbw xmm3, xmm7
-
- paddsw xmm0, xmm1
- movdqa xmm1, xmm3
-
- pmaddubsw xmm3, xmm4
- paddsw xmm0, xmm2
-
- movdqa xmm2, xmm1
- paddsw xmm0, [GLOBAL(rd)]
-
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
-
- psraw xmm0, 7
- pmaddubsw xmm1, xmm5
-
- pmaddubsw xmm2, xmm6
- packuswb xmm0, xmm0
-
- lea rsi, [rsi + rax]
- paddsw xmm3, xmm1
-
- paddsw xmm3, xmm2
-
- paddsw xmm3, [GLOBAL(rd)]
-
- psraw xmm3, 7
-
- packuswb xmm3, xmm3
-
- punpcklqdq xmm0, xmm3
-
- movdqa XMMWORD Ptr [rdi], xmm0
-
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .filter_block1d16_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d4_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
- movdqa xmm7, [GLOBAL(rd)]
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d4_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-;xmm3 free
-.filter_block1d4_h6_rowloop_ssse3:
- movdqu xmm0, XMMWORD PTR [rsi - 2]
-
- movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf1b)]
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2b)]
- pmaddubsw xmm0, xmm4
- pshufb xmm2, [GLOBAL(shuf3b)]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm0, xmm1
- paddsw xmm0, xmm7
- pxor xmm1, xmm1
- paddsw xmm0, xmm2
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- movd DWORD PTR [rdi], xmm0
-
- add rdi, rdx
- dec rcx
- jnz .filter_block1d4_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d4_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
- movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-.filter_block1d4_h4_rowloop_ssse3:
- movdqu xmm1, XMMWORD PTR [rsi - 2]
-
- movdqa xmm2, xmm1
- pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
- pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm1, xmm7
- paddsw xmm1, xmm2
- psraw xmm1, 7
- packuswb xmm1, xmm1
-
- movd DWORD PTR [rdi], xmm1
-
- add rdi, rdx
- dec rcx
- jnz .filter_block1d4_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;void vp9_filter_block1d16_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d16_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-
-.vp9_filter_block1d16_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [GLOBAL(rd)]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2 ;store the results
-
- movq xmm1, MMWORD PTR [rsi + 8] ;A
- movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [GLOBAL(rd)]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi+8], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d16_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d16_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-.vp9_filter_block1d16_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- paddsw xmm2, [GLOBAL(rd)]
- paddsw xmm2, xmm3
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- punpcklbw xmm5, xmm4 ;B D
- punpcklbw xmm1, xmm0 ;C E
-
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm5, xmm7
-
- movdqa xmm4, [GLOBAL(rd)]
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm5, xmm1
- paddsw xmm5, xmm4
- psraw xmm5, 7
- packuswb xmm5, xmm5
-
- punpcklqdq xmm2, xmm5
-
- movdqa XMMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d16_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d8_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d8_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
- movdqa xmm4, [GLOBAL(rd)]
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d8_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d8_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm5, [GLOBAL(rd)]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d8_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm5
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d8_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp9_filter_block1d4_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d4_v4_ssse3
-
- movq mm5, MMWORD PTR [rax] ;k0_k5
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d4_v6_ssse3_loop:
- movd mm1, DWORD PTR [rsi] ;A
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- movd mm0, DWORD PTR [rax + rdx * 4] ;F
-
- movq mm4, [GLOBAL(rd)]
-
- pmaddubsw mm3, mm6
- punpcklbw mm1, mm0 ;A F
- pmaddubsw mm2, mm7
- pmaddubsw mm1, mm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm1
- paddsw mm2, mm4
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d4_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d4_v4_ssse3:
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
- movq mm5, MMWORD PTR [GLOBAL(rd)]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d4_v4_ssse3_loop:
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- pmaddubsw mm3, mm6
- pmaddubsw mm2, mm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm5
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d4_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_bilinear_predict16x16_ssse3
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE
-sym(vp9_bilinear_predict16x16_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- lea rcx, [GLOBAL(bilinear_filters_ssse3)]
- movsxd rax, dword ptr arg(2) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .b16x16_sp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; HFilter
-
- mov rdi, arg(4) ; dst_ptr
- mov rsi, arg(0) ; src_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm1, [rax]
-
- movsxd rax, dword ptr arg(3) ; yoffset
-
- cmp rax, 0 ; skip second_pass filter if yoffset=0
- je .b16x16_fp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
-
- movdqa xmm2, [rax]
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ; dst_pitch
-%endif
- movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
- movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
-
- lea rsi, [rsi + rdx] ; next line
-
- pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
-
- punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
- pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
- psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
- movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm6, xmm5
- movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
- lea rsi, [rsi + rdx] ; next line
-
- pmaddubsw xmm6, xmm1
-
- punpcklbw xmm4, xmm5
- pmaddubsw xmm4, xmm1
-
- paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
- psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
-
- paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
- psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
-
- packuswb xmm6, xmm4
- movdqa xmm5, xmm7
-
- punpcklbw xmm5, xmm6
- pmaddubsw xmm5, xmm2
-
- punpckhbw xmm7, xmm6
- pmaddubsw xmm7, xmm2
-
- paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
- psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128
-
- paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
- psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
-
- packuswb xmm5, xmm7
- movdqa xmm7, xmm6
-
- movdqa [rdi], xmm5 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(5) ; dst_pitch
-%else
- add rdi, r8
-%endif
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done
-
-.b16x16_sp_only:
- movsxd rax, dword ptr arg(3) ; yoffset
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- mov rdi, arg(4) ; dst_ptr
- mov rsi, arg(0) ; src_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm1, [rax] ; VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-
- ; get the first horizontal line done
- movq xmm4, [rsi] ; load row 0
- movq xmm2, [rsi + 8] ; load row 0
-
- lea rsi, [rsi + rax] ; next line
-.next_row_sp:
- movq xmm3, [rsi] ; load row + 1
- movq xmm5, [rsi + 8] ; load row + 1
-
- punpcklbw xmm4, xmm3
- punpcklbw xmm2, xmm5
-
- pmaddubsw xmm4, xmm1
- movq xmm7, [rsi + rax] ; load row + 2
-
- pmaddubsw xmm2, xmm1
- movq xmm6, [rsi + rax + 8] ; load row + 2
-
- punpcklbw xmm3, xmm7
- punpcklbw xmm5, xmm6
-
- pmaddubsw xmm3, xmm1
- paddw xmm4, [GLOBAL(rd)]
-
- pmaddubsw xmm5, xmm1
- paddw xmm2, [GLOBAL(rd)]
-
- psraw xmm4, VP9_FILTER_SHIFT
- psraw xmm2, VP9_FILTER_SHIFT
-
- packuswb xmm4, xmm2
- paddw xmm3, [GLOBAL(rd)]
-
- movdqa [rdi], xmm4 ; store row 0
- paddw xmm5, [GLOBAL(rd)]
-
- psraw xmm3, VP9_FILTER_SHIFT
- psraw xmm5, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm5
- movdqa xmm4, xmm7
-
- movdqa [rdi + rdx],xmm3 ; store row 1
- lea rsi, [rsi + 2*rax]
-
- movdqa xmm2, xmm6
- lea rdi, [rdi + 2*rdx]
-
- cmp rdi, rcx
- jne .next_row_sp
-
- jmp .done
-
-.b16x16_fp_only:
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-
-.next_row_fp:
- movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm2, xmm4
- movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- pmaddubsw xmm2, xmm1
- movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
-
- lea rsi, [rsi + rax] ; next line
- punpcklbw xmm3, xmm4
-
- pmaddubsw xmm3, xmm1
- movq xmm5, [rsi]
-
- paddw xmm2, [GLOBAL(rd)]
- movq xmm7, [rsi+1]
-
- movq xmm6, [rsi+8]
- psraw xmm2, VP9_FILTER_SHIFT
-
- punpcklbw xmm5, xmm7
- movq xmm7, [rsi+9]
-
- paddw xmm3, [GLOBAL(rd)]
- pmaddubsw xmm5, xmm1
-
- psraw xmm3, VP9_FILTER_SHIFT
- punpcklbw xmm6, xmm7
-
- packuswb xmm2, xmm3
- pmaddubsw xmm6, xmm1
-
- movdqa [rdi], xmm2 ; store the results in the destination
- paddw xmm5, [GLOBAL(rd)]
-
- lea rdi, [rdi + rdx] ; dst_pitch
- psraw xmm5, VP9_FILTER_SHIFT
-
- paddw xmm6, [GLOBAL(rd)]
- psraw xmm6, VP9_FILTER_SHIFT
-
- packuswb xmm5, xmm6
- lea rsi, [rsi + rax] ; next line
-
- movdqa [rdi], xmm5 ; store the results in the destination
- lea rdi, [rdi + rdx] ; dst_pitch
-
- cmp rdi, rcx
-
- jne .next_row_fp
-
-.done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_bilinear_predict8x8_ssse3
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE
-sym(vp9_bilinear_predict8x8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 144 ; reserve 144 bytes
-
- lea rcx, [GLOBAL(bilinear_filters_ssse3)]
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- ;Read 9-line unaligned data in and put them on stack. This gives a big
- ;performance boost.
- movdqu xmm0, [rsi]
- lea rax, [rdx + rdx*2]
- movdqu xmm1, [rsi+rdx]
- movdqu xmm2, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi+rdx]
- movdqu xmm5, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm6, [rsi]
- movdqu xmm7, [rsi+rdx]
-
- movdqa XMMWORD PTR [rsp], xmm0
-
- movdqu xmm0, [rsi+rdx*2]
-
- movdqa XMMWORD PTR [rsp+16], xmm1
- movdqa XMMWORD PTR [rsp+32], xmm2
- movdqa XMMWORD PTR [rsp+48], xmm3
- movdqa XMMWORD PTR [rsp+64], xmm4
- movdqa XMMWORD PTR [rsp+80], xmm5
- movdqa XMMWORD PTR [rsp+96], xmm6
- movdqa XMMWORD PTR [rsp+112], xmm7
- movdqa XMMWORD PTR [rsp+128], xmm0
-
- movsxd rax, dword ptr arg(2) ; xoffset
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .b8x8_sp_only
-
- shl rax, 4
- add rax, rcx ; HFilter
-
- mov rdi, arg(4) ; dst_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm0, [rax]
-
- movsxd rax, dword ptr arg(3) ; yoffset
- cmp rax, 0 ; skip second_pass filter if yoffset=0
- je .b8x8_fp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- lea rcx, [rdi+rdx*8]
-
- movdqa xmm1, [rax]
-
- ; get the first horizontal line done
- movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
-
- psrldq xmm5, 1
- lea rsp, [rsp + 16] ; next line
-
- punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
- pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
- movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- lea rsp, [rsp + 16] ; next line
-
- movdqa xmm5, xmm6
-
- psrldq xmm5, 1
-
- punpcklbw xmm6, xmm5
- pmaddubsw xmm6, xmm0
-
- paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
- psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
-
- packuswb xmm6, xmm6
-
- punpcklbw xmm7, xmm6
- pmaddubsw xmm7, xmm1
-
- paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
- psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
-
- packuswb xmm7, xmm7
-
- movq [rdi], xmm7 ; store the results in the destination
- lea rdi, [rdi + rdx]
-
- movdqa xmm7, xmm6
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done8x8
-
-.b8x8_sp_only:
- movsxd rax, dword ptr arg(3) ; yoffset
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- mov rdi, arg(4) ;dst_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm0, [rax] ; VFilter
-
- movq xmm1, XMMWORD PTR [rsp]
- movq xmm2, XMMWORD PTR [rsp+16]
-
- movq xmm3, XMMWORD PTR [rsp+32]
- punpcklbw xmm1, xmm2
-
- movq xmm4, XMMWORD PTR [rsp+48]
- punpcklbw xmm2, xmm3
-
- movq xmm5, XMMWORD PTR [rsp+64]
- punpcklbw xmm3, xmm4
-
- movq xmm6, XMMWORD PTR [rsp+80]
- punpcklbw xmm4, xmm5
-
- movq xmm7, XMMWORD PTR [rsp+96]
- punpcklbw xmm5, xmm6
-
- pmaddubsw xmm1, xmm0
- pmaddubsw xmm2, xmm0
-
- pmaddubsw xmm3, xmm0
- pmaddubsw xmm4, xmm0
-
- pmaddubsw xmm5, xmm0
- punpcklbw xmm6, xmm7
-
- pmaddubsw xmm6, xmm0
- paddw xmm1, [GLOBAL(rd)]
-
- paddw xmm2, [GLOBAL(rd)]
- psraw xmm1, VP9_FILTER_SHIFT
-
- paddw xmm3, [GLOBAL(rd)]
- psraw xmm2, VP9_FILTER_SHIFT
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm3, VP9_FILTER_SHIFT
-
- paddw xmm5, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- paddw xmm6, [GLOBAL(rd)]
- psraw xmm5, VP9_FILTER_SHIFT
-
- psraw xmm6, VP9_FILTER_SHIFT
- packuswb xmm1, xmm1
-
- packuswb xmm2, xmm2
- movq [rdi], xmm1
-
- packuswb xmm3, xmm3
- movq [rdi+rdx], xmm2
-
- packuswb xmm4, xmm4
- movq xmm1, XMMWORD PTR [rsp+112]
-
- lea rdi, [rdi + 2*rdx]
- movq xmm2, XMMWORD PTR [rsp+128]
-
- packuswb xmm5, xmm5
- movq [rdi], xmm3
-
- packuswb xmm6, xmm6
- movq [rdi+rdx], xmm4
-
- lea rdi, [rdi + 2*rdx]
- punpcklbw xmm7, xmm1
-
- movq [rdi], xmm5
- pmaddubsw xmm7, xmm0
-
- movq [rdi+rdx], xmm6
- punpcklbw xmm1, xmm2
-
- pmaddubsw xmm1, xmm0
- paddw xmm7, [GLOBAL(rd)]
-
- psraw xmm7, VP9_FILTER_SHIFT
- paddw xmm1, [GLOBAL(rd)]
-
- psraw xmm1, VP9_FILTER_SHIFT
- packuswb xmm7, xmm7
-
- packuswb xmm1, xmm1
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm7
-
- movq [rdi+rdx], xmm1
- lea rsp, [rsp + 144]
-
- jmp .done8x8
-
-.b8x8_fp_only:
- lea rcx, [rdi+rdx*8]
-
-.next_row_fp:
- movdqa xmm1, XMMWORD PTR [rsp]
- movdqa xmm3, XMMWORD PTR [rsp+16]
-
- movdqa xmm2, xmm1
- movdqa xmm5, XMMWORD PTR [rsp+32]
-
- psrldq xmm2, 1
- movdqa xmm7, XMMWORD PTR [rsp+48]
-
- movdqa xmm4, xmm3
- psrldq xmm4, 1
-
- movdqa xmm6, xmm5
- psrldq xmm6, 1
-
- punpcklbw xmm1, xmm2
- pmaddubsw xmm1, xmm0
-
- punpcklbw xmm3, xmm4
- pmaddubsw xmm3, xmm0
-
- punpcklbw xmm5, xmm6
- pmaddubsw xmm5, xmm0
-
- movdqa xmm2, xmm7
- psrldq xmm2, 1
-
- punpcklbw xmm7, xmm2
- pmaddubsw xmm7, xmm0
-
- paddw xmm1, [GLOBAL(rd)]
- psraw xmm1, VP9_FILTER_SHIFT
-
- paddw xmm3, [GLOBAL(rd)]
- psraw xmm3, VP9_FILTER_SHIFT
-
- paddw xmm5, [GLOBAL(rd)]
- psraw xmm5, VP9_FILTER_SHIFT
-
- paddw xmm7, [GLOBAL(rd)]
- psraw xmm7, VP9_FILTER_SHIFT
-
- packuswb xmm1, xmm1
- packuswb xmm3, xmm3
-
- packuswb xmm5, xmm5
- movq [rdi], xmm1
-
- packuswb xmm7, xmm7
- movq [rdi+rdx], xmm3
-
- lea rdi, [rdi + 2*rdx]
- movq [rdi], xmm5
-
- lea rsp, [rsp + 4*16]
- movq [rdi+rdx], xmm7
-
- lea rdi, [rdi + 2*rdx]
- cmp rdi, rcx
-
- jne .next_row_fp
-
- lea rsp, [rsp + 16]
-
-.done8x8:
- ;add rsp, 144
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-shuf1b:
- db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-shuf2b:
- db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
-shuf3b:
- db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
-
-align 16
-shuf2bfrom1:
- db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
-align 16
-shuf3bfrom1:
- db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
-
-align 16
-rd:
- times 8 dw 0x40
-
-align 16
-k0_k5:
- times 8 db 0, 0 ;placeholder
- times 8 db 0, 0
- times 8 db 2, 1
- times 8 db 0, 0
- times 8 db 3, 3
- times 8 db 0, 0
- times 8 db 1, 2
- times 8 db 0, 0
-k1_k3:
- times 8 db 0, 0 ;placeholder
- times 8 db -6, 12
- times 8 db -11, 36
- times 8 db -9, 50
- times 8 db -16, 77
- times 8 db -6, 93
- times 8 db -8, 108
- times 8 db -1, 123
-k2_k4:
- times 8 db 128, 0 ;placeholder
- times 8 db 123, -1
- times 8 db 108, -8
- times 8 db 93, -6
- times 8 db 77, -16
- times 8 db 50, -9
- times 8 db 36, -11
- times 8 db 12, -6
-align 16
-bilinear_filters_ssse3:
- times 8 db 128, 0
- times 8 db 120, 8
- times 8 db 112, 16
- times 8 db 104, 24
- times 8 db 96, 32
- times 8 db 88, 40
- times 8 db 80, 48
- times 8 db 72, 56
- times 8 db 64, 64
- times 8 db 56, 72
- times 8 db 48, 80
- times 8 db 40, 88
- times 8 db 32, 96
- times 8 db 24, 104
- times 8 db 16, 112
- times 8 db 8, 120
-
diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h
deleted file mode 100644
index 25bc26d9b..000000000
--- a/vp9/common/x86/vp9_subpixel_x86.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
-
-#undef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
-
-#undef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
-
-#endif
-#endif
-
-#if HAVE_SSSE3
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
-
-#undef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
-
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
-
-#undef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
-
-#endif
-#endif
-
-
-
-#endif