summaryrefslogtreecommitdiff
path: root/vp9/common/x86
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2013-04-12 15:33:04 -0700
committerJohn Koleszar <jkoleszar@google.com>2013-04-16 06:49:46 -0700
commit7f7d1357a2732e0a1c36f3baded7dd14f449e535 (patch)
tree6bee68dd36c842cd700ee8f670d1380e37acd77d /vp9/common/x86
parent282c963923eb969c146d63e934bbece433a95282 (diff)
parent868ecb55a1528ca3f19286e7d1551572bf89b642 (diff)
downloadlibvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar
libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar.gz
libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.tar.bz2
libvpx-7f7d1357a2732e0a1c36f3baded7dd14f449e535.zip
Merge branch 'experimental' into master
VP9 preview bitstream 2, commit '868ecb55a1528ca3f19286e7d1551572bf89b642' Conflicts: vp9/vp9_common.mk Change-Id: I3f0f6e692c987ff24f98ceafbb86cb9cf64ad8d3
Diffstat (limited to 'vp9/common/x86')
-rw-r--r--vp9/common/x86/vp9_asm_stubs.c818
-rw-r--r--vp9/common/x86/vp9_filter_sse2.c290
-rw-r--r--vp9/common/x86/vp9_filter_sse4.c362
-rw-r--r--vp9/common/x86/vp9_idct_sse2.asm (renamed from vp9/common/x86/vp9_idctllm_sse2.asm)0
-rw-r--r--vp9/common/x86/vp9_idct_x86.c1975
-rw-r--r--vp9/common/x86/vp9_idct_x86.h13
-rw-r--r--vp9/common/x86/vp9_idctllm_mmx.asm241
-rw-r--r--vp9/common/x86/vp9_loopfilter_intrin_sse2.c600
-rw-r--r--vp9/common/x86/vp9_postproc_mmx.asm4
-rw-r--r--vp9/common/x86/vp9_postproc_sse2.asm4
-rw-r--r--vp9/common/x86/vp9_subpixel_8t_ssse3.asm729
-rw-r--r--vp9/common/x86/vp9_subpixel_mmx.asm268
-rw-r--r--vp9/common/x86/vp9_subpixel_sse2.asm1372
-rw-r--r--vp9/common/x86/vp9_subpixel_ssse3.asm1515
-rw-r--r--vp9/common/x86/vp9_subpixel_x86.h109
15 files changed, 3161 insertions, 5139 deletions
diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c
index f09e2d78b..6d3bb021a 100644
--- a/vp9/common/x86/vp9_asm_stubs.c
+++ b/vp9/common/x86/vp9_asm_stubs.c
@@ -8,91 +8,11 @@
* be found in the AUTHORS file in the root of the source tree.
*/
+#include <assert.h>
#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
#include "vpx_ports/mem.h"
-#include "vp9/common/vp9_subpixel.h"
-
-extern const short vp9_six_tap_mmx[8][6 * 8];
-
-extern void vp9_filter_block1d_h6_mmx(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1dc_v6_mmx(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int output_pitch,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_sse2(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_v6_sse2(unsigned short *src_ptr,
- unsigned char *output_ptr,
- int dst_ptich,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- const short *vp9_filter);
-
-extern void vp9_unpack_block1d16_h6_sse2(unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int output_height,
- unsigned int output_width);
-
-extern void vp9_filter_block1d8_h6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d16_h6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_lin,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
-extern void vp9_filter_block1d8_v6_only_sse2(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- int dst_pitch,
- unsigned int output_height,
- const short *vp9_filter);
-
///////////////////////////////////////////////////////////////////////////
// the mmx function that does the bilinear filtering and var calculation //
// int one pass //
@@ -116,389 +36,7 @@ DECLARE_ALIGNED(16, const short, vp9_bilinear_filters_mmx[16][8]) = {
{ 8, 8, 8, 8, 120, 120, 120, 120 }
};
-#if HAVE_MMX
-void vp9_sixtap_predict4x4_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict4x4_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 16 * 16);
- const short *hfilter, *vfilter;
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 9, 8, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 8, dst_ptr, dst_pitch,
- 8, 4, 4, 4, vfilter);
-}
-
-void vp9_sixtap_predict16x16_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 8,
- fdata2 + 8, src_pixels_per_line, 1, 21, 32,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 12,
- fdata2 + 12, src_pixels_per_line, 1, 21, 32,
- hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 36, dst_ptr + 4, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 40, dst_ptr + 8, dst_pitch,
- 32, 16, 16, 16, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 44, dst_ptr + 12, dst_pitch,
- 32, 16, 16, 16, vfilter);
-}
-
-void vp9_sixtap_predict8x8_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 13, 16,
- hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 13, 16,
- hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 8, 8, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
- 16, 8, 8, 8, vfilter);
-}
-
-void vp9_sixtap_predict8x4_mmx(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_mmx\n");
-#endif
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line),
- fdata2, src_pixels_per_line, 1, 9, 16, hfilter);
- vp9_filter_block1d_h6_mmx(src_ptr - (2 * src_pixels_per_line) + 4,
- fdata2 + 4, src_pixels_per_line, 1, 9, 16, hfilter);
-
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1dc_v6_mmx(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 4, 8, vfilter);
- vp9_filter_block1dc_v6_mmx(fdata2 + 20, dst_ptr + 4, dst_pitch,
- 16, 8, 4, 8, vfilter);
-}
-#endif
-
-#if HAVE_SSE2
-void vp9_sixtap_predict16x16_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 24 * 24);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 21, 32, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d16_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 16, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_unpack_block1d16_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 21, 32);
- vp9_filter_block1d16_v6_sse2(fdata2 + 32, dst_ptr, dst_pitch,
- 32, 16, 16, dst_pitch, vfilter);
- }
-}
-
-void vp9_sixtap_predict8x8_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 13, 16, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 8, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 8, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 8, vfilter);
- }
-}
-
-void vp9_sixtap_predict8x4_sse2(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- /* Temp data bufffer used in filtering */
- DECLARE_ALIGNED_ARRAY(16, unsigned short, fdata2, 256);
- const short *hfilter, *vfilter;
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_sse2\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_sse2(src_ptr - (2 * src_pixels_per_line), fdata2,
- src_pixels_per_line, 1, 9, 16, hfilter);
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_sse2(fdata2 + 16, dst_ptr, dst_pitch,
- 16, 8, 4, dst_pitch, vfilter);
- } else {
- /* First-pass only */
- hfilter = vp9_six_tap_mmx[xoffset];
- vp9_filter_block1d8_h6_only_sse2(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, hfilter);
- }
- } else {
- /* Second-pass only */
- vfilter = vp9_six_tap_mmx[yoffset];
- vp9_filter_block1d8_v6_only_sse2(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, vfilter);
- }
-}
-#endif
-
#if HAVE_SSSE3
-extern void vp9_filter_block1d8_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d16_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d16_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d8_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d4_h6_ssse3(unsigned char *src_ptr,
- unsigned int src_pixels_per_line,
- unsigned char *output_ptr,
- unsigned int output_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-extern void vp9_filter_block1d4_v6_ssse3(unsigned char *src_ptr,
- unsigned int src_pitch,
- unsigned char *output_ptr,
- unsigned int out_pitch,
- unsigned int output_height,
- unsigned int vp9_filter_index);
-
-void vp9_sixtap_predict16x16_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 24 * 24);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict16x16_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d16_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- fdata2, 16, 21, xoffset);
- vp9_filter_block1d16_v6_ssse3(fdata2, 16, dst_ptr, dst_pitch,
- 16, yoffset);
- } else {
- /* First-pass only */
- vp9_filter_block1d16_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 16, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d16_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 16, yoffset);
- }
-}
-
-void vp9_sixtap_predict8x8_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x8_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 8, 13, xoffset);
- vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 8, yoffset);
- } else {
- vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 8, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 8, yoffset);
- }
-}
-
-void vp9_sixtap_predict8x4_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 256);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict8x4_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d8_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 8, 9, xoffset);
- vp9_filter_block1d8_v6_ssse3(fdata2, 8, dst_ptr, dst_pitch, 4, yoffset);
- } else {
- /* First-pass only */
- vp9_filter_block1d8_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- } else {
- /* Second-pass only */
- vp9_filter_block1d8_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
-}
-
-void vp9_sixtap_predict4x4_ssse3(unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- unsigned char *dst_ptr,
- int dst_pitch) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 4 * 9);
-#ifdef ANNOUNCE_FUNCTION
- printf("vp9_sixtap_predict4x4_ssse3\n");
-#endif
-
- if (xoffset) {
- if (yoffset) {
- vp9_filter_block1d4_h6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line, fdata2, 4, 9, xoffset);
- vp9_filter_block1d4_v6_ssse3(fdata2, 4, dst_ptr, dst_pitch, 4, yoffset);
- } else {
- vp9_filter_block1d4_h6_ssse3(src_ptr, src_pixels_per_line,
- dst_ptr, dst_pitch, 4, xoffset);
- }
- } else {
- vp9_filter_block1d4_v6_ssse3(src_ptr - (2 * src_pixels_per_line),
- src_pixels_per_line,
- dst_ptr, dst_pitch, 4, yoffset);
- }
-}
-
void vp9_filter_block1d16_v8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
@@ -513,30 +51,6 @@ void vp9_filter_block1d16_h8_ssse3(const unsigned char *src_ptr,
unsigned int output_height,
const short *filter);
-void vp9_filter_block2d_16x16_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
-
- vp9_filter_block1d16_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 23, hfilter_aligned16);
- vp9_filter_block1d16_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 16,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d16_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride,
- 16, hfilter_aligned16);
- } else {
- vp9_filter_block1d16_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 16, vfilter_aligned16);
- }
- }
-}
-
void vp9_filter_block1d8_v8_ssse3(const unsigned char *src_ptr,
const unsigned int src_pitch,
unsigned char *output_ptr,
@@ -551,51 +65,303 @@ void vp9_filter_block1d8_h8_ssse3(const unsigned char *src_ptr,
unsigned int output_height,
const short *filter);
-void vp9_filter_block2d_8x8_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] != 128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+void vp9_filter_block1d4_v8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
- vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 15, hfilter_aligned16);
- vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 8,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 8,
- hfilter_aligned16);
- } else {
- vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 8, vfilter_aligned16);
+void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr,
+ const unsigned int src_pitch,
+ unsigned char *output_ptr,
+ unsigned int out_pitch,
+ unsigned int output_height,
+ const short *filter);
+
+void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (x_step_q4 == 16 && filter_x[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_h8_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 16;
+ dst += 16;
+ w -= 16;
}
+ while (w >= 8) {
+ vp9_filter_block1d8_h8_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_h8_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (y_step_q4 == 16 && filter_y[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_v8_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_v8_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_v8_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
}
}
-void vp9_filter_block2d_8x4_8_ssse3(const unsigned char *src_ptr,
- const unsigned int src_stride,
- const short *hfilter_aligned16,
- const short *vfilter_aligned16,
- unsigned char *dst_ptr,
- unsigned int dst_stride) {
- if (hfilter_aligned16[3] !=128 && vfilter_aligned16[3] != 128) {
- DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 23 * 16);
+void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (x_step_q4 == 16 && filter_x[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_h8_avg_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_h8_avg_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_h8_avg_ssse3(src, src_stride,
+ dst, dst_stride,
+ h, filter_x);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
- vp9_filter_block1d8_h8_ssse3(src_ptr - (3 * src_stride), src_stride,
- fdata2, 16, 11, hfilter_aligned16);
- vp9_filter_block1d8_v8_ssse3(fdata2, 16, dst_ptr, dst_stride, 4,
- vfilter_aligned16);
- } else {
- if (hfilter_aligned16[3] != 128) {
- vp9_filter_block1d8_h8_ssse3(src_ptr, src_stride, dst_ptr, dst_stride, 4,
- hfilter_aligned16);
- } else {
- vp9_filter_block1d8_v8_ssse3(src_ptr - (3 * src_stride), src_stride,
- dst_ptr, dst_stride, 4, vfilter_aligned16);
+void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ if (y_step_q4 == 16 && filter_y[3] != 128) {
+ while (w >= 16) {
+ vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 16;
+ dst += 16;
+ w -= 16;
+ }
+ while (w >= 8) {
+ vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 8;
+ dst += 8;
+ w -= 8;
+ }
+ while (w >= 4) {
+ vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride,
+ dst, dst_stride,
+ h, filter_y);
+ src += 4;
+ dst += 4;
+ w -= 4;
+ }
+ }
+ if (w) {
+ vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+ }
+}
+
+void vp9_convolve8_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+
+ // check w/h due to fixed size fdata2 array
+ assert(w <= 16);
+ assert(h <= 16);
+
+ if (x_step_q4 == 16 && y_step_q4 == 16 &&
+ filter_x[3] != 128 && filter_y[3] != 128) {
+ if (w == 16) {
+ vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d16_v8_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
+ }
+ if (w == 8) {
+ vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d8_v8_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
+ }
+ if (w == 4) {
+ vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d4_v8_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
+ }
+ }
+ vp9_convolve8_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
+}
+
+void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride,
+ uint8_t *dst, int dst_stride,
+ const int16_t *filter_x, int x_step_q4,
+ const int16_t *filter_y, int y_step_q4,
+ int w, int h) {
+ DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23);
+
+ // check w/h due to fixed size fdata2 array
+ assert(w <= 16);
+ assert(h <= 16);
+
+ if (x_step_q4 == 16 && y_step_q4 == 16 &&
+ filter_x[3] != 128 && filter_y[3] != 128) {
+ if (w == 16) {
+ vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
+ }
+ if (w == 8) {
+ vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
+ }
+ if (w == 4) {
+ vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride,
+ fdata2, 16,
+ h + 7, filter_x);
+ vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16,
+ dst, dst_stride,
+ h, filter_y);
+ return;
}
}
+ vp9_convolve8_avg_c(src, src_stride, dst, dst_stride,
+ filter_x, x_step_q4, filter_y, y_step_q4,
+ w, h);
}
#endif
diff --git a/vp9/common/x86/vp9_filter_sse2.c b/vp9/common/x86/vp9_filter_sse2.c
deleted file mode 100644
index 8e02ac197..000000000
--- a/vp9/common/x86/vp9_filter_sse2.c
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <emmintrin.h> // SSE2
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/emmintrin_compat.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-// just a quick partial snapshot so that other can already use some
-// speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-// filtering.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-// of positive above 128), or have higher precision filter
-// coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, src_ptr, offset) \
- { \
- /* Do shifted load to achieve require shuffles through unpacking */ \
- const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
- const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
- const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
- const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
- const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \
- const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \
- const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \
- const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \
- /* Shit by 4 bytes through suffle to get additional shifted loads */ \
- const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \
- const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \
- const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \
- const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \
- const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \
- /* multiply accumulate them */ \
- const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
- const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
- const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
- const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
- const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
- const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
- mad_all = _mm_add_epi32(mad_all, rounding); \
- result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \
- }
-
-void vp9_filter_block2d_4x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- __m128i intermediateA, intermediateB, intermediateC;
-
- const int kInterp_Extend = 4;
-
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
-
- // check alignment
- assert(0 == ((long)HFilter_aligned16)%16);
- assert(0 == ((long)VFilter_aligned16)%16);
-
- {
- __m128i transpose3_0;
- __m128i transpose3_1;
- __m128i transpose3_2;
- __m128i transpose3_3;
-
- // Horizontal pass (src -> intermediate).
- {
- const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
- src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
- {
- __m128i mad_all0;
- __m128i mad_all1;
- __m128i mad_all2;
- __m128i mad_all3;
- DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
- intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
- }
- }
-
- // Transpose result (intermediate -> transpose3_x)
- {
- // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
- // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
- // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
- const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
- const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
- const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
- const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
- // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
- // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
- // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
- // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
- const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
- const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
- const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
- const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
- // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
- // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
- // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
- // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
- const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
- const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
- const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
- const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
- // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
- transpose3_0 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose2_2),
- _MM_SHUFFLE(1, 0, 1, 0)));
- transpose3_1 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose2_2),
- _MM_SHUFFLE(3, 2, 3, 2)));
- transpose3_2 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose2_3),
- _MM_SHUFFLE(1, 0, 1, 0)));
- transpose3_3 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose2_3),
- _MM_SHUFFLE(3, 2, 3, 2)));
- // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
- // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
- // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
- // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
- }
-
- // Vertical pass (transpose3_x -> dst).
- {
- const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
- __m128i col0, col1, col2, col3;
- DECLARE_ALIGNED(16, unsigned char, temp[32]);
- {
- _mm_store_si128((__m128i *)temp, transpose3_0);
- DO_FOUR_PIXELS(col0, temp, 0);
- }
- {
- _mm_store_si128((__m128i *)temp, transpose3_1);
- DO_FOUR_PIXELS(col1, temp, 0);
- }
- {
- _mm_store_si128((__m128i *)temp, transpose3_2);
- DO_FOUR_PIXELS(col2, temp, 0);
- }
- {
- _mm_store_si128((__m128i *)temp, transpose3_3);
- DO_FOUR_PIXELS(col3, temp, 0);
- }
- // transpose
- {
- __m128i T0 = _mm_unpacklo_epi32(col0, col1);
- __m128i T1 = _mm_unpacklo_epi32(col2, col3);
- __m128i T2 = _mm_unpackhi_epi32(col0, col1);
- __m128i T3 = _mm_unpackhi_epi32(col2, col3);
- col0 = _mm_unpacklo_epi64(T0, T1);
- col1 = _mm_unpackhi_epi64(T0, T1);
- col2 = _mm_unpacklo_epi64(T2, T3);
- col3 = _mm_unpackhi_epi64(T2, T3);
- }
- // saturate to 8 bit
- {
- col0 = _mm_packs_epi32(col0, col0);
- col0 = _mm_packus_epi16(col0, col0);
- col1 = _mm_packs_epi32(col1, col1);
- col1 = _mm_packus_epi16(col1, col1);
- col2 = _mm_packs_epi32 (col2, col2);
- col2 = _mm_packus_epi16(col2, col2);
- col3 = _mm_packs_epi32 (col3, col3);
- col3 = _mm_packus_epi16(col3, col3);
- }
- // store
- {
- *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
- *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
- *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
- *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
- }
- }
- }
-}
-
-void vp9_filter_block2d_8x4_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int j;
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j, dst_stride);
- }
-}
-
-void vp9_filter_block2d_8x8_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<8; i+=4) {
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
-
-void vp9_filter_block2d_16x16_8_sse2
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<16; i+=4) {
- for (j=0; j<16; j+=4) {
- vp9_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
diff --git a/vp9/common/x86/vp9_filter_sse4.c b/vp9/common/x86/vp9_filter_sse4.c
deleted file mode 100644
index 52c35b296..000000000
--- a/vp9/common/x86/vp9_filter_sse4.c
+++ /dev/null
@@ -1,362 +0,0 @@
-/*
- * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <assert.h> // for alignment checks
-#include <smmintrin.h> // SSE4.1
-#include "vp9/common/vp9_filter.h"
-#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
-#include "vp9_rtcd.h"
-
-// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
-// just a quick partial snapshot so that other can already use some
-// speedup.
-// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
-// filtering.
-// TODO(cd): Reduce source size by using macros instead of current code
-// duplication.
-// TODO(cd): Add some comments, better variable naming.
-// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
-// of positive above 128), or have higher precision filter
-// coefficients.
-
-DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
- 0x00, 0x01,
- 0x01, 0x02,
- 0x02, 0x03,
- 0x03, 0x04,
- 0x02, 0x03,
- 0x03, 0x04,
- 0x04, 0x05,
- 0x05, 0x06,
-};
-DECLARE_ALIGNED(16, static const unsigned char, mask4567_c[16]) = {
- 0x04, 0x05,
- 0x05, 0x06,
- 0x06, 0x07,
- 0x07, 0x08,
- 0x06, 0x07,
- 0x07, 0x08,
- 0x08, 0x09,
- 0x09, 0x0A,
-};
-DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
- VP9_FILTER_WEIGHT >> 1,
-};
-DECLARE_ALIGNED(16, static const unsigned char, transpose_c[16]) = {
- 0, 4, 8, 12,
- 1, 5, 9, 13,
- 2, 6, 10, 14,
- 3, 7, 11, 15
-};
-
-// Creating a macro to do more than four pixels at once to hide instruction
-// latency is actually slower :-(
-#define DO_FOUR_PIXELS(result, offset) \
- { \
- /*load pixels*/ \
- __m128i src = _mm_loadu_si128((const __m128i *)(src_ptr + offset)); \
- /* extract the ones used for first column */ \
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123); \
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567); \
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero); \
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero); \
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero); \
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero); \
- /* multiply accumulate them */ \
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
- __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
- __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
- mad_all = _mm_add_epi32(mad_all, rounding); \
- result = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT); \
- }
-
-void vp9_filter_block2d_4x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- __m128i intermediateA, intermediateB, intermediateC;
-
- const int kInterp_Extend = 4;
-
- const __m128i zero = _mm_set1_epi16(0);
- const __m128i mask0123 = _mm_load_si128((const __m128i *)mask0123_c);
- const __m128i mask4567 = _mm_load_si128((const __m128i *)mask4567_c);
- const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
- const __m128i transpose = _mm_load_si128((const __m128i *)transpose_c);
-
- // check alignment
- assert(0 == ((long)HFilter_aligned16)%16);
- assert(0 == ((long)VFilter_aligned16)%16);
-
- {
- __m128i transpose3_0;
- __m128i transpose3_1;
- __m128i transpose3_2;
- __m128i transpose3_3;
-
- // Horizontal pass (src -> intermediate).
- {
- const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
- src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
-
- {
- __m128i mad_all0;
- __m128i mad_all1;
- __m128i mad_all2;
- __m128i mad_all3;
- DO_FOUR_PIXELS(mad_all0, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, 2*src_stride)
- DO_FOUR_PIXELS(mad_all3, 3*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
- intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
- // --
- src_ptr += src_stride*4;
- // --
- DO_FOUR_PIXELS(mad_all0, 0*src_stride)
- DO_FOUR_PIXELS(mad_all1, 1*src_stride)
- DO_FOUR_PIXELS(mad_all2, 2*src_stride)
- mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
- mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
- intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
- }
- }
-
- // Transpose result (intermediate -> transpose3_x)
- {
- // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
- // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
- // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
- const __m128i transpose1_0 = _mm_shuffle_epi8(intermediateA, transpose);
- const __m128i transpose1_1 = _mm_shuffle_epi8(intermediateB, transpose);
- const __m128i transpose1_2 = _mm_shuffle_epi8(intermediateC, transpose);
- // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
- // 40 50 60 70 41 51 61 71 42 52 62 72 43 53 63 73
- // 80 90 A0 xx 81 91 A1 xx 82 92 A2 xx 83 93 A3 xx
- const __m128i transpose2_0 = _mm_unpacklo_epi32(transpose1_0, transpose1_1);
- const __m128i transpose2_1 = _mm_unpackhi_epi32(transpose1_0, transpose1_1);
- // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
- // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
- transpose3_0 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(0, 0, 1, 0)));
- transpose3_1 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(1, 1, 3, 2)));
- transpose3_2 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(2, 2, 1, 0)));
- transpose3_3 = _mm_castps_si128(
- _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
- _mm_castsi128_ps(transpose1_2),
- _MM_SHUFFLE(3, 3, 3, 2)));
- // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
- // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
- // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
- // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
- }
-
- // Vertical pass (transpose3_x -> dst).
- {
- const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
- // get first two columns filter coefficients
- __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
- __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
- __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
- __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
- __m128i col0, col1, col2, col3;
- {
- //load pixels
- __m128i src = transpose3_0;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col0 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- //load pixels
- __m128i src = transpose3_1;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col1 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- //load pixels
- __m128i src = transpose3_2;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col2 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- //load pixels
- __m128i src = transpose3_3;
- // extract the ones used for first column
- __m128i src0123 = _mm_shuffle_epi8(src, mask0123);
- __m128i src4567 = _mm_shuffle_epi8(src, mask4567);
- __m128i src01_16 = _mm_unpacklo_epi8(src0123, zero);
- __m128i src23_16 = _mm_unpackhi_epi8(src0123, zero);
- __m128i src45_16 = _mm_unpacklo_epi8(src4567, zero);
- __m128i src67_16 = _mm_unpackhi_epi8(src4567, zero);
- // multiply accumulate them
- __m128i mad01 = _mm_madd_epi16(src01_16, fil01);
- __m128i mad23 = _mm_madd_epi16(src23_16, fil23);
- __m128i mad45 = _mm_madd_epi16(src45_16, fil45);
- __m128i mad67 = _mm_madd_epi16(src67_16, fil67);
- __m128i mad0123 = _mm_add_epi32(mad01, mad23);
- __m128i mad4567 = _mm_add_epi32(mad45, mad67);
- __m128i mad_all = _mm_add_epi32(mad0123, mad4567);
- mad_all = _mm_add_epi32(mad_all, rounding);
- mad_all = _mm_srai_epi32(mad_all, VP9_FILTER_SHIFT);
- mad_all = _mm_packs_epi32(mad_all, mad_all);
- col3 = _mm_packus_epi16(mad_all, mad_all);
- }
- {
- __m128i col01 = _mm_unpacklo_epi8(col0, col1);
- __m128i col23 = _mm_unpacklo_epi8(col2, col3);
- __m128i col0123 = _mm_unpacklo_epi16(col01, col23);
- //TODO(cd): look into Ronald's comment:
- // Future suggestion: I believe here, too, you can merge the
- // packs_epi32() and pacus_epi16() for the 4 cols above, so that
- // you get the data in a single register, and then use pshufb
- // (shuffle_epi8()) instead of the unpacks here. Should be
- // 2+3+2 instructions faster.
- *((unsigned int *)&dst_ptr[dst_stride * 0]) =
- _mm_extract_epi32(col0123, 0);
- *((unsigned int *)&dst_ptr[dst_stride * 1]) =
- _mm_extract_epi32(col0123, 1);
- *((unsigned int *)&dst_ptr[dst_stride * 2]) =
- _mm_extract_epi32(col0123, 2);
- *((unsigned int *)&dst_ptr[dst_stride * 3]) =
- _mm_extract_epi32(col0123, 3);
- }
- }
- }
-}
-
-void vp9_filter_block2d_8x4_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int j;
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j, dst_stride);
- }
-}
-
-void vp9_filter_block2d_8x8_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<8; i+=4) {
- for (j=0; j<8; j+=4) {
- vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
-
-void vp9_filter_block2d_16x16_8_sse4_1
-(
- const unsigned char *src_ptr, const unsigned int src_stride,
- const short *HFilter_aligned16, const short *VFilter_aligned16,
- unsigned char *dst_ptr, unsigned int dst_stride
-) {
- int i, j;
- for (i=0; i<16; i+=4) {
- for (j=0; j<16; j+=4) {
- vp9_filter_block2d_4x4_8_sse4_1(src_ptr + j + i*src_stride, src_stride,
- HFilter_aligned16, VFilter_aligned16,
- dst_ptr + j + i*dst_stride, dst_stride);
- }
- }
-}
diff --git a/vp9/common/x86/vp9_idctllm_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm
index 8f3c6dfc3..8f3c6dfc3 100644
--- a/vp9/common/x86/vp9_idctllm_sse2.asm
+++ b/vp9/common/x86/vp9_idct_sse2.asm
diff --git a/vp9/common/x86/vp9_idct_x86.c b/vp9/common/x86/vp9_idct_x86.c
new file mode 100644
index 000000000..811ed9899
--- /dev/null
+++ b/vp9/common/x86/vp9_idct_x86.c
@@ -0,0 +1,1975 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h> // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
+
+#if HAVE_SSE2
+// In order to improve performance, clip absolute diff values to [0, 255],
+// which allows to keep the additions/subtractions in 8 bits.
+void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
+ uint8_t *dst_ptr, int pitch, int stride) {
+ int a1;
+ int16_t out;
+ uint8_t abs_diff;
+ __m128i p0, p1, p2, p3;
+ unsigned int extended_diff;
+ __m128i diff;
+
+ out = dct_const_round_shift(input_dc * cospi_16_64);
+ out = dct_const_round_shift(out * cospi_16_64);
+ a1 = ROUND_POWER_OF_TWO(out, 4);
+
+ // Read prediction data.
+ p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));
+ p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));
+ p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));
+ p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));
+
+ // Unpack prediction data, and store 4x4 array in 1 XMM register.
+ p0 = _mm_unpacklo_epi32(p0, p1);
+ p2 = _mm_unpacklo_epi32(p2, p3);
+ p0 = _mm_unpacklo_epi64(p0, p2);
+
+ // Clip dc value to [0, 255] range. Then, do addition or subtraction
+ // according to its sign.
+ if (a1 >= 0) {
+ abs_diff = (a1 > 255) ? 255 : a1;
+ extended_diff = abs_diff * 0x01010101u;
+ diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+ p1 = _mm_adds_epu8(p0, diff);
+ } else {
+ abs_diff = (a1 < -255) ? 255 : -a1;
+ extended_diff = abs_diff * 0x01010101u;
+ diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+ p1 = _mm_subs_epu8(p0, diff);
+ }
+
+ // Store results to dst.
+ *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+ dst_ptr += stride;
+
+ p1 = _mm_srli_si128(p1, 4);
+ *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+ dst_ptr += stride;
+
+ p1 = _mm_srli_si128(p1, 4);
+ *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+ dst_ptr += stride;
+
+ p1 = _mm_srli_si128(p1, 4);
+ *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+}
+
+void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
+ (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
+ (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+ (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const int half_pitch = pitch >> 1;
+ __m128i input0, input1, input2, input3;
+
+ // Rows
+ input0 = _mm_loadl_epi64((__m128i *)input);
+ input1 = _mm_loadl_epi64((__m128i *)(input + 4));
+ input2 = _mm_loadl_epi64((__m128i *)(input + 8));
+ input3 = _mm_loadl_epi64((__m128i *)(input + 12));
+
+ // Construct i3, i1, i3, i1, i2, i0, i2, i0
+ input0 = _mm_shufflelo_epi16(input0, 0xd8);
+ input1 = _mm_shufflelo_epi16(input1, 0xd8);
+ input2 = _mm_shufflelo_epi16(input2, 0xd8);
+ input3 = _mm_shufflelo_epi16(input3, 0xd8);
+
+ input0 = _mm_unpacklo_epi32(input0, input0);
+ input1 = _mm_unpacklo_epi32(input1, input1);
+ input2 = _mm_unpacklo_epi32(input2, input2);
+ input3 = _mm_unpacklo_epi32(input3, input3);
+
+ // Stage 1
+ input0 = _mm_madd_epi16(input0, cst);
+ input1 = _mm_madd_epi16(input1, cst);
+ input2 = _mm_madd_epi16(input2, cst);
+ input3 = _mm_madd_epi16(input3, cst);
+
+ input0 = _mm_add_epi32(input0, rounding);
+ input1 = _mm_add_epi32(input1, rounding);
+ input2 = _mm_add_epi32(input2, rounding);
+ input3 = _mm_add_epi32(input3, rounding);
+
+ input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+ input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+ input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+ input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+ // Stage 2
+ input0 = _mm_packs_epi32(input0, zero);
+ input1 = _mm_packs_epi32(input1, zero);
+ input2 = _mm_packs_epi32(input2, zero);
+ input3 = _mm_packs_epi32(input3, zero);
+
+ // Transpose
+ input1 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpacklo_epi16(input2, input3);
+ input0 = _mm_unpacklo_epi32(input1, input3);
+ input1 = _mm_unpackhi_epi32(input1, input3);
+
+ // Switch column2, column 3, and then, we got:
+ // input2: column1, column 0; input3: column2, column 3.
+ input1 = _mm_shuffle_epi32(input1, 0x4e);
+ input2 = _mm_add_epi16(input0, input1);
+ input3 = _mm_sub_epi16(input0, input1);
+
+ // Columns
+ // Construct i3, i1, i3, i1, i2, i0, i2, i0
+ input0 = _mm_shufflelo_epi16(input2, 0xd8);
+ input1 = _mm_shufflehi_epi16(input2, 0xd8);
+ input2 = _mm_shufflehi_epi16(input3, 0xd8);
+ input3 = _mm_shufflelo_epi16(input3, 0xd8);
+
+ input0 = _mm_unpacklo_epi32(input0, input0);
+ input1 = _mm_unpackhi_epi32(input1, input1);
+ input2 = _mm_unpackhi_epi32(input2, input2);
+ input3 = _mm_unpacklo_epi32(input3, input3);
+
+ // Stage 1
+ input0 = _mm_madd_epi16(input0, cst);
+ input1 = _mm_madd_epi16(input1, cst);
+ input2 = _mm_madd_epi16(input2, cst);
+ input3 = _mm_madd_epi16(input3, cst);
+
+ input0 = _mm_add_epi32(input0, rounding);
+ input1 = _mm_add_epi32(input1, rounding);
+ input2 = _mm_add_epi32(input2, rounding);
+ input3 = _mm_add_epi32(input3, rounding);
+
+ input0 = _mm_srai_epi32(input0, DCT_CONST_BITS);
+ input1 = _mm_srai_epi32(input1, DCT_CONST_BITS);
+ input2 = _mm_srai_epi32(input2, DCT_CONST_BITS);
+ input3 = _mm_srai_epi32(input3, DCT_CONST_BITS);
+
+ // Stage 2
+ input0 = _mm_packs_epi32(input0, zero);
+ input1 = _mm_packs_epi32(input1, zero);
+ input2 = _mm_packs_epi32(input2, zero);
+ input3 = _mm_packs_epi32(input3, zero);
+
+ // Transpose
+ input1 = _mm_unpacklo_epi16(input0, input1);
+ input3 = _mm_unpacklo_epi16(input2, input3);
+ input0 = _mm_unpacklo_epi32(input1, input3);
+ input1 = _mm_unpackhi_epi32(input1, input3);
+
+ // Switch column2, column 3, and then, we got:
+ // input2: column1, column 0; input3: column2, column 3.
+ input1 = _mm_shuffle_epi32(input1, 0x4e);
+ input2 = _mm_add_epi16(input0, input1);
+ input3 = _mm_sub_epi16(input0, input1);
+
+ // Final round and shift
+ input2 = _mm_add_epi16(input2, eight);
+ input3 = _mm_add_epi16(input3, eight);
+
+ input2 = _mm_srai_epi16(input2, 4);
+ input3 = _mm_srai_epi16(input3, 4);
+
+ // Store results
+ _mm_storel_epi64((__m128i *)output, input2);
+ input2 = _mm_srli_si128(input2, 8);
+ _mm_storel_epi64((__m128i *)(output + half_pitch), input2);
+
+ _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
+ input3 = _mm_srli_si128(input3, 8);
+ _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
+}
+
+void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i c1 = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
+ (int16_t)cospi_16_64, (int16_t)-cospi_16_64,
+ (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
+ (int16_t)cospi_8_64, (int16_t)cospi_24_64);
+ const __m128i c2 = _mm_setr_epi16(1, 1, 1, 1, 1, -1, 1, -1);
+
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ __m128i in, temp;
+
+ // Load input data.
+ in = _mm_loadl_epi64((__m128i *)input);
+
+ // Construct i3, i1, i3, i1, i2, i0, i2, i0
+ in = _mm_shufflelo_epi16(in, 0xd8);
+ in = _mm_unpacklo_epi32(in, in);
+
+ // Stage 1
+ in = _mm_madd_epi16(in, c1);
+ in = _mm_add_epi32(in, rounding);
+ in = _mm_srai_epi32(in, DCT_CONST_BITS);
+ in = _mm_packs_epi32(in, zero);
+
+ // Stage 2
+ temp = _mm_shufflelo_epi16(in, 0x9c);
+ in = _mm_shufflelo_epi16(in, 0xc9);
+ in = _mm_unpacklo_epi64(temp, in);
+ in = _mm_madd_epi16(in, c2);
+ in = _mm_packs_epi32(in, zero);
+
+ // Store results
+ _mm_storel_epi64((__m128i *)output, in);
+}
+
+#define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in4, in5); \
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ out4 = _mm_unpacklo_epi64(tr1_1, tr1_5); \
+ out5 = _mm_unpackhi_epi64(tr1_1, tr1_5); \
+ out6 = _mm_unpacklo_epi64(tr1_3, tr1_7); \
+ out7 = _mm_unpackhi_epi64(tr1_3, tr1_7); \
+ }
+
+#define TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, \
+ out0, out1, out2, out3, out4, out5, out6, out7) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in4, in5); \
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in6, in7); \
+ \
+ const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); \
+ const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); \
+ const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); \
+ \
+ out0 = _mm_unpacklo_epi64(tr1_0, tr1_4); \
+ out1 = _mm_unpackhi_epi64(tr1_0, tr1_4); \
+ out2 = _mm_unpacklo_epi64(tr1_2, tr1_6); \
+ out3 = _mm_unpackhi_epi64(tr1_2, tr1_6); \
+ out4 = out5 = out6 = out7 = zero; \
+ }
+
+#define TRANSPOSE_8X4(in0, in1, in2, in3, out0, out1, out2, out3) \
+ { \
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in0, in1); \
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in2, in3); \
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in0, in1); \
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in2, in3); \
+ \
+ in0 = _mm_unpacklo_epi32(tr0_0, tr0_1); /* i1 i0 */ \
+ in1 = _mm_unpackhi_epi32(tr0_0, tr0_1); /* i3 i2 */ \
+ in2 = _mm_unpacklo_epi32(tr0_2, tr0_3); /* i5 i4 */ \
+ in3 = _mm_unpackhi_epi32(tr0_2, tr0_3); /* i7 i6 */ \
+ }
+
+// Define Macro for multiplying elements by constants and adding them together.
+#define MULTIPLICATION_AND_ADD(lo_0, hi_0, lo_1, hi_1, \
+ cst0, cst1, cst2, cst3, res0, res1, res2, res3) \
+ { \
+ tmp0 = _mm_madd_epi16(lo_0, cst0); \
+ tmp1 = _mm_madd_epi16(hi_0, cst0); \
+ tmp2 = _mm_madd_epi16(lo_0, cst1); \
+ tmp3 = _mm_madd_epi16(hi_0, cst1); \
+ tmp4 = _mm_madd_epi16(lo_1, cst2); \
+ tmp5 = _mm_madd_epi16(hi_1, cst2); \
+ tmp6 = _mm_madd_epi16(lo_1, cst3); \
+ tmp7 = _mm_madd_epi16(hi_1, cst3); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ tmp4 = _mm_add_epi32(tmp4, rounding); \
+ tmp5 = _mm_add_epi32(tmp5, rounding); \
+ tmp6 = _mm_add_epi32(tmp6, rounding); \
+ tmp7 = _mm_add_epi32(tmp7, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS); \
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS); \
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS); \
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS); \
+ \
+ res0 = _mm_packs_epi32(tmp0, tmp1); \
+ res1 = _mm_packs_epi32(tmp2, tmp3); \
+ res2 = _mm_packs_epi32(tmp4, tmp5); \
+ res3 = _mm_packs_epi32(tmp6, tmp7); \
+ }
+
+#define IDCT8x8_1D \
+ /* Stage1 */ \
+ { \
+ const __m128i lo_17 = _mm_unpacklo_epi16(in1, in7); \
+ const __m128i hi_17 = _mm_unpackhi_epi16(in1, in7); \
+ const __m128i lo_35 = _mm_unpacklo_epi16(in3, in5); \
+ const __m128i hi_35 = _mm_unpackhi_epi16(in3, in5); \
+ \
+ MULTIPLICATION_AND_ADD(lo_17, hi_17, lo_35, hi_35, stg1_0, \
+ stg1_1, stg1_2, stg1_3, stp1_4, \
+ stp1_7, stp1_5, stp1_6) \
+ } \
+ \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, in4); \
+ const __m128i hi_04 = _mm_unpackhi_epi16(in0, in4); \
+ const __m128i lo_26 = _mm_unpacklo_epi16(in2, in6); \
+ const __m128i hi_26 = _mm_unpackhi_epi16(in2, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_04, hi_04, lo_26, hi_26, stg2_0, \
+ stg2_1, stg2_2, stg2_3, stp2_0, \
+ stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_adds_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_subs_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_subs_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_adds_epi16(stp1_7, stp1_6); \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_56 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_adds_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_adds_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_subs_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_subs_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_56, stg2_1); \
+ tmp1 = _mm_madd_epi16(hi_56, stg2_1); \
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); \
+ tmp3 = _mm_madd_epi16(hi_56, stg2_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ } \
+ \
+ /* Stage4 */ \
+ in0 = _mm_adds_epi16(stp1_0, stp2_7); \
+ in1 = _mm_adds_epi16(stp1_1, stp1_6); \
+ in2 = _mm_adds_epi16(stp1_2, stp1_5); \
+ in3 = _mm_adds_epi16(stp1_3, stp2_4); \
+ in4 = _mm_subs_epi16(stp1_3, stp2_4); \
+ in5 = _mm_subs_epi16(stp1_2, stp1_5); \
+ in6 = _mm_subs_epi16(stp1_1, stp1_6); \
+ in7 = _mm_subs_epi16(stp1_0, stp2_7);
+
+void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
+ const int half_pitch = pitch >> 1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // Load input data.
+ in0 = _mm_load_si128((__m128i *)input);
+ in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+ in4 = _mm_load_si128((__m128i *)(input + 8 * 4));
+ in5 = _mm_load_si128((__m128i *)(input + 8 * 5));
+ in6 = _mm_load_si128((__m128i *)(input + 8 * 6));
+ in7 = _mm_load_si128((__m128i *)(input + 8 * 7));
+
+ // 2-D
+ for (i = 0; i < 2; i++) {
+ // 8x8 Transpose is copied from vp9_short_fdct8x8_sse2()
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ // 4-stage 1D idct8x8
+ IDCT8x8_1D
+ }
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ // Store results
+ _mm_store_si128((__m128i *)output, in0);
+ _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+ _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+ _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+ _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+ _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+ _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+ _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+}
+
+void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
+ const int half_pitch = pitch >> 1;
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<4);
+ const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg1_2 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg2_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg3_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+ // Rows. Load 4-row input data.
+ in0 = _mm_load_si128((__m128i *)input);
+ in1 = _mm_load_si128((__m128i *)(input + 8 * 1));
+ in2 = _mm_load_si128((__m128i *)(input + 8 * 2));
+ in3 = _mm_load_si128((__m128i *)(input + 8 * 3));
+
+ // 8x4 Transpose
+ TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3)
+
+ // Stage1
+ {
+ const __m128i lo_17 = _mm_unpackhi_epi16(in0, in3);
+ const __m128i lo_35 = _mm_unpackhi_epi16(in1, in2);
+
+ tmp0 = _mm_madd_epi16(lo_17, stg1_0);
+ tmp2 = _mm_madd_epi16(lo_17, stg1_1);
+ tmp4 = _mm_madd_epi16(lo_35, stg1_2);
+ tmp6 = _mm_madd_epi16(lo_35, stg1_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp1_4 = _mm_packs_epi32(tmp0, zero);
+ stp1_7 = _mm_packs_epi32(tmp2, zero);
+ stp1_5 = _mm_packs_epi32(tmp4, zero);
+ stp1_6 = _mm_packs_epi32(tmp6, zero);
+ }
+
+ // Stage2
+ {
+ const __m128i lo_04 = _mm_unpacklo_epi16(in0, in2);
+ const __m128i lo_26 = _mm_unpacklo_epi16(in1, in3);
+
+ tmp0 = _mm_madd_epi16(lo_04, stg2_0);
+ tmp2 = _mm_madd_epi16(lo_04, stg2_1);
+ tmp4 = _mm_madd_epi16(lo_26, stg2_2);
+ tmp6 = _mm_madd_epi16(lo_26, stg2_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp2_0 = _mm_packs_epi32(tmp0, zero);
+ stp2_1 = _mm_packs_epi32(tmp2, zero);
+ stp2_2 = _mm_packs_epi32(tmp4, zero);
+ stp2_3 = _mm_packs_epi32(tmp6, zero);
+
+ stp2_4 = _mm_adds_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_subs_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_subs_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_adds_epi16(stp1_7, stp1_6);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_56 = _mm_unpacklo_epi16(stp2_5, stp2_6);
+ stp1_0 = _mm_adds_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_adds_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_subs_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_subs_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_madd_epi16(lo_56, stg3_0);
+ tmp2 = _mm_madd_epi16(lo_56, stg2_0); // stg3_1 = stg2_0
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp0, zero);
+ stp1_6 = _mm_packs_epi32(tmp2, zero);
+ }
+
+ // Stage4
+ in0 = _mm_adds_epi16(stp1_0, stp2_7);
+ in1 = _mm_adds_epi16(stp1_1, stp1_6);
+ in2 = _mm_adds_epi16(stp1_2, stp1_5);
+ in3 = _mm_adds_epi16(stp1_3, stp2_4);
+ in4 = _mm_subs_epi16(stp1_3, stp2_4);
+ in5 = _mm_subs_epi16(stp1_2, stp1_5);
+ in6 = _mm_subs_epi16(stp1_1, stp1_6);
+ in7 = _mm_subs_epi16(stp1_0, stp2_7);
+
+ // Columns. 4x8 Transpose
+ TRANSPOSE_4X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7)
+
+ // 1D idct8x8
+ IDCT8x8_1D
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 5);
+ in1 = _mm_srai_epi16(in1, 5);
+ in2 = _mm_srai_epi16(in2, 5);
+ in3 = _mm_srai_epi16(in3, 5);
+ in4 = _mm_srai_epi16(in4, 5);
+ in5 = _mm_srai_epi16(in5, 5);
+ in6 = _mm_srai_epi16(in6, 5);
+ in7 = _mm_srai_epi16(in7, 5);
+
+ // Store results
+ _mm_store_si128((__m128i *)output, in0);
+ _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+ _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+ _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+ _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+ _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+ _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+ _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+}
+
+#define IDCT16x16_1D \
+ /* Stage2 */ \
+ { \
+ const __m128i lo_1_15 = _mm_unpacklo_epi16(in1, in15); \
+ const __m128i hi_1_15 = _mm_unpackhi_epi16(in1, in15); \
+ const __m128i lo_9_7 = _mm_unpacklo_epi16(in9, in7); \
+ const __m128i hi_9_7 = _mm_unpackhi_epi16(in9, in7); \
+ const __m128i lo_5_11 = _mm_unpacklo_epi16(in5, in11); \
+ const __m128i hi_5_11 = _mm_unpackhi_epi16(in5, in11); \
+ const __m128i lo_13_3 = _mm_unpacklo_epi16(in13, in3); \
+ const __m128i hi_13_3 = _mm_unpackhi_epi16(in13, in3); \
+ \
+ MULTIPLICATION_AND_ADD(lo_1_15, hi_1_15, lo_9_7, hi_9_7, \
+ stg2_0, stg2_1, stg2_2, stg2_3, \
+ stp2_8, stp2_15, stp2_9, stp2_14) \
+ \
+ MULTIPLICATION_AND_ADD(lo_5_11, hi_5_11, lo_13_3, hi_13_3, \
+ stg2_4, stg2_5, stg2_6, stg2_7, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ } \
+ \
+ /* Stage3 */ \
+ { \
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in2, in14); \
+ const __m128i hi_2_14 = _mm_unpackhi_epi16(in2, in14); \
+ const __m128i lo_10_6 = _mm_unpacklo_epi16(in10, in6); \
+ const __m128i hi_10_6 = _mm_unpackhi_epi16(in10, in6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_2_14, hi_2_14, lo_10_6, hi_10_6, \
+ stg3_0, stg3_1, stg3_2, stg3_3, \
+ stp1_4, stp1_7, stp1_5, stp1_6) \
+ \
+ stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9); \
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9); \
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10); \
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10); \
+ \
+ stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13); \
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13); \
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14); \
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14); \
+ } \
+ \
+ /* Stage4 */ \
+ { \
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8); \
+ const __m128i hi_0_8 = _mm_unpackhi_epi16(in0, in8); \
+ const __m128i lo_4_12 = _mm_unpacklo_epi16(in4, in12); \
+ const __m128i hi_4_12 = _mm_unpackhi_epi16(in4, in12); \
+ \
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14); \
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14); \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ \
+ MULTIPLICATION_AND_ADD(lo_0_8, hi_0_8, lo_4_12, hi_4_12, \
+ stg4_0, stg4_1, stg4_2, stg4_3, \
+ stp2_0, stp2_1, stp2_2, stp2_3) \
+ \
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5); \
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6); \
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6); \
+ \
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, \
+ stg4_4, stg4_5, stg4_6, stg4_7, \
+ stp2_9, stp2_14, stp2_10, stp2_13) \
+ } \
+ \
+ /* Stage5 */ \
+ { \
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5); \
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5); \
+ \
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3); \
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2); \
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2); \
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3); \
+ \
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1); \
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1); \
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0); \
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0); \
+ \
+ tmp0 = _mm_add_epi32(tmp0, rounding); \
+ tmp1 = _mm_add_epi32(tmp1, rounding); \
+ tmp2 = _mm_add_epi32(tmp2, rounding); \
+ tmp3 = _mm_add_epi32(tmp3, rounding); \
+ \
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS); \
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS); \
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS); \
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS); \
+ \
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1); \
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3); \
+ \
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11); \
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10); \
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10); \
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11); \
+ \
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0); \
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13); \
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13); \
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0); \
+ } \
+ \
+ /* Stage6 */ \
+ { \
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13); \
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13); \
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12); \
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12); \
+ \
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7); \
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6); \
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5); \
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4); \
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4); \
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5); \
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6); \
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7); \
+ \
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12, \
+ stg6_0, stg4_0, stg6_0, stg4_0, \
+ stp2_10, stp2_13, stp2_11, stp2_12) \
+ }
+
+void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+ const int half_pitch = pitch >> 1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
+ in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
+ in10 = zero, in11 = zero, in12 = zero, in13 = zero,
+ in14 = zero, in15 = zero;
+ __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
+ l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
+ l12 = zero, l13 = zero, l14 = zero, l15 = zero;
+ __m128i r0 = zero, r1 = zero, r2 = zero, r3 = zero, r4 = zero, r5 = zero,
+ r6 = zero, r7 = zero, r8 = zero, r9 = zero, r10 = zero, r11 = zero,
+ r12 = zero, r13 = zero, r14 = zero, r15 = zero;
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_8_0, stp1_12_0;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // We work on a 8x16 block each time, and loop 4 times for 2-D 16x16 idct.
+ for (i = 0; i < 4; i++) {
+ // 1-D idct
+ if (i < 2) {
+ if (i == 1) input += 128;
+
+ // Load input data.
+ in0 = _mm_load_si128((__m128i *)input);
+ in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
+ in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
+ in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
+ in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
+ in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
+ in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
+ in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in4 = _mm_load_si128((__m128i *)(input + 8 * 8));
+ in12 = _mm_load_si128((__m128i *)(input + 8 * 9));
+ in5 = _mm_load_si128((__m128i *)(input + 8 * 10));
+ in13 = _mm_load_si128((__m128i *)(input + 8 * 11));
+ in6 = _mm_load_si128((__m128i *)(input + 8 * 12));
+ in14 = _mm_load_si128((__m128i *)(input + 8 * 13));
+ in7 = _mm_load_si128((__m128i *)(input + 8 * 14));
+ in15 = _mm_load_si128((__m128i *)(input + 8 * 15));
+
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ }
+
+ if (i == 2) {
+ TRANSPOSE_8X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ TRANSPOSE_8X8(r0, r1, r2, r3, r4, r5, r6, r7, in8, in9, in10, in11, in12,
+ in13, in14, in15);
+ }
+
+ if (i == 3) {
+ TRANSPOSE_8X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE_8X8(r8, r9, r10, r11, r12, r13, r14, r15, in8, in9, in10, in11,
+ in12, in13, in14, in15);
+ }
+
+ IDCT16x16_1D
+
+ // Stage7
+ if (i == 0) {
+ // Left 8x16
+ l0 = _mm_add_epi16(stp2_0, stp1_15);
+ l1 = _mm_add_epi16(stp2_1, stp1_14);
+ l2 = _mm_add_epi16(stp2_2, stp2_13);
+ l3 = _mm_add_epi16(stp2_3, stp2_12);
+ l4 = _mm_add_epi16(stp2_4, stp2_11);
+ l5 = _mm_add_epi16(stp2_5, stp2_10);
+ l6 = _mm_add_epi16(stp2_6, stp1_9);
+ l7 = _mm_add_epi16(stp2_7, stp1_8);
+ l8 = _mm_sub_epi16(stp2_7, stp1_8);
+ l9 = _mm_sub_epi16(stp2_6, stp1_9);
+ l10 = _mm_sub_epi16(stp2_5, stp2_10);
+ l11 = _mm_sub_epi16(stp2_4, stp2_11);
+ l12 = _mm_sub_epi16(stp2_3, stp2_12);
+ l13 = _mm_sub_epi16(stp2_2, stp2_13);
+ l14 = _mm_sub_epi16(stp2_1, stp1_14);
+ l15 = _mm_sub_epi16(stp2_0, stp1_15);
+ } else if (i == 1) {
+ // Right 8x16
+ r0 = _mm_add_epi16(stp2_0, stp1_15);
+ r1 = _mm_add_epi16(stp2_1, stp1_14);
+ r2 = _mm_add_epi16(stp2_2, stp2_13);
+ r3 = _mm_add_epi16(stp2_3, stp2_12);
+ r4 = _mm_add_epi16(stp2_4, stp2_11);
+ r5 = _mm_add_epi16(stp2_5, stp2_10);
+ r6 = _mm_add_epi16(stp2_6, stp1_9);
+ r7 = _mm_add_epi16(stp2_7, stp1_8);
+ r8 = _mm_sub_epi16(stp2_7, stp1_8);
+ r9 = _mm_sub_epi16(stp2_6, stp1_9);
+ r10 = _mm_sub_epi16(stp2_5, stp2_10);
+ r11 = _mm_sub_epi16(stp2_4, stp2_11);
+ r12 = _mm_sub_epi16(stp2_3, stp2_12);
+ r13 = _mm_sub_epi16(stp2_2, stp2_13);
+ r14 = _mm_sub_epi16(stp2_1, stp1_14);
+ r15 = _mm_sub_epi16(stp2_0, stp1_15);
+ } else {
+ // 2-D
+ in0 = _mm_add_epi16(stp2_0, stp1_15);
+ in1 = _mm_add_epi16(stp2_1, stp1_14);
+ in2 = _mm_add_epi16(stp2_2, stp2_13);
+ in3 = _mm_add_epi16(stp2_3, stp2_12);
+ in4 = _mm_add_epi16(stp2_4, stp2_11);
+ in5 = _mm_add_epi16(stp2_5, stp2_10);
+ in6 = _mm_add_epi16(stp2_6, stp1_9);
+ in7 = _mm_add_epi16(stp2_7, stp1_8);
+ in8 = _mm_sub_epi16(stp2_7, stp1_8);
+ in9 = _mm_sub_epi16(stp2_6, stp1_9);
+ in10 = _mm_sub_epi16(stp2_5, stp2_10);
+ in11 = _mm_sub_epi16(stp2_4, stp2_11);
+ in12 = _mm_sub_epi16(stp2_3, stp2_12);
+ in13 = _mm_sub_epi16(stp2_2, stp2_13);
+ in14 = _mm_sub_epi16(stp2_1, stp1_14);
+ in15 = _mm_sub_epi16(stp2_0, stp1_15);
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+ in8 = _mm_adds_epi16(in8, final_rounding);
+ in9 = _mm_adds_epi16(in9, final_rounding);
+ in10 = _mm_adds_epi16(in10, final_rounding);
+ in11 = _mm_adds_epi16(in11, final_rounding);
+ in12 = _mm_adds_epi16(in12, final_rounding);
+ in13 = _mm_adds_epi16(in13, final_rounding);
+ in14 = _mm_adds_epi16(in14, final_rounding);
+ in15 = _mm_adds_epi16(in15, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 6);
+ in1 = _mm_srai_epi16(in1, 6);
+ in2 = _mm_srai_epi16(in2, 6);
+ in3 = _mm_srai_epi16(in3, 6);
+ in4 = _mm_srai_epi16(in4, 6);
+ in5 = _mm_srai_epi16(in5, 6);
+ in6 = _mm_srai_epi16(in6, 6);
+ in7 = _mm_srai_epi16(in7, 6);
+ in8 = _mm_srai_epi16(in8, 6);
+ in9 = _mm_srai_epi16(in9, 6);
+ in10 = _mm_srai_epi16(in10, 6);
+ in11 = _mm_srai_epi16(in11, 6);
+ in12 = _mm_srai_epi16(in12, 6);
+ in13 = _mm_srai_epi16(in13, 6);
+ in14 = _mm_srai_epi16(in14, 6);
+ in15 = _mm_srai_epi16(in15, 6);
+
+ // Store results
+ _mm_store_si128((__m128i *)output, in0);
+ _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+ _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+ _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+ _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+ _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+ _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+ _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+ _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
+ _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
+ _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
+ _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
+ _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
+ _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
+ _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
+ _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+
+ output += 8;
+ }
+ }
+}
+
+void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
+ const int half_pitch = pitch >> 1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+ const __m128i zero = _mm_setzero_si128();
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+ const __m128i stg4_7 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0 = zero, in1 = zero, in2 = zero, in3 = zero, in4 = zero,
+ in5 = zero, in6 = zero, in7 = zero, in8 = zero, in9 = zero,
+ in10 = zero, in11 = zero, in12 = zero, in13 = zero,
+ in14 = zero, in15 = zero;
+ __m128i l0 = zero, l1 = zero, l2 = zero, l3 = zero, l4 = zero, l5 = zero,
+ l6 = zero, l7 = zero, l8 = zero, l9 = zero, l10 = zero, l11 = zero,
+ l12 = zero, l13 = zero, l14 = zero, l15 = zero;
+
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_8_0, stp1_12_0;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i;
+
+ // 1-D idct. Load input data.
+ in0 = _mm_load_si128((__m128i *)input);
+ in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
+ in1 = _mm_load_si128((__m128i *)(input + 8 * 2));
+ in9 = _mm_load_si128((__m128i *)(input + 8 * 3));
+ in2 = _mm_load_si128((__m128i *)(input + 8 * 4));
+ in10 = _mm_load_si128((__m128i *)(input + 8 * 5));
+ in3 = _mm_load_si128((__m128i *)(input + 8 * 6));
+ in11 = _mm_load_si128((__m128i *)(input + 8 * 7));
+
+ TRANSPOSE_8X4(in0, in1, in2, in3, in0, in1, in2, in3);
+ TRANSPOSE_8X4(in8, in9, in10, in11, in8, in9, in10, in11);
+
+ // Stage2
+ {
+ const __m128i lo_1_15 = _mm_unpackhi_epi16(in0, in11);
+ const __m128i lo_9_7 = _mm_unpackhi_epi16(in8, in3);
+ const __m128i lo_5_11 = _mm_unpackhi_epi16(in2, in9);
+ const __m128i lo_13_3 = _mm_unpackhi_epi16(in10, in1);
+
+ tmp0 = _mm_madd_epi16(lo_1_15, stg2_0);
+ tmp2 = _mm_madd_epi16(lo_1_15, stg2_1);
+ tmp4 = _mm_madd_epi16(lo_9_7, stg2_2);
+ tmp6 = _mm_madd_epi16(lo_9_7, stg2_3);
+ tmp1 = _mm_madd_epi16(lo_5_11, stg2_4);
+ tmp3 = _mm_madd_epi16(lo_5_11, stg2_5);
+ tmp5 = _mm_madd_epi16(lo_13_3, stg2_6);
+ tmp7 = _mm_madd_epi16(lo_13_3, stg2_7);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+ stp2_8 = _mm_packs_epi32(tmp0, zero);
+ stp2_15 = _mm_packs_epi32(tmp2, zero);
+ stp2_9 = _mm_packs_epi32(tmp4, zero);
+ stp2_14 = _mm_packs_epi32(tmp6, zero);
+
+ stp2_10 = _mm_packs_epi32(tmp1, zero);
+ stp2_13 = _mm_packs_epi32(tmp3, zero);
+ stp2_11 = _mm_packs_epi32(tmp5, zero);
+ stp2_12 = _mm_packs_epi32(tmp7, zero);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_2_14 = _mm_unpacklo_epi16(in1, in11);
+ const __m128i lo_10_6 = _mm_unpacklo_epi16(in9, in3);
+
+ tmp0 = _mm_madd_epi16(lo_2_14, stg3_0);
+ tmp2 = _mm_madd_epi16(lo_2_14, stg3_1);
+ tmp4 = _mm_madd_epi16(lo_10_6, stg3_2);
+ tmp6 = _mm_madd_epi16(lo_10_6, stg3_3);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp1_4 = _mm_packs_epi32(tmp0, zero);
+ stp1_7 = _mm_packs_epi32(tmp2, zero);
+ stp1_5 = _mm_packs_epi32(tmp4, zero);
+ stp1_6 = _mm_packs_epi32(tmp6, zero);
+
+ stp1_8_0 = _mm_add_epi16(stp2_8, stp2_9);
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+
+ stp1_12_0 = _mm_add_epi16(stp2_12, stp2_13);
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+ }
+
+ // Stage4
+ {
+ const __m128i lo_0_8 = _mm_unpacklo_epi16(in0, in8);
+ const __m128i lo_4_12 = _mm_unpacklo_epi16(in2, in10);
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+
+ tmp0 = _mm_madd_epi16(lo_0_8, stg4_0);
+ tmp2 = _mm_madd_epi16(lo_0_8, stg4_1);
+ tmp4 = _mm_madd_epi16(lo_4_12, stg4_2);
+ tmp6 = _mm_madd_epi16(lo_4_12, stg4_3);
+ tmp1 = _mm_madd_epi16(lo_9_14, stg4_4);
+ tmp3 = _mm_madd_epi16(lo_9_14, stg4_5);
+ tmp5 = _mm_madd_epi16(lo_10_13, stg4_6);
+ tmp7 = _mm_madd_epi16(lo_10_13, stg4_7);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp5 = _mm_add_epi32(tmp5, rounding);
+ tmp7 = _mm_add_epi32(tmp7, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp5 = _mm_srai_epi32(tmp5, DCT_CONST_BITS);
+ tmp7 = _mm_srai_epi32(tmp7, DCT_CONST_BITS);
+
+ stp2_0 = _mm_packs_epi32(tmp0, zero);
+ stp2_1 = _mm_packs_epi32(tmp2, zero);
+ stp2_2 = _mm_packs_epi32(tmp4, zero);
+ stp2_3 = _mm_packs_epi32(tmp6, zero);
+ stp2_9 = _mm_packs_epi32(tmp1, zero);
+ stp2_14 = _mm_packs_epi32(tmp3, zero);
+ stp2_10 = _mm_packs_epi32(tmp5, zero);
+ stp2_13 = _mm_packs_epi32(tmp7, zero);
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+ }
+
+ // Stage5 and Stage6
+ {
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ stp1_8 = _mm_add_epi16(stp1_8_0, stp1_11);
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+ stp1_11 = _mm_sub_epi16(stp1_8_0, stp1_11);
+
+ stp1_12 = _mm_sub_epi16(stp1_15, stp1_12_0);
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+ stp1_15 = _mm_add_epi16(stp1_15, stp1_12_0);
+ }
+
+ // Stage6
+ {
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+
+ tmp1 = _mm_madd_epi16(lo_6_5, stg4_1);
+ tmp3 = _mm_madd_epi16(lo_6_5, stg4_0);
+ tmp0 = _mm_madd_epi16(lo_10_13, stg6_0);
+ tmp2 = _mm_madd_epi16(lo_10_13, stg4_0);
+ tmp4 = _mm_madd_epi16(lo_11_12, stg6_0);
+ tmp6 = _mm_madd_epi16(lo_11_12, stg4_0);
+
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp4 = _mm_add_epi32(tmp4, rounding);
+ tmp6 = _mm_add_epi32(tmp6, rounding);
+
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp4 = _mm_srai_epi32(tmp4, DCT_CONST_BITS);
+ tmp6 = _mm_srai_epi32(tmp6, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp1, zero);
+ stp1_6 = _mm_packs_epi32(tmp3, zero);
+ stp2_10 = _mm_packs_epi32(tmp0, zero);
+ stp2_13 = _mm_packs_epi32(tmp2, zero);
+ stp2_11 = _mm_packs_epi32(tmp4, zero);
+ stp2_12 = _mm_packs_epi32(tmp6, zero);
+
+ stp2_0 = _mm_add_epi16(stp1_0, stp2_7);
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+ stp2_3 = _mm_add_epi16(stp1_3, stp2_4);
+ stp2_4 = _mm_sub_epi16(stp1_3, stp2_4);
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
+ stp2_7 = _mm_sub_epi16(stp1_0, stp2_7);
+ }
+
+ // Stage7. Left 8x16 only.
+ l0 = _mm_add_epi16(stp2_0, stp1_15);
+ l1 = _mm_add_epi16(stp2_1, stp1_14);
+ l2 = _mm_add_epi16(stp2_2, stp2_13);
+ l3 = _mm_add_epi16(stp2_3, stp2_12);
+ l4 = _mm_add_epi16(stp2_4, stp2_11);
+ l5 = _mm_add_epi16(stp2_5, stp2_10);
+ l6 = _mm_add_epi16(stp2_6, stp1_9);
+ l7 = _mm_add_epi16(stp2_7, stp1_8);
+ l8 = _mm_sub_epi16(stp2_7, stp1_8);
+ l9 = _mm_sub_epi16(stp2_6, stp1_9);
+ l10 = _mm_sub_epi16(stp2_5, stp2_10);
+ l11 = _mm_sub_epi16(stp2_4, stp2_11);
+ l12 = _mm_sub_epi16(stp2_3, stp2_12);
+ l13 = _mm_sub_epi16(stp2_2, stp2_13);
+ l14 = _mm_sub_epi16(stp2_1, stp1_14);
+ l15 = _mm_sub_epi16(stp2_0, stp1_15);
+
+ // 2-D idct. We do 2 8x16 blocks.
+ for (i = 0; i < 2; i++) {
+ if (i == 0)
+ TRANSPOSE_4X8(l0, l1, l2, l3, l4, l5, l6, l7, in0, in1, in2, in3, in4,
+ in5, in6, in7);
+
+ if (i == 1)
+ TRANSPOSE_4X8(l8, l9, l10, l11, l12, l13, l14, l15, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+
+ in8 = in9 = in10 = in11 = in12 = in13 = in14 = in15 = zero;
+
+ IDCT16x16_1D
+
+ // Stage7
+ in0 = _mm_add_epi16(stp2_0, stp1_15);
+ in1 = _mm_add_epi16(stp2_1, stp1_14);
+ in2 = _mm_add_epi16(stp2_2, stp2_13);
+ in3 = _mm_add_epi16(stp2_3, stp2_12);
+ in4 = _mm_add_epi16(stp2_4, stp2_11);
+ in5 = _mm_add_epi16(stp2_5, stp2_10);
+ in6 = _mm_add_epi16(stp2_6, stp1_9);
+ in7 = _mm_add_epi16(stp2_7, stp1_8);
+ in8 = _mm_sub_epi16(stp2_7, stp1_8);
+ in9 = _mm_sub_epi16(stp2_6, stp1_9);
+ in10 = _mm_sub_epi16(stp2_5, stp2_10);
+ in11 = _mm_sub_epi16(stp2_4, stp2_11);
+ in12 = _mm_sub_epi16(stp2_3, stp2_12);
+ in13 = _mm_sub_epi16(stp2_2, stp2_13);
+ in14 = _mm_sub_epi16(stp2_1, stp1_14);
+ in15 = _mm_sub_epi16(stp2_0, stp1_15);
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+ in8 = _mm_adds_epi16(in8, final_rounding);
+ in9 = _mm_adds_epi16(in9, final_rounding);
+ in10 = _mm_adds_epi16(in10, final_rounding);
+ in11 = _mm_adds_epi16(in11, final_rounding);
+ in12 = _mm_adds_epi16(in12, final_rounding);
+ in13 = _mm_adds_epi16(in13, final_rounding);
+ in14 = _mm_adds_epi16(in14, final_rounding);
+ in15 = _mm_adds_epi16(in15, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 6);
+ in1 = _mm_srai_epi16(in1, 6);
+ in2 = _mm_srai_epi16(in2, 6);
+ in3 = _mm_srai_epi16(in3, 6);
+ in4 = _mm_srai_epi16(in4, 6);
+ in5 = _mm_srai_epi16(in5, 6);
+ in6 = _mm_srai_epi16(in6, 6);
+ in7 = _mm_srai_epi16(in7, 6);
+ in8 = _mm_srai_epi16(in8, 6);
+ in9 = _mm_srai_epi16(in9, 6);
+ in10 = _mm_srai_epi16(in10, 6);
+ in11 = _mm_srai_epi16(in11, 6);
+ in12 = _mm_srai_epi16(in12, 6);
+ in13 = _mm_srai_epi16(in13, 6);
+ in14 = _mm_srai_epi16(in14, 6);
+ in15 = _mm_srai_epi16(in15, 6);
+
+ // Store results
+ _mm_store_si128((__m128i *)output, in0);
+ _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+ _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+ _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+ _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+ _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+ _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+ _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+ _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
+ _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
+ _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
+ _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
+ _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
+ _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
+ _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
+ _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+ output += 8;
+ }
+}
+
+void vp9_short_idct32x32_sse2(int16_t *input, int16_t *output, int pitch) {
+ const int half_pitch = pitch >> 1;
+ const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
+ const __m128i final_rounding = _mm_set1_epi16(1<<5);
+
+ // idct constants for each stage
+ const __m128i stg1_0 = pair_set_epi16(cospi_31_64, -cospi_1_64);
+ const __m128i stg1_1 = pair_set_epi16(cospi_1_64, cospi_31_64);
+ const __m128i stg1_2 = pair_set_epi16(cospi_15_64, -cospi_17_64);
+ const __m128i stg1_3 = pair_set_epi16(cospi_17_64, cospi_15_64);
+ const __m128i stg1_4 = pair_set_epi16(cospi_23_64, -cospi_9_64);
+ const __m128i stg1_5 = pair_set_epi16(cospi_9_64, cospi_23_64);
+ const __m128i stg1_6 = pair_set_epi16(cospi_7_64, -cospi_25_64);
+ const __m128i stg1_7 = pair_set_epi16(cospi_25_64, cospi_7_64);
+ const __m128i stg1_8 = pair_set_epi16(cospi_27_64, -cospi_5_64);
+ const __m128i stg1_9 = pair_set_epi16(cospi_5_64, cospi_27_64);
+ const __m128i stg1_10 = pair_set_epi16(cospi_11_64, -cospi_21_64);
+ const __m128i stg1_11 = pair_set_epi16(cospi_21_64, cospi_11_64);
+ const __m128i stg1_12 = pair_set_epi16(cospi_19_64, -cospi_13_64);
+ const __m128i stg1_13 = pair_set_epi16(cospi_13_64, cospi_19_64);
+ const __m128i stg1_14 = pair_set_epi16(cospi_3_64, -cospi_29_64);
+ const __m128i stg1_15 = pair_set_epi16(cospi_29_64, cospi_3_64);
+
+ const __m128i stg2_0 = pair_set_epi16(cospi_30_64, -cospi_2_64);
+ const __m128i stg2_1 = pair_set_epi16(cospi_2_64, cospi_30_64);
+ const __m128i stg2_2 = pair_set_epi16(cospi_14_64, -cospi_18_64);
+ const __m128i stg2_3 = pair_set_epi16(cospi_18_64, cospi_14_64);
+ const __m128i stg2_4 = pair_set_epi16(cospi_22_64, -cospi_10_64);
+ const __m128i stg2_5 = pair_set_epi16(cospi_10_64, cospi_22_64);
+ const __m128i stg2_6 = pair_set_epi16(cospi_6_64, -cospi_26_64);
+ const __m128i stg2_7 = pair_set_epi16(cospi_26_64, cospi_6_64);
+
+ const __m128i stg3_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
+ const __m128i stg3_1 = pair_set_epi16(cospi_4_64, cospi_28_64);
+ const __m128i stg3_2 = pair_set_epi16(cospi_12_64, -cospi_20_64);
+ const __m128i stg3_3 = pair_set_epi16(cospi_20_64, cospi_12_64);
+ const __m128i stg3_4 = pair_set_epi16(-cospi_4_64, cospi_28_64);
+ const __m128i stg3_5 = pair_set_epi16(cospi_28_64, cospi_4_64);
+ const __m128i stg3_6 = pair_set_epi16(-cospi_28_64, -cospi_4_64);
+ const __m128i stg3_8 = pair_set_epi16(-cospi_20_64, cospi_12_64);
+ const __m128i stg3_9 = pair_set_epi16(cospi_12_64, cospi_20_64);
+ const __m128i stg3_10 = pair_set_epi16(-cospi_12_64, -cospi_20_64);
+
+ const __m128i stg4_0 = pair_set_epi16(cospi_16_64, cospi_16_64);
+ const __m128i stg4_1 = pair_set_epi16(cospi_16_64, -cospi_16_64);
+ const __m128i stg4_2 = pair_set_epi16(cospi_24_64, -cospi_8_64);
+ const __m128i stg4_3 = pair_set_epi16(cospi_8_64, cospi_24_64);
+ const __m128i stg4_4 = pair_set_epi16(-cospi_8_64, cospi_24_64);
+ const __m128i stg4_5 = pair_set_epi16(cospi_24_64, cospi_8_64);
+ const __m128i stg4_6 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+
+ const __m128i stg6_0 = pair_set_epi16(-cospi_16_64, cospi_16_64);
+
+ __m128i in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, in12,
+ in13, in14, in15, in16, in17, in18, in19, in20, in21, in22, in23,
+ in24, in25, in26, in27, in28, in29, in30, in31;
+ __m128i col[128];
+ __m128i stp1_0, stp1_1, stp1_2, stp1_3, stp1_4, stp1_5, stp1_6, stp1_7,
+ stp1_8, stp1_9, stp1_10, stp1_11, stp1_12, stp1_13, stp1_14, stp1_15,
+ stp1_16, stp1_17, stp1_18, stp1_19, stp1_20, stp1_21, stp1_22,
+ stp1_23, stp1_24, stp1_25, stp1_26, stp1_27, stp1_28, stp1_29,
+ stp1_30, stp1_31;
+ __m128i stp2_0, stp2_1, stp2_2, stp2_3, stp2_4, stp2_5, stp2_6, stp2_7,
+ stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15,
+ stp2_16, stp2_17, stp2_18, stp2_19, stp2_20, stp2_21, stp2_22,
+ stp2_23, stp2_24, stp2_25, stp2_26, stp2_27, stp2_28, stp2_29,
+ stp2_30, stp2_31;
+ __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ int i, j;
+
+ // We work on a 8x32 block each time, and loop 8 times for 2-D 32x32 idct.
+ for (i = 0; i < 8; i++) {
+ if (i < 4) {
+ // First 1-D idct
+ // Load input data.
+ in0 = _mm_load_si128((__m128i *)input);
+ in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
+ in16 = _mm_load_si128((__m128i *)(input + 8 * 2));
+ in24 = _mm_load_si128((__m128i *)(input + 8 * 3));
+ in1 = _mm_load_si128((__m128i *)(input + 8 * 4));
+ in9 = _mm_load_si128((__m128i *)(input + 8 * 5));
+ in17 = _mm_load_si128((__m128i *)(input + 8 * 6));
+ in25 = _mm_load_si128((__m128i *)(input + 8 * 7));
+ in2 = _mm_load_si128((__m128i *)(input + 8 * 8));
+ in10 = _mm_load_si128((__m128i *)(input + 8 * 9));
+ in18 = _mm_load_si128((__m128i *)(input + 8 * 10));
+ in26 = _mm_load_si128((__m128i *)(input + 8 * 11));
+ in3 = _mm_load_si128((__m128i *)(input + 8 * 12));
+ in11 = _mm_load_si128((__m128i *)(input + 8 * 13));
+ in19 = _mm_load_si128((__m128i *)(input + 8 * 14));
+ in27 = _mm_load_si128((__m128i *)(input + 8 * 15));
+
+ in4 = _mm_load_si128((__m128i *)(input + 8 * 16));
+ in12 = _mm_load_si128((__m128i *)(input + 8 * 17));
+ in20 = _mm_load_si128((__m128i *)(input + 8 * 18));
+ in28 = _mm_load_si128((__m128i *)(input + 8 * 19));
+ in5 = _mm_load_si128((__m128i *)(input + 8 * 20));
+ in13 = _mm_load_si128((__m128i *)(input + 8 * 21));
+ in21 = _mm_load_si128((__m128i *)(input + 8 * 22));
+ in29 = _mm_load_si128((__m128i *)(input + 8 * 23));
+ in6 = _mm_load_si128((__m128i *)(input + 8 * 24));
+ in14 = _mm_load_si128((__m128i *)(input + 8 * 25));
+ in22 = _mm_load_si128((__m128i *)(input + 8 * 26));
+ in30 = _mm_load_si128((__m128i *)(input + 8 * 27));
+ in7 = _mm_load_si128((__m128i *)(input + 8 * 28));
+ in15 = _mm_load_si128((__m128i *)(input + 8 * 29));
+ in23 = _mm_load_si128((__m128i *)(input + 8 * 30));
+ in31 = _mm_load_si128((__m128i *)(input + 8 * 31));
+
+ input += 256;
+
+ // Transpose 32x8 block to 8x32 block
+ TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+ in4, in5, in6, in7);
+ TRANSPOSE_8X8(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+ in10, in11, in12, in13, in14, in15);
+ TRANSPOSE_8X8(in16, in17, in18, in19, in20, in21, in22, in23, in16, in17,
+ in18, in19, in20, in21, in22, in23);
+ TRANSPOSE_8X8(in24, in25, in26, in27, in28, in29, in30, in31, in24, in25,
+ in26, in27, in28, in29, in30, in31);
+ } else {
+ // Second 1-D idct
+ j = i - 4;
+
+ // Transpose 32x8 block to 8x32 block
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in0, in1, in2, in3, in4,
+ in5, in6, in7);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in8, in9, in10,
+ in11, in12, in13, in14, in15);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in16, in17, in18,
+ in19, in20, in21, in22, in23);
+ j += 4;
+ TRANSPOSE_8X8(col[j * 8 + 0], col[j * 8 + 1], col[j * 8 + 2],
+ col[j * 8 + 3], col[j * 8 + 4], col[j * 8 + 5],
+ col[j * 8 + 6], col[j * 8 + 7], in24, in25, in26, in27,
+ in28, in29, in30, in31);
+ }
+
+ // Stage1
+ {
+ const __m128i lo_1_31 = _mm_unpacklo_epi16(in1, in31);
+ const __m128i hi_1_31 = _mm_unpackhi_epi16(in1, in31);
+ const __m128i lo_17_15 = _mm_unpacklo_epi16(in17, in15);
+ const __m128i hi_17_15 = _mm_unpackhi_epi16(in17, in15);
+
+ const __m128i lo_9_23 = _mm_unpacklo_epi16(in9, in23);
+ const __m128i hi_9_23 = _mm_unpackhi_epi16(in9, in23);
+ const __m128i lo_25_7= _mm_unpacklo_epi16(in25, in7);
+ const __m128i hi_25_7 = _mm_unpackhi_epi16(in25, in7);
+
+ const __m128i lo_5_27 = _mm_unpacklo_epi16(in5, in27);
+ const __m128i hi_5_27 = _mm_unpackhi_epi16(in5, in27);
+ const __m128i lo_21_11 = _mm_unpacklo_epi16(in21, in11);
+ const __m128i hi_21_11 = _mm_unpackhi_epi16(in21, in11);
+
+ const __m128i lo_13_19 = _mm_unpacklo_epi16(in13, in19);
+ const __m128i hi_13_19 = _mm_unpackhi_epi16(in13, in19);
+ const __m128i lo_29_3 = _mm_unpacklo_epi16(in29, in3);
+ const __m128i hi_29_3 = _mm_unpackhi_epi16(in29, in3);
+
+ MULTIPLICATION_AND_ADD(lo_1_31, hi_1_31, lo_17_15, hi_17_15, stg1_0,
+ stg1_1, stg1_2, stg1_3, stp1_16, stp1_31,
+ stp1_17, stp1_30)
+ MULTIPLICATION_AND_ADD(lo_9_23, hi_9_23, lo_25_7, hi_25_7, stg1_4,
+ stg1_5, stg1_6, stg1_7, stp1_18, stp1_29,
+ stp1_19, stp1_28)
+ MULTIPLICATION_AND_ADD(lo_5_27, hi_5_27, lo_21_11, hi_21_11, stg1_8,
+ stg1_9, stg1_10, stg1_11, stp1_20, stp1_27,
+ stp1_21, stp1_26)
+ MULTIPLICATION_AND_ADD(lo_13_19, hi_13_19, lo_29_3, hi_29_3, stg1_12,
+ stg1_13, stg1_14, stg1_15, stp1_22, stp1_25,
+ stp1_23, stp1_24)
+ }
+
+ // Stage2
+ {
+ const __m128i lo_2_30 = _mm_unpacklo_epi16(in2, in30);
+ const __m128i hi_2_30 = _mm_unpackhi_epi16(in2, in30);
+ const __m128i lo_18_14 = _mm_unpacklo_epi16(in18, in14);
+ const __m128i hi_18_14 = _mm_unpackhi_epi16(in18, in14);
+
+ const __m128i lo_10_22 = _mm_unpacklo_epi16(in10, in22);
+ const __m128i hi_10_22 = _mm_unpackhi_epi16(in10, in22);
+ const __m128i lo_26_6 = _mm_unpacklo_epi16(in26, in6);
+ const __m128i hi_26_6 = _mm_unpackhi_epi16(in26, in6);
+
+ MULTIPLICATION_AND_ADD(lo_2_30, hi_2_30, lo_18_14, hi_18_14, stg2_0,
+ stg2_1, stg2_2, stg2_3, stp2_8, stp2_15, stp2_9,
+ stp2_14)
+ MULTIPLICATION_AND_ADD(lo_10_22, hi_10_22, lo_26_6, hi_26_6, stg2_4,
+ stg2_5, stg2_6, stg2_7, stp2_10, stp2_13,
+ stp2_11, stp2_12)
+
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_17);
+ stp2_17 = _mm_sub_epi16(stp1_16, stp1_17);
+ stp2_18 = _mm_sub_epi16(stp1_19, stp1_18);
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_18);
+
+ stp2_20 = _mm_add_epi16(stp1_20, stp1_21);
+ stp2_21 = _mm_sub_epi16(stp1_20, stp1_21);
+ stp2_22 = _mm_sub_epi16(stp1_23, stp1_22);
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_22);
+
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_25);
+ stp2_25 = _mm_sub_epi16(stp1_24, stp1_25);
+ stp2_26 = _mm_sub_epi16(stp1_27, stp1_26);
+ stp2_27 = _mm_add_epi16(stp1_27, stp1_26);
+
+ stp2_28 = _mm_add_epi16(stp1_28, stp1_29);
+ stp2_29 = _mm_sub_epi16(stp1_28, stp1_29);
+ stp2_30 = _mm_sub_epi16(stp1_31, stp1_30);
+ stp2_31 = _mm_add_epi16(stp1_31, stp1_30);
+ }
+
+ // Stage3
+ {
+ const __m128i lo_4_28 = _mm_unpacklo_epi16(in4, in28);
+ const __m128i hi_4_28 = _mm_unpackhi_epi16(in4, in28);
+ const __m128i lo_20_12 = _mm_unpacklo_epi16(in20, in12);
+ const __m128i hi_20_12 = _mm_unpackhi_epi16(in20, in12);
+
+ const __m128i lo_17_30 = _mm_unpacklo_epi16(stp2_17, stp2_30);
+ const __m128i hi_17_30 = _mm_unpackhi_epi16(stp2_17, stp2_30);
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+
+ MULTIPLICATION_AND_ADD(lo_4_28, hi_4_28, lo_20_12, hi_20_12, stg3_0,
+ stg3_1, stg3_2, stg3_3, stp1_4, stp1_7, stp1_5,
+ stp1_6)
+
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_9);
+ stp1_9 = _mm_sub_epi16(stp2_8, stp2_9);
+ stp1_10 = _mm_sub_epi16(stp2_11, stp2_10);
+ stp1_11 = _mm_add_epi16(stp2_11, stp2_10);
+ stp1_12 = _mm_add_epi16(stp2_12, stp2_13);
+ stp1_13 = _mm_sub_epi16(stp2_12, stp2_13);
+ stp1_14 = _mm_sub_epi16(stp2_15, stp2_14);
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_14);
+
+ MULTIPLICATION_AND_ADD(lo_17_30, hi_17_30, lo_18_29, hi_18_29, stg3_4,
+ stg3_5, stg3_6, stg3_4, stp1_17, stp1_30,
+ stp1_18, stp1_29)
+ MULTIPLICATION_AND_ADD(lo_21_26, hi_21_26, lo_22_25, hi_22_25, stg3_8,
+ stg3_9, stg3_10, stg3_8, stp1_21, stp1_26,
+ stp1_22, stp1_25)
+
+ stp1_16 = stp2_16;
+ stp1_31 = stp2_31;
+ stp1_19 = stp2_19;
+ stp1_20 = stp2_20;
+ stp1_23 = stp2_23;
+ stp1_24 = stp2_24;
+ stp1_27 = stp2_27;
+ stp1_28 = stp2_28;
+ }
+
+ // Stage4
+ {
+ const __m128i lo_0_16 = _mm_unpacklo_epi16(in0, in16);
+ const __m128i hi_0_16 = _mm_unpackhi_epi16(in0, in16);
+ const __m128i lo_8_24 = _mm_unpacklo_epi16(in8, in24);
+ const __m128i hi_8_24 = _mm_unpackhi_epi16(in8, in24);
+
+ const __m128i lo_9_14 = _mm_unpacklo_epi16(stp1_9, stp1_14);
+ const __m128i hi_9_14 = _mm_unpackhi_epi16(stp1_9, stp1_14);
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+
+ MULTIPLICATION_AND_ADD(lo_0_16, hi_0_16, lo_8_24, hi_8_24, stg4_0,
+ stg4_1, stg4_2, stg4_3, stp2_0, stp2_1,
+ stp2_2, stp2_3)
+
+ stp2_4 = _mm_add_epi16(stp1_4, stp1_5);
+ stp2_5 = _mm_sub_epi16(stp1_4, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_7, stp1_6);
+ stp2_7 = _mm_add_epi16(stp1_7, stp1_6);
+
+ MULTIPLICATION_AND_ADD(lo_9_14, hi_9_14, lo_10_13, hi_10_13, stg4_4,
+ stg4_5, stg4_6, stg4_4, stp2_9, stp2_14,
+ stp2_10, stp2_13)
+
+ stp2_8 = stp1_8;
+ stp2_15 = stp1_15;
+ stp2_11 = stp1_11;
+ stp2_12 = stp1_12;
+
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_19);
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_18);
+ stp2_18 = _mm_sub_epi16(stp1_17, stp1_18);
+ stp2_19 = _mm_sub_epi16(stp1_16, stp1_19);
+ stp2_20 = _mm_sub_epi16(stp1_23, stp1_20);
+ stp2_21 = _mm_sub_epi16(stp1_22, stp1_21);
+ stp2_22 = _mm_add_epi16(stp1_22, stp1_21);
+ stp2_23 = _mm_add_epi16(stp1_23, stp1_20);
+
+ stp2_24 = _mm_add_epi16(stp1_24, stp1_27);
+ stp2_25 = _mm_add_epi16(stp1_25, stp1_26);
+ stp2_26 = _mm_sub_epi16(stp1_25, stp1_26);
+ stp2_27 = _mm_sub_epi16(stp1_24, stp1_27);
+ stp2_28 = _mm_sub_epi16(stp1_31, stp1_28);
+ stp2_29 = _mm_sub_epi16(stp1_30, stp1_29);
+ stp2_30 = _mm_add_epi16(stp1_29, stp1_30);
+ stp2_31 = _mm_add_epi16(stp1_28, stp1_31);
+ }
+
+ // Stage5
+ {
+ const __m128i lo_6_5 = _mm_unpacklo_epi16(stp2_6, stp2_5);
+ const __m128i hi_6_5 = _mm_unpackhi_epi16(stp2_6, stp2_5);
+ const __m128i lo_18_29 = _mm_unpacklo_epi16(stp2_18, stp2_29);
+ const __m128i hi_18_29 = _mm_unpackhi_epi16(stp2_18, stp2_29);
+
+ const __m128i lo_19_28 = _mm_unpacklo_epi16(stp2_19, stp2_28);
+ const __m128i hi_19_28 = _mm_unpackhi_epi16(stp2_19, stp2_28);
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_3);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_2);
+ stp1_2 = _mm_sub_epi16(stp2_1, stp2_2);
+ stp1_3 = _mm_sub_epi16(stp2_0, stp2_3);
+
+ tmp0 = _mm_madd_epi16(lo_6_5, stg4_1);
+ tmp1 = _mm_madd_epi16(hi_6_5, stg4_1);
+ tmp2 = _mm_madd_epi16(lo_6_5, stg4_0);
+ tmp3 = _mm_madd_epi16(hi_6_5, stg4_0);
+
+ tmp0 = _mm_add_epi32(tmp0, rounding);
+ tmp1 = _mm_add_epi32(tmp1, rounding);
+ tmp2 = _mm_add_epi32(tmp2, rounding);
+ tmp3 = _mm_add_epi32(tmp3, rounding);
+
+ tmp0 = _mm_srai_epi32(tmp0, DCT_CONST_BITS);
+ tmp1 = _mm_srai_epi32(tmp1, DCT_CONST_BITS);
+ tmp2 = _mm_srai_epi32(tmp2, DCT_CONST_BITS);
+ tmp3 = _mm_srai_epi32(tmp3, DCT_CONST_BITS);
+
+ stp1_5 = _mm_packs_epi32(tmp0, tmp1);
+ stp1_6 = _mm_packs_epi32(tmp2, tmp3);
+
+ stp1_4 = stp2_4;
+ stp1_7 = stp2_7;
+
+ stp1_8 = _mm_add_epi16(stp2_8, stp2_11);
+ stp1_9 = _mm_add_epi16(stp2_9, stp2_10);
+ stp1_10 = _mm_sub_epi16(stp2_9, stp2_10);
+ stp1_11 = _mm_sub_epi16(stp2_8, stp2_11);
+ stp1_12 = _mm_sub_epi16(stp2_15, stp2_12);
+ stp1_13 = _mm_sub_epi16(stp2_14, stp2_13);
+ stp1_14 = _mm_add_epi16(stp2_14, stp2_13);
+ stp1_15 = _mm_add_epi16(stp2_15, stp2_12);
+
+ stp1_16 = stp2_16;
+ stp1_17 = stp2_17;
+
+ MULTIPLICATION_AND_ADD(lo_18_29, hi_18_29, lo_19_28, hi_19_28, stg4_4,
+ stg4_5, stg4_4, stg4_5, stp1_18, stp1_29,
+ stp1_19, stp1_28)
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg4_6,
+ stg4_4, stg4_6, stg4_4, stp1_20, stp1_27,
+ stp1_21, stp1_26)
+
+ stp1_22 = stp2_22;
+ stp1_23 = stp2_23;
+ stp1_24 = stp2_24;
+ stp1_25 = stp2_25;
+ stp1_30 = stp2_30;
+ stp1_31 = stp2_31;
+ }
+
+ // Stage6
+ {
+ const __m128i lo_10_13 = _mm_unpacklo_epi16(stp1_10, stp1_13);
+ const __m128i hi_10_13 = _mm_unpackhi_epi16(stp1_10, stp1_13);
+ const __m128i lo_11_12 = _mm_unpacklo_epi16(stp1_11, stp1_12);
+ const __m128i hi_11_12 = _mm_unpackhi_epi16(stp1_11, stp1_12);
+
+ stp2_0 = _mm_add_epi16(stp1_0, stp1_7);
+ stp2_1 = _mm_add_epi16(stp1_1, stp1_6);
+ stp2_2 = _mm_add_epi16(stp1_2, stp1_5);
+ stp2_3 = _mm_add_epi16(stp1_3, stp1_4);
+ stp2_4 = _mm_sub_epi16(stp1_3, stp1_4);
+ stp2_5 = _mm_sub_epi16(stp1_2, stp1_5);
+ stp2_6 = _mm_sub_epi16(stp1_1, stp1_6);
+ stp2_7 = _mm_sub_epi16(stp1_0, stp1_7);
+
+ stp2_8 = stp1_8;
+ stp2_9 = stp1_9;
+ stp2_14 = stp1_14;
+ stp2_15 = stp1_15;
+
+ MULTIPLICATION_AND_ADD(lo_10_13, hi_10_13, lo_11_12, hi_11_12,
+ stg6_0, stg4_0, stg6_0, stg4_0, stp2_10,
+ stp2_13, stp2_11, stp2_12)
+
+ stp2_16 = _mm_add_epi16(stp1_16, stp1_23);
+ stp2_17 = _mm_add_epi16(stp1_17, stp1_22);
+ stp2_18 = _mm_add_epi16(stp1_18, stp1_21);
+ stp2_19 = _mm_add_epi16(stp1_19, stp1_20);
+ stp2_20 = _mm_sub_epi16(stp1_19, stp1_20);
+ stp2_21 = _mm_sub_epi16(stp1_18, stp1_21);
+ stp2_22 = _mm_sub_epi16(stp1_17, stp1_22);
+ stp2_23 = _mm_sub_epi16(stp1_16, stp1_23);
+
+ stp2_24 = _mm_sub_epi16(stp1_31, stp1_24);
+ stp2_25 = _mm_sub_epi16(stp1_30, stp1_25);
+ stp2_26 = _mm_sub_epi16(stp1_29, stp1_26);
+ stp2_27 = _mm_sub_epi16(stp1_28, stp1_27);
+ stp2_28 = _mm_add_epi16(stp1_27, stp1_28);
+ stp2_29 = _mm_add_epi16(stp1_26, stp1_29);
+ stp2_30 = _mm_add_epi16(stp1_25, stp1_30);
+ stp2_31 = _mm_add_epi16(stp1_24, stp1_31);
+ }
+
+ // Stage7
+ {
+ const __m128i lo_20_27 = _mm_unpacklo_epi16(stp2_20, stp2_27);
+ const __m128i hi_20_27 = _mm_unpackhi_epi16(stp2_20, stp2_27);
+ const __m128i lo_21_26 = _mm_unpacklo_epi16(stp2_21, stp2_26);
+ const __m128i hi_21_26 = _mm_unpackhi_epi16(stp2_21, stp2_26);
+
+ const __m128i lo_22_25 = _mm_unpacklo_epi16(stp2_22, stp2_25);
+ const __m128i hi_22_25 = _mm_unpackhi_epi16(stp2_22, stp2_25);
+ const __m128i lo_23_24 = _mm_unpacklo_epi16(stp2_23, stp2_24);
+ const __m128i hi_23_24 = _mm_unpackhi_epi16(stp2_23, stp2_24);
+
+ stp1_0 = _mm_add_epi16(stp2_0, stp2_15);
+ stp1_1 = _mm_add_epi16(stp2_1, stp2_14);
+ stp1_2 = _mm_add_epi16(stp2_2, stp2_13);
+ stp1_3 = _mm_add_epi16(stp2_3, stp2_12);
+ stp1_4 = _mm_add_epi16(stp2_4, stp2_11);
+ stp1_5 = _mm_add_epi16(stp2_5, stp2_10);
+ stp1_6 = _mm_add_epi16(stp2_6, stp2_9);
+ stp1_7 = _mm_add_epi16(stp2_7, stp2_8);
+ stp1_8 = _mm_sub_epi16(stp2_7, stp2_8);
+ stp1_9 = _mm_sub_epi16(stp2_6, stp2_9);
+ stp1_10 = _mm_sub_epi16(stp2_5, stp2_10);
+ stp1_11 = _mm_sub_epi16(stp2_4, stp2_11);
+ stp1_12 = _mm_sub_epi16(stp2_3, stp2_12);
+ stp1_13 = _mm_sub_epi16(stp2_2, stp2_13);
+ stp1_14 = _mm_sub_epi16(stp2_1, stp2_14);
+ stp1_15 = _mm_sub_epi16(stp2_0, stp2_15);
+
+ stp1_16 = stp2_16;
+ stp1_17 = stp2_17;
+ stp1_18 = stp2_18;
+ stp1_19 = stp2_19;
+
+ MULTIPLICATION_AND_ADD(lo_20_27, hi_20_27, lo_21_26, hi_21_26, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp1_20, stp1_27,
+ stp1_21, stp1_26)
+ MULTIPLICATION_AND_ADD(lo_22_25, hi_22_25, lo_23_24, hi_23_24, stg6_0,
+ stg4_0, stg6_0, stg4_0, stp1_22, stp1_25,
+ stp1_23, stp1_24)
+
+ stp1_28 = stp2_28;
+ stp1_29 = stp2_29;
+ stp1_30 = stp2_30;
+ stp1_31 = stp2_31;
+ }
+
+ // final stage
+ if (i < 4) {
+ // 1_D: Store 32 intermediate results for each 8x32 block.
+ col[i * 32 + 0] = _mm_add_epi16(stp1_0, stp1_31);
+ col[i * 32 + 1] = _mm_add_epi16(stp1_1, stp1_30);
+ col[i * 32 + 2] = _mm_add_epi16(stp1_2, stp1_29);
+ col[i * 32 + 3] = _mm_add_epi16(stp1_3, stp1_28);
+ col[i * 32 + 4] = _mm_add_epi16(stp1_4, stp1_27);
+ col[i * 32 + 5] = _mm_add_epi16(stp1_5, stp1_26);
+ col[i * 32 + 6] = _mm_add_epi16(stp1_6, stp1_25);
+ col[i * 32 + 7] = _mm_add_epi16(stp1_7, stp1_24);
+ col[i * 32 + 8] = _mm_add_epi16(stp1_8, stp1_23);
+ col[i * 32 + 9] = _mm_add_epi16(stp1_9, stp1_22);
+ col[i * 32 + 10] = _mm_add_epi16(stp1_10, stp1_21);
+ col[i * 32 + 11] = _mm_add_epi16(stp1_11, stp1_20);
+ col[i * 32 + 12] = _mm_add_epi16(stp1_12, stp1_19);
+ col[i * 32 + 13] = _mm_add_epi16(stp1_13, stp1_18);
+ col[i * 32 + 14] = _mm_add_epi16(stp1_14, stp1_17);
+ col[i * 32 + 15] = _mm_add_epi16(stp1_15, stp1_16);
+ col[i * 32 + 16] = _mm_sub_epi16(stp1_15, stp1_16);
+ col[i * 32 + 17] = _mm_sub_epi16(stp1_14, stp1_17);
+ col[i * 32 + 18] = _mm_sub_epi16(stp1_13, stp1_18);
+ col[i * 32 + 19] = _mm_sub_epi16(stp1_12, stp1_19);
+ col[i * 32 + 20] = _mm_sub_epi16(stp1_11, stp1_20);
+ col[i * 32 + 21] = _mm_sub_epi16(stp1_10, stp1_21);
+ col[i * 32 + 22] = _mm_sub_epi16(stp1_9, stp1_22);
+ col[i * 32 + 23] = _mm_sub_epi16(stp1_8, stp1_23);
+ col[i * 32 + 24] = _mm_sub_epi16(stp1_7, stp1_24);
+ col[i * 32 + 25] = _mm_sub_epi16(stp1_6, stp1_25);
+ col[i * 32 + 26] = _mm_sub_epi16(stp1_5, stp1_26);
+ col[i * 32 + 27] = _mm_sub_epi16(stp1_4, stp1_27);
+ col[i * 32 + 28] = _mm_sub_epi16(stp1_3, stp1_28);
+ col[i * 32 + 29] = _mm_sub_epi16(stp1_2, stp1_29);
+ col[i * 32 + 30] = _mm_sub_epi16(stp1_1, stp1_30);
+ col[i * 32 + 31] = _mm_sub_epi16(stp1_0, stp1_31);
+ } else {
+ // 2_D: Calculate the results and store them to destination.
+ in0 = _mm_add_epi16(stp1_0, stp1_31);
+ in1 = _mm_add_epi16(stp1_1, stp1_30);
+ in2 = _mm_add_epi16(stp1_2, stp1_29);
+ in3 = _mm_add_epi16(stp1_3, stp1_28);
+ in4 = _mm_add_epi16(stp1_4, stp1_27);
+ in5 = _mm_add_epi16(stp1_5, stp1_26);
+ in6 = _mm_add_epi16(stp1_6, stp1_25);
+ in7 = _mm_add_epi16(stp1_7, stp1_24);
+ in8 = _mm_add_epi16(stp1_8, stp1_23);
+ in9 = _mm_add_epi16(stp1_9, stp1_22);
+ in10 = _mm_add_epi16(stp1_10, stp1_21);
+ in11 = _mm_add_epi16(stp1_11, stp1_20);
+ in12 = _mm_add_epi16(stp1_12, stp1_19);
+ in13 = _mm_add_epi16(stp1_13, stp1_18);
+ in14 = _mm_add_epi16(stp1_14, stp1_17);
+ in15 = _mm_add_epi16(stp1_15, stp1_16);
+ in16 = _mm_sub_epi16(stp1_15, stp1_16);
+ in17 = _mm_sub_epi16(stp1_14, stp1_17);
+ in18 = _mm_sub_epi16(stp1_13, stp1_18);
+ in19 = _mm_sub_epi16(stp1_12, stp1_19);
+ in20 = _mm_sub_epi16(stp1_11, stp1_20);
+ in21 = _mm_sub_epi16(stp1_10, stp1_21);
+ in22 = _mm_sub_epi16(stp1_9, stp1_22);
+ in23 = _mm_sub_epi16(stp1_8, stp1_23);
+ in24 = _mm_sub_epi16(stp1_7, stp1_24);
+ in25 = _mm_sub_epi16(stp1_6, stp1_25);
+ in26 = _mm_sub_epi16(stp1_5, stp1_26);
+ in27 = _mm_sub_epi16(stp1_4, stp1_27);
+ in28 = _mm_sub_epi16(stp1_3, stp1_28);
+ in29 = _mm_sub_epi16(stp1_2, stp1_29);
+ in30 = _mm_sub_epi16(stp1_1, stp1_30);
+ in31 = _mm_sub_epi16(stp1_0, stp1_31);
+
+ // Final rounding and shift
+ in0 = _mm_adds_epi16(in0, final_rounding);
+ in1 = _mm_adds_epi16(in1, final_rounding);
+ in2 = _mm_adds_epi16(in2, final_rounding);
+ in3 = _mm_adds_epi16(in3, final_rounding);
+ in4 = _mm_adds_epi16(in4, final_rounding);
+ in5 = _mm_adds_epi16(in5, final_rounding);
+ in6 = _mm_adds_epi16(in6, final_rounding);
+ in7 = _mm_adds_epi16(in7, final_rounding);
+ in8 = _mm_adds_epi16(in8, final_rounding);
+ in9 = _mm_adds_epi16(in9, final_rounding);
+ in10 = _mm_adds_epi16(in10, final_rounding);
+ in11 = _mm_adds_epi16(in11, final_rounding);
+ in12 = _mm_adds_epi16(in12, final_rounding);
+ in13 = _mm_adds_epi16(in13, final_rounding);
+ in14 = _mm_adds_epi16(in14, final_rounding);
+ in15 = _mm_adds_epi16(in15, final_rounding);
+ in16 = _mm_adds_epi16(in16, final_rounding);
+ in17 = _mm_adds_epi16(in17, final_rounding);
+ in18 = _mm_adds_epi16(in18, final_rounding);
+ in19 = _mm_adds_epi16(in19, final_rounding);
+ in20 = _mm_adds_epi16(in20, final_rounding);
+ in21 = _mm_adds_epi16(in21, final_rounding);
+ in22 = _mm_adds_epi16(in22, final_rounding);
+ in23 = _mm_adds_epi16(in23, final_rounding);
+ in24 = _mm_adds_epi16(in24, final_rounding);
+ in25 = _mm_adds_epi16(in25, final_rounding);
+ in26 = _mm_adds_epi16(in26, final_rounding);
+ in27 = _mm_adds_epi16(in27, final_rounding);
+ in28 = _mm_adds_epi16(in28, final_rounding);
+ in29 = _mm_adds_epi16(in29, final_rounding);
+ in30 = _mm_adds_epi16(in30, final_rounding);
+ in31 = _mm_adds_epi16(in31, final_rounding);
+
+ in0 = _mm_srai_epi16(in0, 6);
+ in1 = _mm_srai_epi16(in1, 6);
+ in2 = _mm_srai_epi16(in2, 6);
+ in3 = _mm_srai_epi16(in3, 6);
+ in4 = _mm_srai_epi16(in4, 6);
+ in5 = _mm_srai_epi16(in5, 6);
+ in6 = _mm_srai_epi16(in6, 6);
+ in7 = _mm_srai_epi16(in7, 6);
+ in8 = _mm_srai_epi16(in8, 6);
+ in9 = _mm_srai_epi16(in9, 6);
+ in10 = _mm_srai_epi16(in10, 6);
+ in11 = _mm_srai_epi16(in11, 6);
+ in12 = _mm_srai_epi16(in12, 6);
+ in13 = _mm_srai_epi16(in13, 6);
+ in14 = _mm_srai_epi16(in14, 6);
+ in15 = _mm_srai_epi16(in15, 6);
+ in16 = _mm_srai_epi16(in16, 6);
+ in17 = _mm_srai_epi16(in17, 6);
+ in18 = _mm_srai_epi16(in18, 6);
+ in19 = _mm_srai_epi16(in19, 6);
+ in20 = _mm_srai_epi16(in20, 6);
+ in21 = _mm_srai_epi16(in21, 6);
+ in22 = _mm_srai_epi16(in22, 6);
+ in23 = _mm_srai_epi16(in23, 6);
+ in24 = _mm_srai_epi16(in24, 6);
+ in25 = _mm_srai_epi16(in25, 6);
+ in26 = _mm_srai_epi16(in26, 6);
+ in27 = _mm_srai_epi16(in27, 6);
+ in28 = _mm_srai_epi16(in28, 6);
+ in29 = _mm_srai_epi16(in29, 6);
+ in30 = _mm_srai_epi16(in30, 6);
+ in31 = _mm_srai_epi16(in31, 6);
+
+ // Store results
+ _mm_store_si128((__m128i *)output, in0);
+ _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
+ _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
+ _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
+ _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
+ _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
+ _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
+ _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+ _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
+ _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
+ _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
+ _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
+ _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
+ _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
+ _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
+ _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+ _mm_store_si128((__m128i *)(output + half_pitch * 16), in16);
+ _mm_store_si128((__m128i *)(output + half_pitch * 17), in17);
+ _mm_store_si128((__m128i *)(output + half_pitch * 18), in18);
+ _mm_store_si128((__m128i *)(output + half_pitch * 19), in19);
+ _mm_store_si128((__m128i *)(output + half_pitch * 20), in20);
+ _mm_store_si128((__m128i *)(output + half_pitch * 21), in21);
+ _mm_store_si128((__m128i *)(output + half_pitch * 22), in22);
+ _mm_store_si128((__m128i *)(output + half_pitch * 23), in23);
+ _mm_store_si128((__m128i *)(output + half_pitch * 24), in24);
+ _mm_store_si128((__m128i *)(output + half_pitch * 25), in25);
+ _mm_store_si128((__m128i *)(output + half_pitch * 26), in26);
+ _mm_store_si128((__m128i *)(output + half_pitch * 27), in27);
+ _mm_store_si128((__m128i *)(output + half_pitch * 28), in28);
+ _mm_store_si128((__m128i *)(output + half_pitch * 29), in29);
+ _mm_store_si128((__m128i *)(output + half_pitch * 30), in30);
+ _mm_store_si128((__m128i *)(output + half_pitch * 31), in31);
+
+ output += 8;
+ }
+ }
+}
+#endif
diff --git a/vp9/common/x86/vp9_idct_x86.h b/vp9/common/x86/vp9_idct_x86.h
index 8320cf87d..bd66d8c72 100644
--- a/vp9/common/x86/vp9_idct_x86.h
+++ b/vp9/common/x86/vp9_idct_x86.h
@@ -20,23 +20,10 @@
*/
#if HAVE_MMX
-extern prototype_idct(vp9_short_idct4x4llm_1_mmx);
-extern prototype_idct(vp9_short_idct4x4llm_mmx);
-extern prototype_idct_scalar_add(vp9_dc_only_idct_add_mmx);
-
extern prototype_second_order(vp9_short_inv_walsh4x4_mmx);
extern prototype_second_order(vp9_short_inv_walsh4x4_1_mmx);
#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_mmx
-
-#undef vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_mmx
-
-#undef vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_mmx
-
#undef vp9_idct_iwalsh16
#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_mmx
diff --git a/vp9/common/x86/vp9_idctllm_mmx.asm b/vp9/common/x86/vp9_idctllm_mmx.asm
deleted file mode 100644
index 15e81addb..000000000
--- a/vp9/common/x86/vp9_idctllm_mmx.asm
+++ /dev/null
@@ -1,241 +0,0 @@
-;
-; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "third_party/x86inc/x86inc.asm"
-
-SECTION_RODATA
-align 16
-x_s1sqr2: times 4 dw 0x8A8C
-align 16
-x_c1sqr2less1: times 4 dw 0x4E7B
-align 16
-pw_16: times 4 dw 16
-
-SECTION .text
-
-
-; /****************************************************************************
-; * Notes:
-; *
-; * This implementation makes use of 16 bit fixed point version of two multiply
-; * constants:
-; * 1. sqrt(2) * cos (pi/8)
-; * 2. sqrt(2) * sin (pi/8)
-; * Because the first constant is bigger than 1, to maintain the same 16 bit
-; * fixed point precision as the second one, we use a trick of
-; * x * a = x + x*(a-1)
-; * so
-; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
-; *
-; * For the second constant, because of the 16bit version is 35468, which
-; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
-; * number.
-; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
-; *
-; **************************************************************************/
-
-INIT_MMX
-
-;void short_idct4x4llm_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_mmx, 3,3,0, inp, out, pit
- mova m0, [inpq +0]
- mova m1, [inpq +8]
-
- mova m2, [inpq+16]
- mova m3, [inpq+24]
-
- psubw m0, m2 ; b1= 0-2
- paddw m2, m2 ;
-
- mova m5, m1
- paddw m2, m0 ; a1 =0+2
-
- pmulhw m5, [x_s1sqr2] ;
- paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
-
- mova m7, m3 ;
- pmulhw m7, [x_c1sqr2less1] ;
-
- paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw m7, m5 ; c1
-
- mova m5, m1
- mova m4, m3
-
- pmulhw m5, [x_c1sqr2less1]
- paddw m5, m1
-
- pmulhw m3, [x_s1sqr2]
- paddw m3, m4
-
- paddw m3, m5 ; d1
- mova m6, m2 ; a1
-
- mova m4, m0 ; b1
- paddw m2, m3 ;0
-
- paddw m4, m7 ;1
- psubw m0, m7 ;2
-
- psubw m6, m3 ;3
-
- mova m1, m2 ; 03 02 01 00
- mova m3, m4 ; 23 22 21 20
-
- punpcklwd m1, m0 ; 11 01 10 00
- punpckhwd m2, m0 ; 13 03 12 02
-
- punpcklwd m3, m6 ; 31 21 30 20
- punpckhwd m4, m6 ; 33 23 32 22
-
- mova m0, m1 ; 11 01 10 00
- mova m5, m2 ; 13 03 12 02
-
- punpckldq m0, m3 ; 30 20 10 00
- punpckhdq m1, m3 ; 31 21 11 01
-
- punpckldq m2, m4 ; 32 22 12 02
- punpckhdq m5, m4 ; 33 23 13 03
-
- mova m3, m5 ; 33 23 13 03
-
- psubw m0, m2 ; b1= 0-2
- paddw m2, m2 ;
-
- mova m5, m1
- paddw m2, m0 ; a1 =0+2
-
- pmulhw m5, [x_s1sqr2] ;
- paddw m5, m1 ; ip1 * sin(pi/8) * sqrt(2)
-
- mova m7, m3 ;
- pmulhw m7, [x_c1sqr2less1] ;
-
- paddw m7, m3 ; ip3 * cos(pi/8) * sqrt(2)
- psubw m7, m5 ; c1
-
- mova m5, m1
- mova m4, m3
-
- pmulhw m5, [x_c1sqr2less1]
- paddw m5, m1
-
- pmulhw m3, [x_s1sqr2]
- paddw m3, m4
-
- paddw m3, m5 ; d1
- paddw m0, [pw_16]
-
- paddw m2, [pw_16]
- mova m6, m2 ; a1
-
- mova m4, m0 ; b1
- paddw m2, m3 ;0
-
- paddw m4, m7 ;1
- psubw m0, m7 ;2
-
- psubw m6, m3 ;3
- psraw m2, 5
-
- psraw m0, 5
- psraw m4, 5
-
- psraw m6, 5
-
- mova m1, m2 ; 03 02 01 00
- mova m3, m4 ; 23 22 21 20
-
- punpcklwd m1, m0 ; 11 01 10 00
- punpckhwd m2, m0 ; 13 03 12 02
-
- punpcklwd m3, m6 ; 31 21 30 20
- punpckhwd m4, m6 ; 33 23 32 22
-
- mova m0, m1 ; 11 01 10 00
- mova m5, m2 ; 13 03 12 02
-
- punpckldq m0, m3 ; 30 20 10 00
- punpckhdq m1, m3 ; 31 21 11 01
-
- punpckldq m2, m4 ; 32 22 12 02
- punpckhdq m5, m4 ; 33 23 13 03
-
- mova [outq], m0
-
- mova [outq+r2], m1
- mova [outq+pitq*2], m2
-
- add outq, pitq
- mova [outq+pitq*2], m5
- RET
-
-;void short_idct4x4llm_1_mmx(short *input, short *output, int pitch)
-cglobal short_idct4x4llm_1_mmx,3,3,0,inp,out,pit
- movh m0, [inpq]
- paddw m0, [pw_16]
- psraw m0, 5
- punpcklwd m0, m0
- punpckldq m0, m0
-
- mova [outq], m0
- mova [outq+pitq], m0
-
- mova [outq+pitq*2], m0
- add r1, r2
-
- mova [outq+pitq*2], m0
- RET
-
-
-;void dc_only_idct_add_mmx(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride)
-cglobal dc_only_idct_add_mmx, 4,5,0,in_dc,pred,dst,pit,stride
-%if ARCH_X86_64
- movsxd strideq, dword stridem
-%else
- mov strideq, stridem
-%endif
- pxor m0, m0
-
- movh m5, in_dcq ; dc
- paddw m5, [pw_16]
-
- psraw m5, 5
-
- punpcklwd m5, m5
- punpckldq m5, m5
-
- movh m1, [predq]
- punpcklbw m1, m0
- paddsw m1, m5
- packuswb m1, m0 ; pack and unpack to saturate
- movh [dstq], m1
-
- movh m2, [predq+pitq]
- punpcklbw m2, m0
- paddsw m2, m5
- packuswb m2, m0 ; pack and unpack to saturate
- movh [dstq+strideq], m2
-
- movh m3, [predq+2*pitq]
- punpcklbw m3, m0
- paddsw m3, m5
- packuswb m3, m0 ; pack and unpack to saturate
- movh [dstq+2*strideq], m3
-
- add dstq, strideq
- add predq, pitq
- movh m4, [predq+2*pitq]
- punpcklbw m4, m0
- paddsw m4, m5
- packuswb m4, m0 ; pack and unpack to saturate
- movh [dstq+2*strideq], m4
- RET
-
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index d319bf2d5..08447a62d 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -26,14 +26,16 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
- DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
+ DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
+
+ DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
+ DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
+
+
__m128i mask, hev, flat, flat2;
const __m128i zero = _mm_set1_epi16(0);
+ const __m128i one = _mm_set1_epi8(1);
__m128i p7, p6, p5;
__m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
__m128i q5, q6, q7;
@@ -58,12 +60,24 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
+
+ _mm_store_si128((__m128i *)ap[4], p4);
+ _mm_store_si128((__m128i *)ap[3], p3);
+ _mm_store_si128((__m128i *)ap[2], p2);
+ _mm_store_si128((__m128i *)ap[1], p1);
+ _mm_store_si128((__m128i *)ap[0], p0);
+ _mm_store_si128((__m128i *)aq[4], q4);
+ _mm_store_si128((__m128i *)aq[3], q3);
+ _mm_store_si128((__m128i *)aq[2], q2);
+ _mm_store_si128((__m128i *)aq[1], q1);
+ _mm_store_si128((__m128i *)aq[0], q0);
+
+
{
const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
_mm_subs_epu8(p0, p1));
const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
_mm_subs_epu8(q0, q1));
- const __m128i one = _mm_set1_epi8(1);
const __m128i fe = _mm_set1_epi8(0xfe);
const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
__m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
@@ -95,246 +109,8 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
mask = _mm_max_epu8(work, mask);
mask = _mm_subs_epu8(mask, limit);
mask = _mm_cmpeq_epi8(mask, zero);
-
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
- _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0),
- _mm_subs_epu8(q0, q2)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
- _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0),
- _mm_subs_epu8(q0, q3)));
- flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
- _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0),
- _mm_subs_epu8(q0, q4)));
- flat = _mm_max_epu8(work, flat);
- flat = _mm_subs_epu8(flat, one);
- flat = _mm_cmpeq_epi8(flat, zero);
- flat = _mm_and_si128(flat, mask);
}
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // calculate flat2
- p4 = _mm_loadu_si128((__m128i *)(s - 8 * p));
- p3 = _mm_loadu_si128((__m128i *)(s - 7 * p));
- p2 = _mm_loadu_si128((__m128i *)(s - 6 * p));
- p1 = _mm_loadu_si128((__m128i *)(s - 5 * p));
-// p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
-// q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
- q1 = _mm_loadu_si128((__m128i *)(s + 4 * p));
- q2 = _mm_loadu_si128((__m128i *)(s + 5 * p));
- q3 = _mm_loadu_si128((__m128i *)(s + 6 * p));
- q4 = _mm_loadu_si128((__m128i *)(s + 7 * p));
-
- {
- const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
- _mm_subs_epu8(p0, p1));
- const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
- _mm_subs_epu8(q0, q1));
- const __m128i one = _mm_set1_epi8(1);
- __m128i work;
- flat2 = _mm_max_epu8(abs_p1p0, abs_q1q0);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
- _mm_subs_epu8(p0, p2)),
- _mm_or_si128(_mm_subs_epu8(q2, q0),
- _mm_subs_epu8(q0, q2)));
- flat2 = _mm_max_epu8(work, flat2);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
- _mm_subs_epu8(p0, p3)),
- _mm_or_si128(_mm_subs_epu8(q3, q0),
- _mm_subs_epu8(q0, q3)));
- flat2 = _mm_max_epu8(work, flat2);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
- _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0),
- _mm_subs_epu8(q0, q4)));
- flat2 = _mm_max_epu8(work, flat2);
- flat2 = _mm_subs_epu8(flat2, one);
- flat2 = _mm_cmpeq_epi8(flat2, zero);
- flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
- }
- // calculate flat2
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
- {
- const __m128i four = _mm_set1_epi16(4);
- unsigned char *src = s;
- i = 0;
- do {
- __m128i workp_a, workp_b, workp_shft;
- p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
- q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
-
- workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
- workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op2[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op1[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_op0[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq0[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq1[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
- _mm_storel_epi64((__m128i *)&flat_oq2[i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- src += 8;
- } while (++i < 2);
- }
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- // wide flat
- // TODO(slavarnway): interleave with the flat pixel calculations (see above)
- {
- const __m128i eight = _mm_set1_epi16(8);
- unsigned char *src = s;
- int i = 0;
- do {
- __m128i workp_a, workp_b, workp_shft;
- p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 8 * p)), zero);
- p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 7 * p)), zero);
- p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 6 * p)), zero);
- p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
- p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
- p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
- p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
- p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
- q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
- q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
- q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
- q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
- q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
- q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 5 * p)), zero);
- q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 6 * p)), zero);
- q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 7 * p)), zero);
-
-
- workp_a = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
- workp_a = _mm_add_epi16(_mm_slli_epi16(p6, 1), workp_a);
- workp_b = _mm_add_epi16(_mm_add_epi16(p5, p4), _mm_add_epi16(p3, p2));
- workp_a = _mm_add_epi16(_mm_add_epi16(p1, p0), workp_a);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, eight), workp_b);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p5);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p6), q1);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p4);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p5), q2);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p4), q3);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p3), q4);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p1);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p2), q5);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), p0);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), q6);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p7), q0);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q7);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p6), q1);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q7);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p5), q2);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q7);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q3);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q2), q7);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q4);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q3), q7);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q5);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q4), q7);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q6);
- workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q5), q7);
- workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 4);
- _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
- _mm_packus_epi16(workp_shft, workp_shft));
-
- src += 8;
- } while (++i < 2);
- }
- // wide flat
- // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
// lp filter
{
const __m128i t4 = _mm_set1_epi8(4);
@@ -345,14 +121,10 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
const __m128i t1 = _mm_set1_epi8(0x1);
const __m128i t7f = _mm_set1_epi8(0x7f);
- __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
- t80);
- __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
- t80);
- __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
- t80);
- __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
- t80);
+ __m128i ps1 = _mm_xor_si128(p1, t80);
+ __m128i ps0 = _mm_xor_si128(p0, t80);
+ __m128i qs0 = _mm_xor_si128(q0, t80);
+ __m128i qs1 = _mm_xor_si128(q1, t80);
__m128i filt;
__m128i work_a;
__m128i filter1, filter2;
@@ -374,6 +146,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
work_a = _mm_and_si128(work_a, te0);
filter1 = _mm_and_si128(filter1, t1f);
filter1 = _mm_or_si128(filter1, work_a);
+ qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
/* Filter2 >> 3 */
work_a = _mm_cmpgt_epi8(zero, filter2);
@@ -381,6 +154,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
work_a = _mm_and_si128(work_a, te0);
filter2 = _mm_and_si128(filter2, t1f);
filter2 = _mm_or_si128(filter2, work_a);
+ ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
/* filt >> 1 */
filt = _mm_adds_epi8(filter1, t1);
@@ -389,20 +163,265 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
work_a = _mm_and_si128(work_a, t80);
filt = _mm_and_si128(filt, t7f);
filt = _mm_or_si128(filt, work_a);
-
filt = _mm_andnot_si128(hev, filt);
-
- ps0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
ps1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
- qs0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
qs1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+ // loopfilter done
+
+ {
+ __m128i work;
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+ _mm_subs_epu8(p0, p2)),
+ _mm_or_si128(_mm_subs_epu8(q2, q0),
+ _mm_subs_epu8(q0, q2)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+ _mm_subs_epu8(p0, p3)),
+ _mm_or_si128(_mm_subs_epu8(q3, q0),
+ _mm_subs_epu8(q0, q3)));
+ flat = _mm_max_epu8(work, flat);
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
+ _mm_subs_epu8(p0, p4)),
+ _mm_or_si128(_mm_subs_epu8(q4, q0),
+ _mm_subs_epu8(q0, q4)));
+ flat = _mm_subs_epu8(flat, one);
+ flat = _mm_cmpeq_epi8(flat, zero);
+ flat = _mm_and_si128(flat, mask);
+
+ p5 = _mm_loadu_si128((__m128i *)(s - 6 * p));
+ q5 = _mm_loadu_si128((__m128i *)(s + 5 * p));
+ flat2 = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p5, p0),
+ _mm_subs_epu8(p0, p5)),
+ _mm_or_si128(_mm_subs_epu8(q5, q0),
+ _mm_subs_epu8(q0, q5)));
+ _mm_store_si128((__m128i *)ap[5], p5);
+ _mm_store_si128((__m128i *)aq[5], q5);
+ flat2 = _mm_max_epu8(work, flat2);
+ p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
+ q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p6, p0),
+ _mm_subs_epu8(p0, p6)),
+ _mm_or_si128(_mm_subs_epu8(q6, q0),
+ _mm_subs_epu8(q0, q6)));
+ _mm_store_si128((__m128i *)ap[6], p6);
+ _mm_store_si128((__m128i *)aq[6], q6);
+ flat2 = _mm_max_epu8(work, flat2);
+
+ p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
+ q7 = _mm_loadu_si128((__m128i *)(s + 7 * p));
+ work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p7, p0),
+ _mm_subs_epu8(p0, p7)),
+ _mm_or_si128(_mm_subs_epu8(q7, q0),
+ _mm_subs_epu8(q0, q7)));
+ _mm_store_si128((__m128i *)ap[7], p7);
+ _mm_store_si128((__m128i *)aq[7], q7);
+ flat2 = _mm_max_epu8(work, flat2);
+ flat2 = _mm_subs_epu8(flat2, one);
+ flat2 = _mm_cmpeq_epi8(flat2, zero);
+ flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask
+ }
+
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ // flat and wide flat calculations
+ {
+ const __m128i eight = _mm_set1_epi16(8);
+ const __m128i four = _mm_set1_epi16(4);
+ __m128i temp_flat2 = flat2;
+ unsigned char *src = s;
+ int i = 0;
+ do {
+ __m128i workp_shft;
+ __m128i a, b, c;
+
+ unsigned int off = i * 8;
+ p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
+ p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
+ p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
+ p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
+ p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
+ p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
+ p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
+ p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
+ q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
+ q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
+ q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
+ q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
+ q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
+ q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
+ q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
+ q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
+
+ c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7); // p7 * 7
+ c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
+
+ b = _mm_add_epi16(_mm_add_epi16(p3, four), _mm_add_epi16(p3, p2));
+ a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
+ a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
+
+ _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q1, a);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
+ _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q2, a);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
+ _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q3, a);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
+ _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ b = _mm_add_epi16(q3, b);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
+ _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+
+ c = _mm_add_epi16(q4, c);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ b = _mm_add_epi16(q3, b);
+ b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
+ _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
+ _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
+ , b));
+ a = _mm_add_epi16(q5, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q6, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ a = _mm_add_epi16(q7, a);
+ c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
+ workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
+ _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+ _mm_packus_epi16(workp_shft, workp_shft));
+
+ temp_flat2 = _mm_srli_si128(temp_flat2, 8);
+ src += 8;
+ } while (++i < 2);
+ }
+ // wide flat
+ // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ work_a = _mm_load_si128((__m128i *)ap[2]);
+ p2 = _mm_load_si128((__m128i *)flat_op[2]);
+ work_a = _mm_andnot_si128(flat, work_a);
+ p2 = _mm_and_si128(flat, p2);
+ p2 = _mm_or_si128(work_a, p2);
+ _mm_store_si128((__m128i *)flat_op[2], p2);
+
+ p1 = _mm_load_si128((__m128i *)flat_op[1]);
+ work_a = _mm_andnot_si128(flat, ps1);
+ p1 = _mm_and_si128(flat, p1);
+ p1 = _mm_or_si128(work_a, p1);
+ _mm_store_si128((__m128i *)flat_op[1], p1);
+
+ p0 = _mm_load_si128((__m128i *)flat_op[0]);
+ work_a = _mm_andnot_si128(flat, ps0);
+ p0 = _mm_and_si128(flat, p0);
+ p0 = _mm_or_si128(work_a, p0);
+ _mm_store_si128((__m128i *)flat_op[0], p0);
+
+ q0 = _mm_load_si128((__m128i *)flat_oq[0]);
+ work_a = _mm_andnot_si128(flat, qs0);
+ q0 = _mm_and_si128(flat, q0);
+ q0 = _mm_or_si128(work_a, q0);
+ _mm_store_si128((__m128i *)flat_oq[0], q0);
+
+ q1 = _mm_load_si128((__m128i *)flat_oq[1]);
+ work_a = _mm_andnot_si128(flat, qs1);
+ q1 = _mm_and_si128(flat, q1);
+ q1 = _mm_or_si128(work_a, q1);
+ _mm_store_si128((__m128i *)flat_oq[1], q1);
+
+ work_a = _mm_load_si128((__m128i *)aq[2]);
+ q2 = _mm_load_si128((__m128i *)flat_oq[2]);
+ work_a = _mm_andnot_si128(flat, work_a);
+ q2 = _mm_and_si128(flat, q2);
+ q2 = _mm_or_si128(work_a, q2);
+ _mm_store_si128((__m128i *)flat_oq[2], q2);
// write out op6 - op3
{
unsigned char *dst = (s - 7 * p);
for (i = 6; i > 2; i--) {
__m128i flat2_output;
- work_a = _mm_loadu_si128((__m128i *)dst);
+ work_a = _mm_load_si128((__m128i *)ap[i]);
flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
work_a = _mm_andnot_si128(flat2, work_a);
flat2_output = _mm_and_si128(flat2, flat2_output);
@@ -412,62 +431,42 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
}
}
- work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
- p2 = _mm_load_si128((__m128i *)flat_op2);
- work_a = _mm_andnot_si128(flat, work_a);
- p2 = _mm_and_si128(flat, p2);
- work_a = _mm_or_si128(work_a, p2);
+ work_a = _mm_load_si128((__m128i *)flat_op[2]);
p2 = _mm_load_si128((__m128i *)flat2_op[2]);
work_a = _mm_andnot_si128(flat2, work_a);
p2 = _mm_and_si128(flat2, p2);
p2 = _mm_or_si128(work_a, p2);
_mm_storeu_si128((__m128i *)(s - 3 * p), p2);
- p1 = _mm_load_si128((__m128i *)flat_op1);
- work_a = _mm_andnot_si128(flat, ps1);
- p1 = _mm_and_si128(flat, p1);
- work_a = _mm_or_si128(work_a, p1);
+ work_a = _mm_load_si128((__m128i *)flat_op[1]);
p1 = _mm_load_si128((__m128i *)flat2_op[1]);
work_a = _mm_andnot_si128(flat2, work_a);
p1 = _mm_and_si128(flat2, p1);
p1 = _mm_or_si128(work_a, p1);
_mm_storeu_si128((__m128i *)(s - 2 * p), p1);
- p0 = _mm_load_si128((__m128i *)flat_op0);
- work_a = _mm_andnot_si128(flat, ps0);
- p0 = _mm_and_si128(flat, p0);
- work_a = _mm_or_si128(work_a, p0);
+ work_a = _mm_load_si128((__m128i *)flat_op[0]);
p0 = _mm_load_si128((__m128i *)flat2_op[0]);
work_a = _mm_andnot_si128(flat2, work_a);
p0 = _mm_and_si128(flat2, p0);
p0 = _mm_or_si128(work_a, p0);
_mm_storeu_si128((__m128i *)(s - 1 * p), p0);
- q0 = _mm_load_si128((__m128i *)flat_oq0);
- work_a = _mm_andnot_si128(flat, qs0);
- q0 = _mm_and_si128(flat, q0);
- work_a = _mm_or_si128(work_a, q0);
+ work_a = _mm_load_si128((__m128i *)flat_oq[0]);
q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
work_a = _mm_andnot_si128(flat2, work_a);
q0 = _mm_and_si128(flat2, q0);
q0 = _mm_or_si128(work_a, q0);
_mm_storeu_si128((__m128i *)(s - 0 * p), q0);
- q1 = _mm_load_si128((__m128i *)flat_oq1);
- work_a = _mm_andnot_si128(flat, qs1);
- q1 = _mm_and_si128(flat, q1);
- work_a = _mm_or_si128(work_a, q1);
+ work_a = _mm_load_si128((__m128i *)flat_oq[1]);
q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
work_a = _mm_andnot_si128(flat2, work_a);
q1 = _mm_and_si128(flat2, q1);
q1 = _mm_or_si128(work_a, q1);
_mm_storeu_si128((__m128i *)(s + 1 * p), q1);
- work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
- q2 = _mm_load_si128((__m128i *)flat_oq2);
- work_a = _mm_andnot_si128(flat, work_a);
- q2 = _mm_and_si128(flat, q2);
- work_a = _mm_or_si128(work_a, q2);
+ work_a = _mm_load_si128((__m128i *)flat_oq[2]);
q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
work_a = _mm_andnot_si128(flat2, work_a);
q2 = _mm_and_si128(flat2, q2);
@@ -479,7 +478,7 @@ void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
unsigned char *dst = (s + 3 * p);
for (i = 3; i < 7; i++) {
__m128i flat2_output;
- work_a = _mm_loadu_si128((__m128i *)dst);
+ work_a = _mm_load_si128((__m128i *)aq[i]);
flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
work_a = _mm_andnot_si128(flat2, work_a);
flat2_output = _mm_and_si128(flat2, flat2_output);
@@ -504,7 +503,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
__m128i mask, hev, flat;
const __m128i zero = _mm_set1_epi16(0);
- __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
+ __m128i p3, p2, p1, p0, q0, q1, q2, q3;
const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
const unsigned int extended_limit = _limit[0] * 0x01010101u;
const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
@@ -515,7 +514,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
const __m128i blimit =
_mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
- p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
@@ -524,7 +522,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
- q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
{
const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
_mm_subs_epu8(p0, p1));
@@ -573,11 +570,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
_mm_or_si128(_mm_subs_epu8(q3, q0),
_mm_subs_epu8(q0, q3)));
flat = _mm_max_epu8(work, flat);
- work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p4, p0),
- _mm_subs_epu8(p0, p4)),
- _mm_or_si128(_mm_subs_epu8(q4, q0),
- _mm_subs_epu8(q0, q4)));
- flat = _mm_max_epu8(work, flat);
flat = _mm_subs_epu8(flat, one);
flat = _mm_cmpeq_epi8(flat, zero);
flat = _mm_and_si128(flat, mask);
@@ -588,7 +580,6 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
int i = 0;
do {
__m128i workp_a, workp_b, workp_shft;
- p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 5 * p)), zero);
p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
@@ -597,11 +588,10 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
- q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 4 * p)), zero);
- workp_a = _mm_add_epi16(_mm_add_epi16(p4, p3), _mm_add_epi16(p2, p1));
+ workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
- workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p4);
+ workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_storel_epi64((__m128i *)&flat_op2[i*8],
_mm_packus_epi16(workp_shft, workp_shft));
@@ -611,7 +601,7 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
_mm_storel_epi64((__m128i *)&flat_op1[i*8],
_mm_packus_epi16(workp_shft, workp_shft));
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p4), q2);
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_storel_epi64((__m128i *)&flat_op0[i*8],
@@ -623,13 +613,13 @@ void vp9_mbloop_filter_horizontal_edge_sse2(unsigned char *s,
_mm_storel_epi64((__m128i *)&flat_oq0[i*8],
_mm_packus_epi16(workp_shft, workp_shft));
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q4);
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_storel_epi64((__m128i *)&flat_oq1[i*8],
_mm_packus_epi16(workp_shft, workp_shft));
- workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q4);
+ workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
_mm_storel_epi64((__m128i *)&flat_oq2[i*8],
@@ -813,8 +803,8 @@ void vp9_mbloop_filter_horizontal_edge_uv_sse2(unsigned char *u,
_mm_loadl_epi64((__m128i *)(src + 120)));
}
-static __inline void transpose8x16(unsigned char *in0, unsigned char *in1,
- int in_p, unsigned char *out, int out_p) {
+static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
+ int in_p, unsigned char *out, int out_p) {
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
__m128i x8, x9, x10, x11, x12, x13, x14, x15;
@@ -879,9 +869,9 @@ static __inline void transpose8x16(unsigned char *in0, unsigned char *in1,
_mm_storeu_si128((__m128i *)(out + 7 * out_p), _mm_unpackhi_epi64(x7, x15));
}
-static __inline void transpose(unsigned char *src[], int in_p,
- unsigned char *dst[], int out_p,
- int num_8x8_to_transpose) {
+static INLINE void transpose(unsigned char *src[], int in_p,
+ unsigned char *dst[], int out_p,
+ int num_8x8_to_transpose) {
int idx8x8 = 0;
__m128i x0, x1, x2, x3, x4, x5, x6, x7;
do {
diff --git a/vp9/common/x86/vp9_postproc_mmx.asm b/vp9/common/x86/vp9_postproc_mmx.asm
index 5f06f0ea0..c2118dbb7 100644
--- a/vp9/common/x86/vp9_postproc_mmx.asm
+++ b/vp9/common/x86/vp9_postproc_mmx.asm
@@ -459,11 +459,11 @@ sym(vp9_mbpost_proc_down_mmx):
%undef flimit2
-;void vp9_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise,
+;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise,
; unsigned char blackclamp[16],
; unsigned char whiteclamp[16],
; unsigned char bothclamp[16],
-; unsigned int Width, unsigned int Height, int Pitch)
+; unsigned int width, unsigned int height, int pitch)
extern sym(rand)
global sym(vp9_plane_add_noise_mmx) PRIVATE
sym(vp9_plane_add_noise_mmx):
diff --git a/vp9/common/x86/vp9_postproc_sse2.asm b/vp9/common/x86/vp9_postproc_sse2.asm
index 8bbb3794b..858fc99b6 100644
--- a/vp9/common/x86/vp9_postproc_sse2.asm
+++ b/vp9/common/x86/vp9_postproc_sse2.asm
@@ -624,11 +624,11 @@ sym(vp9_mbpost_proc_across_ip_xmm):
%undef flimit4
-;void vp9_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise,
+;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise,
; unsigned char blackclamp[16],
; unsigned char whiteclamp[16],
; unsigned char bothclamp[16],
-; unsigned int Width, unsigned int Height, int Pitch)
+; unsigned int width, unsigned int height, int pitch)
extern sym(rand)
global sym(vp9_plane_add_noise_wmt) PRIVATE
sym(vp9_plane_add_noise_wmt):
diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index b644da64c..32f00e289 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
@@ -21,34 +21,92 @@
;
;*************************************************************************************/
-;void vp9_filter_block1d8_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
+%macro VERTx4 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
+
+ movdqa xmm4, [rdx] ;load filters
+ movd xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
+
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
+
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
+
+%if ABI_IS_32BIT=0
+ movsxd r8, DWORD PTR arg(3) ;out_pitch
+%endif
+ mov rax, rsi
+ movsxd rcx, DWORD PTR arg(4) ;output_height
+ add rax, rdx
+
+ lea rbx, [rdx + rdx*4]
+ add rbx, rdx ;pitch * 6
+
+.loop:
+ movd xmm0, [rsi] ;A
+ movd xmm1, [rsi + rdx] ;B
+ movd xmm2, [rsi + rdx * 2] ;C
+ movd xmm3, [rax + rdx * 2] ;D
+ movd xmm4, [rsi + rdx * 4] ;E
+ movd xmm5, [rax + rdx * 4] ;F
+
+ punpcklbw xmm0, xmm1 ;A B
+ punpcklbw xmm2, xmm3 ;C D
+ punpcklbw xmm4, xmm5 ;E F
+
+ movd xmm6, [rsi + rbx] ;G
+ movd xmm7, [rax + rbx] ;H
+
+ pmaddubsw xmm0, k0k1
+ pmaddubsw xmm2, k2k3
+ punpcklbw xmm6, xmm7 ;G H
+ pmaddubsw xmm4, k4k5
+ pmaddubsw xmm6, k6k7
+
+ paddsw xmm0, xmm2
+ paddsw xmm0, krd
+ paddsw xmm4, xmm6
+ paddsw xmm0, xmm4
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+
+ add rsi, rdx
+ add rax, rdx
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+ movd [rdi], xmm0
+
+%if ABI_IS_32BIT
+ add rdi, DWORD PTR arg(3) ;out_pitch
+%else
+ add rdi, r8
+%endif
+ dec rcx
+ jnz .loop
+%endm
+
+%macro VERTx8 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
@@ -86,7 +144,7 @@ sym(vp9_filter_block1d8_v8_ssse3):
lea rbx, [rdx + rdx*4]
add rbx, rdx ;pitch * 6
-.vp9_filter_block1d8_v8_ssse3_loop:
+.loop:
movq xmm0, [rsi] ;A
movq xmm1, [rsi + rdx] ;B
movq xmm2, [rsi + rdx * 2] ;C
@@ -117,7 +175,10 @@ sym(vp9_filter_block1d8_v8_ssse3):
add rsi, rdx
add rax, rdx
-
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
movq [rdi], xmm0
%if ABI_IS_32BIT
@@ -126,47 +187,11 @@ sym(vp9_filter_block1d8_v8_ssse3):
add rdi, r8
%endif
dec rcx
- jnz .vp9_filter_block1d8_v8_ssse3_loop
-
- add rsp, 16*5
- pop rsp
- pop rbx
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d16_v8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- push rsi
- push rdi
- push rbx
- ; end prolog
+ jnz .loop
+%endm
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
+%macro VERTx16 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
@@ -204,7 +229,7 @@ sym(vp9_filter_block1d16_v8_ssse3):
lea rbx, [rdx + rdx*4]
add rbx, rdx ;pitch * 6
-.vp9_filter_block1d16_v8_ssse3_loop:
+.loop:
movq xmm0, [rsi] ;A
movq xmm1, [rsi + rdx] ;B
movq xmm2, [rsi + rdx * 2] ;C
@@ -232,7 +257,10 @@ sym(vp9_filter_block1d16_v8_ssse3):
psraw xmm0, 7
packuswb xmm0, xmm0
-
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
movq [rdi], xmm0
movq xmm0, [rsi + 8] ;A
@@ -267,6 +295,10 @@ sym(vp9_filter_block1d16_v8_ssse3):
add rsi, rdx
add rax, rdx
+%if %1
+ movq xmm1, [rdi+8]
+ pavgb xmm0, xmm1
+%endif
movq [rdi+8], xmm0
@@ -276,7 +308,38 @@ sym(vp9_filter_block1d16_v8_ssse3):
add rdi, r8
%endif
dec rcx
- jnz .vp9_filter_block1d16_v8_ssse3_loop
+ jnz .loop
+%endm
+
+;void vp9_filter_block1d8_v8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx4 0
add rsp, 16*5
pop rsp
@@ -289,24 +352,65 @@ sym(vp9_filter_block1d16_v8_ssse3):
pop rbp
ret
-;void vp9_filter_block1d8_h8_ssse3
+;void vp9_filter_block1d8_v8_ssse3
;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
; short *filter
;)
-global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h8_ssse3):
+global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx8 0
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d16_v8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pitch,
+; unsigned char *output_ptr,
+; unsigned int out_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_ssse3):
push rbp
mov rbp, rsp
SHADOW_ARGS_TO_STACK 6
SAVE_XMM 7
- GET_GOT rbx
push rsi
push rdi
+ push rbx
; end prolog
ALIGN_STACK 16, rax
@@ -317,6 +421,121 @@ sym(vp9_filter_block1d8_h8_ssse3):
%define k6k7 [rsp + 16*3]
%define krd [rsp + 16*4]
+ VERTx16 0
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+
+global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_v8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx4 1
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_v8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx8 1
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_v8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ push rsi
+ push rdi
+ push rbx
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ VERTx16 1
+
+ add rsp, 16*5
+ pop rsp
+ pop rbx
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+%macro HORIZx4 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
@@ -340,19 +559,16 @@ sym(vp9_filter_block1d8_h8_ssse3):
pshufd xmm5, xmm5, 0
movdqa k4k5, xmm2
movdqa k6k7, xmm3
-; movdqa krd, xmm5
+ movdqa krd, xmm5
movsxd rax, dword ptr arg(1) ;src_pixels_per_line
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
-.filter_block1d8_h8_rowloop_ssse3:
+.loop:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
-; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
punpcklqdq xmm0, xmm3
movdqa xmm1, xmm0
@@ -371,59 +587,94 @@ sym(vp9_filter_block1d8_h8_ssse3):
pmaddubsw xmm4, k6k7
paddsw xmm0, xmm1
- paddsw xmm0, xmm2
- paddsw xmm0, xmm5
paddsw xmm0, xmm4
+ paddsw xmm0, xmm2
+ paddsw xmm0, krd
psraw xmm0, 7
packuswb xmm0, xmm0
-
+%if %1
+ movd xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
lea rsi, [rsi + rax]
- movq [rdi], xmm0
+ movd [rdi], xmm0
lea rdi, [rdi + rdx]
dec rcx
- jnz .filter_block1d8_h8_rowloop_ssse3
+ jnz .loop
+%endm
- add rsp, 16*5
- pop rsp
+%macro HORIZx8 1
+ mov rdx, arg(5) ;filter ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;output_ptr
+ mov rcx, 0x0400040
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
+ movdqa xmm4, [rdx] ;load filters
+ movd xmm5, rcx
+ packsswb xmm4, xmm4
+ pshuflw xmm0, xmm4, 0b ;k0_k1
+ pshuflw xmm1, xmm4, 01010101b ;k2_k3
+ pshuflw xmm2, xmm4, 10101010b ;k4_k5
+ pshuflw xmm3, xmm4, 11111111b ;k6_k7
-;void vp9_filter_block1d16_h8_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; short *filter
-;)
-global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
+ punpcklqdq xmm0, xmm0
+ punpcklqdq xmm1, xmm1
+ punpcklqdq xmm2, xmm2
+ punpcklqdq xmm3, xmm3
- ALIGN_STACK 16, rax
- sub rsp, 16*5
- %define k0k1 [rsp + 16*0]
- %define k2k3 [rsp + 16*1]
- %define k4k5 [rsp + 16*2]
- %define k6k7 [rsp + 16*3]
- %define krd [rsp + 16*4]
+ movdqa k0k1, xmm0
+ movdqa k2k3, xmm1
+ pshufd xmm5, xmm5, 0
+ movdqa k4k5, xmm2
+ movdqa k6k7, xmm3
+ movdqa krd, xmm5
+
+ movsxd rax, dword ptr arg(1) ;src_pixels_per_line
+ movsxd rdx, dword ptr arg(3) ;output_pitch
+ movsxd rcx, dword ptr arg(4) ;output_height
+.loop:
+ movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
+
+ movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
+ punpcklqdq xmm0, xmm3
+
+ movdqa xmm1, xmm0
+ pshufb xmm0, [GLOBAL(shuf_t0t1)]
+ pmaddubsw xmm0, k0k1
+
+ movdqa xmm2, xmm1
+ pshufb xmm1, [GLOBAL(shuf_t2t3)]
+ pmaddubsw xmm1, k2k3
+
+ movdqa xmm4, xmm2
+ pshufb xmm2, [GLOBAL(shuf_t4t5)]
+ pmaddubsw xmm2, k4k5
+
+ pshufb xmm4, [GLOBAL(shuf_t6t7)]
+ pmaddubsw xmm4, k6k7
+
+ paddsw xmm0, xmm1
+ paddsw xmm0, xmm4
+ paddsw xmm0, xmm2
+ paddsw xmm0, krd
+ psraw xmm0, 7
+ packuswb xmm0, xmm0
+%if %1
+ movq xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
+
+ lea rsi, [rsi + rax]
+ movq [rdi], xmm0
+
+ lea rdi, [rdi + rdx]
+ dec rcx
+ jnz .loop
+%endm
+
+%macro HORIZx16 1
mov rdx, arg(5) ;filter ptr
mov rsi, arg(0) ;src_ptr
mov rdi, arg(2) ;output_ptr
@@ -453,13 +704,10 @@ sym(vp9_filter_block1d16_h8_ssse3):
movsxd rdx, dword ptr arg(3) ;output_pitch
movsxd rcx, dword ptr arg(4) ;output_height
-.filter_block1d16_h8_rowloop_ssse3:
+.loop:
movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4
-; movq xmm3, [rsi + 4] ; 4 5 6 7 8 9 10 11
movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12
-;note: if we create a k0_k7 filter, we can save a pshufb
-; punpcklbw xmm0, xmm3 ; -3 4 -2 5 -1 6 0 7 1 8 2 9 3 10 4 11
punpcklqdq xmm0, xmm3
movdqa xmm1, xmm0
@@ -486,10 +734,7 @@ sym(vp9_filter_block1d16_h8_ssse3):
movq xmm3, [rsi + 5]
-; movq xmm7, [rsi + 12]
movq xmm7, [rsi + 13]
-;note: same as above
-; punpcklbw xmm3, xmm7
punpcklqdq xmm3, xmm7
movdqa xmm1, xmm3
@@ -508,19 +753,54 @@ sym(vp9_filter_block1d16_h8_ssse3):
pmaddubsw xmm4, k6k7
paddsw xmm3, xmm1
+ paddsw xmm3, xmm4
paddsw xmm3, xmm2
paddsw xmm3, krd
- paddsw xmm3, xmm4
psraw xmm3, 7
packuswb xmm3, xmm3
punpcklqdq xmm0, xmm3
+%if %1
+ movdqa xmm1, [rdi]
+ pavgb xmm0, xmm1
+%endif
lea rsi, [rsi + rax]
movdqa [rdi], xmm0
lea rdi, [rdi + rdx]
dec rcx
- jnz .filter_block1d16_h8_rowloop_ssse3
+ jnz .loop
+%endm
+
+;void vp9_filter_block1d4_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx4 0
add rsp, 16*5
pop rsp
@@ -534,7 +814,188 @@ sym(vp9_filter_block1d16_h8_ssse3):
pop rbp
ret
+;void vp9_filter_block1d8_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx8 0
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+;void vp9_filter_block1d16_h8_ssse3
+;(
+; unsigned char *src_ptr,
+; unsigned int src_pixels_per_line,
+; unsigned char *output_ptr,
+; unsigned int output_pitch,
+; unsigned int output_height,
+; short *filter
+;)
+global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx16 0
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d4_h8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx4 1
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d8_h8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx8 1
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE
+sym(vp9_filter_block1d16_h8_avg_ssse3):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ SAVE_XMM 7
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ ALIGN_STACK 16, rax
+ sub rsp, 16*5
+ %define k0k1 [rsp + 16*0]
+ %define k2k3 [rsp + 16*1]
+ %define k4k5 [rsp + 16*2]
+ %define k6k7 [rsp + 16*3]
+ %define krd [rsp + 16*4]
+
+ HORIZx16 1
+
+ add rsp, 16*5
+ pop rsp
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+ ret
SECTION_RODATA
align 16
shuf_t0t1:
diff --git a/vp9/common/x86/vp9_subpixel_mmx.asm b/vp9/common/x86/vp9_subpixel_mmx.asm
deleted file mode 100644
index dee29b8fb..000000000
--- a/vp9/common/x86/vp9_subpixel_mmx.asm
+++ /dev/null
@@ -1,268 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define vp9_filter_weight 128
-%define VP9_FILTER_SHIFT 7
-
-
-;void vp9_filter_block1d_h6_mmx
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp9_filter
-;)
-global sym(vp9_filter_block1d_h6_mmx) PRIVATE
-sym(vp9_filter_block1d_h6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp9_filter
-
- movq mm1, [rdx + 16] ; do both the negative taps first!!!
- movq mm2, [rdx + 32] ;
- movq mm6, [rdx + 48] ;
- movq mm7, [rdx + 64] ;
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(5) ;output_width ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-.nextrow:
- movq mm3, [rsi-2] ; mm3 = p-2..p5
- movq mm4, mm3 ; mm4 = p-2..p5
- psrlq mm3, 8 ; mm3 = p-1..p5
- punpcklbw mm3, mm0 ; mm3 = p-1..p2
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
-
- movq mm5, mm4 ; mm5 = p-2..p5
- punpckhbw mm4, mm0 ; mm5 = p2..p5
- pmullw mm4, mm7 ; mm5 *= kernel 4 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- movq mm4, mm5 ; mm4 = p-2..p5;
- psrlq mm5, 16 ; mm5 = p0..p5;
- punpcklbw mm5, mm0 ; mm5 = p0..p3
- pmullw mm5, mm2 ; mm5 *= kernel 2 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
-
- movq mm5, mm4 ; mm5 = p-2..p5
- psrlq mm4, 24 ; mm4 = p1..p5
- punpcklbw mm4, mm0 ; mm4 = p1..p4
- pmullw mm4, mm6 ; mm5 *= kernel 3 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- ; do outer positive taps
- movd mm4, [rsi+3]
- punpcklbw mm4, mm0 ; mm5 = p3..p6
- pmullw mm4, [rdx+80] ; mm5 *= kernel 0 modifiers
- paddsw mm3, mm4 ; mm3 += mm5
-
- punpcklbw mm5, mm0 ; mm5 = p-2..p1
- pmullw mm5, [rdx] ; mm5 *= kernel 5 modifiers
- paddsw mm3, mm5 ; mm3 += mm5
-
- paddsw mm3, [GLOBAL(rd)] ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and unpack to saturate
- punpcklbw mm3, mm0 ;
-
- movq [rdi], mm3 ; store the results in the destination
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(2) ;src_pixels_per_line ; next line
- add rdi, rax;
-%else
- movsxd r8, dword ptr arg(2) ;src_pixels_per_line
- add rdi, rax;
-
- add rsi, r8 ; next line
-%endif
-
- dec rcx ; decrement count
- jnz .nextrow ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1dc_v6_mmx
-;(
-; short *src_ptr,
-; unsigned char *output_ptr,
-; int output_pitch,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp9_filter
-;)
-global sym(vp9_filter_block1dc_v6_mmx) PRIVATE
-sym(vp9_filter_block1dc_v6_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movq mm5, [GLOBAL(rd)]
- push rbx
- mov rbx, arg(7) ;vp9_filter
- movq mm1, [rbx + 16] ; do both the negative taps first!!!
- movq mm2, [rbx + 32] ;
- movq mm6, [rbx + 48] ;
- movq mm7, [rbx + 64] ;
-
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
- sub rsi, rdx
- sub rsi, rdx
- movsxd rcx, DWORD PTR arg(5) ;output_height
- movsxd rax, DWORD PTR arg(2) ;output_pitch ; destination pitch?
- pxor mm0, mm0 ; mm0 = 00000000
-
-
-.nextrow_cv:
- movq mm3, [rsi+rdx] ; mm3 = p0..p8 = row -1
- pmullw mm3, mm1 ; mm3 *= kernel 1 modifiers.
-
-
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 2
- pmullw mm4, mm7 ; mm4 *= kernel 4 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 0
- pmullw mm4, mm2 ; mm4 *= kernel 2 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi] ; mm4 = p0..p3 = row -2
- pmullw mm4, [rbx] ; mm4 *= kernel 0 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
-
- add rsi, rdx ; move source forward 1 line to avoid 3 * pitch
- movq mm4, [rsi + 2*rdx] ; mm4 = p0..p3 = row 1
- pmullw mm4, mm6 ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
- movq mm4, [rsi + 4*rdx] ; mm4 = p0..p3 = row 3
- pmullw mm4, [rbx +80] ; mm4 *= kernel 3 modifiers.
- paddsw mm3, mm4 ; mm3 += mm4
-
-
- paddsw mm3, mm5 ; mm3 += round value
- psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128
- packuswb mm3, mm0 ; pack and saturate
-
- movd [rdi],mm3 ; store the results in the destination
- ; the subsequent iterations repeat 3 out of 4 of these reads. Since the
- ; recon block should be in cache this shouldn't cost much. Its obviously
- ; avoidable!!!.
- lea rdi, [rdi+rax] ;
- dec rcx ; decrement count
- jnz .nextrow_cv ; next row
-
- pop rbx
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-rd:
- times 4 dw 0x40
-
-align 16
-global HIDDEN_DATA(sym(vp9_six_tap_mmx))
-sym(vp9_six_tap_mmx):
- times 8 dw 0
- times 8 dw 0
- times 8 dw 128
- times 8 dw 0
- times 8 dw 0
- times 8 dw 0
-
- times 8 dw 0
- times 8 dw -6
- times 8 dw 123
- times 8 dw 12
- times 8 dw -1
- times 8 dw 0
-
- times 8 dw 2
- times 8 dw -11
- times 8 dw 108
- times 8 dw 36
- times 8 dw -8
- times 8 dw 1
-
- times 8 dw 0
- times 8 dw -9
- times 8 dw 93
- times 8 dw 50
- times 8 dw -6
- times 8 dw 0
-
- times 8 dw 3
- times 8 dw -16
- times 8 dw 77
- times 8 dw 77
- times 8 dw -16
- times 8 dw 3
-
- times 8 dw 0
- times 8 dw -6
- times 8 dw 50
- times 8 dw 93
- times 8 dw -9
- times 8 dw 0
-
- times 8 dw 1
- times 8 dw -8
- times 8 dw 36
- times 8 dw 108
- times 8 dw -11
- times 8 dw 2
-
- times 8 dw 0
- times 8 dw -1
- times 8 dw 12
- times 8 dw 123
- times 8 dw -6
- times 8 dw 0
-
diff --git a/vp9/common/x86/vp9_subpixel_sse2.asm b/vp9/common/x86/vp9_subpixel_sse2.asm
deleted file mode 100644
index b0c4f1282..000000000
--- a/vp9/common/x86/vp9_subpixel_sse2.asm
+++ /dev/null
@@ -1,1372 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT 7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short *vp9_filter
-;)
-global sym(vp9_filter_block1d8_h6_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;output_width
-%endif
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d8_h6_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm4
- lea rsi, [rsi + rax]
-
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(5) ;[output_width]
-%else
- add rdi, r8
-%endif
- dec rcx
-
- jnz .filter_block1d8_h6_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d16_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_h6_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 7
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(6) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;output_width
-%endif
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d16_h6_sse2_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- movq xmm2, MMWORD PTR [rsi +14]
- pslldq xmm2, 8
-
- por xmm2, xmm1
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm4
-
- movdqa xmm3, xmm2
- movdqa xmm4, xmm2
-
- movdqa xmm5, xmm2
- movdqa xmm6, xmm2
-
- movdqa xmm7, xmm2
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm2
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
- punpcklbw xmm4, xmm0
-
- movdqa XMMWORD Ptr [rdi+16], xmm4
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(5) ;[output_width]
-%else
- add rdi, r8
-%endif
-
- dec rcx
- jnz .filter_block1d16_h6_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d8_v6_sse2
-;(
-; short *src_ptr,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; short * vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d8_v6_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rax, arg(7) ;vp9_filter
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
-
- sub rsi, rdx
- sub rsi, rdx
-
- movsxd rcx, DWORD PTR arg(5) ;[output_height]
- pxor xmm0, xmm0 ; clear xmm0
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_sse2_loop:
- movdqa xmm1, XMMWORD PTR [rsi]
- pmullw xmm1, [rax]
-
- movdqa xmm2, XMMWORD PTR [rsi + rdx]
- pmullw xmm2, [rax + 16]
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 2]
- pmullw xmm3, [rax + 32]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 4]
- pmullw xmm5, [rax + 64]
-
- add rsi, rdx
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 2]
-
- pmullw xmm4, [rax + 48]
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 4]
-
- pmullw xmm6, [rax + 80]
-
- paddsw xmm2, xmm5
- paddsw xmm2, xmm3
-
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
-
- paddsw xmm2, xmm6
- paddsw xmm2, xmm7
-
- psraw xmm2, 7
- packuswb xmm2, xmm0 ; pack and saturate
-
- movq QWORD PTR [rdi], xmm2 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(2) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp9_filter_block1d8_v6_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d16_v6_sse2
-;(
-; unsigned short *src_ptr,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int pixels_per_line,
-; unsigned int pixel_step,
-; unsigned int output_height,
-; unsigned int output_width,
-; const short *vp9_filter
-;)
-;/************************************************************************************
-; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The
-; input pixel array has output_height rows.
-;*************************************************************************************/
-global sym(vp9_filter_block1d16_v6_sse2) PRIVATE
-sym(vp9_filter_block1d16_v6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rax, arg(7) ;vp9_filter
- movsxd rdx, dword ptr arg(3) ;pixels_per_line
-
- mov rdi, arg(1) ;output_ptr
- mov rsi, arg(0) ;src_ptr
-
- sub rsi, rdx
- sub rsi, rdx
-
- movsxd rcx, DWORD PTR arg(5) ;[output_height]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(2) ; dst_ptich
-%endif
-
-.vp9_filter_block1d16_v6_sse2_loop:
-; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order.
- movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2
- movdqa xmm2, XMMWORD PTR [rsi + rdx + 16]
- pmullw xmm1, [rax + 16]
- pmullw xmm2, [rax + 16]
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16]
- pmullw xmm3, [rax + 64]
- pmullw xmm4, [rax + 64]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16]
- pmullw xmm5, [rax + 32]
- pmullw xmm6, [rax + 32]
-
- movdqa xmm7, XMMWORD PTR [rsi] ; line 1
- movdqa xmm0, XMMWORD PTR [rsi + 16]
- pmullw xmm7, [rax]
- pmullw xmm0, [rax]
-
- paddsw xmm1, xmm3
- paddsw xmm2, xmm4
- paddsw xmm1, xmm5
- paddsw xmm2, xmm6
- paddsw xmm1, xmm7
- paddsw xmm2, xmm0
-
- add rsi, rdx
-
- movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4
- movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16]
- pmullw xmm3, [rax + 48]
- pmullw xmm4, [rax + 48]
-
- movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6
- movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16]
- pmullw xmm5, [rax + 80]
- pmullw xmm6, [rax + 80]
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
- pxor xmm0, xmm0 ; clear xmm0
-
- paddsw xmm1, xmm3
- paddsw xmm2, xmm4
- paddsw xmm1, xmm5
- paddsw xmm2, xmm6
-
- paddsw xmm1, xmm7
- paddsw xmm2, xmm7
-
- psraw xmm1, 7
- psraw xmm2, 7
-
- packuswb xmm1, xmm2 ; pack and saturate
- movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(2) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp9_filter_block1d16_v6_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d8_h6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d8_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_h6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(5) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ;dst_ptich
-%endif
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d8_h6_only_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0
-
- movq QWORD PTR [rdi], xmm4 ; store the results in the destination
- lea rsi, [rsi + rax]
-
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(3) ;dst_ptich
-%else
- add rdi, r8
-%endif
- dec rcx
-
- jnz .filter_block1d8_h6_only_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d16_h6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp9_filter
-;)
-; First-pass filter only when yoffset==0
-global sym(vp9_filter_block1d16_h6_only_sse2) PRIVATE
-sym(vp9_filter_block1d16_h6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdx, arg(5) ;vp9_filter
- mov rsi, arg(0) ;src_ptr
-
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ;dst_ptich
-%endif
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-
-.filter_block1d16_h6_only_sse2_rowloop:
- movq xmm3, MMWORD PTR [rsi - 2]
- movq xmm1, MMWORD PTR [rsi + 6]
-
- movq xmm2, MMWORD PTR [rsi +14]
- pslldq xmm2, 8
-
- por xmm2, xmm1
- prefetcht2 [rsi+rax-2]
-
- pslldq xmm1, 8
- por xmm1, xmm3
-
- movdqa xmm4, xmm1
- movdqa xmm5, xmm1
-
- movdqa xmm6, xmm1
- movdqa xmm7, xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm1
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0 ; lower 8 bytes
-
- movq QWORD Ptr [rdi], xmm4 ; store the results in the destination
-
- movdqa xmm3, xmm2
- movdqa xmm4, xmm2
-
- movdqa xmm5, xmm2
- movdqa xmm6, xmm2
-
- movdqa xmm7, xmm2
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1
-
- pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1
- punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1
-
- psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00
- pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2
-
- punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00
- psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01
-
- pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3
-
- punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01
- psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02
-
- pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4
-
- punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02
- psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03
-
- pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5
-
- punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03
- pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6
-
- paddsw xmm4, xmm7
- paddsw xmm4, xmm5
-
- paddsw xmm4, xmm3
- paddsw xmm4, xmm6
-
- paddsw xmm4, xmm2
- paddsw xmm4, [GLOBAL(rd)]
-
- psraw xmm4, 7
-
- packuswb xmm4, xmm0 ; higher 8 bytes
-
- movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(3) ;dst_ptich
-%else
- add rdi, r8
-%endif
-
- dec rcx
- jnz .filter_block1d16_h6_only_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_filter_block1d8_v6_only_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; int dst_ptich,
-; unsigned int output_height,
-; const short *vp9_filter
-;)
-; Second-pass filter only when xoffset==0
-global sym(vp9_filter_block1d8_v6_only_sse2) PRIVATE
-sym(vp9_filter_block1d8_v6_only_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
-
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- mov rax, arg(5) ;vp9_filter
-
- pxor xmm0, xmm0 ; clear xmm0
-
- movdqa xmm7, XMMWORD PTR [GLOBAL(rd)]
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(3) ; dst_ptich
-%endif
-
-.vp9_filter_block1d8_v6_only_sse2_loop:
- movq xmm1, MMWORD PTR [rsi]
- movq xmm2, MMWORD PTR [rsi + rdx]
- movq xmm3, MMWORD PTR [rsi + rdx * 2]
- movq xmm5, MMWORD PTR [rsi + rdx * 4]
- add rsi, rdx
- movq xmm4, MMWORD PTR [rsi + rdx * 2]
- movq xmm6, MMWORD PTR [rsi + rdx * 4]
-
- punpcklbw xmm1, xmm0
- pmullw xmm1, [rax]
-
- punpcklbw xmm2, xmm0
- pmullw xmm2, [rax + 16]
-
- punpcklbw xmm3, xmm0
- pmullw xmm3, [rax + 32]
-
- punpcklbw xmm5, xmm0
- pmullw xmm5, [rax + 64]
-
- punpcklbw xmm4, xmm0
- pmullw xmm4, [rax + 48]
-
- punpcklbw xmm6, xmm0
- pmullw xmm6, [rax + 80]
-
- paddsw xmm2, xmm5
- paddsw xmm2, xmm3
-
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
-
- paddsw xmm2, xmm6
- paddsw xmm2, xmm7
-
- psraw xmm2, 7
- packuswb xmm2, xmm0 ; pack and saturate
-
- movq QWORD PTR [rdi], xmm2 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[dst_ptich]
-%else
- add rdi, r8
-%endif
- dec rcx ; decrement count
- jnz .vp9_filter_block1d8_v6_only_sse2_loop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_unpack_block1d16_h6_sse2
-;(
-; unsigned char *src_ptr,
-; unsigned short *output_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned int output_height,
-; unsigned int output_width
-;)
-global sym(vp9_unpack_block1d16_h6_sse2) PRIVATE
-sym(vp9_unpack_block1d16_h6_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(1) ;output_ptr
-
- movsxd rcx, dword ptr arg(3) ;output_height
- movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source
-
- pxor xmm0, xmm0 ; clear xmm0 for unpack
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source
-%endif
-
-.unpack_block1d16_h6_sse2_rowloop:
- movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2
- movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1
-
- punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2
- punpcklbw xmm1, xmm0
-
- movdqa XMMWORD Ptr [rdi], xmm1
- movdqa XMMWORD Ptr [rdi + 16], xmm3
-
- lea rsi, [rsi + rax]
-%if ABI_IS_32BIT
- add rdi, DWORD Ptr arg(4) ;[output_width]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .unpack_block1d16_h6_sse2_rowloop ; next row
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_bilinear_predict16x16_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict16x16_sse2) PRIVATE
-sym(vp9_bilinear_predict16x16_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ;const short *HFilter = bilinear_filters_mmx[xoffset]
- ;const short *VFilter = bilinear_filters_mmx[yoffset]
-
- lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
- movsxd rax, dword ptr arg(2) ;xoffset
-
- cmp rax, 0 ;skip first_pass filter if xoffset=0
- je .b16x16_sp_only
-
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
-
- cmp rax, 0 ;skip second_pass filter if yoffset=0
- je .b16x16_fp_only
-
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ;dst_pitch
-%endif
- ; get the first horizontal line done
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- add rsi, rdx ; next line
-.next_row:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, [rax]
- pmullw xmm6, [rax]
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4
-
- pmullw xmm3, [rax+16]
- pmullw xmm4, [rax+16]
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rdx ; next line
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(5) ;dst_pitch
-%else
- add rdi, r8
-%endif
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done
-
-.b16x16_sp_only:
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- mov rdi, arg(4) ;dst_ptr
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- add rsi, rax ; next line
-.next_row_spo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
-
- movdqa xmm5, xmm7
- movdqa xmm6, xmm7
-
- movdqa xmm4, xmm3 ; make a copy of current line
- movdqa xmm7, xmm3
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm5, xmm1
- pmullw xmm6, xmm1
- pmullw xmm3, xmm2
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ;dst_pitch
- cmp rdi, rcx
- jne .next_row_spo
-
- jmp .done
-
-.b16x16_fp_only:
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- pxor xmm0, xmm0
-
-.next_row_fpo:
- movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
- movdqa xmm4, xmm3 ; make a copy of current line
-
- punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06
- punpckhbw xmm4, xmm0
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm1
-
- movdqu xmm5, [rsi+1]
- movdqa xmm6, xmm5
-
- punpcklbw xmm5, xmm0
- punpckhbw xmm6, xmm0
-
- pmullw xmm5, xmm2
- pmullw xmm6, xmm2
-
- paddw xmm3, xmm5
- paddw xmm4, xmm6
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm4
- movdqa [rdi], xmm3 ; store the results in the destination
-
- add rsi, rax ; next line
- add rdi, rdx ; dst_pitch
- cmp rdi, rcx
- jne .next_row_fpo
-
-.done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_bilinear_predict8x8_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-extern sym(vp9_bilinear_filters_mmx)
-global sym(vp9_bilinear_predict8x8_sse2) PRIVATE
-sym(vp9_bilinear_predict8x8_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 144 ; reserve 144 bytes
-
- ;const short *HFilter = bilinear_filters_mmx[xoffset]
- ;const short *VFilter = bilinear_filters_mmx[yoffset]
- lea rcx, [GLOBAL(sym(vp9_bilinear_filters_mmx))]
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- ;Read 9-line unaligned data in and put them on stack. This gives a big
- ;performance boost.
- movdqu xmm0, [rsi]
- lea rax, [rdx + rdx*2]
- movdqu xmm1, [rsi+rdx]
- movdqu xmm2, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi+rdx]
- movdqu xmm5, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm6, [rsi]
- movdqu xmm7, [rsi+rdx]
-
- movdqa XMMWORD PTR [rsp], xmm0
-
- movdqu xmm0, [rsi+rdx*2]
-
- movdqa XMMWORD PTR [rsp+16], xmm1
- movdqa XMMWORD PTR [rsp+32], xmm2
- movdqa XMMWORD PTR [rsp+48], xmm3
- movdqa XMMWORD PTR [rsp+64], xmm4
- movdqa XMMWORD PTR [rsp+80], xmm5
- movdqa XMMWORD PTR [rsp+96], xmm6
- movdqa XMMWORD PTR [rsp+112], xmm7
- movdqa XMMWORD PTR [rsp+128], xmm0
-
- movsxd rax, dword ptr arg(2) ;xoffset
- shl rax, 5
- add rax, rcx ;HFilter
-
- mov rdi, arg(4) ;dst_ptr
- movsxd rdx, dword ptr arg(5) ;dst_pitch
-
- movdqa xmm1, [rax]
- movdqa xmm2, [rax+16]
-
- movsxd rax, dword ptr arg(3) ;yoffset
- shl rax, 5
- add rax, rcx ;VFilter
-
- lea rcx, [rdi+rdx*8]
-
- movdqa xmm5, [rax]
- movdqa xmm6, [rax+16]
-
- pxor xmm0, xmm0
-
- ; get the first horizontal line done
- movdqa xmm3, XMMWORD PTR [rsp]
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm7, xmm3
- add rsp, 16 ; next line
-.next_row8x8:
- movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- movdqa xmm4, xmm3 ; make a copy of current line
- psrldq xmm4, 1
-
- punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07
- punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08
-
- pmullw xmm3, xmm1
- pmullw xmm4, xmm2
-
- paddw xmm3, xmm4
- pmullw xmm7, xmm5
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm4, xmm3
-
- pmullw xmm3, xmm6
- paddw xmm3, xmm7
-
- movdqa xmm7, xmm4
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- packuswb xmm3, xmm0
- movq [rdi], xmm3 ; store the results in the destination
-
- add rsp, 16 ; next line
- add rdi, rdx
-
- cmp rdi, rcx
- jne .next_row8x8
-
- ;add rsp, 144
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-rd:
- times 8 dw 0x40
diff --git a/vp9/common/x86/vp9_subpixel_ssse3.asm b/vp9/common/x86/vp9_subpixel_ssse3.asm
deleted file mode 100644
index b260480e0..000000000
--- a/vp9/common/x86/vp9_subpixel_ssse3.asm
+++ /dev/null
@@ -1,1515 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define BLOCK_HEIGHT_WIDTH 4
-%define VP9_FILTER_WEIGHT 128
-%define VP9_FILTER_SHIFT 7
-
-
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-;void vp9_filter_block1d8_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4
-
- movdqa xmm7, [GLOBAL(rd)]
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
- mov rdi, arg(2) ;output_ptr
-
- cmp esi, DWORD PTR [rax]
- je vp9_filter_block1d8_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-;xmm3 free
-.filter_block1d8_h6_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm1, xmm0
- pmaddubsw xmm0, xmm4
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
-
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
- pmaddubsw xmm1, xmm5
-
- lea rdi, [rdi + rdx]
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
-
- paddsw xmm0, xmm1
- paddsw xmm2, xmm7
-
- paddsw xmm0, xmm2
-
- psraw xmm0, 7
-
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
- jnz .filter_block1d8_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-vp9_filter_block1d8_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)]
- movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)]
-
- mov rsi, arg(0) ;src_ptr
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
- sub rdi, rdx
-
-.filter_block1d8_h4_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm2, xmm0
- pshufb xmm0, xmm3
-
- pshufb xmm2, xmm4
- pmaddubsw xmm0, xmm5
-
- lea rdi, [rdi + rdx]
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
- dec rcx
-
- paddsw xmm0, xmm7
-
- paddsw xmm0, xmm2
-
- psraw xmm0, 7
-
- packuswb xmm0, xmm0
-
- movq MMWORD Ptr [rdi], xmm0
-
- jnz .filter_block1d8_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp9_filter_block1d16_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- mov rdi, arg(2) ;output_ptr
-
- mov rsi, arg(0) ;src_ptr
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-.filter_block1d16_h6_rowloop_ssse3:
- movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5
-
- movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10
-
- punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10
-
- movdqa xmm1, xmm0
- pmaddubsw xmm0, xmm4
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
-
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
- movq xmm3, MMWORD PTR [rsi + 6]
-
- pmaddubsw xmm1, xmm5
- movq xmm7, MMWORD PTR [rsi + 11]
-
- pmaddubsw xmm2, xmm6
- punpcklbw xmm3, xmm7
-
- paddsw xmm0, xmm1
- movdqa xmm1, xmm3
-
- pmaddubsw xmm3, xmm4
- paddsw xmm0, xmm2
-
- movdqa xmm2, xmm1
- paddsw xmm0, [GLOBAL(rd)]
-
- pshufb xmm1, [GLOBAL(shuf2bfrom1)]
- pshufb xmm2, [GLOBAL(shuf3bfrom1)]
-
- psraw xmm0, 7
- pmaddubsw xmm1, xmm5
-
- pmaddubsw xmm2, xmm6
- packuswb xmm0, xmm0
-
- lea rsi, [rsi + rax]
- paddsw xmm3, xmm1
-
- paddsw xmm3, xmm2
-
- paddsw xmm3, [GLOBAL(rd)]
-
- psraw xmm3, 7
-
- packuswb xmm3, xmm3
-
- punpcklqdq xmm0, xmm3
-
- movdqa XMMWORD Ptr [rdi], xmm0
-
- lea rdi, [rdi + rdx]
- dec rcx
- jnz .filter_block1d16_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d4_h6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pixels_per_line,
-; unsigned char *output_ptr,
-; unsigned int output_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_h6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_h6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
- movdqa xmm7, [GLOBAL(rd)]
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d4_h4_ssse3
-
- movdqa xmm4, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-;xmm3 free
-.filter_block1d4_h6_rowloop_ssse3:
- movdqu xmm0, XMMWORD PTR [rsi - 2]
-
- movdqa xmm1, xmm0
- pshufb xmm0, [GLOBAL(shuf1b)]
-
- movdqa xmm2, xmm1
- pshufb xmm1, [GLOBAL(shuf2b)]
- pmaddubsw xmm0, xmm4
- pshufb xmm2, [GLOBAL(shuf3b)]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm0, xmm1
- paddsw xmm0, xmm7
- pxor xmm1, xmm1
- paddsw xmm0, xmm2
- psraw xmm0, 7
- packuswb xmm0, xmm0
-
- movd DWORD PTR [rdi], xmm0
-
- add rdi, rdx
- dec rcx
- jnz .filter_block1d4_h6_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d4_h4_ssse3:
- movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)]
- movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)]
-
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;output_ptr
- movsxd rax, dword ptr arg(1) ;src_pixels_per_line
- movsxd rcx, dword ptr arg(4) ;output_height
-
- movsxd rdx, dword ptr arg(3) ;output_pitch
-
-.filter_block1d4_h4_rowloop_ssse3:
- movdqu xmm1, XMMWORD PTR [rsi - 2]
-
- movdqa xmm2, xmm1
- pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)]
- pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)]
- pmaddubsw xmm1, xmm5
-
-;--
- pmaddubsw xmm2, xmm6
-
- lea rsi, [rsi + rax]
-;--
- paddsw xmm1, xmm7
- paddsw xmm1, xmm2
- psraw xmm1, 7
- packuswb xmm1, xmm1
-
- movd DWORD PTR [rdi], xmm1
-
- add rdi, rdx
- dec rcx
- jnz .filter_block1d4_h4_rowloop_ssse3
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;void vp9_filter_block1d16_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d16_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d16_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d16_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-
-.vp9_filter_block1d16_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [GLOBAL(rd)]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2 ;store the results
-
- movq xmm1, MMWORD PTR [rsi + 8] ;A
- movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
-
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, [GLOBAL(rd)]
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi+8], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d16_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d16_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ;out_pitch
-%endif
- mov rax, rsi
- movsxd rcx, DWORD PTR arg(4) ;output_height
- add rax, rdx
-
-.vp9_filter_block1d16_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B
- movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E
-
- paddsw xmm2, [GLOBAL(rd)]
- paddsw xmm2, xmm3
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- punpcklbw xmm5, xmm4 ;B D
- punpcklbw xmm1, xmm0 ;C E
-
- pmaddubsw xmm1, xmm6
- pmaddubsw xmm5, xmm7
-
- movdqa xmm4, [GLOBAL(rd)]
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm5, xmm1
- paddsw xmm5, xmm4
- psraw xmm5, 7
- packuswb xmm5, xmm5
-
- punpcklqdq xmm2, xmm5
-
- movdqa XMMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;out_pitch
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d16_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_filter_block1d8_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d8_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d8_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d8_v4_ssse3
-
- movdqa xmm5, XMMWORD PTR [rax] ;k0_k5
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d8_v6_ssse3_loop:
- movq xmm1, MMWORD PTR [rsi] ;A
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- movq xmm0, MMWORD PTR [rax + rdx * 4] ;F
- movdqa xmm4, [GLOBAL(rd)]
-
- pmaddubsw xmm3, xmm6
- punpcklbw xmm1, xmm0 ;A F
- pmaddubsw xmm2, xmm7
- pmaddubsw xmm1, xmm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm1
- paddsw xmm2, xmm4
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d8_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d8_v4_ssse3:
- movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4
- movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3
- movdqa xmm5, [GLOBAL(rd)]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d8_v4_ssse3_loop:
- movq xmm2, MMWORD PTR [rsi + rdx] ;B
- movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C
- movq xmm4, MMWORD PTR [rax + rdx * 2] ;D
- movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw xmm2, xmm4 ;B D
- punpcklbw xmm3, xmm0 ;C E
-
- pmaddubsw xmm3, xmm6
- pmaddubsw xmm2, xmm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw xmm2, xmm3
- paddsw xmm2, xmm5
- psraw xmm2, 7
- packuswb xmm2, xmm2
-
- movq MMWORD PTR [rdi], xmm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d8_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-;void vp9_filter_block1d4_v6_ssse3
-;(
-; unsigned char *src_ptr,
-; unsigned int src_pitch,
-; unsigned char *output_ptr,
-; unsigned int out_pitch,
-; unsigned int output_height,
-; unsigned int vp9_filter_index
-;)
-global sym(vp9_filter_block1d4_v6_ssse3) PRIVATE
-sym(vp9_filter_block1d4_v6_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- movsxd rdx, DWORD PTR arg(5) ;table index
- xor rsi, rsi
- shl rdx, 4 ;
-
- lea rax, [GLOBAL(k0_k5)]
- add rax, rdx
-
- movsxd rdx, DWORD PTR arg(1) ;pixels_per_line
- mov rdi, arg(2) ;output_ptr
-%if ABI_IS_32BIT=0
- movsxd r8, DWORD PTR arg(3) ; out_pitch
-%endif
- movsxd rcx, DWORD PTR arg(4) ;[output_height]
-
- cmp esi, DWORD PTR [rax]
- je .vp9_filter_block1d4_v4_ssse3
-
- movq mm5, MMWORD PTR [rax] ;k0_k5
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d4_v6_ssse3_loop:
- movd mm1, DWORD PTR [rsi] ;A
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- movd mm0, DWORD PTR [rax + rdx * 4] ;F
-
- movq mm4, [GLOBAL(rd)]
-
- pmaddubsw mm3, mm6
- punpcklbw mm1, mm0 ;A F
- pmaddubsw mm2, mm7
- pmaddubsw mm1, mm5
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm1
- paddsw mm2, mm4
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d4_v6_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-.vp9_filter_block1d4_v4_ssse3:
- movq mm6, MMWORD PTR [rax+256] ;k2_k4
- movq mm7, MMWORD PTR [rax+128] ;k1_k3
- movq mm5, MMWORD PTR [GLOBAL(rd)]
-
- mov rsi, arg(0) ;src_ptr
-
- mov rax, rsi
- add rax, rdx
-
-.vp9_filter_block1d4_v4_ssse3_loop:
- movd mm2, DWORD PTR [rsi + rdx] ;B
- movd mm3, DWORD PTR [rsi + rdx * 2] ;C
- movd mm4, DWORD PTR [rax + rdx * 2] ;D
- movd mm0, DWORD PTR [rsi + rdx * 4] ;E
-
- punpcklbw mm2, mm4 ;B D
- punpcklbw mm3, mm0 ;C E
-
- pmaddubsw mm3, mm6
- pmaddubsw mm2, mm7
- add rsi, rdx
- add rax, rdx
-;--
-;--
- paddsw mm2, mm3
- paddsw mm2, mm5
- psraw mm2, 7
- packuswb mm2, mm2
-
- movd DWORD PTR [rdi], mm2
-
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(3) ;[out_pitch]
-%else
- add rdi, r8
-%endif
- dec rcx
- jnz .vp9_filter_block1d4_v4_ssse3_loop
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_bilinear_predict16x16_ssse3
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp9_bilinear_predict16x16_ssse3) PRIVATE
-sym(vp9_bilinear_predict16x16_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- lea rcx, [GLOBAL(bilinear_filters_ssse3)]
- movsxd rax, dword ptr arg(2) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .b16x16_sp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; HFilter
-
- mov rdi, arg(4) ; dst_ptr
- mov rsi, arg(0) ; src_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm1, [rax]
-
- movsxd rax, dword ptr arg(3) ; yoffset
-
- cmp rax, 0 ; skip second_pass filter if yoffset=0
- je .b16x16_fp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rdx, dword ptr arg(1) ; src_pixels_per_line
-
- movdqa xmm2, [rax]
-
-%if ABI_IS_32BIT=0
- movsxd r8, dword ptr arg(5) ; dst_pitch
-%endif
- movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
- movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
-
- lea rsi, [rsi + rdx] ; next line
-
- pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14
-
- punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16
- pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
- psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
- movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm6, xmm5
- movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16
- lea rsi, [rsi + rdx] ; next line
-
- pmaddubsw xmm6, xmm1
-
- punpcklbw xmm4, xmm5
- pmaddubsw xmm4, xmm1
-
- paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
- psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
-
- paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value
- psraw xmm4, VP9_FILTER_SHIFT ; xmm4 /= 128
-
- packuswb xmm6, xmm4
- movdqa xmm5, xmm7
-
- punpcklbw xmm5, xmm6
- pmaddubsw xmm5, xmm2
-
- punpckhbw xmm7, xmm6
- pmaddubsw xmm7, xmm2
-
- paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value
- psraw xmm5, VP9_FILTER_SHIFT ; xmm5 /= 128
-
- paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
- psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
-
- packuswb xmm5, xmm7
- movdqa xmm7, xmm6
-
- movdqa [rdi], xmm5 ; store the results in the destination
-%if ABI_IS_32BIT
- add rdi, DWORD PTR arg(5) ; dst_pitch
-%else
- add rdi, r8
-%endif
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done
-
-.b16x16_sp_only:
- movsxd rax, dword ptr arg(3) ; yoffset
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- mov rdi, arg(4) ; dst_ptr
- mov rsi, arg(0) ; src_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm1, [rax] ; VFilter
-
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-
- ; get the first horizontal line done
- movq xmm4, [rsi] ; load row 0
- movq xmm2, [rsi + 8] ; load row 0
-
- lea rsi, [rsi + rax] ; next line
-.next_row_sp:
- movq xmm3, [rsi] ; load row + 1
- movq xmm5, [rsi + 8] ; load row + 1
-
- punpcklbw xmm4, xmm3
- punpcklbw xmm2, xmm5
-
- pmaddubsw xmm4, xmm1
- movq xmm7, [rsi + rax] ; load row + 2
-
- pmaddubsw xmm2, xmm1
- movq xmm6, [rsi + rax + 8] ; load row + 2
-
- punpcklbw xmm3, xmm7
- punpcklbw xmm5, xmm6
-
- pmaddubsw xmm3, xmm1
- paddw xmm4, [GLOBAL(rd)]
-
- pmaddubsw xmm5, xmm1
- paddw xmm2, [GLOBAL(rd)]
-
- psraw xmm4, VP9_FILTER_SHIFT
- psraw xmm2, VP9_FILTER_SHIFT
-
- packuswb xmm4, xmm2
- paddw xmm3, [GLOBAL(rd)]
-
- movdqa [rdi], xmm4 ; store row 0
- paddw xmm5, [GLOBAL(rd)]
-
- psraw xmm3, VP9_FILTER_SHIFT
- psraw xmm5, VP9_FILTER_SHIFT
-
- packuswb xmm3, xmm5
- movdqa xmm4, xmm7
-
- movdqa [rdi + rdx],xmm3 ; store row 1
- lea rsi, [rsi + 2*rax]
-
- movdqa xmm2, xmm6
- lea rdi, [rdi + 2*rdx]
-
- cmp rdi, rcx
- jne .next_row_sp
-
- jmp .done
-
-.b16x16_fp_only:
- lea rcx, [rdi+rdx*8]
- lea rcx, [rcx+rdx*8]
- movsxd rax, dword ptr arg(1) ; src_pixels_per_line
-
-.next_row_fp:
- movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07
- movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08
-
- punpcklbw xmm2, xmm4
- movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15
-
- pmaddubsw xmm2, xmm1
- movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16
-
- lea rsi, [rsi + rax] ; next line
- punpcklbw xmm3, xmm4
-
- pmaddubsw xmm3, xmm1
- movq xmm5, [rsi]
-
- paddw xmm2, [GLOBAL(rd)]
- movq xmm7, [rsi+1]
-
- movq xmm6, [rsi+8]
- psraw xmm2, VP9_FILTER_SHIFT
-
- punpcklbw xmm5, xmm7
- movq xmm7, [rsi+9]
-
- paddw xmm3, [GLOBAL(rd)]
- pmaddubsw xmm5, xmm1
-
- psraw xmm3, VP9_FILTER_SHIFT
- punpcklbw xmm6, xmm7
-
- packuswb xmm2, xmm3
- pmaddubsw xmm6, xmm1
-
- movdqa [rdi], xmm2 ; store the results in the destination
- paddw xmm5, [GLOBAL(rd)]
-
- lea rdi, [rdi + rdx] ; dst_pitch
- psraw xmm5, VP9_FILTER_SHIFT
-
- paddw xmm6, [GLOBAL(rd)]
- psraw xmm6, VP9_FILTER_SHIFT
-
- packuswb xmm5, xmm6
- lea rsi, [rsi + rax] ; next line
-
- movdqa [rdi], xmm5 ; store the results in the destination
- lea rdi, [rdi + rdx] ; dst_pitch
-
- cmp rdi, rcx
-
- jne .next_row_fp
-
-.done:
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_bilinear_predict8x8_ssse3
-;(
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; int xoffset,
-; int yoffset,
-; unsigned char *dst_ptr,
-; int dst_pitch
-;)
-global sym(vp9_bilinear_predict8x8_ssse3) PRIVATE
-sym(vp9_bilinear_predict8x8_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 144 ; reserve 144 bytes
-
- lea rcx, [GLOBAL(bilinear_filters_ssse3)]
-
- mov rsi, arg(0) ;src_ptr
- movsxd rdx, dword ptr arg(1) ;src_pixels_per_line
-
- ;Read 9-line unaligned data in and put them on stack. This gives a big
- ;performance boost.
- movdqu xmm0, [rsi]
- lea rax, [rdx + rdx*2]
- movdqu xmm1, [rsi+rdx]
- movdqu xmm2, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm3, [rsi]
- movdqu xmm4, [rsi+rdx]
- movdqu xmm5, [rsi+rdx*2]
- add rsi, rax
- movdqu xmm6, [rsi]
- movdqu xmm7, [rsi+rdx]
-
- movdqa XMMWORD PTR [rsp], xmm0
-
- movdqu xmm0, [rsi+rdx*2]
-
- movdqa XMMWORD PTR [rsp+16], xmm1
- movdqa XMMWORD PTR [rsp+32], xmm2
- movdqa XMMWORD PTR [rsp+48], xmm3
- movdqa XMMWORD PTR [rsp+64], xmm4
- movdqa XMMWORD PTR [rsp+80], xmm5
- movdqa XMMWORD PTR [rsp+96], xmm6
- movdqa XMMWORD PTR [rsp+112], xmm7
- movdqa XMMWORD PTR [rsp+128], xmm0
-
- movsxd rax, dword ptr arg(2) ; xoffset
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .b8x8_sp_only
-
- shl rax, 4
- add rax, rcx ; HFilter
-
- mov rdi, arg(4) ; dst_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm0, [rax]
-
- movsxd rax, dword ptr arg(3) ; yoffset
- cmp rax, 0 ; skip second_pass filter if yoffset=0
- je .b8x8_fp_only
-
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- lea rcx, [rdi+rdx*8]
-
- movdqa xmm1, [rax]
-
- ; get the first horizontal line done
- movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx
-
- psrldq xmm5, 1
- lea rsp, [rsp + 16] ; next line
-
- punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08
- pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14
-
- paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value
- psraw xmm3, VP9_FILTER_SHIFT ; xmm3 /= 128
-
- movdqa xmm7, xmm3
- packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
-
-.next_row:
- movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15
- lea rsp, [rsp + 16] ; next line
-
- movdqa xmm5, xmm6
-
- psrldq xmm5, 1
-
- punpcklbw xmm6, xmm5
- pmaddubsw xmm6, xmm0
-
- paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value
- psraw xmm6, VP9_FILTER_SHIFT ; xmm6 /= 128
-
- packuswb xmm6, xmm6
-
- punpcklbw xmm7, xmm6
- pmaddubsw xmm7, xmm1
-
- paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value
- psraw xmm7, VP9_FILTER_SHIFT ; xmm7 /= 128
-
- packuswb xmm7, xmm7
-
- movq [rdi], xmm7 ; store the results in the destination
- lea rdi, [rdi + rdx]
-
- movdqa xmm7, xmm6
-
- cmp rdi, rcx
- jne .next_row
-
- jmp .done8x8
-
-.b8x8_sp_only:
- movsxd rax, dword ptr arg(3) ; yoffset
- shl rax, 4
- lea rax, [rax + rcx] ; VFilter
-
- mov rdi, arg(4) ;dst_ptr
- movsxd rdx, dword ptr arg(5) ; dst_pitch
-
- movdqa xmm0, [rax] ; VFilter
-
- movq xmm1, XMMWORD PTR [rsp]
- movq xmm2, XMMWORD PTR [rsp+16]
-
- movq xmm3, XMMWORD PTR [rsp+32]
- punpcklbw xmm1, xmm2
-
- movq xmm4, XMMWORD PTR [rsp+48]
- punpcklbw xmm2, xmm3
-
- movq xmm5, XMMWORD PTR [rsp+64]
- punpcklbw xmm3, xmm4
-
- movq xmm6, XMMWORD PTR [rsp+80]
- punpcklbw xmm4, xmm5
-
- movq xmm7, XMMWORD PTR [rsp+96]
- punpcklbw xmm5, xmm6
-
- pmaddubsw xmm1, xmm0
- pmaddubsw xmm2, xmm0
-
- pmaddubsw xmm3, xmm0
- pmaddubsw xmm4, xmm0
-
- pmaddubsw xmm5, xmm0
- punpcklbw xmm6, xmm7
-
- pmaddubsw xmm6, xmm0
- paddw xmm1, [GLOBAL(rd)]
-
- paddw xmm2, [GLOBAL(rd)]
- psraw xmm1, VP9_FILTER_SHIFT
-
- paddw xmm3, [GLOBAL(rd)]
- psraw xmm2, VP9_FILTER_SHIFT
-
- paddw xmm4, [GLOBAL(rd)]
- psraw xmm3, VP9_FILTER_SHIFT
-
- paddw xmm5, [GLOBAL(rd)]
- psraw xmm4, VP9_FILTER_SHIFT
-
- paddw xmm6, [GLOBAL(rd)]
- psraw xmm5, VP9_FILTER_SHIFT
-
- psraw xmm6, VP9_FILTER_SHIFT
- packuswb xmm1, xmm1
-
- packuswb xmm2, xmm2
- movq [rdi], xmm1
-
- packuswb xmm3, xmm3
- movq [rdi+rdx], xmm2
-
- packuswb xmm4, xmm4
- movq xmm1, XMMWORD PTR [rsp+112]
-
- lea rdi, [rdi + 2*rdx]
- movq xmm2, XMMWORD PTR [rsp+128]
-
- packuswb xmm5, xmm5
- movq [rdi], xmm3
-
- packuswb xmm6, xmm6
- movq [rdi+rdx], xmm4
-
- lea rdi, [rdi + 2*rdx]
- punpcklbw xmm7, xmm1
-
- movq [rdi], xmm5
- pmaddubsw xmm7, xmm0
-
- movq [rdi+rdx], xmm6
- punpcklbw xmm1, xmm2
-
- pmaddubsw xmm1, xmm0
- paddw xmm7, [GLOBAL(rd)]
-
- psraw xmm7, VP9_FILTER_SHIFT
- paddw xmm1, [GLOBAL(rd)]
-
- psraw xmm1, VP9_FILTER_SHIFT
- packuswb xmm7, xmm7
-
- packuswb xmm1, xmm1
- lea rdi, [rdi + 2*rdx]
-
- movq [rdi], xmm7
-
- movq [rdi+rdx], xmm1
- lea rsp, [rsp + 144]
-
- jmp .done8x8
-
-.b8x8_fp_only:
- lea rcx, [rdi+rdx*8]
-
-.next_row_fp:
- movdqa xmm1, XMMWORD PTR [rsp]
- movdqa xmm3, XMMWORD PTR [rsp+16]
-
- movdqa xmm2, xmm1
- movdqa xmm5, XMMWORD PTR [rsp+32]
-
- psrldq xmm2, 1
- movdqa xmm7, XMMWORD PTR [rsp+48]
-
- movdqa xmm4, xmm3
- psrldq xmm4, 1
-
- movdqa xmm6, xmm5
- psrldq xmm6, 1
-
- punpcklbw xmm1, xmm2
- pmaddubsw xmm1, xmm0
-
- punpcklbw xmm3, xmm4
- pmaddubsw xmm3, xmm0
-
- punpcklbw xmm5, xmm6
- pmaddubsw xmm5, xmm0
-
- movdqa xmm2, xmm7
- psrldq xmm2, 1
-
- punpcklbw xmm7, xmm2
- pmaddubsw xmm7, xmm0
-
- paddw xmm1, [GLOBAL(rd)]
- psraw xmm1, VP9_FILTER_SHIFT
-
- paddw xmm3, [GLOBAL(rd)]
- psraw xmm3, VP9_FILTER_SHIFT
-
- paddw xmm5, [GLOBAL(rd)]
- psraw xmm5, VP9_FILTER_SHIFT
-
- paddw xmm7, [GLOBAL(rd)]
- psraw xmm7, VP9_FILTER_SHIFT
-
- packuswb xmm1, xmm1
- packuswb xmm3, xmm3
-
- packuswb xmm5, xmm5
- movq [rdi], xmm1
-
- packuswb xmm7, xmm7
- movq [rdi+rdx], xmm3
-
- lea rdi, [rdi + 2*rdx]
- movq [rdi], xmm5
-
- lea rsp, [rsp + 4*16]
- movq [rdi+rdx], xmm7
-
- lea rdi, [rdi + 2*rdx]
- cmp rdi, rcx
-
- jne .next_row_fp
-
- lea rsp, [rsp + 16]
-
-.done8x8:
- ;add rsp, 144
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-shuf1b:
- db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
-shuf2b:
- db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11
-shuf3b:
- db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10
-
-align 16
-shuf2bfrom1:
- db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13
-align 16
-shuf3bfrom1:
- db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11
-
-align 16
-rd:
- times 8 dw 0x40
-
-align 16
-k0_k5:
- times 8 db 0, 0 ;placeholder
- times 8 db 0, 0
- times 8 db 2, 1
- times 8 db 0, 0
- times 8 db 3, 3
- times 8 db 0, 0
- times 8 db 1, 2
- times 8 db 0, 0
-k1_k3:
- times 8 db 0, 0 ;placeholder
- times 8 db -6, 12
- times 8 db -11, 36
- times 8 db -9, 50
- times 8 db -16, 77
- times 8 db -6, 93
- times 8 db -8, 108
- times 8 db -1, 123
-k2_k4:
- times 8 db 128, 0 ;placeholder
- times 8 db 123, -1
- times 8 db 108, -8
- times 8 db 93, -6
- times 8 db 77, -16
- times 8 db 50, -9
- times 8 db 36, -11
- times 8 db 12, -6
-align 16
-bilinear_filters_ssse3:
- times 8 db 128, 0
- times 8 db 120, 8
- times 8 db 112, 16
- times 8 db 104, 24
- times 8 db 96, 32
- times 8 db 88, 40
- times 8 db 80, 48
- times 8 db 72, 56
- times 8 db 64, 64
- times 8 db 56, 72
- times 8 db 48, 80
- times 8 db 40, 88
- times 8 db 32, 96
- times 8 db 24, 104
- times 8 db 16, 112
- times 8 db 8, 120
-
diff --git a/vp9/common/x86/vp9_subpixel_x86.h b/vp9/common/x86/vp9_subpixel_x86.h
deleted file mode 100644
index 25bc26d9b..000000000
--- a/vp9/common/x86/vp9_subpixel_x86.h
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-#define VP9_COMMON_X86_VP9_SUBPIXEL_X86_H_
-
-/* Note:
- *
- * This platform is commonly built for runtime CPU detection. If you modify
- * any of the function mappings present in this file, be sure to also update
- * them in the function pointer initialization code
- */
-
-#if HAVE_MMX
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_mmx);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_mmx);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_mmx);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_mmx
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_mmx
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_mmx
-
-#undef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_mmx
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_mmx
-
-#endif
-#endif
-
-
-#if HAVE_SSE2
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_sse2);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_sse2);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_sse2);
-
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_sse2
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_sse2
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_sse2
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_sse2
-
-#undef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_sse2
-
-#endif
-#endif
-
-#if HAVE_SSSE3
-extern prototype_subpixel_predict(vp9_sixtap_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x8_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict8x4_ssse3);
-extern prototype_subpixel_predict(vp9_sixtap_predict4x4_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict16x16_ssse3);
-extern prototype_subpixel_predict(vp9_bilinear_predict8x8_ssse3);
-
-#if !CONFIG_RUNTIME_CPU_DETECT
-#undef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_ssse3
-
-#undef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_ssse3
-
-#undef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_ssse3
-
-#undef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict4x4_ssse3
-
-
-#undef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_ssse3
-
-#undef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_ssse3
-
-#endif
-#endif
-
-
-
-#endif