summaryrefslogtreecommitdiff
path: root/vp9/encoder
diff options
context:
space:
mode:
authorJim Bankoski <jimbankoski@google.com>2013-02-28 08:32:14 -0800
committerJim Bankoski <jimbankoski@google.com>2013-02-28 08:46:35 -0800
commit714aa9f3c072624186df161589bacbb778369312 (patch)
tree22563b7ebd98666e4c29430196f2fdcde37e6b0d /vp9/encoder
parentb715e371c05324c84b3a58ca19f5348caa2ff695 (diff)
downloadlibvpx-714aa9f3c072624186df161589bacbb778369312.tar
libvpx-714aa9f3c072624186df161589bacbb778369312.tar.gz
libvpx-714aa9f3c072624186df161589bacbb778369312.tar.bz2
libvpx-714aa9f3c072624186df161589bacbb778369312.zip
this commit converts all sad ptrs to uint32
sse4_1 code used uint16_t for returning sad, but that won't work for 32x32 or 64x64. This code fixes the assembly for those and also reenables sse4_1 on linux Change-Id: I5ce7288d581db870a148e5f7c5092826f59edd81
Diffstat (limited to 'vp9/encoder')
-rw-r--r--vp9/encoder/vp9_mcomp.c2
-rw-r--r--vp9/encoder/vp9_sad_c.c350
-rw-r--r--vp9/encoder/vp9_variance.h2
-rw-r--r--vp9/encoder/x86/vp9_sad_sse4.asm118
4 files changed, 239 insertions, 233 deletions
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 64d8d7d6c..5287f9785 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -1782,7 +1782,7 @@ int vp9_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
int col_min = ref_col - distance;
int col_max = ref_col + distance;
- DECLARE_ALIGNED_ARRAY(16, uint16_t, sad_array8, 8);
+ DECLARE_ALIGNED_ARRAY(16, uint32_t, sad_array8, 8);
unsigned int sad_array[3];
int_mv fcenter_mv;
diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index daff0c972..dc21f02f6 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c
@@ -103,62 +103,62 @@ void vp9_sad64x64x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
- uint16_t *sad_array) {
- sad_array[0] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (uint16_t)vp9_sad64x64(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad64x64(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = vp9_sad64x64(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = vp9_sad64x64(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = vp9_sad64x64(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = vp9_sad64x64(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = vp9_sad64x64(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = vp9_sad64x64(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = vp9_sad64x64(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
}
void vp9_sad32x32x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
- uint16_t *sad_array) {
- sad_array[0] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (uint16_t)vp9_sad32x32(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
+ unsigned int *sad_array) {
+ sad_array[0] = vp9_sad32x32(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = vp9_sad32x32(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = vp9_sad32x32(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = vp9_sad32x32(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = vp9_sad32x32(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = vp9_sad32x32(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = vp9_sad32x32(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = vp9_sad32x32(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
}
void vp9_sad16x16x3_c(const uint8_t *src_ptr,
@@ -178,31 +178,31 @@ void vp9_sad16x16x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
- uint16_t *sad_array) {
- sad_array[0] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (uint16_t)vp9_sad16x16(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
+ uint32_t *sad_array) {
+ sad_array[0] = vp9_sad16x16(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = vp9_sad16x16(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = vp9_sad16x16(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = vp9_sad16x16(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = vp9_sad16x16(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = vp9_sad16x16(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = vp9_sad16x16(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = vp9_sad16x16(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
}
void vp9_sad16x8x3_c(const uint8_t *src_ptr,
@@ -222,31 +222,31 @@ void vp9_sad16x8x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
- uint16_t *sad_array) {
- sad_array[0] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (uint16_t)vp9_sad16x8(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
+ uint32_t *sad_array) {
+ sad_array[0] = vp9_sad16x8(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = vp9_sad16x8(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = vp9_sad16x8(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = vp9_sad16x8(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = vp9_sad16x8(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = vp9_sad16x8(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = vp9_sad16x8(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = vp9_sad16x8(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
}
void vp9_sad8x8x3_c(const uint8_t *src_ptr,
@@ -266,31 +266,31 @@ void vp9_sad8x8x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
- uint16_t *sad_array) {
- sad_array[0] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (uint16_t)vp9_sad8x8(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
+ uint32_t *sad_array) {
+ sad_array[0] = vp9_sad8x8(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = vp9_sad8x8(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = vp9_sad8x8(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = vp9_sad8x8(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = vp9_sad8x8(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = vp9_sad8x8(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = vp9_sad8x8(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = vp9_sad8x8(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
}
void vp9_sad8x16x3_c(const uint8_t *src_ptr,
@@ -310,31 +310,31 @@ void vp9_sad8x16x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
- uint16_t *sad_array) {
- sad_array[0] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (uint16_t)vp9_sad8x16(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
+ uint32_t *sad_array) {
+ sad_array[0] = vp9_sad8x16(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = vp9_sad8x16(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = vp9_sad8x16(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = vp9_sad8x16(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = vp9_sad8x16(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = vp9_sad8x16(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = vp9_sad8x16(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = vp9_sad8x16(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
}
void vp9_sad4x4x3_c(const uint8_t *src_ptr,
@@ -354,31 +354,31 @@ void vp9_sad4x4x8_c(const uint8_t *src_ptr,
int src_stride,
const uint8_t *ref_ptr,
int ref_stride,
- uint16_t *sad_array) {
- sad_array[0] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
- ref_ptr, ref_stride,
- 0x7fffffff);
- sad_array[1] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
- ref_ptr + 1, ref_stride,
- 0x7fffffff);
- sad_array[2] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
- ref_ptr + 2, ref_stride,
- 0x7fffffff);
- sad_array[3] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
- ref_ptr + 3, ref_stride,
- 0x7fffffff);
- sad_array[4] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
- ref_ptr + 4, ref_stride,
- 0x7fffffff);
- sad_array[5] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
- ref_ptr + 5, ref_stride,
- 0x7fffffff);
- sad_array[6] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
- ref_ptr + 6, ref_stride,
- 0x7fffffff);
- sad_array[7] = (uint16_t)vp9_sad4x4(src_ptr, src_stride,
- ref_ptr + 7, ref_stride,
- 0x7fffffff);
+ uint32_t *sad_array) {
+ sad_array[0] = vp9_sad4x4(src_ptr, src_stride,
+ ref_ptr, ref_stride,
+ 0x7fffffff);
+ sad_array[1] = vp9_sad4x4(src_ptr, src_stride,
+ ref_ptr + 1, ref_stride,
+ 0x7fffffff);
+ sad_array[2] = vp9_sad4x4(src_ptr, src_stride,
+ ref_ptr + 2, ref_stride,
+ 0x7fffffff);
+ sad_array[3] = vp9_sad4x4(src_ptr, src_stride,
+ ref_ptr + 3, ref_stride,
+ 0x7fffffff);
+ sad_array[4] = vp9_sad4x4(src_ptr, src_stride,
+ ref_ptr + 4, ref_stride,
+ 0x7fffffff);
+ sad_array[5] = vp9_sad4x4(src_ptr, src_stride,
+ ref_ptr + 5, ref_stride,
+ 0x7fffffff);
+ sad_array[6] = vp9_sad4x4(src_ptr, src_stride,
+ ref_ptr + 6, ref_stride,
+ 0x7fffffff);
+ sad_array[7] = vp9_sad4x4(src_ptr, src_stride,
+ ref_ptr + 7, ref_stride,
+ 0x7fffffff);
}
void vp9_sad64x64x4d_c(const uint8_t *src_ptr,
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index eb903bf94..8b32524a2 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -29,7 +29,7 @@ typedef void (*vp9_sad_multi1_fn_t)(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
int ref_stride,
- unsigned short *sad_array);
+ unsigned int *sad_array);
typedef void (*vp9_sad_multi_d_fn_t)(const uint8_t *src_ptr,
int source_stride,
diff --git a/vp9/encoder/x86/vp9_sad_sse4.asm b/vp9/encoder/x86/vp9_sad_sse4.asm
index b42982a1f..faf1768a9 100644
--- a/vp9/encoder/x86/vp9_sad_sse4.asm
+++ b/vp9/encoder/x86/vp9_sad_sse4.asm
@@ -154,6 +154,16 @@
paddw xmm1, xmm5
%endmacro
+%macro WRITE_AS_INTS 0
+ mov rdi, arg(4) ;Results
+ pxor xmm0, xmm0
+ movdqa xmm2, xmm1
+ punpcklwd xmm1, xmm0
+ punpckhwd xmm2, xmm0
+
+ movdqa [rdi], xmm1
+ movdqa [rdi + 16], xmm2
+%endmacro
;void vp9_sad16x16x8_sse4(
; const unsigned char *src_ptr,
@@ -170,23 +180,22 @@ sym(vp9_sad16x16x8_sse4):
push rdi
; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
+ WRITE_AS_INTS
; begin epilog
pop rdi
@@ -212,19 +221,18 @@ sym(vp9_sad16x8x8_sse4):
push rdi
; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
- PROCESS_16X2X8 1
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
- PROCESS_16X2X8 0
+ PROCESS_16X2X8 1
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
+ PROCESS_16X2X8 0
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
+ WRITE_AS_INTS
; begin epilog
pop rdi
@@ -250,19 +258,18 @@ sym(vp9_sad8x8x8_sse4):
push rdi
; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
+ WRITE_AS_INTS
; begin epilog
pop rdi
@@ -288,22 +295,22 @@ sym(vp9_sad8x16x8_sse4):
push rdi
; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
+
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ PROCESS_8X2X8 1
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
+ PROCESS_8X2X8 0
- PROCESS_8X2X8 1
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- PROCESS_8X2X8 0
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
+ WRITE_AS_INTS
; begin epilog
pop rdi
@@ -329,17 +336,16 @@ sym(vp9_sad4x4x8_sse4):
push rdi
; end prolog
- mov rsi, arg(0) ;src_ptr
- mov rdi, arg(2) ;ref_ptr
+ mov rsi, arg(0) ;src_ptr
+ mov rdi, arg(2) ;ref_ptr
- movsxd rax, dword ptr arg(1) ;src_stride
- movsxd rdx, dword ptr arg(3) ;ref_stride
+ movsxd rax, dword ptr arg(1) ;src_stride
+ movsxd rdx, dword ptr arg(3) ;ref_stride
- PROCESS_4X2X8 1
- PROCESS_4X2X8 0
+ PROCESS_4X2X8 1
+ PROCESS_4X2X8 0
- mov rdi, arg(4) ;Results
- movdqa XMMWORD PTR [rdi], xmm1
+ WRITE_AS_INTS
; begin epilog
pop rdi