summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2012-04-12 14:22:47 -0400
committerScott LaVarnway <slavarnway@google.com>2012-04-12 14:22:47 -0400
commite0a80519c726f3097c3896b6fc155741f64f68b0 (patch)
tree3b6567d77aaf4353801e3f7be6b1c112e8d1c6fa
parentd9ca52452bff9481ec1ed4684985a86fb7fc3c3e (diff)
downloadlibvpx-e0a80519c726f3097c3896b6fc155741f64f68b0.tar
libvpx-e0a80519c726f3097c3896b6fc155741f64f68b0.tar.gz
libvpx-e0a80519c726f3097c3896b6fc155741f64f68b0.tar.bz2
libvpx-e0a80519c726f3097c3896b6fc155741f64f68b0.zip
loopfilter improvements
Local variable offsets are now consistent for the functions, removed unused parameters, reworked the assembly to eliminate stalls/instructions. Change-Id: Iaa37668f8a9bb8754df435f6a51c3a08d547f879
-rw-r--r--vp8/common/loopfilter.c99
-rw-r--r--vp8/common/x86/loopfilter_sse2.asm750
-rw-r--r--vp8/common/x86/loopfilter_x86.c28
3 files changed, 402 insertions, 475 deletions
diff --git a/vp8/common/loopfilter.c b/vp8/common/loopfilter.c
index 66b280d33..3f05efe81 100644
--- a/vp8/common/loopfilter.c
+++ b/vp8/common/loopfilter.c
@@ -210,6 +210,8 @@ void vp8_loop_filter_frame
int mb_row;
int mb_col;
+ int mb_rows = cm->mb_rows;
+ int mb_cols = cm->mb_cols;
int filter_level;
@@ -217,6 +219,8 @@ void vp8_loop_filter_frame
/* Point at base of Mb MODE_INFO list */
const MODE_INFO *mode_info_context = cm->mi;
+ int post_y_stride = post->y_stride;
+ int post_uv_stride = post->uv_stride;
/* Initialize the loop filter for this frame. */
vp8_loop_filter_frame_init(cm, mbd, cm->filter_level);
@@ -227,23 +231,23 @@ void vp8_loop_filter_frame
v_ptr = post->v_buffer;
/* vp8_filter each macro block */
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ if (cm->filter_type == NORMAL_LOOPFILTER)
{
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ for (mb_row = 0; mb_row < mb_rows; mb_row++)
{
- int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
- mode_info_context->mbmi.mode != SPLITMV &&
- mode_info_context->mbmi.mb_skip_coeff);
+ for (mb_col = 0; mb_col < mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
- const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
- const int seg = mode_info_context->mbmi.segment_id;
- const int ref_frame = mode_info_context->mbmi.ref_frame;
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
- filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
- if (filter_level)
- {
- if (cm->filter_type == NORMAL_LOOPFILTER)
+ if (filter_level)
{
const int hev_index = lfi_n->hev_thr_lut[frame_type][filter_level];
lfi.mblim = lfi_n->mblim[filter_level];
@@ -253,54 +257,87 @@ void vp8_loop_filter_frame
if (mb_col > 0)
vp8_loop_filter_mbv
- (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
if (!skip_lf)
vp8_loop_filter_bv
- (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
/* don't apply across umv border */
if (mb_row > 0)
vp8_loop_filter_mbh
- (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
if (!skip_lf)
vp8_loop_filter_bh
- (y_ptr, u_ptr, v_ptr, post->y_stride, post->uv_stride, &lfi);
+ (y_ptr, u_ptr, v_ptr, post_y_stride, post_uv_stride, &lfi);
}
- else
+
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+
+ mode_info_context++; /* step to next MB */
+ }
+ y_ptr += post_y_stride * 16 - post->y_width;
+ u_ptr += post_uv_stride * 8 - post->uv_width;
+ v_ptr += post_uv_stride * 8 - post->uv_width;
+
+ mode_info_context++; /* Skip border mb */
+
+ }
+ }
+ else /* SIMPLE_LOOPFILTER */
+ {
+ for (mb_row = 0; mb_row < mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < mb_cols; mb_col++)
+ {
+ int skip_lf = (mode_info_context->mbmi.mode != B_PRED &&
+ mode_info_context->mbmi.mode != SPLITMV &&
+ mode_info_context->mbmi.mb_skip_coeff);
+
+ const int mode_index = lfi_n->mode_lf_lut[mode_info_context->mbmi.mode];
+ const int seg = mode_info_context->mbmi.segment_id;
+ const int ref_frame = mode_info_context->mbmi.ref_frame;
+
+ filter_level = lfi_n->lvl[seg][ref_frame][mode_index];
+ if (filter_level)
{
+ const unsigned char * mblim = lfi_n->mblim[filter_level];
+ const unsigned char * blim = lfi_n->blim[filter_level];
+
if (mb_col > 0)
vp8_loop_filter_simple_mbv
- (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+ (y_ptr, post_y_stride, mblim);
if (!skip_lf)
vp8_loop_filter_simple_bv
- (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+ (y_ptr, post_y_stride, blim);
/* don't apply across umv border */
if (mb_row > 0)
vp8_loop_filter_simple_mbh
- (y_ptr, post->y_stride, lfi_n->mblim[filter_level]);
+ (y_ptr, post_y_stride, mblim);
if (!skip_lf)
vp8_loop_filter_simple_bh
- (y_ptr, post->y_stride, lfi_n->blim[filter_level]);
+ (y_ptr, post_y_stride, blim);
}
- }
- y_ptr += 16;
- u_ptr += 8;
- v_ptr += 8;
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
- mode_info_context++; /* step to next MB */
- }
+ mode_info_context++; /* step to next MB */
+ }
+ y_ptr += post_y_stride * 16 - post->y_width;
+ u_ptr += post_uv_stride * 8 - post->uv_width;
+ v_ptr += post_uv_stride * 8 - post->uv_width;
- y_ptr += post->y_stride * 16 - post->y_width;
- u_ptr += post->uv_stride * 8 - post->uv_width;
- v_ptr += post->uv_stride * 8 - post->uv_width;
+ mode_info_context++; /* Skip border mb */
- mode_info_context++; /* Skip border mb */
+ }
}
}
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm
index 2ad010adb..9944c33fe 100644
--- a/vp8/common/x86/loopfilter_sse2.asm
+++ b/vp8/common/x86/loopfilter_sse2.asm
@@ -10,6 +10,17 @@
%include "vpx_ports/x86_abi_support.asm"
+%define _t0 0
+%define _t1 _t0 + 16
+%define _p3 _t1 + 16
+%define _p2 _p3 + 16
+%define _p1 _p2 + 16
+%define _p0 _p1 + 16
+%define _q0 _p0 + 16
+%define _q1 _q0 + 16
+%define _q2 _q1 + 16
+%define _q3 _q2 + 16
+%define lf_var_size 160
; Use of pmaxub instead of psubusb to compute filter mask was seen
; in ffvp8
@@ -35,9 +46,10 @@
lea rsi, [rsi + rax*4]
lea rdi, [rdi + rax*4]
- movdqa XMMWORD PTR [rsp], xmm1 ; store q2
- movdqa XMMWORD PTR [rsp + 16], xmm4 ; store q1
+ movdqa [rsp+_q2], xmm1 ; store q2
+ movdqa [rsp+_q1], xmm4 ; store q1
%endif
+ movdqa xmm7, [rdx] ;limit
movdqa xmm6, xmm1 ; q2
movdqa xmm3, xmm4 ; q1
@@ -58,7 +70,7 @@
psubusb xmm3, xmm0 ; q1-=q0
por xmm5, xmm3 ; abs(q0-q1)
- movdqa t0, xmm5 ; save to t0
+ movdqa [rsp+_t0], xmm5 ; save to t0
pmaxub xmm1, xmm5
@@ -75,8 +87,8 @@
movhps xmm4, [rdi]
movhps xmm6, [rdi + rcx]
- movdqa XMMWORD PTR [rsp + 32], xmm4 ; store p2
- movdqa XMMWORD PTR [rsp + 48], xmm6 ; store p1
+ movdqa [rsp+_p2], xmm4 ; store p2
+ movdqa [rsp+_p1], xmm6 ; store p1
%endif
movdqa xmm5, xmm4 ; p2
@@ -101,7 +113,7 @@
%else
movlps xmm4, [rsi + rcx*2] ; p0
movhps xmm4, [rdi + rcx*2]
- movdqa xmm3, q1 ; q1
+ movdqa xmm3, [rsp+_q1] ; q1
%endif
movdqa xmm5, xmm4 ; p0
@@ -112,7 +124,7 @@
por xmm6, xmm4 ; abs(p1 - p0)
mov rdx, arg(2) ; get blimit
- movdqa t1, xmm6 ; save to t1
+ movdqa [rsp+_t1], xmm6 ; save to t1
movdqa xmm4, xmm3 ; q1
pmaxub xmm1, xmm6
@@ -123,30 +135,27 @@
psubusb xmm1, xmm7
por xmm2, xmm3 ; abs(p1-q1)
- movdqa xmm7, XMMWORD PTR [rdx] ; blimit
+ movdqa xmm7, [rdx] ; blimit
+ mov rdx, arg(4) ; hev get thresh
movdqa xmm3, xmm0 ; q0
pand xmm2, [GLOBAL(tfe)] ; set lsb of each byte to zero
- mov rdx, arg(4) ; hev get thresh
-
movdqa xmm6, xmm5 ; p0
psrlw xmm2, 1 ; abs(p1-q1)/2
psubusb xmm5, xmm3 ; p0-=q0
-
psubusb xmm3, xmm6 ; q0-=p0
por xmm5, xmm3 ; abs(p0 - q0)
paddusb xmm5, xmm5 ; abs(p0-q0)*2
- movdqa xmm4, t0 ; hev get abs (q1 - q0)
-
- movdqa xmm3, t1 ; get abs (p1 - p0)
+ movdqa xmm4, [rsp+_t0] ; hev get abs (q1 - q0)
+ movdqa xmm3, [rsp+_t1] ; get abs (p1 - p0)
paddusb xmm5, xmm2 ; abs (p0 - q0) *2 + abs(p1-q1)/2
- movdqa xmm2, XMMWORD PTR [rdx] ; hev
+ movdqa xmm2, [rdx] ; hev
psubusb xmm5, xmm7 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
psubusb xmm4, xmm2 ; hev
@@ -165,43 +174,37 @@
%endmacro
%macro B_FILTER 1
+ movdqa xmm3, [GLOBAL(t80)]
%if %1 == 0
- movdqa xmm2, p1 ; p1
- movdqa xmm7, q1 ; q1
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
%elif %1 == 1
movdqa xmm2, [rsi+2*rax] ; p1
movdqa xmm7, [rdi] ; q1
%elif %1 == 2
- lea rdx, srct
-
- movdqa xmm2, [rdx] ; p1
- movdqa xmm7, [rdx+48] ; q1
- movdqa xmm6, [rdx+16] ; p0
- movdqa xmm0, [rdx+32] ; q0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
%endif
- pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
psubsb xmm2, xmm7 ; p1 - q1
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor xmm6, xmm3 ; offset to convert to signed values
pand xmm2, xmm4 ; high var mask (hvm)(p1 - q1)
- pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor xmm0, xmm3 ; offset to convert to signed values
movdqa xmm3, xmm0 ; q0
psubsb xmm0, xmm6 ; q0 - p0
-
paddsb xmm2, xmm0 ; 1 * (q0 - p0) + hvm(p1 - q1)
-
paddsb xmm2, xmm0 ; 2 * (q0 - p0) + hvm(p1 - q1)
-
paddsb xmm2, xmm0 ; 3 * (q0 - p0) + hvm(p1 - q1)
-
pand xmm1, xmm2 ; mask filter values we don't care about
movdqa xmm2, xmm1
-
paddsb xmm1, [GLOBAL(t4)] ; 3* (q0 - p0) + hvm(p1 - q1) + 4
paddsb xmm2, [GLOBAL(t3)] ; 3* (q0 - p0) + hvm(p1 - q1) + 3
@@ -221,47 +224,49 @@
movdqa xmm5, xmm0 ; save results
packsswb xmm0, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>3
- paddsw xmm5, [GLOBAL(ones)]
- paddsw xmm1, [GLOBAL(ones)]
- psraw xmm5, 1 ; partial shifted one more time for 2nd tap
+ paddsb xmm6, xmm2 ; p0+= p0 add
+ movdqa xmm2, [GLOBAL(ones)]
+ paddsw xmm5, xmm2
+ paddsw xmm1, xmm2
+ psraw xmm5, 1 ; partial shifted one more time for 2nd tap
psraw xmm1, 1 ; partial shifted one more time for 2nd tap
-
- paddsb xmm6, xmm2 ; p0+= p0 add
packsswb xmm5, xmm1 ; (3* (q0 - p0) + hvm(p1 - q1) + 4) >>4
+ movdqa xmm2, [GLOBAL(t80)]
%if %1 == 0
- movdqa xmm1, p1 ; p1
+ movdqa xmm1, [rsp+_p1] ; p1
+ lea rsi, [rsi + rcx*2]
+ lea rdi, [rdi + rcx*2]
%elif %1 == 1
movdqa xmm1, [rsi+2*rax] ; p1
%elif %1 == 2
- movdqa xmm1, [rdx] ; p1
+ movdqa xmm1, [rsp+_p1] ; p1
%endif
+
pandn xmm4, xmm5 ; high edge variance additive
- pxor xmm6, [GLOBAL(t80)] ; unoffset
+ pxor xmm6, xmm2 ; unoffset
- pxor xmm1, [GLOBAL(t80)] ; reoffset
+ pxor xmm1, xmm2 ; reoffset
psubsb xmm3, xmm0 ; q0-= q0 add
paddsb xmm1, xmm4 ; p1+= p1 add
- pxor xmm3, [GLOBAL(t80)] ; unoffset
+ pxor xmm3, xmm2 ; unoffset
- pxor xmm1, [GLOBAL(t80)] ; unoffset
+ pxor xmm1, xmm2 ; unoffset
psubsb xmm7, xmm4 ; q1-= q1 add
- pxor xmm7, [GLOBAL(t80)] ; unoffset
+ pxor xmm7, xmm2 ; unoffset
%if %1 == 0
- lea rsi, [rsi + rcx*2]
- lea rdi, [rdi + rcx*2]
- movq MMWORD PTR [rsi], xmm6 ; p0
- movhps MMWORD PTR [rdi], xmm6
- movq MMWORD PTR [rsi + rax], xmm1 ; p1
- movhps MMWORD PTR [rdi + rax], xmm1
- movq MMWORD PTR [rsi + rcx], xmm3 ; q0
- movhps MMWORD PTR [rdi + rcx], xmm3
- movq MMWORD PTR [rsi + rcx*2],xmm7 ; q1
- movhps MMWORD PTR [rdi + rcx*2],xmm7
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rax], xmm1 ; p1
+ movhps [rdi + rax], xmm1
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ movq [rsi + rcx*2], xmm7 ; q1
+ movhps [rdi + rcx*2], xmm7
%elif %1 == 1
movdqa [rsi+rax], xmm6 ; write back
movdqa [rsi+2*rax], xmm1 ; write back
@@ -280,13 +285,12 @@
; const char *blimit,
; const char *limit,
; const char *thresh,
-; int count
;)
global sym(vp8_loop_filter_horizontal_edge_sse2)
sym(vp8_loop_filter_horizontal_edge_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
+ SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
GET_GOT rbx
push rsi
@@ -294,15 +298,12 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+ sub rsp, lf_var_size
mov rsi, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixel_step
mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
@@ -311,7 +312,7 @@ sym(vp8_loop_filter_horizontal_edge_sse2):
; filter and write back the result
B_FILTER 1
- add rsp, 32
+ add rsp, lf_var_size
pop rsp
; begin epilog
pop rdi
@@ -345,13 +346,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
- %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
- %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
- %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
- %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
+ sub rsp, lf_var_size
mov rsi, arg(0) ; u
mov rdi, arg(5) ; v
@@ -360,7 +355,6 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
neg rax ; negate pitch to deal with above border
mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
lea rsi, [rsi + rcx]
lea rdi, [rdi + rcx]
@@ -370,7 +364,7 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
; filter and write back the result
B_FILTER 0
- add rsp, 96
+ add rsp, lf_var_size
pop rsp
; begin epilog
pop rdi
@@ -383,9 +377,10 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
%macro MB_FILTER_AND_WRITEBACK 1
+ movdqa xmm3, [GLOBAL(t80)]
%if %1 == 0
- movdqa xmm2, p1 ; p1
- movdqa xmm7, q1 ; q1
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm7, [rsp+_q1] ; q1
%elif %1 == 1
movdqa xmm2, [rsi+2*rax] ; p1
movdqa xmm7, [rdi] ; q1
@@ -393,30 +388,24 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
mov rcx, rax
neg rcx
%elif %1 == 2
- lea rdx, srct
-
- movdqa xmm2, [rdx+32] ; p1
- movdqa xmm7, [rdx+80] ; q1
- movdqa xmm6, [rdx+48] ; p0
- movdqa xmm0, [rdx+64] ; q0
+ movdqa xmm2, [rsp+_p1] ; p1
+ movdqa xmm6, [rsp+_p0] ; p0
+ movdqa xmm0, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
%endif
- pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
+ pxor xmm2, xmm3 ; p1 offset to convert to signed values
+ pxor xmm7, xmm3 ; q1 offset to convert to signed values
+ pxor xmm6, xmm3 ; offset to convert to signed values
+ pxor xmm0, xmm3 ; offset to convert to signed values
psubsb xmm2, xmm7 ; p1 - q1
- movdqa xmm3, xmm0 ; q0
+ movdqa xmm3, xmm0 ; q0
psubsb xmm0, xmm6 ; q0 - p0
-
paddsb xmm2, xmm0 ; 1 * (q0 - p0) + (p1 - q1)
-
paddsb xmm2, xmm0 ; 2 * (q0 - p0)
-
paddsb xmm2, xmm0 ; 3 * (q0 - p0) + (p1 - q1)
-
pand xmm1, xmm2 ; mask filter values we don't care about
movdqa xmm2, xmm1 ; vp8_filter
@@ -428,19 +417,20 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
pxor xmm1, xmm1
punpcklbw xmm0, xmm4 ; Filter 2 (hi)
+ punpckhbw xmm1, xmm4 ; Filter 2 (lo)
+
movdqa xmm5, xmm2
- punpckhbw xmm1, xmm4 ; Filter 2 (lo)
+ movdqa xmm4, [GLOBAL(s9)]
paddsb xmm5, [GLOBAL(t3)] ; vp8_signed_char_clamp(Filter2 + 3)
+ paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
- pmulhw xmm1, [GLOBAL(s9)] ; Filter 2 (lo) * 9
-
- pmulhw xmm0, [GLOBAL(s9)] ; Filter 2 (hi) * 9
+ pmulhw xmm1, xmm4 ; Filter 2 (lo) * 9
+ pmulhw xmm0, xmm4 ; Filter 2 (hi) * 9
punpckhbw xmm7, xmm5 ; axbxcxdx
- paddsb xmm2, [GLOBAL(t4)] ; vp8_signed_char_clamp(Filter2 + 4)
-
punpcklbw xmm5, xmm5 ; exfxgxhx
+
psraw xmm7, 11 ; sign extended shift right by 3
psraw xmm5, 11 ; sign extended shift right by 3
@@ -453,18 +443,19 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
psraw xmm2, 11 ; sign extended shift right by 3
packsswb xmm2, xmm4 ; Filter1 >>=3;
- movdqa xmm7, xmm1
paddsb xmm6, xmm5 ; ps0 =ps0 + Fitler2
- movdqa xmm4, xmm1
psubsb xmm3, xmm2 ; qs0 =qs0 - Filter1
- movdqa xmm5, xmm0
+ movdqa xmm7, xmm1
+ movdqa xmm4, [GLOBAL(s63)]
+ movdqa xmm5, xmm0
movdqa xmm2, xmm5
- paddw xmm0, [GLOBAL(s63)] ; Filter 2 (hi) * 9 + 63
+ paddw xmm0, xmm4 ; Filter 2 (hi) * 9 + 63
+ paddw xmm1, xmm4 ; Filter 2 (lo) * 9 + 63
+ movdqa xmm4, xmm7
- paddw xmm1, [GLOBAL(s63)] ; Filter 2 (lo) * 9 + 63
paddw xmm5, xmm5 ; Filter 2 (hi) * 18
paddw xmm7, xmm7 ; Filter 2 (lo) * 18
@@ -472,99 +463,91 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
paddw xmm7, xmm1 ; Filter 2 (lo) * 27 + 63
paddw xmm2, xmm0 ; Filter 2 (hi) * 18 + 63
-
- paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
psraw xmm0, 7 ; (Filter 2 (hi) * 9 + 63) >> 7
+ paddw xmm4, xmm1 ; Filter 2 (lo) * 18 + 63
psraw xmm1, 7 ; (Filter 2 (lo) * 9 + 63) >> 7
psraw xmm2, 7 ; (Filter 2 (hi) * 18 + 63) >> 7
packsswb xmm0, xmm1 ; u1 = vp8_signed_char_clamp((63 + Filter2 * 9)>>7)
- psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
+ psraw xmm4, 7 ; (Filter 2 (lo) * 18 + 63) >> 7
psraw xmm5, 7 ; (Filter 2 (hi) * 27 + 63) >> 7
- packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
-
psraw xmm7, 7 ; (Filter 2 (lo) * 27 + 63) >> 7
packsswb xmm5, xmm7 ; u3 = vp8_signed_char_clamp((63 + Filter2 * 27)>>7)
-
- psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
- paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
+ packsswb xmm2, xmm4 ; u2 = vp8_signed_char_clamp((63 + Filter2 * 18)>>7)
+ movdqa xmm7, [GLOBAL(t80)]
%if %1 == 0
- movdqa xmm5, q2 ; q2
- movdqa xmm1, q1 ; q1
- movdqa xmm4, p1 ; p1
- movdqa xmm7, p2 ; p2
+ movdqa xmm1, [rsp+_q1] ; q1
+ movdqa xmm4, [rsp+_p1] ; p1
+ lea rsi, [rsi+rcx*2]
+ lea rdi, [rdi+rcx*2]
%elif %1 == 1
- movdqa xmm5, XMMWORD PTR [rdi+rcx] ; q2
- movdqa xmm1, XMMWORD PTR [rdi] ; q1
- movdqa xmm4, XMMWORD PTR [rsi+rax*2] ; p1
- movdqa xmm7, XMMWORD PTR [rdi+rax*4] ; p2
+ movdqa xmm1, [rdi] ; q1
+ movdqa xmm4, [rsi+rax*2] ; p1
%elif %1 == 2
- movdqa xmm5, XMMWORD PTR [rdx+96] ; q2
- movdqa xmm1, XMMWORD PTR [rdx+80] ; q1
- movdqa xmm4, XMMWORD PTR [rdx+32] ; p1
- movdqa xmm7, XMMWORD PTR [rdx+16] ; p2
+ movdqa xmm4, [rsp+_p1] ; p1
+ movdqa xmm1, [rsp+_q1] ; q1
%endif
- pxor xmm3, [GLOBAL(t80)] ; *oq0 = sq^0x80
- pxor xmm6, [GLOBAL(t80)] ; *oq0 = sp^0x80
-
- pxor xmm1, [GLOBAL(t80)]
- pxor xmm4, [GLOBAL(t80)]
+ pxor xmm1, xmm7
+ pxor xmm4, xmm7
+ psubsb xmm3, xmm5 ; sq = vp8_signed_char_clamp(qs0 - u3)
+ paddsb xmm6, xmm5 ; sp = vp8_signed_char_clamp(ps0 - u3)
psubsb xmm1, xmm2 ; sq = vp8_signed_char_clamp(qs1 - u2)
paddsb xmm4, xmm2 ; sp = vp8_signed_char_clamp(ps1 - u2)
- pxor xmm1, [GLOBAL(t80)] ; *oq1 = sq^0x80;
- pxor xmm4, [GLOBAL(t80)] ; *op1 = sp^0x80;
-
- pxor xmm7, [GLOBAL(t80)]
- pxor xmm5, [GLOBAL(t80)]
+%if %1 == 1
+ movdqa xmm2, [rdi+rax*4] ; p2
+ movdqa xmm5, [rdi+rcx] ; q2
+%else
+ movdqa xmm2, [rsp+_p2] ; p2
+ movdqa xmm5, [rsp+_q2] ; q2
+%endif
- paddsb xmm7, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
+ pxor xmm1, xmm7 ; *oq1 = sq^0x80;
+ pxor xmm4, xmm7 ; *op1 = sp^0x80;
+ pxor xmm2, xmm7
+ pxor xmm5, xmm7
+ paddsb xmm2, xmm0 ; sp = vp8_signed_char_clamp(ps2 - u)
psubsb xmm5, xmm0 ; sq = vp8_signed_char_clamp(qs2 - u)
-
- pxor xmm7, [GLOBAL(t80)] ; *op2 = sp^0x80;
- pxor xmm5, [GLOBAL(t80)] ; *oq2 = sq^0x80;
-
+ pxor xmm2, xmm7 ; *op2 = sp^0x80;
+ pxor xmm5, xmm7 ; *oq2 = sq^0x80;
+ pxor xmm3, xmm7 ; *oq0 = sq^0x80
+ pxor xmm6, xmm7 ; *oq0 = sp^0x80
%if %1 == 0
- lea rsi, [rsi+rcx*2]
- lea rdi, [rdi+rcx*2]
-
- movq MMWORD PTR [rsi], xmm6 ; p0
- movhps MMWORD PTR [rdi], xmm6
- movq MMWORD PTR [rsi + rcx], xmm3 ; q0
- movhps MMWORD PTR [rdi + rcx], xmm3
-
- movq MMWORD PTR [rsi+rcx*2], xmm1 ; q1
- movhps MMWORD PTR [rdi+rcx*2], xmm1
-
- movq MMWORD PTR [rsi + rax], xmm4 ; p1
- movhps MMWORD PTR [rdi + rax], xmm4
-
- movq MMWORD PTR [rsi+rax*2], xmm7 ; p2
- movhps MMWORD PTR [rdi+rax*2], xmm7
-
- lea rsi, [rsi + rcx]
- lea rdi, [rdi + rcx]
- movq MMWORD PTR [rsi+rcx*2], xmm5 ; q2
- movhps MMWORD PTR [rdi+rcx*2], xmm5
+ movq [rsi], xmm6 ; p0
+ movhps [rdi], xmm6
+ movq [rsi + rcx], xmm3 ; q0
+ movhps [rdi + rcx], xmm3
+ lea rdx, [rcx + rcx*2]
+ movq [rsi+rcx*2], xmm1 ; q1
+ movhps [rdi+rcx*2], xmm1
+
+ movq [rsi + rax], xmm4 ; p1
+ movhps [rdi + rax], xmm4
+
+ movq [rsi+rax*2], xmm2 ; p2
+ movhps [rdi+rax*2], xmm2
+
+ movq [rsi+rdx], xmm5 ; q2
+ movhps [rdi+rdx], xmm5
%elif %1 == 1
- movdqa XMMWORD PTR [rdi+rcx], xmm5 ; q2
- movdqa XMMWORD PTR [rdi], xmm1 ; q1
- movdqa XMMWORD PTR [rsi], xmm3 ; q0
- movdqa XMMWORD PTR [rsi+rax ],xmm6 ; p0
- movdqa XMMWORD PTR [rsi+rax*2],xmm4 ; p1
- movdqa XMMWORD PTR [rdi+rax*4],xmm7 ; p2
+ movdqa [rdi+rcx], xmm5 ; q2
+ movdqa [rdi], xmm1 ; q1
+ movdqa [rsi], xmm3 ; q0
+ movdqa [rsi+rax ], xmm6 ; p0
+ movdqa [rsi+rax*2], xmm4 ; p1
+ movdqa [rdi+rax*4], xmm2 ; p2
%elif %1 == 2
- movdqa XMMWORD PTR [rdx+80], xmm1 ; q1
- movdqa XMMWORD PTR [rdx+64], xmm3 ; q0
- movdqa XMMWORD PTR [rdx+48], xmm6 ; p0
- movdqa XMMWORD PTR [rdx+32], xmm4 ; p1
+ movdqa [rsp+_p1], xmm4 ; p1
+ movdqa [rsp+_p0], xmm6 ; p0
+ movdqa [rsp+_q0], xmm3 ; q0
+ movdqa [rsp+_q1], xmm1 ; q1
%endif
%endmacro
@@ -577,13 +560,12 @@ sym(vp8_loop_filter_horizontal_edge_uv_sse2):
; const char *blimit,
; const char *limit,
; const char *thresh,
-; int count
;)
global sym(vp8_mbloop_filter_horizontal_edge_sse2)
sym(vp8_mbloop_filter_horizontal_edge_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
+ SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
GET_GOT rbx
push rsi
@@ -591,15 +573,11 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2):
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
+ sub rsp, lf_var_size
mov rsi, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixel_step
-
mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
lea rdi, [rsi+rax] ; rdi points to row +1 for indirect addressing
@@ -608,7 +586,7 @@ sym(vp8_mbloop_filter_horizontal_edge_sse2):
; filter and write back the results
MB_FILTER_AND_WRITEBACK 1
- add rsp, 32
+ add rsp, lf_var_size
pop rsp
; begin epilog
pop rdi
@@ -641,22 +619,14 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define q2 [rsp + 0] ;__declspec(align(16)) char q2[16];
- %define q1 [rsp + 16] ;__declspec(align(16)) char q1[16];
- %define p2 [rsp + 32] ;__declspec(align(16)) char p2[16];
- %define p1 [rsp + 48] ;__declspec(align(16)) char p1[16];
- %define t0 [rsp + 64] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 80] ;__declspec(align(16)) char t1[16];
+ sub rsp, lf_var_size
mov rsi, arg(0) ; u
mov rdi, arg(5) ; v
movsxd rax, dword ptr arg(1) ; src_pixel_step
mov rcx, rax
neg rax ; negate pitch to deal with above border
-
mov rdx, arg(3) ;limit
- movdqa xmm7, XMMWORD PTR [rdx]
lea rsi, [rsi + rcx]
lea rdi, [rdi + rcx]
@@ -666,7 +636,7 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
; filter and write back the results
MB_FILTER_AND_WRITEBACK 0
- add rsp, 96
+ add rsp, lf_var_size
pop rsp
; begin epilog
pop rdi
@@ -679,46 +649,39 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
%macro TRANSPOSE_16X8 2
- movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
- movq xmm1, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
- movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
- movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
- movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
- movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
+ movq xmm4, [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00
+ movq xmm1, [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20
+ movq xmm7, [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30
+ movq xmm5, [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40
+ movq xmm2, [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50
punpcklbw xmm4, xmm1 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
- movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
+ movq xmm1, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70
movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00
punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20
- movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
+ movq xmm7, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60
punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
%if %1
lea rsi, [rsi+rax*8]
+ lea rdi, [rdi+rax*8]
%else
mov rsi, arg(5) ; v_ptr
%endif
movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40
punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60
-
punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44
-%if %1
- lea rdi, [rdi+rax*8]
-%else
- lea rsi, [rsi - 4]
-%endif
-
punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
-%if %1
- lea rdx, srct
-%else
- lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
+
+%if %1 == 0
+ lea rdi, [rsi + rax - 4] ; rdi points to row +1 for indirect addressing
+ lea rsi, [rsi - 4]
%endif
movdqa xmm2, xmm3 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
@@ -733,24 +696,25 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
punpckldq xmm2, xmm5 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- movdqa t0, xmm2 ; save to free XMM2
- movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
- movq xmm6, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
- movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
- movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
- movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
+ movdqa [rsp+_t0], xmm2 ; save to free XMM2
+
+ movq xmm2, [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80
+ movq xmm6, [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90
+ movq xmm0, [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0
+ movq xmm5, [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0
+ movq xmm1, [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0
punpcklbw xmm2, xmm6 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80
- movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
+ movq xmm6, [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0
- punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
+ punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0
- movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
+ movq xmm5, [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0
punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0
- movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
+ movq xmm6, [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0
punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0
@@ -778,64 +742,38 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
punpcklqdq xmm6, xmm2 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
punpckhqdq xmm7, xmm2 ; f7 e7 d7 c7 b7 a7 97 87 77 67 57 47 37 27 17 07
-%if %2
- movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
- movdqa [rdx], xmm2 ; save 2
-
- movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
- punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
- movdqa [rdx+16], xmm3 ; save 3
-
- punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
-
- movdqa [rdx+32], xmm4 ; save 4
- movdqa [rdx+48], xmm5 ; save 5
- movdqa xmm1, t0 ; get
-
- movdqa xmm2, xmm1 ;
- punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
- punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
-%else
- movdqa [rdx+112], xmm7 ; save 7
-
- movdqa [rdx+96], xmm6 ; save 6
+%if %2 == 0
+ movdqa [rsp+_q3], xmm7 ; save 7
+ movdqa [rsp+_q2], xmm6 ; save 6
+%endif
movdqa xmm2, xmm3 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
punpckhqdq xmm3, xmm5 ; f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
punpcklqdq xmm2, xmm5 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
-
- movdqa [rdx+32], xmm2 ; save 2
+ movdqa [rsp+_p1], xmm2 ; save 2
movdqa xmm5, xmm4 ; 75 65 55 45 35 25 15 05 74 64 54 44 34 24 14 04
punpcklqdq xmm4, xmm1 ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
-
- movdqa [rdx+48], xmm3 ; save 3
+ movdqa [rsp+_p0], xmm3 ; save 3
punpckhqdq xmm5, xmm1 ; f5 e5 d5 c5 b5 a5 95 85 75 65 55 45 35 25 15 05
- movdqa [rdx+64], xmm4 ; save 4
- movdqa [rdx+80], xmm5 ; save 5
- movdqa xmm1, t0 ; get
+ movdqa [rsp+_q0], xmm4 ; save 4
+ movdqa [rsp+_q1], xmm5 ; save 5
+ movdqa xmm1, [rsp+_t0]
- movdqa xmm2, xmm1
+ movdqa xmm2, xmm1 ;
punpckhqdq xmm1, xmm0 ; f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
-
punpcklqdq xmm2, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- movdqa [rdx+16], xmm1
-
- movdqa [rdx], xmm2
+%if %2 == 0
+ movdqa [rsp+_p2], xmm1
+ movdqa [rsp+_p3], xmm2
%endif
+
%endmacro
-%macro LFV_FILTER_MASK_HEV_MASK 1
+%macro LFV_FILTER_MASK_HEV_MASK 0
movdqa xmm0, xmm6 ; q2
psubusb xmm0, xmm7 ; q2-q3
@@ -853,14 +791,11 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
psubusb xmm2, xmm1 ; p3 - p2;
por xmm0, xmm2 ; abs(p2-p3)
-%if %1
- movdqa xmm2, [rdx] ; p1
-%else
- movdqa xmm2, [rdx+32] ; p1
-%endif
- movdqa xmm5, xmm2 ; p1
+
+ movdqa xmm5, [rsp+_p1] ; p1
pmaxub xmm0, xmm7
+ movdqa xmm2, xmm5 ; p1
psubusb xmm5, xmm1 ; p1-p2
psubusb xmm1, xmm2 ; p2-p1
@@ -874,43 +809,33 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
movdqa xmm1, xmm2 ; p1
psubusb xmm2, xmm3 ; p1-p0
- lea rdx, srct
por xmm2, xmm7 ; abs(p1-p0)
- movdqa t0, xmm2 ; save abs(p1-p0)
-
pmaxub xmm0, xmm2
-%if %1
- movdqa xmm5, [rdx+32] ; q0
- movdqa xmm7, [rdx+48] ; q1
-%else
- movdqa xmm5, [rdx+64] ; q0
- movdqa xmm7, [rdx+80] ; q1
-%endif
+ movdqa xmm5, [rsp+_q0] ; q0
+ movdqa xmm7, [rsp+_q1] ; q1
+
mov rdx, arg(3) ; limit
movdqa xmm6, xmm5 ; q0
- movdqa xmm2, xmm7 ; q1
+ movdqa xmm4, xmm7 ; q1
psubusb xmm5, xmm7 ; q0-q1
psubusb xmm7, xmm6 ; q1-q0
por xmm7, xmm5 ; abs(q1-q0)
- movdqa t1, xmm7 ; save abs(q1-q0)
+ pmaxub xmm0, xmm7
- movdqa xmm4, XMMWORD PTR [rdx]; limit
+ psubusb xmm0, [rdx] ; limit
- pmaxub xmm0, xmm7
mov rdx, arg(2) ; blimit
-
- psubusb xmm0, xmm4
- movdqa xmm5, xmm2 ; q1
+ movdqa xmm5, xmm4 ; q1
psubusb xmm5, xmm1 ; q1-=p1
- psubusb xmm1, xmm2 ; p1-=q1
+ psubusb xmm1, xmm4 ; p1-=q1
por xmm5, xmm1 ; abs(p1-q1)
movdqa xmm1, xmm3 ; p0
@@ -918,39 +843,32 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
pand xmm5, [GLOBAL(tfe)] ; set lsb of each byte to zero
psubusb xmm1, xmm6 ; p0-q0
+ movdqa xmm4, [rdx] ; blimit
+ mov rdx, arg(4) ; get thresh
+
psrlw xmm5, 1 ; abs(p1-q1)/2
psubusb xmm6, xmm3 ; q0-p0
- movdqa xmm4, XMMWORD PTR [rdx]; blimit
-
- mov rdx, arg(4) ; get thresh
-
por xmm1, xmm6 ; abs(q0-p0)
-
- movdqa xmm6, t0 ; get abs (q1 - q0)
-
paddusb xmm1, xmm1 ; abs(q0-p0)*2
-
- movdqa xmm3, t1 ; get abs (p1 - p0)
-
- movdqa xmm7, XMMWORD PTR [rdx]
+ movdqa xmm3, [rdx]
paddusb xmm1, xmm5 ; abs (p0 - q0) *2 + abs(p1-q1)/2
- psubusb xmm6, xmm7 ; abs(q1 - q0) > thresh
+ psubusb xmm2, xmm3 ; abs(q1 - q0) > thresh
- psubusb xmm3, xmm7 ; abs(p1 - p0)> thresh
+ psubusb xmm7, xmm3 ; abs(p1 - p0)> thresh
psubusb xmm1, xmm4 ; abs (p0 - q0) *2 + abs(p1-q1)/2 > blimit
- por xmm6, xmm3 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
+ por xmm2, xmm7 ; abs(q1 - q0) > thresh || abs(p1 - p0) > thresh
por xmm1, xmm0 ; mask
- pcmpeqb xmm6, xmm0
+ pcmpeqb xmm2, xmm0
pxor xmm0, xmm0
pcmpeqb xmm4, xmm4
pcmpeqb xmm1, xmm0
- pxor xmm4, xmm6
+ pxor xmm4, xmm2
%endmacro
%macro BV_TRANSPOSE 0
@@ -985,25 +903,18 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
%macro BV_WRITEBACK 2
movd [rsi+2], %1
- psrldq %1, 4
-
- movd [rdi+2], %1
- psrldq %1, 4
-
- movd [rsi+2*rax+2], %1
- psrldq %1, 4
-
- movd [rdi+2*rax+2], %1
-
movd [rsi+4*rax+2], %2
+ psrldq %1, 4
psrldq %2, 4
-
+ movd [rdi+2], %1
movd [rdi+4*rax+2], %2
+ psrldq %1, 4
psrldq %2, 4
-
+ movd [rsi+2*rax+2], %1
movd [rsi+2*rcx+2], %2
+ psrldq %1, 4
psrldq %2, 4
-
+ movd [rdi+2*rax+2], %1
movd [rdi+2*rcx+2], %2
%endmacro
@@ -1016,13 +927,12 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2):
; const char *blimit,
; const char *limit,
; const char *thresh,
-; int count
;)
global sym(vp8_loop_filter_vertical_edge_sse2)
sym(vp8_loop_filter_vertical_edge_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
+ SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
GET_GOT rbx
push rsi
@@ -1030,10 +940,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+ sub rsp, lf_var_size
mov rsi, arg(0) ; src_ptr
movsxd rax, dword ptr arg(1) ; src_pixel_step
@@ -1046,7 +953,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
TRANSPOSE_16X8 1, 1
; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 1
+ LFV_FILTER_MASK_HEV_MASK
; start work on filters
B_FILTER 2
@@ -1064,7 +971,7 @@ sym(vp8_loop_filter_vertical_edge_sse2):
lea rdi, [rdi+rdx*8]
BV_WRITEBACK xmm2, xmm6
- add rsp, 96
+ add rsp, lf_var_size
pop rsp
; begin epilog
pop rdi
@@ -1098,10 +1005,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 96 ; reserve 96 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[64];
+ sub rsp, lf_var_size
mov rsi, arg(0) ; u_ptr
movsxd rax, dword ptr arg(1) ; src_pixel_step
@@ -1110,13 +1014,11 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
lea rcx, [rax+2*rax]
- lea rdx, srct
-
;transpose 16x8 to 8x16, and store the 8-line result on stack.
TRANSPOSE_16X8 0, 1
; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 1
+ LFV_FILTER_MASK_HEV_MASK
; start work on filters
B_FILTER 2
@@ -1134,7 +1036,7 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
BV_WRITEBACK xmm2, xmm6
- add rsp, 96
+ add rsp, lf_var_size
pop rsp
; begin epilog
pop rdi
@@ -1146,92 +1048,89 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
ret
%macro MBV_TRANSPOSE 0
- movdqa xmm0, [rdx] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
+ movdqa xmm0, [rsp+_p3] ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
movdqa xmm1, xmm0 ; f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- punpcklbw xmm0, xmm7 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpckhbw xmm1, xmm7 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+ punpcklbw xmm0, xmm2 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm1, xmm2 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
- movdqa xmm2, [rdx+32] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- movdqa xmm6, xmm2 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa xmm7, [rsp+_p1] ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
+ movdqa xmm6, xmm7 ; f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- punpcklbw xmm2, [rdx+48] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpckhbw xmm6, [rdx+48] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
+ punpcklbw xmm7, [rsp+_p0] ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
+ punpckhbw xmm6, [rsp+_p0] ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
movdqa xmm3, xmm0 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpcklwd xmm0, xmm2 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
+ punpcklwd xmm0, xmm7 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
- punpckhwd xmm3, xmm2 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
+ punpckhwd xmm3, xmm7 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
movdqa xmm4, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
punpcklwd xmm1, xmm6 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
punpckhwd xmm4, xmm6 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
- movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpcklbw xmm2, [rdx+80] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpcklbw xmm7, [rsp+_q1] ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
movdqa xmm6, xmm5 ; f6 e6 d6 c6 b6 a6 96 86 76 66 56 46 36 26 16 06
- punpcklbw xmm6, [rdx+112] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
+ punpcklbw xmm6, [rsp+_q3] ; 77 76 67 66 57 56 47 46 37 36 27 26 17 16 07 06
- movdqa xmm7, xmm2 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
- punpcklwd xmm2, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
+ movdqa xmm2, xmm7 ; 75 74 65 64 55 54 45 44 35 34 25 24 15 14 05 04
+ punpcklwd xmm7, xmm6 ; 37 36 35 34 27 26 25 24 17 16 15 14 07 06 05 04
- punpckhwd xmm7, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
+ punpckhwd xmm2, xmm6 ; 77 76 75 74 67 66 65 64 57 56 55 54 47 46 45 44
movdqa xmm6, xmm0 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
- punpckldq xmm0, xmm2 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
- punpckhdq xmm6, xmm2 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
+ punpckldq xmm0, xmm7 ; 17 16 15 14 13 12 11 10 07 06 05 04 03 02 01 00
+ punpckhdq xmm6, xmm7 ; 37 36 35 34 33 32 31 30 27 26 25 24 23 22 21 20
%endmacro
%macro MBV_WRITEBACK_1 0
- movq QWORD PTR [rsi], xmm0
- movhps MMWORD PTR [rdi], xmm0
+ movq [rsi], xmm0
+ movhps [rdi], xmm0
- movq QWORD PTR [rsi+2*rax], xmm6
- movhps MMWORD PTR [rdi+2*rax], xmm6
+ movq [rsi+2*rax], xmm6
+ movhps [rdi+2*rax], xmm6
movdqa xmm0, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
- punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+ punpckldq xmm0, xmm2 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40
+ punpckhdq xmm3, xmm2 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
- punpckhdq xmm3, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60
+ movq [rsi+4*rax], xmm0
+ movhps [rdi+4*rax], xmm0
- movq QWORD PTR [rsi+4*rax], xmm0
- movhps MMWORD PTR [rdi+4*rax], xmm0
+ movq [rsi+2*rcx], xmm3
+ movhps [rdi+2*rcx], xmm3
- movq QWORD PTR [rsi+2*rcx], xmm3
- movhps MMWORD PTR [rdi+2*rcx], xmm3
-
- movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
- punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
-
- punpckhbw xmm5, [rdx+112] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
- movdqa xmm0, xmm2
+ movdqa xmm7, [rsp+_q0] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04
+ punpckhbw xmm7, [rsp+_q1] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84
+ punpckhbw xmm5, [rsp+_q3] ; f7 f6 e7 e6 d7 d6 c7 c6 b7 b6 a7 a6 97 96 87 86
+ movdqa xmm0, xmm7
punpcklwd xmm0, xmm5 ; b7 b6 b4 b4 a7 a6 a5 a4 97 96 95 94 87 86 85 84
- punpckhwd xmm2, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
+ punpckhwd xmm7, xmm5 ; f7 f6 f5 f4 e7 e6 e5 e4 d7 d6 d5 d4 c7 c6 c5 c4
movdqa xmm5, xmm1 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
punpckldq xmm1, xmm0 ; 97 96 95 94 93 92 91 90 87 86 85 83 84 82 81 80
-
punpckhdq xmm5, xmm0 ; b7 b6 b5 b4 b3 b2 b1 b0 a7 a6 a5 a4 a3 a2 a1 a0
%endmacro
%macro MBV_WRITEBACK_2 0
- movq QWORD PTR [rsi], xmm1
- movhps MMWORD PTR [rdi], xmm1
+ movq [rsi], xmm1
+ movhps [rdi], xmm1
- movq QWORD PTR [rsi+2*rax], xmm5
- movhps MMWORD PTR [rdi+2*rax], xmm5
+ movq [rsi+2*rax], xmm5
+ movhps [rdi+2*rax], xmm5
movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
- punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
- punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
+ punpckldq xmm1, xmm7 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0
+ punpckhdq xmm4, xmm7 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0
- movq QWORD PTR [rsi+4*rax], xmm1
- movhps MMWORD PTR [rdi+4*rax], xmm1
+ movq [rsi+4*rax], xmm1
+ movhps [rdi+4*rax], xmm1
- movq QWORD PTR [rsi+2*rcx], xmm4
- movhps MMWORD PTR [rdi+2*rcx], xmm4
+ movq [rsi+2*rcx], xmm4
+ movhps [rdi+2*rcx], xmm4
%endmacro
@@ -1242,13 +1141,12 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2):
; const char *blimit,
; const char *limit,
; const char *thresh,
-; int count
;)
global sym(vp8_mbloop_filter_vertical_edge_sse2)
sym(vp8_mbloop_filter_vertical_edge_sse2):
push rbp
mov rbp, rsp
- SHADOW_ARGS_TO_STACK 6
+ SHADOW_ARGS_TO_STACK 5
SAVE_XMM 7
GET_GOT rbx
push rsi
@@ -1256,10 +1154,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2):
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 160 ; reserve 160 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
+ sub rsp, lf_var_size
mov rsi, arg(0) ; src_ptr
movsxd rax, dword ptr arg(1) ; src_pixel_step
@@ -1272,7 +1167,7 @@ sym(vp8_mbloop_filter_vertical_edge_sse2):
TRANSPOSE_16X8 1, 0
; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 0
+ LFV_FILTER_MASK_HEV_MASK
neg rax
; start work on filters
@@ -1288,11 +1183,12 @@ sym(vp8_mbloop_filter_vertical_edge_sse2):
MBV_WRITEBACK_1
+
lea rsi, [rsi+rax*8]
lea rdi, [rdi+rax*8]
MBV_WRITEBACK_2
- add rsp, 160
+ add rsp, lf_var_size
pop rsp
; begin epilog
pop rdi
@@ -1325,10 +1221,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
; end prolog
ALIGN_STACK 16, rax
- sub rsp, 160 ; reserve 160 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
- %define srct [rsp + 32] ;__declspec(align(16)) char srct[128];
+ sub rsp, lf_var_size
mov rsi, arg(0) ; u_ptr
movsxd rax, dword ptr arg(1) ; src_pixel_step
@@ -1337,13 +1230,11 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing
lea rcx, [rax+2*rax]
- lea rdx, srct
-
; Transpose
TRANSPOSE_16X8 0, 0
; calculate filter mask and high edge variance
- LFV_FILTER_MASK_HEV_MASK 0
+ LFV_FILTER_MASK_HEV_MASK
; start work on filters
MB_FILTER_AND_WRITEBACK 2
@@ -1360,7 +1251,7 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2):
lea rdi, [rsi + rax]
MBV_WRITEBACK_2
- add rsp, 160
+ add rsp, lf_var_size
pop rsp
; begin epilog
pop rdi
@@ -1389,7 +1280,7 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
mov rcx, arg(0) ;src_ptr
movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
+ movdqa xmm6, [GLOBAL(tfe)]
lea rdx, [rcx + rax]
neg rax
@@ -1399,15 +1290,15 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
movdqa xmm1, [rcx+2*rax] ; p1
movdqa xmm2, xmm1
- movdqa xmm7, xmm0
+ movdqa xmm3, xmm0
psubusb xmm0, xmm1 ; q1-=p1
- psubusb xmm1, xmm7 ; p1-=q1
+ psubusb xmm1, xmm3 ; p1-=q1
por xmm1, xmm0 ; abs(p1-q1)
- pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
+ pand xmm1, xmm6 ; set lsb of each byte to zero
psrlw xmm1, 1 ; abs(p1-q1)/2
- movdqa xmm3, XMMWORD PTR [rdx]
+ movdqa xmm7, XMMWORD PTR [rdx]
movdqa xmm5, [rcx+rax] ; p0
movdqa xmm4, [rcx] ; q0
@@ -1421,15 +1312,15 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
paddusb xmm5, xmm5 ; abs(p0-q0)*2
paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
- psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor xmm3, xmm3
- pcmpeqb xmm5, xmm3
+ psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
+ pxor xmm7, xmm7
+ pcmpeqb xmm5, xmm7
; start work on filters
pxor xmm2, xmm4 ; p1 offset to convert to signed values
- pxor xmm7, xmm4 ; q1 offset to convert to signed values
- psubsb xmm2, xmm7 ; p1 - q1
+ pxor xmm3, xmm4 ; q1 offset to convert to signed values
+ psubsb xmm2, xmm3 ; p1 - q1
pxor xmm6, xmm4 ; offset to convert to signed values
pxor xmm0, xmm4 ; offset to convert to signed values
@@ -1440,14 +1331,14 @@ sym(vp8_loop_filter_simple_horizontal_edge_sse2):
paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
pand xmm5, xmm2 ; mask filter values we don't care about
- paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
movdqa xmm0, xmm5
- psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
movdqa xmm1, [GLOBAL(te0)]
movdqa xmm2, [GLOBAL(t1f)]
- pxor xmm7, xmm7
+; pxor xmm7, xmm7
pcmpgtb xmm7, xmm0 ;save sign
pand xmm7, xmm1 ;preserve the upper 3 bits
psrlw xmm0, 3
@@ -1605,29 +1496,26 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
pxor xmm3, xmm4 ; q1 offset to convert to signed values
psubsb xmm0, xmm3 ; p1 - q1
- movdqa xmm6, xmm1 ; p0
-; movdqa xmm7, xmm2 ; q0
-
- pxor xmm6, xmm4 ; offset to convert to signed values
+ pxor xmm1, xmm4 ; offset to convert to signed values
pxor xmm2, xmm4 ; offset to convert to signed values
movdqa xmm3, xmm2 ; offseted ; q0
- psubsb xmm2, xmm6 ; q0 - p0
+ psubsb xmm2, xmm1 ; q0 - p0
paddsb xmm0, xmm2 ; p1 - q1 + 1 * (q0 - p0)
paddsb xmm0, xmm2 ; p1 - q1 + 2 * (q0 - p0)
paddsb xmm0, xmm2 ; p1 - q1 + 3 * (q0 - p0)
pand xmm5, xmm0 ; mask filter values we don't care about
- paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
movdqa xmm0, xmm5
- psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
+ paddsb xmm5, [GLOBAL(t3)] ; 3* (q0 - p0) + (p1 - q1) + 4
+ paddsb xmm0, [GLOBAL(t4)] ; +3 instead of +4
- movdqa xmm1, [GLOBAL(te0)]
+ movdqa xmm6, [GLOBAL(te0)]
movdqa xmm2, [GLOBAL(t1f)]
- pxor xmm7, xmm7
+; pxor xmm7, xmm7
pcmpgtb xmm7, xmm0 ;save sign
- pand xmm7, xmm1 ;preserve the upper 3 bits
+ pand xmm7, xmm6 ;preserve the upper 3 bits
psrlw xmm0, 3
pand xmm0, xmm2 ;clear out upper 3 bits
por xmm0, xmm7 ;add sign
@@ -1635,26 +1523,29 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
pxor xmm7, xmm7
pcmpgtb xmm7, xmm5 ;save sign
- pand xmm7, xmm1 ;preserve the upper 3 bits
+ pand xmm7, xmm6 ;preserve the upper 3 bits
psrlw xmm5, 3
pand xmm5, xmm2 ;clear out upper 3 bits
por xmm5, xmm7 ;add sign
- paddsb xmm6, xmm5 ; p0+= p0 add
+ paddsb xmm1, xmm5 ; p0+= p0 add
pxor xmm3, xmm4 ; unoffset q0
- pxor xmm6, xmm4 ; unoffset p0
+ pxor xmm1, xmm4 ; unoffset p0
movdqa xmm0, t0 ; p1
movdqa xmm4, t1 ; q1
+ ; write out order: xmm0 xmm2 xmm1 xmm3
+ lea rdx, [rsi + rax*4]
+
; transpose back to write out
; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
+ movdqa xmm6, xmm0
+ punpcklbw xmm0, xmm1 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
+ punpckhbw xmm6, xmm1 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
movdqa xmm5, xmm3
punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
@@ -1664,27 +1555,23 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
- movdqa xmm3, xmm1
- punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
+ movdqa xmm3, xmm6
+ punpcklwd xmm6, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
- ; write out order: xmm0 xmm2 xmm1 xmm3
- lea rdx, [rsi + rax*4]
-
- movd [rsi], xmm1 ; write the second 8-line result
- psrldq xmm1, 4
- movd [rdi], xmm1
- psrldq xmm1, 4
- movd [rsi + rax*2], xmm1
- psrldq xmm1, 4
- movd [rdi + rax*2], xmm1
-
+ movd [rsi], xmm6 ; write the second 8-line result
movd [rdx], xmm3
+ psrldq xmm6, 4
psrldq xmm3, 4
+ movd [rdi], xmm6
movd [rcx], xmm3
+ psrldq xmm6, 4
psrldq xmm3, 4
+ movd [rsi + rax*2], xmm6
movd [rdx + rax*2], xmm3
+ psrldq xmm6, 4
psrldq xmm3, 4
+ movd [rdi + rax*2], xmm6
movd [rcx + rax*2], xmm3
neg rax
@@ -1695,19 +1582,18 @@ sym(vp8_loop_filter_simple_vertical_edge_sse2):
lea rcx, [rdx + rax]
movd [rsi], xmm0 ; write the first 8-line result
- psrldq xmm0, 4
- movd [rdi], xmm0
- psrldq xmm0, 4
- movd [rsi + rax*2], xmm0
- psrldq xmm0, 4
- movd [rdi + rax*2], xmm0
-
movd [rdx], xmm2
+ psrldq xmm0, 4
psrldq xmm2, 4
+ movd [rdi], xmm0
movd [rcx], xmm2
+ psrldq xmm0, 4
psrldq xmm2, 4
+ movd [rsi + rax*2], xmm0
movd [rdx + rax*2], xmm2
+ psrldq xmm0, 4
psrldq xmm2, 4
+ movd [rdi + rax*2], xmm0
movd [rcx + rax*2], xmm2
add rsp, 32
diff --git a/vp8/common/x86/loopfilter_x86.c b/vp8/common/x86/loopfilter_x86.c
index 066df4352..658600460 100644
--- a/vp8/common/x86/loopfilter_x86.c
+++ b/vp8/common/x86/loopfilter_x86.c
@@ -16,6 +16,10 @@
void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
const unsigned char *limit, const unsigned char *thresh, int count)
+#define prototype_loopfilter_nc(sym) \
+ void sym(unsigned char *src, int pitch, const unsigned char *blimit,\
+ const unsigned char *limit, const unsigned char *thresh)
+
#define prototype_simple_loopfilter(sym) \
void sym(unsigned char *y, int ystride, const unsigned char *blimit)
@@ -30,11 +34,11 @@ prototype_simple_loopfilter(vp8_loop_filter_simple_vertical_edge_mmx);
prototype_loopfilter(vp8_loop_filter_bv_y_sse2);
prototype_loopfilter(vp8_loop_filter_bh_y_sse2);
#else
-prototype_loopfilter(vp8_loop_filter_vertical_edge_sse2);
-prototype_loopfilter(vp8_loop_filter_horizontal_edge_sse2);
+prototype_loopfilter_nc(vp8_loop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_loop_filter_horizontal_edge_sse2);
#endif
-prototype_loopfilter(vp8_mbloop_filter_vertical_edge_sse2);
-prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_sse2);
+prototype_loopfilter_nc(vp8_mbloop_filter_vertical_edge_sse2);
+prototype_loopfilter_nc(vp8_mbloop_filter_horizontal_edge_sse2);
extern loop_filter_uvfunction vp8_loop_filter_horizontal_edge_uv_sse2;
extern loop_filter_uvfunction vp8_loop_filter_vertical_edge_uv_sse2;
@@ -124,7 +128,7 @@ void vp8_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, const unsigned
void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+ vp8_mbloop_filter_horizontal_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
if (u_ptr)
vp8_mbloop_filter_horizontal_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
@@ -135,7 +139,7 @@ void vp8_loop_filter_mbh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsign
void vp8_loop_filter_mbv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
int y_stride, int uv_stride, loop_filter_info *lfi)
{
- vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr, 2);
+ vp8_mbloop_filter_vertical_edge_sse2(y_ptr, y_stride, lfi->mblim, lfi->lim, lfi->hev_thr);
if (u_ptr)
vp8_mbloop_filter_vertical_edge_uv_sse2(u_ptr, uv_stride, lfi->mblim, lfi->lim, lfi->hev_thr, v_ptr);
@@ -149,9 +153,9 @@ void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
#if ARCH_X86_64
vp8_loop_filter_bh_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
#else
- vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 4 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 8 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_horizontal_edge_sse2(y_ptr + 12 * y_stride, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
#endif
if (u_ptr)
@@ -174,9 +178,9 @@ void vp8_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigne
#if ARCH_X86_64
vp8_loop_filter_bv_y_sse2(y_ptr, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
#else
- vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
- vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr, 2);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 4, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 8, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
+ vp8_loop_filter_vertical_edge_sse2(y_ptr + 12, y_stride, lfi->blim, lfi->lim, lfi->hev_thr);
#endif
if (u_ptr)