diff options
Diffstat (limited to 'vp8')
-rw-r--r-- | vp8/common/x86/loopfilter_sse2.asm | 205 |
1 files changed, 108 insertions, 97 deletions
diff --git a/vp8/common/x86/loopfilter_sse2.asm b/vp8/common/x86/loopfilter_sse2.asm index 5839e43bf..985d5a09d 100644 --- a/vp8/common/x86/loopfilter_sse2.asm +++ b/vp8/common/x86/loopfilter_sse2.asm @@ -737,29 +737,30 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): %macro TRANSPOSE_16X8_1 0 - movq xmm0, QWORD PTR [rdi+rcx*2] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 - movq xmm7, QWORD PTR [rsi+rcx*2] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 + movq xmm4, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 + movq xmm7, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 - punpcklbw xmm7, xmm0 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 - movq xmm0, QWORD PTR [rsi+rcx] + punpcklbw xmm4, xmm7 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 - movq xmm5, QWORD PTR [rsi] ; - punpcklbw xmm5, xmm0 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 - punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + movq xmm7, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 - punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 - movq xmm7, QWORD PTR [rsi + rax] ; xx xx xx xx xx xx xx xx 37 36 35 34 33 32 31 30 + movq xmm5, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx 47 46 45 44 43 42 41 40 + movq xmm2, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx 57 56 55 54 53 52 51 50 - movq xmm0, QWORD PTR [rsi + rax*2] ; xx xx xx xx xx xx xx xx 27 26 25 24 23 22 21 20 - punpcklbw xmm0, xmm7 ; 37 27 36 36 35 25 34 24 33 23 32 22 31 21 30 20 + punpcklbw xmm5, xmm2 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 + movq xmm7, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx 67 66 65 64 63 62 61 60 - movq xmm4, QWORD PTR [rsi + rax*4] ; xx xx xx xx xx xx xx xx 07 06 05 04 03 02 01 00 - movq xmm7, QWORD PTR [rdi + rax*4] ; xx xx xx xx xx xx xx xx 17 16 15 14 13 12 11 10 + movq xmm1, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx 77 76 75 74 73 72 71 70 + movdqa xmm6, xmm5 ; 57 47 56 46 55 45 54 44 53 43 52 42 51 41 50 40 - punpcklbw xmm4, xmm7 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 - movdqa xmm3, xmm4 ; 17 07 16 06 15 05 14 04 13 03 12 02 11 01 10 00 + punpcklbw xmm7, xmm1 ; 77 67 76 66 75 65 74 64 73 63 72 62 71 61 70 60 + punpcklwd xmm5, xmm7 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + + punpckhwd xmm6, xmm7 ; 77 67 57 47 76 66 56 46 75 65 55 45 74 64 54 44 punpcklwd xmm3, xmm0 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 punpckhwd xmm4, xmm0 ; 37 27 17 07 36 26 16 06 35 25 15 05 34 24 14 04 @@ -777,28 +778,28 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): %endmacro %macro TRANSPOSE_16X8_2 1 - movq xmm6, QWORD PTR [rdi+rcx*2] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 - movq xmm5, QWORD PTR [rsi+rcx*2] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + movq xmm2, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 + movq xmm5, QWORD PTR [rdi] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 - punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 - movq xmm6, QWORD PTR [rsi+rcx] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 + punpcklbw xmm2, xmm5 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 + movq xmm0, QWORD PTR [rsi+2*rax] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 + + movq xmm5, QWORD PTR [rdi+2*rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 + punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 + + movq xmm1, QWORD PTR [rsi+4*rax] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 + movq xmm6, QWORD PTR [rdi+4*rax] ; xx xx xx xx xx xx xx xx d7 d6 d5 d4 d3 d2 d1 d0 - movq xmm1, QWORD PTR [rsi] ; xx xx xx xx xx xx xx xx c7 c6 c5 c4 c3 c2 c1 c0 punpcklbw xmm1, xmm6 ; d7 c7 d6 c6 d5 c5 d4 c4 d3 c3 d2 c2 d1 e1 d0 c0 + movq xmm5, QWORD PTR [rsi+2*rcx] ; xx xx xx xx xx xx xx xx e7 e6 e5 e4 e3 e2 e1 e0 + + movq xmm6, QWORD PTR [rdi+2*rcx] ; xx xx xx xx xx xx xx xx f7 f6 f5 f4 f3 f2 f1 f0 + punpcklbw xmm5, xmm6 ; f7 e7 f6 e6 f5 e5 f4 e4 f3 e3 f2 e2 f1 e1 f0 e0 movdqa xmm6, xmm1 ; punpckhwd xmm6, xmm5 ; f7 e7 d7 c7 f6 e6 d6 c6 f5 e5 d5 c5 f4 e4 d4 c4 punpcklwd xmm1, xmm5 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 - movq xmm5, QWORD PTR [rsi+rax] ; xx xx xx xx xx xx xx xx b7 b6 b5 b4 b3 b2 b1 b0 - - movq xmm0, QWORD PTR [rsi+rax*2] ; xx xx xx xx xx xx xx xx a7 a6 a5 a4 a3 a2 a1 a0 - punpcklbw xmm0, xmm5 ; b7 a7 b6 a6 b5 a5 b4 a4 b3 a3 b2 a2 b1 a1 b0 a0 - - movq xmm2, QWORD PTR [rsi+rax*4] ; xx xx xx xx xx xx xx xx 87 86 85 84 83 82 81 80 - movq xmm5, QWORD PTR [rdi+rax*4] ; xx xx xx xx xx xx xx xx 97 96 95 94 93 92 91 90 - - punpcklbw xmm2, xmm5 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 movdqa xmm5, xmm2 ; 97 87 96 86 95 85 94 84 93 83 92 82 91 81 90 80 punpcklwd xmm5, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 @@ -995,7 +996,6 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): lea rdx, srct movdqa xmm2, [rdx] ; p1 lea rsi, [rsi+rcx*8] - lea rdi, [rsi+rcx] movdqa xmm7, [rdx+48] ; q1 movdqa xmm6, [rdx+16] ; p0 movdqa xmm0, [rdx+32] ; q0 @@ -1103,27 +1103,27 @@ sym(vp8_mbloop_filter_horizontal_edge_uv_sse2): %endmacro %macro BV_WRITEBACK 2 - movd [rsi+rax*4+2], %1 + movd [rsi+2], %1 psrldq %1, 4 - movd [rdi+rax*4+2], %1 + movd [rdi+2], %1 psrldq %1, 4 - movd [rsi+rax*2+2], %1 + movd [rsi+2*rax+2], %1 psrldq %1, 4 - movd [rdi+rax*2+2], %1 + movd [rdi+2*rax+2], %1 - movd [rsi+2], %2 + movd [rsi+4*rax+2], %2 psrldq %2, 4 - movd [rdi+2], %2 + movd [rdi+4*rax+2], %2 psrldq %2, 4 - movd [rdi+rcx+2], %2 + movd [rsi+2*rcx+2], %2 psrldq %2, 4 - movd [rdi+rcx*2+2], %2 + movd [rdi+2*rcx+2], %2 %endmacro @@ -1156,16 +1156,15 @@ sym(vp8_loop_filter_vertical_edge_sse2): mov rsi, arg(0) ; src_ptr movsxd rax, dword ptr arg(1) ; src_pixel_step - lea rsi, [rsi + rax*4 - 4] + lea rsi, [rsi - 4] lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - mov rcx, rax - neg rax + lea rcx, [rax*2+rax] ;transpose 16x8 to 8x16, and store the 8-line result on stack. TRANSPOSE_16X8_1 - lea rsi, [rsi+rcx*8] - lea rdi, [rdi+rcx*8] + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] lea rdx, srct TRANSPOSE_16X8_2 1 @@ -1180,10 +1179,14 @@ sym(vp8_loop_filter_vertical_edge_sse2): ; tranpose and write back - only work on q1, q0, p0, p1 BV_TRANSPOSE ; store 16-line result + + lea rdx, [rax] + neg rdx + BV_WRITEBACK xmm1, xmm5 - lea rsi, [rsi+rax*8] - lea rdi, [rsi+rcx] + lea rsi, [rsi+rdx*8] + lea rdi, [rdi+rdx*8] BV_WRITEBACK xmm2, xmm6 add rsp, 96 @@ -1227,17 +1230,16 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): mov rsi, arg(0) ; u_ptr movsxd rax, dword ptr arg(1) ; src_pixel_step - lea rsi, [rsi + rax*4 - 4] + lea rsi, [rsi - 4] lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - mov rcx, rax - neg rax + lea rcx, [rax+2*rax] ;transpose 16x8 to 8x16, and store the 8-line result on stack. TRANSPOSE_16X8_1 - mov rsi, arg(5) ; v_ptr - lea rsi, [rsi + rcx*4 - 4] - lea rdi, [rsi + rcx] ; rdi points to row +1 for indirect addressing + mov rsi, arg(5) ; v_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing lea rdx, srct TRANSPOSE_16X8_2 1 @@ -1252,12 +1254,15 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): ; tranpose and write back - only work on q1, q0, p0, p1 BV_TRANSPOSE + + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + ; store 16-line result BV_WRITEBACK xmm1, xmm5 - mov rsi, arg(0) ;u_ptr - lea rsi, [rsi + rcx*4 - 4] - lea rdi, [rsi + rcx] + mov rsi, arg(0) ; u_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing BV_WRITEBACK xmm2, xmm6 add rsp, 96 @@ -1479,28 +1484,30 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): %endmacro %macro MBV_WRITEBACK_1 0 - movq QWORD PTR [rsi+rax*4], xmm0 + movq QWORD PTR [rsi], xmm0 psrldq xmm0, 8 - movq QWORD PTR [rsi+rax*2], xmm6 + movq QWORD PTR [rdi], xmm0 + + movq QWORD PTR [rsi+2*rax], xmm6 psrldq xmm6, 8 - movq QWORD PTR [rdi+rax*4], xmm0 - movq QWORD PTR [rsi+rax], xmm6 + movq QWORD PTR [rdi+2*rax], xmm6 movdqa xmm0, xmm5 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 punpckldq xmm0, xmm7 ; 57 56 55 54 53 52 51 50 47 46 45 44 43 42 41 40 punpckhdq xmm5, xmm7 ; 77 76 75 74 73 72 71 70 67 66 65 64 63 62 61 60 - movq QWORD PTR [rsi], xmm0 + movq QWORD PTR [rsi+4*rax], xmm0 psrldq xmm0, 8 - movq QWORD PTR [rsi+rcx*2], xmm5 + movq QWORD PTR [rdi+4*rax], xmm0 + + movq QWORD PTR [rsi+2*rcx], xmm5 psrldq xmm5, 8 - movq QWORD PTR [rsi+rcx], xmm0 - movq QWORD PTR [rdi+rcx*2], xmm5 + movq QWORD PTR [rdi+2*rcx], xmm5 movdqa xmm2, [rdx+64] ; f4 e4 d4 c4 b4 a4 94 84 74 64 54 44 34 24 14 04 punpckhbw xmm2, [rdx+80] ; f5 f4 e5 e4 d5 d4 c5 c4 b5 b4 a5 a4 95 94 85 84 @@ -1518,28 +1525,30 @@ sym(vp8_loop_filter_vertical_edge_uv_sse2): %endmacro %macro MBV_WRITEBACK_2 0 - movq QWORD PTR [rsi+rax*4], xmm1 + movq QWORD PTR [rsi], xmm1 psrldq xmm1, 8 - movq QWORD PTR [rsi+rax*2], xmm3 + movq QWORD PTR [rdi], xmm1 + + movq QWORD PTR [rsi+2*rax], xmm3 psrldq xmm3, 8 - movq QWORD PTR [rdi+rax*4], xmm1 - movq QWORD PTR [rsi+rax], xmm3 + movq QWORD PTR [rdi+2*rax], xmm3 movdqa xmm1, xmm4 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 punpckldq xmm1, xmm2 ; d7 d6 d5 d4 d3 d2 d1 d0 c7 c6 c5 c4 c3 c2 c1 c0 punpckhdq xmm4, xmm2 ; f7 f6 f4 f4 f3 f2 f1 f0 e7 e6 e5 e4 e3 e2 e1 e0 - movq QWORD PTR [rsi], xmm1 + movq QWORD PTR [rsi+4*rax], xmm1 psrldq xmm1, 8 - movq QWORD PTR [rsi+rcx*2], xmm4 + movq QWORD PTR [rdi+4*rax], xmm1 + + movq QWORD PTR [rsi+2*rcx], xmm4 psrldq xmm4, 8 - movq QWORD PTR [rsi+rcx], xmm1 - movq QWORD PTR [rdi+rcx*2], xmm4 + movq QWORD PTR [rdi+2*rcx], xmm4 %endmacro @@ -1569,20 +1578,19 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step + mov rsi, arg(0) ; src_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step - lea rsi, [rsi + rax*4 - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - mov rcx, rax - neg rax + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax*2+rax] ; Transpose TRANSPOSE_16X8_1 - lea rsi, [rsi+rcx*8] - lea rdi, [rdi+rcx*8] - lea rdx, srct + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + lea rdx, srct TRANSPOSE_16X8_2 0 ; calculate filter mask @@ -1590,18 +1598,22 @@ sym(vp8_mbloop_filter_vertical_edge_sse2): ; calculate high edge variance LFV_HEV_MASK + neg rax ; start work on filters MBV_FILTER + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] + ; transpose and write back MBV_TRANSPOSE - lea rsi, [rsi+rax*8] - lea rdi, [rdi+rax*8] + neg rax + MBV_WRITEBACK_1 - lea rsi, [rsi+rcx*8] - lea rdi, [rdi+rcx*8] + lea rsi, [rsi+rax*8] + lea rdi, [rdi+rax*8] MBV_WRITEBACK_2 add rsp, 160 @@ -1642,21 +1654,20 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; %define srct [rsp + 32] ;__declspec(align(16)) char srct[128]; - mov rsi, arg(0) ;u_ptr - movsxd rax, dword ptr arg(1) ; src_pixel_step + mov rsi, arg(0) ; u_ptr + movsxd rax, dword ptr arg(1) ; src_pixel_step - lea rsi, [rsi + rax*4 - 4] - lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing - mov rcx, rax - neg rax + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] ; rdi points to row +1 for indirect addressing + lea rcx, [rax+2*rax] ; Transpose TRANSPOSE_16X8_1 ; XMM3 XMM4 XMM7 in use - mov rsi, arg(5) ;v_ptr - lea rsi, [rsi + rcx*4 - 4] - lea rdi, [rsi + rcx] + mov rsi, arg(5) ; v_ptr + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] lea rdx, srct TRANSPOSE_16X8_2 0 @@ -1672,12 +1683,12 @@ sym(vp8_mbloop_filter_vertical_edge_uv_sse2): MBV_TRANSPOSE mov rsi, arg(0) ;u_ptr - lea rsi, [rsi + rcx*4 - 4] - lea rdi, [rsi + rcx] + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] MBV_WRITEBACK_1 mov rsi, arg(5) ;v_ptr - lea rsi, [rsi + rcx*4 - 4] - lea rdi, [rsi + rcx] + lea rsi, [rsi - 4] + lea rdi, [rsi + rax] MBV_WRITEBACK_2 add rsp, 160 |