diff options
author | James Zern <jzern@google.com> | 2015-11-10 22:35:12 +0000 |
---|---|---|
committer | Gerrit Code Review <noreply-gerritcodereview@google.com> | 2015-11-10 22:35:12 +0000 |
commit | e3efed7f4c188bcacdae813a3ee9e93f0f649234 (patch) | |
tree | 3994a60b10c0d6e41d8d76f0f49543ee3a9e1a24 /vpx_dsp | |
parent | f48321974bf1bc62494fd4fb983eb745b48a184e (diff) | |
parent | 40dab58941695ee52b70e0bcb4b93da668e42d6b (diff) | |
download | libvpx-e3efed7f4c188bcacdae813a3ee9e93f0f649234.tar libvpx-e3efed7f4c188bcacdae813a3ee9e93f0f649234.tar.gz libvpx-e3efed7f4c188bcacdae813a3ee9e93f0f649234.tar.bz2 libvpx-e3efed7f4c188bcacdae813a3ee9e93f0f649234.zip |
Merge "convolve_copy_sse2: replace SSE w/SSE2 code"
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/x86/vpx_convolve_copy_sse2.asm | 70 |
1 files changed, 40 insertions, 30 deletions
diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index 9c5b414b4..abc027065 100644 --- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -13,15 +13,21 @@ SECTION .text %macro convolve_fn 1-2 -INIT_XMM sse2 +%ifidn %1, avg +%define AUX_XMM_REGS 4 +%else +%define AUX_XMM_REGS 0 +%endif %ifidn %2, highbd %define pavg pavgw -cglobal %2_convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ - fx, fxs, fy, fys, w, h, bd +cglobal %2_convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h, bd %else %define pavg pavgb -cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ - fx, fxs, fy, fys, w, h +cglobal convolve_%1, 4, 7, 4+AUX_XMM_REGS, src, src_stride, \ + dst, dst_stride, \ + fx, fxs, fy, fys, w, h %endif mov r4d, dword wm %ifidn %2, highbd @@ -152,27 +158,30 @@ cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ jnz .loop16 RET -INIT_MMX sse .w8: mov r4d, dword hm lea r5q, [src_strideq*3] lea r6q, [dst_strideq*3] .loop8: - movu m0, [srcq] - movu m1, [srcq+src_strideq] - movu m2, [srcq+src_strideq*2] - movu m3, [srcq+r5q] + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - pavg m0, [dstq] - pavg m1, [dstq+dst_strideq] - pavg m2, [dstq+dst_strideq*2] - pavg m3, [dstq+r6q] + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + pavg m0, m4 + pavg m1, m5 + pavg m2, m6 + pavg m3, m7 %endif - mova [dstq ], m0 - mova [dstq+dst_strideq ], m1 - mova [dstq+dst_strideq*2], m2 - mova [dstq+r6q ], m3 + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 lea dstq, [dstq+dst_strideq*4] sub r4d, 4 jnz .loop8 @@ -184,25 +193,25 @@ INIT_MMX sse lea r5q, [src_strideq*3] lea r6q, [dst_strideq*3] .loop4: - movh m0, [srcq] - movh m1, [srcq+src_strideq] - movh m2, [srcq+src_strideq*2] - movh m3, [srcq+r5q] + movd m0, [srcq] + movd m1, [srcq+src_strideq] + movd m2, [srcq+src_strideq*2] + movd m3, [srcq+r5q] lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - movh m4, [dstq] - movh m5, [dstq+dst_strideq] - movh m6, [dstq+dst_strideq*2] - movh m7, [dstq+r6q] + movd m4, [dstq] + movd m5, [dstq+dst_strideq] + movd m6, [dstq+dst_strideq*2] + movd m7, [dstq+r6q] pavg m0, m4 pavg m1, m5 pavg m2, m6 pavg m3, m7 %endif - movh [dstq ], m0 - movh [dstq+dst_strideq ], m1 - movh [dstq+dst_strideq*2], m2 - movh [dstq+r6q ], m3 + movd [dstq ], m0 + movd [dstq+dst_strideq ], m1 + movd [dstq+dst_strideq*2], m2 + movd [dstq+r6q ], m3 lea dstq, [dstq+dst_strideq*4] sub r4d, 4 jnz .loop4 @@ -210,6 +219,7 @@ INIT_MMX sse %endif %endmacro +INIT_XMM sse2 convolve_fn copy convolve_fn avg %if CONFIG_VP9_HIGHBITDEPTH |