diff options
author | Scott LaVarnway <slavarnway@google.com> | 2015-07-30 05:02:04 -0700 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2015-07-31 14:51:51 -0700 |
commit | a5e97d874b16ae5826b68515f1e35ffb44361cf8 (patch) | |
tree | 90355c3fad36aee0de850ea3f1d16359cef0263c /vpx_dsp | |
parent | 6025c6d65bacea0c72e02ee498bd3e82f92c9141 (diff) | |
download | libvpx-a5e97d874b16ae5826b68515f1e35ffb44361cf8.tar libvpx-a5e97d874b16ae5826b68515f1e35ffb44361cf8.tar.gz libvpx-a5e97d874b16ae5826b68515f1e35ffb44361cf8.tar.bz2 libvpx-a5e97d874b16ae5826b68515f1e35ffb44361cf8.zip |
VP9_COPY_CONVOLVE_SSE2 optimization
This function suffers from a couple problems in small core(tablets):
-The load of the next iteration is blocked by the store of previous iteration
-4k aliasing (between future store and older loads)
-current small core machine are in-order machine and because of it the store will spin the rehabQ until the load is finished
fixed by:
- prefetching 2 lines ahead
- unroll copy of 2 rows of block
- pre-load all xmm regiters before the loop, final stores after the loop
The function is optimized by:
copy_convolve_sse2 64x64 - 16%
copy_convolve_sse2 32x32 - 52%
copy_convolve_sse2 16x16 - 6%
copy_convolve_sse2 8x8 - 2.5%
copy_convolve_sse2 4x4 - 2.7%
credit goes to Tom Craver(tom.r.craver@intel.com) and Ilya Albrekht(ilya.albrekht@intel.com)
Change-Id: I63d3428799c50b2bf7b5677c8268bacb9fc29671
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/x86/vpx_convolve_copy_sse2.asm | 339 |
1 files changed, 244 insertions, 95 deletions
diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm index 6cd620a59..0be9cc529 100644 --- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm +++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm @@ -18,138 +18,287 @@ SECTION .text INIT_XMM sse2 cglobal convolve_%1, 4, 7, 4, src, src_stride, dst, dst_stride, \ fx, fxs, fy, fys, w, h - mov r4d, dword wm - cmp r4d, 4 + mov r4d, dword wm + cmp r4d, 4 je .w4 - cmp r4d, 8 + cmp r4d, 8 je .w8 - cmp r4d, 16 + cmp r4d, 16 je .w16 - cmp r4d, 32 + cmp r4d, 32 je .w32 - mov r4d, dword hm + ; 64xh + mov r4d, dword hm + shr r4d, 1 ; ASSUMPTION: hm is at least EVEN + sub r4d, 1 + + movu m0, [srcq] + movu m4, [srcq+src_strideq] + movu m1, [srcq+16] + movu m5, [srcq+src_strideq+16] + movu m2, [srcq+32] + movu m6, [srcq+src_strideq+32] + movu m3, [srcq+48] + movu m7, [srcq+src_strideq+48] + .loop64: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+32] - movu m3, [srcq+48] - add srcq, src_strideq + prefetcht0 [srcq+64 ] + prefetcht0 [srcq+src_strideq+64] + + lea srcq, [srcq+src_strideq*2] + %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+16] - pavgb m2, [dstq+32] - pavgb m3, [dstq+48] + pavgb m0, [dstq] + pavgb m1, [dstq+16] + + mova [dstq ], m0 + movu m0, [srcq] + + mova [dstq+16], m1 + movu m1, [srcq+16] + + pavgb m2, [dstq+32] + mova [dstq+32], m2 + movu m2, [srcq+32] + pavgb m3, [dstq+48] + mova [dstq+48], m3 + movu m3, [srcq+48] + pavgb m4, [dstq+dst_strideq] + + mova [dstq+dst_strideq], m4 + movu m4, [srcq+src_strideq] + + pavgb m5, [dstq+dst_strideq+16] + mova [dstq+dst_strideq+16], m5 + movu m5, [srcq+src_strideq+16] + pavgb m6, [dstq+dst_strideq+32] + mova [dstq+dst_strideq+32], m6 + movu m6, [srcq+src_strideq+32] + pavgb m7, [dstq+dst_strideq+48] + mova [dstq+dst_strideq+48], m7 + movu m7, [srcq+src_strideq+48] + + lea dstq, [dstq+dst_strideq*2] +%else + mova [dstq ], m0 + movu m0, [srcq] + + mova [dstq+16], m1 + movu m1, [srcq+16] + mova [dstq+32], m2 + movu m2, [srcq+32] + mova [dstq+48], m3 + movu m3, [srcq+48] + + mova [dstq+dst_strideq], m4 + movu m4, [srcq+src_strideq] + + mova [dstq+dst_strideq+16], m5 + movu m5, [srcq+src_strideq+16] + mova [dstq+dst_strideq+32], m6 + movu m6, [srcq+src_strideq+32] + mova [dstq+dst_strideq+48], m7 + movu m7, [srcq+src_strideq+48] + + lea dstq, [dstq+dst_strideq*2] %endif - mova [dstq ], m0 - mova [dstq+16], m1 - mova [dstq+32], m2 - mova [dstq+48], m3 - add dstq, dst_strideq - dec r4d + dec r4d jnz .loop64 + +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+16] + pavgb m2, [dstq+32] + pavgb m3, [dstq+48] + pavgb m4, [dstq+dst_strideq] + pavgb m5, [dstq+dst_strideq+16] + pavgb m6, [dstq+dst_strideq+32] + pavgb m7, [dstq+dst_strideq+48] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + mova [dstq+32], m2 + mova [dstq+48], m3 + + mova [dstq+dst_strideq ], m4 + mova [dstq+dst_strideq+16], m5 + mova [dstq+dst_strideq+32], m6 + mova [dstq+dst_strideq+48], m7 + RET .w32: - mov r4d, dword hm + mov r4d, dword hm + sub r4d, 2 + + movu m0, [srcq] + movu m1, [srcq+16] + movu m2, [srcq+src_strideq] + movu m3, [srcq+src_strideq+16] + .loop32: - movu m0, [srcq] - movu m1, [srcq+16] - movu m2, [srcq+src_strideq] - movu m3, [srcq+src_strideq+16] - lea srcq, [srcq+src_strideq*2] + prefetcht0 [srcq+64] + prefetcht0 [srcq+src_strideq+64] + + lea srcq, [srcq+src_strideq*2] %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq +16] - pavgb m2, [dstq+dst_strideq] - pavgb m3, [dstq+dst_strideq+16] + pavgb m0, [dstq] + pavgb m1, [dstq+16] + pavgb m2, [dstq+dst_strideq] + pavgb m3, [dstq+dst_strideq+16] %endif - mova [dstq ], m0 - mova [dstq +16], m1 - mova [dstq+dst_strideq ], m2 - mova [dstq+dst_strideq+16], m3 - lea dstq, [dstq+dst_strideq*2] - sub r4d, 2 + mova [dstq], m0 + movu m0, [srcq] + + mova [dstq+16], m1 + movu m1, [srcq+16] + + mova [dstq+dst_strideq], m2 + movu m2, [srcq+src_strideq] + + mova [dstq+dst_strideq+16], m3 + movu m3, [srcq+src_strideq+16] + + lea dstq, [dstq+dst_strideq*2] + + sub r4d, 2 jnz .loop32 + +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+16] + pavgb m2, [dstq+dst_strideq] + pavgb m3, [dstq+dst_strideq+16] +%endif + mova [dstq ], m0 + mova [dstq+16], m1 + + mova [dstq+dst_strideq ], m2 + mova [dstq+dst_strideq+16], m3 + RET .w16: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] + mov r4d, dword hm + sub r4d, 4 + + movu m0, [srcq] + movu m1, [srcq+src_strideq] + .loop16: - movu m0, [srcq] - movu m1, [srcq+src_strideq] - movu m2, [srcq+src_strideq*2] - movu m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] + lea srcq, [srcq+src_strideq] + prefetcht0 [srcq+src_strideq*4] + lea srcq, [srcq+src_strideq] + prefetcht0 [srcq+src_strideq*2] %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+dst_strideq] - pavgb m2, [dstq+dst_strideq*2] - pavgb m3, [dstq+r6q] + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] %endif - mova [dstq ], m0 - mova [dstq+dst_strideq ], m1 - mova [dstq+dst_strideq*2], m2 - mova [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 + mova [dstq ], m0 + mova [dstq+dst_strideq], m1 + + lea dstq, [dstq+dst_strideq*2] + + movu m0, [srcq] + movu m1, [srcq+src_strideq] + + sub r4d, 2 jnz .loop16 + + lea srcq, [srcq+src_strideq*2] +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq], m1 + + lea dstq, [dstq+dst_strideq*2] + + movu m0, [srcq] + movu m1, [srcq+src_strideq] + +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] +%endif + + mova [dstq ], m0 + mova [dstq+dst_strideq], m1 + RET INIT_MMX sse .w8: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] + mov r4d, dword hm + sub r4d, 2 + + movu m0, [srcq] + movu m1, [srcq+src_strideq] + .loop8: - movu m0, [srcq] - movu m1, [srcq+src_strideq] - movu m2, [srcq+src_strideq*2] - movu m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] + lea srcq, [srcq+src_strideq] + prefetcht0 [srcq+src_strideq*4] + lea srcq, [srcq+src_strideq] + prefetcht0 [srcq+src_strideq*2] + %ifidn %1, avg - pavgb m0, [dstq] - pavgb m1, [dstq+dst_strideq] - pavgb m2, [dstq+dst_strideq*2] - pavgb m3, [dstq+r6q] + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] %endif - mova [dstq ], m0 - mova [dstq+dst_strideq ], m1 - mova [dstq+dst_strideq*2], m2 - mova [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 + mova [dstq ], m0 + mova [dstq+dst_strideq], m1 + + movu m0, [srcq] + movu m1, [srcq+src_strideq] + + lea dstq, [dstq+dst_strideq*2] + + sub r4d, 2 jnz .loop8 + +%ifidn %1, avg + pavgb m0, [dstq] + pavgb m1, [dstq+dst_strideq] +%endif + mova [dstq ], m0 + mova [dstq+dst_strideq], m1 + RET .w4: - mov r4d, dword hm - lea r5q, [src_strideq*3] - lea r6q, [dst_strideq*3] + mov r4d, dword hm + + lea r5q, [src_strideq*3] + lea r6q, [dst_strideq*3] + .loop4: - movh m0, [srcq] - movh m1, [srcq+src_strideq] - movh m2, [srcq+src_strideq*2] - movh m3, [srcq+r5q] - lea srcq, [srcq+src_strideq*4] + movh m0, [srcq] + movh m1, [srcq+src_strideq] + movh m2, [srcq+src_strideq*2] + movh m3, [srcq+r5q] + + lea srcq, [srcq+src_strideq*4] %ifidn %1, avg - movh m4, [dstq] - movh m5, [dstq+dst_strideq] - movh m6, [dstq+dst_strideq*2] - movh m7, [dstq+r6q] - pavgb m0, m4 - pavgb m1, m5 - pavgb m2, m6 - pavgb m3, m7 + movh m4, [dstq] + movh m5, [dstq+dst_strideq] + movh m6, [dstq+dst_strideq*2] + movh m7, [dstq+r6q] + + pavgb m0, m4 + pavgb m1, m5 + pavgb m2, m6 + pavgb m3, m7 %endif - movh [dstq ], m0 - movh [dstq+dst_strideq ], m1 - movh [dstq+dst_strideq*2], m2 - movh [dstq+r6q ], m3 - lea dstq, [dstq+dst_strideq*4] - sub r4d, 4 + movh [dstq ], m0 + movh [dstq+dst_strideq ], m1 + movh [dstq+dst_strideq*2], m2 + movh [dstq+r6q ], m3 + + lea dstq, [dstq+dst_strideq*4] + + sub r4d, 4 jnz .loop4 RET %endmacro |