diff options
author | Scott LaVarnway <slavarnway@google.com> | 2013-02-13 09:15:38 -0800 |
---|---|---|
committer | Scott LaVarnway <slavarnway@google.com> | 2013-02-13 09:15:38 -0800 |
commit | 30f866f44b4ba2bceb4421eecc541842ff0eb972 (patch) | |
tree | 5679f3ba13e0154f2b0f91bb6aac399051476219 /vp9 | |
parent | cb00be1fa20c1a3955d62c1939646f4fd5d31224 (diff) | |
download | libvpx-30f866f44b4ba2bceb4421eecc541842ff0eb972.tar libvpx-30f866f44b4ba2bceb4421eecc541842ff0eb972.tar.gz libvpx-30f866f44b4ba2bceb4421eecc541842ff0eb972.tar.bz2 libvpx-30f866f44b4ba2bceb4421eecc541842ff0eb972.zip |
WIP: ssse3 version of convolve avg functions
Initial ssse3 convolve avg functions and is one step closer
to using x86inc.asm. The decoder performance improved by 8% for
the test clip used. This should be revisited later to see if
averaging outside the loop is better than having many similar
filter functions.
Change-Id: Ice3fafb423b02710b0448ffca18b296bcac649e9
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 6 | ||||
-rw-r--r-- | vp9/common/x86/vp9_asm_stubs.c | 164 | ||||
-rw-r--r-- | vp9/common/x86/vp9_subpixel_8t_ssse3.asm | 666 |
3 files changed, 624 insertions, 212 deletions
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 066989272..0487c400d 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -254,13 +254,13 @@ prototype void vp9_convolve8_vert "const uint8_t *src, int src_stride, uint8_t * specialize vp9_convolve8_vert ssse3 prototype void vp9_convolve8_avg "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg +specialize vp9_convolve8_avg ssse3 prototype void vp9_convolve8_avg_horiz "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_horiz +specialize vp9_convolve8_avg_horiz ssse3 prototype void vp9_convolve8_avg_vert "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h" -specialize vp9_convolve8_avg_vert +specialize vp9_convolve8_avg_vert ssse3 # # dct diff --git a/vp9/common/x86/vp9_asm_stubs.c b/vp9/common/x86/vp9_asm_stubs.c index fbc95b6ce..6d3bb021a 100644 --- a/vp9/common/x86/vp9_asm_stubs.c +++ b/vp9/common/x86/vp9_asm_stubs.c @@ -79,6 +79,48 @@ void vp9_filter_block1d4_h8_ssse3(const unsigned char *src_ptr, unsigned int output_height, const short *filter); +void vp9_filter_block1d16_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d16_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d8_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_v8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + +void vp9_filter_block1d4_h8_avg_ssse3(const unsigned char *src_ptr, + const unsigned int src_pitch, + unsigned char *output_ptr, + unsigned int out_pitch, + unsigned int output_height, + const short *filter); + void vp9_convolve8_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, @@ -155,6 +197,82 @@ void vp9_convolve8_vert_ssse3(const uint8_t *src, int src_stride, } } +void vp9_convolve8_avg_horiz_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (x_step_q4 == 16 && filter_x[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_h8_avg_ssse3(src, src_stride, + dst, dst_stride, + h, filter_x); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + +void vp9_convolve8_avg_vert_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + if (y_step_q4 == 16 && filter_y[3] != 128) { + while (w >= 16) { + vp9_filter_block1d16_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 16; + dst += 16; + w -= 16; + } + while (w >= 8) { + vp9_filter_block1d8_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 8; + dst += 8; + w -= 8; + } + while (w >= 4) { + vp9_filter_block1d4_v8_avg_ssse3(src - src_stride * 3, src_stride, + dst, dst_stride, + h, filter_y); + src += 4; + dst += 4; + w -= 4; + } + } + if (w) { + vp9_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); + } +} + void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, const int16_t *filter_x, int x_step_q4, @@ -200,4 +318,50 @@ void vp9_convolve8_ssse3(const uint8_t *src, int src_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); } + +void vp9_convolve8_avg_ssse3(const uint8_t *src, int src_stride, + uint8_t *dst, int dst_stride, + const int16_t *filter_x, int x_step_q4, + const int16_t *filter_y, int y_step_q4, + int w, int h) { + DECLARE_ALIGNED_ARRAY(16, unsigned char, fdata2, 16*23); + + // check w/h due to fixed size fdata2 array + assert(w <= 16); + assert(h <= 16); + + if (x_step_q4 == 16 && y_step_q4 == 16 && + filter_x[3] != 128 && filter_y[3] != 128) { + if (w == 16) { + vp9_filter_block1d16_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d16_v8_avg_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 8) { + vp9_filter_block1d8_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d8_v8_avg_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + if (w == 4) { + vp9_filter_block1d4_h8_ssse3(src - 3 * src_stride, src_stride, + fdata2, 16, + h + 7, filter_x); + vp9_filter_block1d4_v8_avg_ssse3(fdata2, 16, + dst, dst_stride, + h, filter_y); + return; + } + } + vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, + filter_x, x_step_q4, filter_y, y_step_q4, + w, h); +} #endif diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm index 5f039454a..fa24f4cd0 100644 --- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm +++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm @@ -21,34 +21,8 @@ ; ;*************************************************************************************/ -;void vp9_filter_block1d8_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE -sym(vp9_filter_block1d4_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] +%macro VERTx4 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -86,7 +60,7 @@ sym(vp9_filter_block1d4_v8_ssse3): lea rbx, [rdx + rdx*4] add rbx, rdx ;pitch * 6 -.vp9_filter_block1d4_v8_ssse3_loop: +.loop: movd xmm0, [rsi] ;A movd xmm1, [rsi + rdx] ;B movd xmm2, [rsi + rdx * 2] ;C @@ -117,7 +91,10 @@ sym(vp9_filter_block1d4_v8_ssse3): add rsi, rdx add rax, rdx - +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif movd [rdi], xmm0 %if ABI_IS_32BIT @@ -126,47 +103,10 @@ sym(vp9_filter_block1d4_v8_ssse3): add rdi, r8 %endif dec rcx - jnz .vp9_filter_block1d4_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE -sym(vp9_filter_block1d8_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] + jnz .loop +%endm +%macro VERTx8 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -204,7 +144,7 @@ sym(vp9_filter_block1d8_v8_ssse3): lea rbx, [rdx + rdx*4] add rbx, rdx ;pitch * 6 -.vp9_filter_block1d8_v8_ssse3_loop: +.loop: movq xmm0, [rsi] ;A movq xmm1, [rsi + rdx] ;B movq xmm2, [rsi + rdx * 2] ;C @@ -235,7 +175,10 @@ sym(vp9_filter_block1d8_v8_ssse3): add rsi, rdx add rax, rdx - +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif movq [rdi], xmm0 %if ABI_IS_32BIT @@ -244,47 +187,11 @@ sym(vp9_filter_block1d8_v8_ssse3): add rdi, r8 %endif dec rcx - jnz .vp9_filter_block1d8_v8_ssse3_loop - - add rsp, 16*5 - pop rsp - pop rbx - ; begin epilog - pop rdi - pop rsi - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret + jnz .loop +%endm -;void vp9_filter_block1d16_v8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pitch, -; unsigned char *output_ptr, -; unsigned int out_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE -sym(vp9_filter_block1d16_v8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - push rsi - push rdi - push rbx - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] +%macro VERTx16 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -322,7 +229,7 @@ sym(vp9_filter_block1d16_v8_ssse3): lea rbx, [rdx + rdx*4] add rbx, rdx ;pitch * 6 -.vp9_filter_block1d16_v8_ssse3_loop: +.loop: movq xmm0, [rsi] ;A movq xmm1, [rsi + rdx] ;B movq xmm2, [rsi + rdx * 2] ;C @@ -350,7 +257,10 @@ sym(vp9_filter_block1d16_v8_ssse3): psraw xmm0, 7 packuswb xmm0, xmm0 - +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif movq [rdi], xmm0 movq xmm0, [rsi + 8] ;A @@ -385,6 +295,10 @@ sym(vp9_filter_block1d16_v8_ssse3): add rsi, rdx add rax, rdx +%if %1 + movq xmm1, [rdi+8] + pavgb xmm0, xmm1 +%endif movq [rdi+8], xmm0 @@ -394,7 +308,38 @@ sym(vp9_filter_block1d16_v8_ssse3): add rdi, r8 %endif dec rcx - jnz .vp9_filter_block1d16_v8_ssse3_loop + jnz .loop +%endm + +;void vp9_filter_block1d8_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx4 0 add rsp, 16*5 pop rsp @@ -407,24 +352,100 @@ sym(vp9_filter_block1d16_v8_ssse3): pop rbp ret -;void vp9_filter_block1d4_h8_ssse3 +;void vp9_filter_block1d8_v8_ssse3 ;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, ; short *filter ;) -global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE -sym(vp9_filter_block1d4_h8_ssse3): +global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx8 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_v8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pitch, +; unsigned char *output_ptr, +; unsigned int out_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx16 0 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + +global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_v8_avg_ssse3): push rbp mov rbp, rsp SHADOW_ARGS_TO_STACK 6 SAVE_XMM 7 - GET_GOT rbx push rsi push rdi + push rbx ; end prolog ALIGN_STACK 16, rax @@ -435,6 +456,86 @@ sym(vp9_filter_block1d4_h8_ssse3): %define k6k7 [rsp + 16*3] %define krd [rsp + 16*4] + VERTx4 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx8 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_v8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + push rsi + push rdi + push rbx + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + VERTx16 1 + + add rsp, 16*5 + pop rsp + pop rbx + ; begin epilog + pop rdi + pop rsi + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +%macro HORIZx4 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -464,7 +565,7 @@ sym(vp9_filter_block1d4_h8_ssse3): movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height -.filter_block1d4_h8_rowloop_ssse3: +.loop: movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 @@ -491,54 +592,19 @@ sym(vp9_filter_block1d4_h8_ssse3): paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 - +%if %1 + movd xmm1, [rdi] + pavgb xmm0, xmm1 +%endif lea rsi, [rsi + rax] movd [rdi], xmm0 lea rdi, [rdi + rdx] dec rcx - jnz .filter_block1d4_h8_rowloop_ssse3 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d8_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE -sym(vp9_filter_block1d8_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] + jnz .loop +%endm +%macro HORIZx8 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -568,7 +634,7 @@ sym(vp9_filter_block1d8_h8_ssse3): movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height -.filter_block1d8_h8_rowloop_ssse3: +.loop: movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 @@ -595,54 +661,20 @@ sym(vp9_filter_block1d8_h8_ssse3): paddsw xmm0, krd psraw xmm0, 7 packuswb xmm0, xmm0 +%if %1 + movq xmm1, [rdi] + pavgb xmm0, xmm1 +%endif lea rsi, [rsi + rax] movq [rdi], xmm0 lea rdi, [rdi + rdx] dec rcx - jnz .filter_block1d8_h8_rowloop_ssse3 - - add rsp, 16*5 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - -;void vp9_filter_block1d16_h8_ssse3 -;( -; unsigned char *src_ptr, -; unsigned int src_pixels_per_line, -; unsigned char *output_ptr, -; unsigned int output_pitch, -; unsigned int output_height, -; short *filter -;) -global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE -sym(vp9_filter_block1d16_h8_ssse3): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 6 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 16*5 - %define k0k1 [rsp + 16*0] - %define k2k3 [rsp + 16*1] - %define k4k5 [rsp + 16*2] - %define k6k7 [rsp + 16*3] - %define krd [rsp + 16*4] + jnz .loop +%endm +%macro HORIZx16 1 mov rdx, arg(5) ;filter ptr mov rsi, arg(0) ;src_ptr mov rdi, arg(2) ;output_ptr @@ -672,7 +704,7 @@ sym(vp9_filter_block1d16_h8_ssse3): movsxd rdx, dword ptr arg(3) ;output_pitch movsxd rcx, dword ptr arg(4) ;output_height -.filter_block1d16_h8_rowloop_ssse3: +.loop: movq xmm0, [rsi - 3] ; -3 -2 -1 0 1 2 3 4 movq xmm3, [rsi + 5] ; 5 6 7 8 9 10 11 12 @@ -727,13 +759,48 @@ sym(vp9_filter_block1d16_h8_ssse3): psraw xmm3, 7 packuswb xmm3, xmm3 punpcklqdq xmm0, xmm3 +%if %1 + movdqa xmm1, [rdi] + pavgb xmm0, xmm1 +%endif lea rsi, [rsi + rax] movdqa [rdi], xmm0 lea rdi, [rdi + rdx] dec rcx - jnz .filter_block1d16_h8_rowloop_ssse3 + jnz .loop +%endm + +;void vp9_filter_block1d4_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx4 0 add rsp, 16*5 pop rsp @@ -747,7 +814,188 @@ sym(vp9_filter_block1d16_h8_ssse3): pop rbp ret +;void vp9_filter_block1d8_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +;void vp9_filter_block1d16_h8_ssse3 +;( +; unsigned char *src_ptr, +; unsigned int src_pixels_per_line, +; unsigned char *output_ptr, +; unsigned int output_pitch, +; unsigned int output_height, +; short *filter +;) +global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 0 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d4_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx4 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d8_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx8 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret + +global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE +sym(vp9_filter_block1d16_h8_avg_ssse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 16*5 + %define k0k1 [rsp + 16*0] + %define k2k3 [rsp + 16*1] + %define k4k5 [rsp + 16*2] + %define k6k7 [rsp + 16*3] + %define krd [rsp + 16*4] + + HORIZx16 1 + + add rsp, 16*5 + pop rsp + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret SECTION_RODATA align 16 shuf_t0t1: |