diff options
author | Johann <johann.koenig@duck.com> | 2018-10-25 13:37:50 -0700 |
---|---|---|
committer | Johann <johann.koenig@duck.com> | 2018-10-29 18:53:32 -0700 |
commit | c176e6490403076105faa2a07f275d31ec61d2a3 (patch) | |
tree | 977a36d51dcc5dd024d7fb3626bdef9d47c03eb6 /vpx_dsp/x86 | |
parent | fa0076282e62f649483bde868602aab86448a661 (diff) | |
download | libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.tar libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.tar.gz libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.tar.bz2 libvpx-c176e6490403076105faa2a07f275d31ec61d2a3.zip |
vpx postproc: rewrite in intrinsics
About ~10% faster on 64bit but ~10% slower on 32
Removes the assembly usage of vpx_rv.
Change-Id: I214698fb5677f615dee0a8f5f5bb8f64daf2565e
Diffstat (limited to 'vpx_dsp/x86')
-rw-r--r-- | vpx_dsp/x86/deblock_sse2.asm | 231 | ||||
-rw-r--r-- | vpx_dsp/x86/post_proc_sse2.c | 141 |
2 files changed, 141 insertions, 231 deletions
diff --git a/vpx_dsp/x86/deblock_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm index 97cb43b67..9d8e5e3e0 100644 --- a/vpx_dsp/x86/deblock_sse2.asm +++ b/vpx_dsp/x86/deblock_sse2.asm @@ -232,237 +232,6 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2): ret %undef flimit -;void vpx_mbpost_proc_down_sse2(unsigned char *dst, -; int pitch, int rows, int cols,int flimit) -extern sym(vpx_rv) -global sym(vpx_mbpost_proc_down_sse2) PRIVATE -sym(vpx_mbpost_proc_down_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 5 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 128+16 - - ; unsigned char d[16][8] at [rsp] - ; create flimit2 at [rsp+128] - mov eax, dword ptr arg(4) ;flimit - mov [rsp+128], eax - mov [rsp+128+4], eax - mov [rsp+128+8], eax - mov [rsp+128+12], eax -%define flimit4 [rsp+128] - -%if ABI_IS_32BIT=0 - lea r8, [GLOBAL(sym(vpx_rv))] -%endif - - ;rows +=8; - add dword arg(2), 8 - - ;for(c=0; c<cols; c+=8) -.loop_col: - mov rsi, arg(0) ; s - pxor xmm0, xmm0 ; - - movsxd rax, dword ptr arg(1) ;pitch ; - - ; this copies the last row down into the border 8 rows - mov rdi, rsi - mov rdx, arg(2) - sub rdx, 9 - imul rdx, rax - lea rdi, [rdi+rdx] - movq xmm1, QWORD ptr[rdi] ; first row - mov rcx, 8 -.init_borderd: ; initialize borders - lea rdi, [rdi + rax] - movq [rdi], xmm1 - - dec rcx - jne .init_borderd - - neg rax ; rax = -pitch - - ; this copies the first row up into the border 8 rows - mov rdi, rsi - movq xmm1, QWORD ptr[rdi] ; first row - mov rcx, 8 -.init_border: ; initialize borders - lea rdi, [rdi + rax] - movq [rdi], xmm1 - - dec rcx - jne .init_border - - - - lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] - neg rax - - pxor xmm5, xmm5 - pxor xmm6, xmm6 ; - - pxor xmm7, xmm7 ; - mov rdi, rsi - - mov rcx, 15 ; - -.loop_initvar: - movq xmm1, QWORD PTR [rdi]; - punpcklbw xmm1, xmm0 ; - - paddw xmm5, xmm1 ; - pmullw xmm1, xmm1 ; - - movdqa xmm2, xmm1 ; - punpcklwd xmm1, xmm0 ; - - punpckhwd xmm2, xmm0 ; - paddd xmm6, xmm1 ; - - paddd xmm7, xmm2 ; - lea rdi, [rdi+rax] ; - - dec rcx - jne .loop_initvar - ;save the var and sum - xor rdx, rdx -.loop_row: - movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] - movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] - - punpcklbw xmm1, xmm0 - punpcklbw xmm2, xmm0 - - paddw xmm5, xmm2 - psubw xmm5, xmm1 - - pmullw xmm2, xmm2 - movdqa xmm4, xmm2 - - punpcklwd xmm2, xmm0 - punpckhwd xmm4, xmm0 - - paddd xmm6, xmm2 - paddd xmm7, xmm4 - - pmullw xmm1, xmm1 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm0 - psubd xmm6, xmm1 - - punpckhwd xmm2, xmm0 - psubd xmm7, xmm2 - - - movdqa xmm3, xmm6 - pslld xmm3, 4 - - psubd xmm3, xmm6 - movdqa xmm1, xmm5 - - movdqa xmm4, xmm5 - pmullw xmm1, xmm1 - - pmulhw xmm4, xmm4 - movdqa xmm2, xmm1 - - punpcklwd xmm1, xmm4 - punpckhwd xmm2, xmm4 - - movdqa xmm4, xmm7 - pslld xmm4, 4 - - psubd xmm4, xmm7 - - psubd xmm3, xmm1 - psubd xmm4, xmm2 - - psubd xmm3, flimit4 - psubd xmm4, flimit4 - - psrad xmm3, 31 - psrad xmm4, 31 - - packssdw xmm3, xmm4 - packsswb xmm3, xmm0 - - movq xmm1, QWORD PTR [rsi+rax*8] - - movq xmm2, xmm1 - punpcklbw xmm1, xmm0 - - paddw xmm1, xmm5 - mov rcx, rdx - - and rcx, 127 -%if ABI_IS_32BIT=1 && CONFIG_PIC=1 - push rax - lea rax, [GLOBAL(sym(vpx_rv))] - movdqu xmm4, [rax + rcx*2] ;vpx_rv[rcx*2] - pop rax -%elif ABI_IS_32BIT=0 - movdqu xmm4, [r8 + rcx*2] ;vpx_rv[rcx*2] -%else - movdqu xmm4, [sym(vpx_rv) + rcx*2] -%endif - - paddw xmm1, xmm4 - ;paddw xmm1, eight8s - psraw xmm1, 4 - - packuswb xmm1, xmm0 - pand xmm1, xmm3 - - pandn xmm3, xmm2 - por xmm1, xmm3 - - and rcx, 15 - movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] - - cmp edx, 8 - jl .skip_assignment - - mov rcx, rdx - sub rcx, 8 - and rcx, 15 - movq mm0, [rsp + rcx*8] ;d[rcx*8] - movq [rsi], mm0 - -.skip_assignment: - lea rsi, [rsi+rax] - - lea rdi, [rdi+rax] - add rdx, 1 - - cmp edx, dword arg(2) ;rows - jl .loop_row - - add dword arg(0), 8 ; s += 8 - sub dword arg(3), 8 ; cols -= 8 - cmp dword arg(3), 0 - jg .loop_col - - add rsp, 128+16 - pop rsp - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret -%undef flimit4 - ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src, ; int pitch, int rows, int cols,int flimit) diff --git a/vpx_dsp/x86/post_proc_sse2.c b/vpx_dsp/x86/post_proc_sse2.c new file mode 100644 index 000000000..d1029afc4 --- /dev/null +++ b/vpx_dsp/x86/post_proc_sse2.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> +#include <emmintrin.h> + +#include <stdio.h> + +#include "./vpx_dsp_rtcd.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/x86/mem_sse2.h" + +extern const int16_t vpx_rv[]; + +void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows, + int cols, int flimit) { + int col; + const __m128i zero = _mm_setzero_si128(); + const __m128i f = _mm_set1_epi32(flimit); + DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]); + + // 8 columns are processed at a time. + // If rows is less than 8 the bottom border extension fails. + assert(cols % 8 == 0); + assert(rows >= 8); + + for (col = 0; col < cols; col += 8) { + int row, i; + __m128i s = _mm_loadl_epi64((__m128i *)dst); + __m128i sum, sumsq_0, sumsq_1; + __m128i tmp_0, tmp_1; + __m128i below_context; + + s = _mm_unpacklo_epi8(s, zero); + + for (i = 0; i < 8; ++i) { + _mm_store_si128((__m128i *)above_context + i, s); + } + + // sum *= 9 + sum = _mm_slli_epi16(s, 3); + sum = _mm_add_epi16(s, sum); + + // sum^2 * 9 == (sum * 9) * sum + tmp_0 = _mm_mullo_epi16(sum, s); + tmp_1 = _mm_mulhi_epi16(sum, s); + + sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1); + sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1); + + // Prime sum/sumsq + for (i = 1; i <= 6; ++i) { + __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch)); + a = _mm_unpacklo_epi8(a, zero); + sum = _mm_add_epi16(sum, a); + a = _mm_mullo_epi16(a, a); + sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero)); + sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero)); + } + + for (row = 0; row < rows + 8; row++) { + const __m128i above = + _mm_load_si128((__m128i *)above_context + (row & 7)); + __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch)); + __m128i above_sq, below_sq; + __m128i mask_0, mask_1; + __m128i multmp_0, multmp_1; + __m128i rv; + __m128i out; + + this_row = _mm_unpacklo_epi8(this_row, zero); + + if (row + 7 < rows) { + // Instead of copying the end context we just stop loading when we get + // to the last one. + below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch)); + below_context = _mm_unpacklo_epi8(below_context, zero); + } + + sum = _mm_sub_epi16(sum, above); + sum = _mm_add_epi16(sum, below_context); + + // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero + // extend. Unfortunately we can't do below_sq - above_sq in 16 bits + // because x86 does not have unpack with sign extension. + above_sq = _mm_mullo_epi16(above, above); + sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero)); + sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero)); + + below_sq = _mm_mullo_epi16(below_context, below_context); + sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero)); + sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero)); + + // sumsq * 16 - sumsq == sumsq * 15 + mask_0 = _mm_slli_epi32(sumsq_0, 4); + mask_0 = _mm_sub_epi32(mask_0, sumsq_0); + mask_1 = _mm_slli_epi32(sumsq_1, 4); + mask_1 = _mm_sub_epi32(mask_1, sumsq_1); + + multmp_0 = _mm_mullo_epi16(sum, sum); + multmp_1 = _mm_mulhi_epi16(sum, sum); + + mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1)); + mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1)); + + // mask - f gives a negative value when mask < f + mask_0 = _mm_sub_epi32(mask_0, f); + mask_1 = _mm_sub_epi32(mask_1, f); + + // Shift the sign bit down to create a mask + mask_0 = _mm_srai_epi32(mask_0, 31); + mask_1 = _mm_srai_epi32(mask_1, 31); + + mask_0 = _mm_packs_epi32(mask_0, mask_1); + + rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127))); + + mask_1 = _mm_add_epi16(rv, sum); + mask_1 = _mm_add_epi16(mask_1, this_row); + mask_1 = _mm_srai_epi16(mask_1, 4); + + mask_1 = _mm_and_si128(mask_0, mask_1); + mask_0 = _mm_andnot_si128(mask_0, this_row); + out = _mm_or_si128(mask_1, mask_0); + + _mm_storel_epi64((__m128i *)(dst + row * pitch), + _mm_packus_epi16(out, zero)); + + _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row); + } + + dst += 8; + } +} |