From c176e6490403076105faa2a07f275d31ec61d2a3 Mon Sep 17 00:00:00 2001
From: Johann <johann.koenig@duck.com>
Date: Thu, 25 Oct 2018 13:37:50 -0700
Subject: vpx postproc: rewrite in intrinsics

About ~10% faster on 64bit but ~10% slower on 32

Removes the assembly usage of vpx_rv.

Change-Id: I214698fb5677f615dee0a8f5f5bb8f64daf2565e
---
 vpx_dsp/x86/deblock_sse2.asm | 231 -------------------------------------------
 vpx_dsp/x86/post_proc_sse2.c | 141 ++++++++++++++++++++++++++
 2 files changed, 141 insertions(+), 231 deletions(-)
 create mode 100644 vpx_dsp/x86/post_proc_sse2.c

(limited to 'vpx_dsp/x86')

diff --git a/vpx_dsp/x86/deblock_sse2.asm b/vpx_dsp/x86/deblock_sse2.asm
index 97cb43b67..9d8e5e3e0 100644
--- a/vpx_dsp/x86/deblock_sse2.asm
+++ b/vpx_dsp/x86/deblock_sse2.asm
@@ -232,237 +232,6 @@ sym(vpx_post_proc_down_and_across_mb_row_sse2):
     ret
 %undef flimit
 
-;void vpx_mbpost_proc_down_sse2(unsigned char *dst,
-;                               int pitch, int rows, int cols,int flimit)
-extern sym(vpx_rv)
-global sym(vpx_mbpost_proc_down_sse2) PRIVATE
-sym(vpx_mbpost_proc_down_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    GET_GOT     rbx
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    ALIGN_STACK 16, rax
-    sub         rsp, 128+16
-
-    ; unsigned char d[16][8] at [rsp]
-    ; create flimit2 at [rsp+128]
-    mov         eax, dword ptr arg(4) ;flimit
-    mov         [rsp+128], eax
-    mov         [rsp+128+4], eax
-    mov         [rsp+128+8], eax
-    mov         [rsp+128+12], eax
-%define flimit4 [rsp+128]
-
-%if ABI_IS_32BIT=0
-    lea         r8,       [GLOBAL(sym(vpx_rv))]
-%endif
-
-    ;rows +=8;
-    add         dword arg(2), 8
-
-    ;for(c=0; c<cols; c+=8)
-.loop_col:
-            mov         rsi,        arg(0) ; s
-            pxor        xmm0,       xmm0        ;
-
-            movsxd      rax,        dword ptr arg(1) ;pitch       ;
-
-            ; this copies the last row down into the border 8 rows
-            mov         rdi,        rsi
-            mov         rdx,        arg(2)
-            sub         rdx,        9
-            imul        rdx,        rax
-            lea         rdi,        [rdi+rdx]
-            movq        xmm1,       QWORD ptr[rdi]              ; first row
-            mov         rcx,        8
-.init_borderd:                                                  ; initialize borders
-            lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
-
-            dec         rcx
-            jne         .init_borderd
-
-            neg         rax                                     ; rax = -pitch
-
-            ; this copies the first row up into the border 8 rows
-            mov         rdi,        rsi
-            movq        xmm1,       QWORD ptr[rdi]              ; first row
-            mov         rcx,        8
-.init_border:                                                   ; initialize borders
-            lea         rdi,        [rdi + rax]
-            movq        [rdi],      xmm1
-
-            dec         rcx
-            jne         .init_border
-
-
-
-            lea         rsi,        [rsi + rax*8];              ; rdi = s[-pitch*8]
-            neg         rax
-
-            pxor        xmm5,       xmm5
-            pxor        xmm6,       xmm6        ;
-
-            pxor        xmm7,       xmm7        ;
-            mov         rdi,        rsi
-
-            mov         rcx,        15          ;
-
-.loop_initvar:
-            movq        xmm1,       QWORD PTR [rdi];
-            punpcklbw   xmm1,       xmm0        ;
-
-            paddw       xmm5,       xmm1        ;
-            pmullw      xmm1,       xmm1        ;
-
-            movdqa      xmm2,       xmm1        ;
-            punpcklwd   xmm1,       xmm0        ;
-
-            punpckhwd   xmm2,       xmm0        ;
-            paddd       xmm6,       xmm1        ;
-
-            paddd       xmm7,       xmm2        ;
-            lea         rdi,        [rdi+rax]   ;
-
-            dec         rcx
-            jne         .loop_initvar
-            ;save the var and sum
-            xor         rdx,        rdx
-.loop_row:
-            movq        xmm1,       QWORD PTR [rsi]     ; [s-pitch*8]
-            movq        xmm2,       QWORD PTR [rdi]     ; [s+pitch*7]
-
-            punpcklbw   xmm1,       xmm0
-            punpcklbw   xmm2,       xmm0
-
-            paddw       xmm5,       xmm2
-            psubw       xmm5,       xmm1
-
-            pmullw      xmm2,       xmm2
-            movdqa      xmm4,       xmm2
-
-            punpcklwd   xmm2,       xmm0
-            punpckhwd   xmm4,       xmm0
-
-            paddd       xmm6,       xmm2
-            paddd       xmm7,       xmm4
-
-            pmullw      xmm1,       xmm1
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm0
-            psubd       xmm6,       xmm1
-
-            punpckhwd   xmm2,       xmm0
-            psubd       xmm7,       xmm2
-
-
-            movdqa      xmm3,       xmm6
-            pslld       xmm3,       4
-
-            psubd       xmm3,       xmm6
-            movdqa      xmm1,       xmm5
-
-            movdqa      xmm4,       xmm5
-            pmullw      xmm1,       xmm1
-
-            pmulhw      xmm4,       xmm4
-            movdqa      xmm2,       xmm1
-
-            punpcklwd   xmm1,       xmm4
-            punpckhwd   xmm2,       xmm4
-
-            movdqa      xmm4,       xmm7
-            pslld       xmm4,       4
-
-            psubd       xmm4,       xmm7
-
-            psubd       xmm3,       xmm1
-            psubd       xmm4,       xmm2
-
-            psubd       xmm3,       flimit4
-            psubd       xmm4,       flimit4
-
-            psrad       xmm3,       31
-            psrad       xmm4,       31
-
-            packssdw    xmm3,       xmm4
-            packsswb    xmm3,       xmm0
-
-            movq        xmm1,       QWORD PTR [rsi+rax*8]
-
-            movq        xmm2,       xmm1
-            punpcklbw   xmm1,       xmm0
-
-            paddw       xmm1,       xmm5
-            mov         rcx,        rdx
-
-            and         rcx,        127
-%if ABI_IS_32BIT=1 && CONFIG_PIC=1
-            push        rax
-            lea         rax,        [GLOBAL(sym(vpx_rv))]
-            movdqu      xmm4,       [rax + rcx*2] ;vpx_rv[rcx*2]
-            pop         rax
-%elif ABI_IS_32BIT=0
-            movdqu      xmm4,       [r8 + rcx*2] ;vpx_rv[rcx*2]
-%else
-            movdqu      xmm4,       [sym(vpx_rv) + rcx*2]
-%endif
-
-            paddw       xmm1,       xmm4
-            ;paddw     xmm1,       eight8s
-            psraw       xmm1,       4
-
-            packuswb    xmm1,       xmm0
-            pand        xmm1,       xmm3
-
-            pandn       xmm3,       xmm2
-            por         xmm1,       xmm3
-
-            and         rcx,        15
-            movq        QWORD PTR   [rsp + rcx*8], xmm1 ;d[rcx*8]
-
-            cmp         edx,        8
-            jl          .skip_assignment
-
-            mov         rcx,        rdx
-            sub         rcx,        8
-            and         rcx,        15
-            movq        mm0,        [rsp + rcx*8] ;d[rcx*8]
-            movq        [rsi],      mm0
-
-.skip_assignment:
-            lea         rsi,        [rsi+rax]
-
-            lea         rdi,        [rdi+rax]
-            add         rdx,        1
-
-            cmp         edx,        dword arg(2) ;rows
-            jl          .loop_row
-
-        add         dword arg(0), 8 ; s += 8
-        sub         dword arg(3), 8 ; cols -= 8
-        cmp         dword arg(3), 0
-        jg          .loop_col
-
-    add         rsp, 128+16
-    pop         rsp
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_GOT
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-%undef flimit4
-
 
 ;void vpx_mbpost_proc_across_ip_sse2(unsigned char *src,
 ;                                    int pitch, int rows, int cols,int flimit)
diff --git a/vpx_dsp/x86/post_proc_sse2.c b/vpx_dsp/x86/post_proc_sse2.c
new file mode 100644
index 000000000..d1029afc4
--- /dev/null
+++ b/vpx_dsp/x86/post_proc_sse2.c
@@ -0,0 +1,141 @@
+/*
+ *  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+
+#include <stdio.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/x86/mem_sse2.h"
+
+extern const int16_t vpx_rv[];
+
+void vpx_mbpost_proc_down_sse2(unsigned char *dst, int pitch, int rows,
+                               int cols, int flimit) {
+  int col;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i f = _mm_set1_epi32(flimit);
+  DECLARE_ALIGNED(16, int16_t, above_context[8 * 8]);
+
+  // 8 columns are processed at a time.
+  // If rows is less than 8 the bottom border extension fails.
+  assert(cols % 8 == 0);
+  assert(rows >= 8);
+
+  for (col = 0; col < cols; col += 8) {
+    int row, i;
+    __m128i s = _mm_loadl_epi64((__m128i *)dst);
+    __m128i sum, sumsq_0, sumsq_1;
+    __m128i tmp_0, tmp_1;
+    __m128i below_context;
+
+    s = _mm_unpacklo_epi8(s, zero);
+
+    for (i = 0; i < 8; ++i) {
+      _mm_store_si128((__m128i *)above_context + i, s);
+    }
+
+    // sum *= 9
+    sum = _mm_slli_epi16(s, 3);
+    sum = _mm_add_epi16(s, sum);
+
+    // sum^2 * 9 == (sum * 9) * sum
+    tmp_0 = _mm_mullo_epi16(sum, s);
+    tmp_1 = _mm_mulhi_epi16(sum, s);
+
+    sumsq_0 = _mm_unpacklo_epi16(tmp_0, tmp_1);
+    sumsq_1 = _mm_unpackhi_epi16(tmp_0, tmp_1);
+
+    // Prime sum/sumsq
+    for (i = 1; i <= 6; ++i) {
+      __m128i a = _mm_loadl_epi64((__m128i *)(dst + i * pitch));
+      a = _mm_unpacklo_epi8(a, zero);
+      sum = _mm_add_epi16(sum, a);
+      a = _mm_mullo_epi16(a, a);
+      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(a, zero));
+      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(a, zero));
+    }
+
+    for (row = 0; row < rows + 8; row++) {
+      const __m128i above =
+          _mm_load_si128((__m128i *)above_context + (row & 7));
+      __m128i this_row = _mm_loadl_epi64((__m128i *)(dst + row * pitch));
+      __m128i above_sq, below_sq;
+      __m128i mask_0, mask_1;
+      __m128i multmp_0, multmp_1;
+      __m128i rv;
+      __m128i out;
+
+      this_row = _mm_unpacklo_epi8(this_row, zero);
+
+      if (row + 7 < rows) {
+        // Instead of copying the end context we just stop loading when we get
+        // to the last one.
+        below_context = _mm_loadl_epi64((__m128i *)(dst + (row + 7) * pitch));
+        below_context = _mm_unpacklo_epi8(below_context, zero);
+      }
+
+      sum = _mm_sub_epi16(sum, above);
+      sum = _mm_add_epi16(sum, below_context);
+
+      // context^2 fits in 16 bits. Don't need to mulhi and combine. Just zero
+      // extend. Unfortunately we can't do below_sq - above_sq in 16 bits
+      // because x86 does not have unpack with sign extension.
+      above_sq = _mm_mullo_epi16(above, above);
+      sumsq_0 = _mm_sub_epi32(sumsq_0, _mm_unpacklo_epi16(above_sq, zero));
+      sumsq_1 = _mm_sub_epi32(sumsq_1, _mm_unpackhi_epi16(above_sq, zero));
+
+      below_sq = _mm_mullo_epi16(below_context, below_context);
+      sumsq_0 = _mm_add_epi32(sumsq_0, _mm_unpacklo_epi16(below_sq, zero));
+      sumsq_1 = _mm_add_epi32(sumsq_1, _mm_unpackhi_epi16(below_sq, zero));
+
+      // sumsq * 16 - sumsq == sumsq * 15
+      mask_0 = _mm_slli_epi32(sumsq_0, 4);
+      mask_0 = _mm_sub_epi32(mask_0, sumsq_0);
+      mask_1 = _mm_slli_epi32(sumsq_1, 4);
+      mask_1 = _mm_sub_epi32(mask_1, sumsq_1);
+
+      multmp_0 = _mm_mullo_epi16(sum, sum);
+      multmp_1 = _mm_mulhi_epi16(sum, sum);
+
+      mask_0 = _mm_sub_epi32(mask_0, _mm_unpacklo_epi16(multmp_0, multmp_1));
+      mask_1 = _mm_sub_epi32(mask_1, _mm_unpackhi_epi16(multmp_0, multmp_1));
+
+      // mask - f gives a negative value when mask < f
+      mask_0 = _mm_sub_epi32(mask_0, f);
+      mask_1 = _mm_sub_epi32(mask_1, f);
+
+      // Shift the sign bit down to create a mask
+      mask_0 = _mm_srai_epi32(mask_0, 31);
+      mask_1 = _mm_srai_epi32(mask_1, 31);
+
+      mask_0 = _mm_packs_epi32(mask_0, mask_1);
+
+      rv = _mm_loadu_si128((__m128i const *)(vpx_rv + (row & 127)));
+
+      mask_1 = _mm_add_epi16(rv, sum);
+      mask_1 = _mm_add_epi16(mask_1, this_row);
+      mask_1 = _mm_srai_epi16(mask_1, 4);
+
+      mask_1 = _mm_and_si128(mask_0, mask_1);
+      mask_0 = _mm_andnot_si128(mask_0, this_row);
+      out = _mm_or_si128(mask_1, mask_0);
+
+      _mm_storel_epi64((__m128i *)(dst + row * pitch),
+                       _mm_packus_epi16(out, zero));
+
+      _mm_store_si128((__m128i *)above_context + ((row + 8) & 7), this_row);
+    }
+
+    dst += 8;
+  }
+}
-- 
cgit v1.2.3