3 files changed, 117 insertions, 40 deletions
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 686582238..1f58d872e 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -1187,7 +1187,7 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
   const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
@@ -1513,8 +1513,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
           const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p24_p08);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p24_p08);
           // dct_const_round_shift
           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
@@ -1535,8 +1535,8 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
           const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
           const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
           const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
-          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
-          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
+          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p08_m24);
+          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p08_m24);
           // dct_const_round_shift
           const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
           const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
@@ -1554,10 +1554,10 @@ void vp9_fdct16x16_sse2(const int16_t *input, int16_t *output, int stride) {
         {
           step1_0 = _mm_add_epi16(step3_0, step2_1);
           step1_1 = _mm_sub_epi16(step3_0, step2_1);
-          step1_2 = _mm_sub_epi16(step3_3, step2_2);
-          step1_3 = _mm_add_epi16(step3_3, step2_2);
-          step1_4 = _mm_add_epi16(step3_4, step2_5);
-          step1_5 = _mm_sub_epi16(step3_4, step2_5);
+          step1_2 = _mm_add_epi16(step3_3, step2_2);
+          step1_3 = _mm_sub_epi16(step3_3, step2_2);
+          step1_4 = _mm_sub_epi16(step3_4, step2_5);
+          step1_5 = _mm_add_epi16(step3_4, step2_5);
           step1_6 = _mm_sub_epi16(step3_7, step2_6);
           step1_7 = _mm_add_epi16(step3_7, step2_6);
         }
@@ -1848,7 +1848,7 @@ void fdct16_8col(__m128i *in) {
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
+  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
@@ -2052,10 +2052,10 @@ void fdct16_8col(__m128i *in) {
 
   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
 
@@ -2085,10 +2085,10 @@ void fdct16_8col(__m128i *in) {
   // stage 5
   s[0] = _mm_add_epi16(p[0], t[1]);
   s[1] = _mm_sub_epi16(p[0], t[1]);
-  s[2] = _mm_sub_epi16(p[3], t[2]);
-  s[3] = _mm_add_epi16(p[3], t[2]);
-  s[4] = _mm_add_epi16(p[4], t[5]);
-  s[5] = _mm_sub_epi16(p[4], t[5]);
+  s[2] = _mm_add_epi16(p[3], t[2]);
+  s[3] = _mm_sub_epi16(p[3], t[2]);
+  s[4] = _mm_sub_epi16(p[4], t[5]);
+  s[5] = _mm_add_epi16(p[4], t[5]);
   s[6] = _mm_sub_epi16(p[7], t[6]);
   s[7] = _mm_add_epi16(p[7], t[6]);
 
diff --git a/vp9/encoder/x86/vp9_error_intrin_avx2.c b/vp9/encoder/x86/vp9_error_intrin_avx2.c
new file mode 100644
index 000000000..c67490fad
--- /dev/null
+++ b/vp9/encoder/x86/vp9_error_intrin_avx2.c
@@ -0,0 +1,72 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Usee of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <immintrin.h>  // AVX2
+#include "vpx/vpx_integer.h"
+
+
+int64_t vp9_block_error_avx2(const int16_t *coeff,
+                             const int16_t *dqcoeff,
+                             intptr_t block_size,
+                             int64_t *ssz) {
+  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
+  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
+  __m256i sse_reg_64hi, ssz_reg_64hi;
+  __m128i sse_reg128, ssz_reg128;
+  int64_t sse;
+  int i;
+  const __m256i zero_reg = _mm256_set1_epi16(0);
+
+  // init sse and ssz registerd to zero
+  sse_reg = _mm256_set1_epi16(0);
+  ssz_reg = _mm256_set1_epi16(0);
+
+  for (i = 0 ; i < block_size ; i+= 16) {
+    // load 32 bytes from coeff and dqcoeff
+    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
+    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
+    // dqcoeff - coeff
+    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
+    // madd (dqcoeff - coeff)
+    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
+    // madd coeff
+    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
+    // expand each double word of madd (dqcoeff - coeff) to quad word
+    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
+    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
+    // expand each double word of madd (coeff) to quad word
+    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
+    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
+    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
+    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
+    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
+  }
+  // save the higher 64 bit of each 128 bit lane
+  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
+  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
+  // add the higher 64 bit to the low 64 bit
+  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
+  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);
+
+  // add each 64 bit from each of the 128 bit lane of the 256 bit
+  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
+                             _mm256_extractf128_si256(sse_reg, 1));
+
+  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
+                             _mm256_extractf128_si256(ssz_reg, 1));
+
+  // store the results
+  _mm_storel_epi64((__m128i*)(&sse), sse_reg128);
+
+  _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
+  return sse;
+}
diff --git a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
index d2d13b383..673e0b3a6 100644
--- a/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
+++ b/vp9/encoder/x86/vp9_temporal_filter_apply_sse2.asm
@@ -15,41 +15,45 @@
 ;  (unsigned char  *frame1,           |  0
 ;   unsigned int    stride,           |  1
 ;   unsigned char  *frame2,           |  2
-;   unsigned int    block_size,       |  3
-;   int             strength,         |  4
-;   int             filter_weight,    |  5
-;   unsigned int   *accumulator,      |  6
-;   unsigned short *count)            |  7
+;   unsigned int    block_width,      |  3
+;   unsigned int    block_height,     |  4
+;   int             strength,         |  5
+;   int             filter_weight,    |  6
+;   unsigned int   *accumulator,      |  7
+;   unsigned short *count)            |  8
 global sym(vp9_temporal_filter_apply_sse2) PRIVATE
 sym(vp9_temporal_filter_apply_sse2):
 
     push        rbp
     mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 8
+    SHADOW_ARGS_TO_STACK 9
     SAVE_XMM 7
     GET_GOT     rbx
     push        rsi
     push        rdi
     ALIGN_STACK 16, rax
-    %define block_size    0
-    %define strength      16
-    %define filter_weight 32
-    %define rounding_bit  48
-    %define rbp_backup    64
-    %define stack_size    80
+    %define block_width    0
+    %define block_height  16
+    %define strength      32
+    %define filter_weight 48
+    %define rounding_bit  64
+    %define rbp_backup    80
+    %define stack_size    96
     sub         rsp,           stack_size
     mov         [rsp + rbp_backup], rbp
     ; end prolog
 
         mov         rdx,            arg(3)
-        mov         [rsp + block_size], rdx
-        movd        xmm6,            arg(4)
+        mov         [rsp + block_width], rdx
+        mov         rdx,            arg(4)
+        mov         [rsp + block_height], rdx
+        movd        xmm6,           arg(5)
         movdqa      [rsp + strength], xmm6 ; where strength is used, all 16 bytes are read
 
         ; calculate the rounding bit outside the loop
         ; 0x8000 >> (16 - strength)
         mov         rdx,            16
-        sub         rdx,            arg(4) ; 16 - strength
+        sub         rdx,            arg(5) ; 16 - strength
         movq        xmm4,           rdx    ; can't use rdx w/ shift
         movdqa      xmm5,           [GLOBAL(_const_top_bit)]
         psrlw       xmm5,           xmm4
@@ -57,11 +61,11 @@ sym(vp9_temporal_filter_apply_sse2):
 
         mov         rsi,            arg(0) ; src/frame1
         mov         rdx,            arg(2) ; predictor frame
-        mov         rdi,            arg(6) ; accumulator
-        mov         rax,            arg(7) ; count
+        mov         rdi,            arg(7) ; accumulator
+        mov         rax,            arg(8) ; count
 
         ; dup the filter weight and store for later
-        movd        xmm0,           arg(5) ; filter_weight
+        movd        xmm0,           arg(6) ; filter_weight
         pshuflw     xmm0,           xmm0, 0
         punpcklwd   xmm0,           xmm0
         movdqa      [rsp + filter_weight], xmm0
@@ -69,10 +73,11 @@ sym(vp9_temporal_filter_apply_sse2):
         mov         rbp,            arg(1) ; stride
         pxor        xmm7,           xmm7   ; zero for extraction
 
-        lea         rcx,            [rdx + 16*16*1]
-        cmp         dword ptr [rsp + block_size], 8
+        mov         rcx,            [rsp + block_width]
+        imul        rcx,            [rsp + block_height]
+        add         rcx,            rdx
+        cmp         dword ptr [rsp + block_width], 8
         jne         .temporal_filter_apply_load_16
-        lea         rcx,            [rdx + 8*8*1]
 
 .temporal_filter_apply_load_8:
         movq        xmm0,           [rsi]  ; first row
@@ -178,7 +183,7 @@ sym(vp9_temporal_filter_apply_sse2):
         cmp         rdx,            rcx
         je          .temporal_filter_apply_epilog
         pxor        xmm7,           xmm7   ; zero for extraction
-        cmp         dword ptr [rsp + block_size], 16
+        cmp         dword ptr [rsp + block_width], 16
         je          .temporal_filter_apply_load_16
         jmp         .temporal_filter_apply_load_8