Optimize vp9_dc_only_idct_add_c function

Wrote SSE2 version of vp9_dc_only_idct_add_c function. In order to improve performance, clipped the absolute diff values to [0, 255]. This allowed us to keep the additions/subtractions in 8 bits. Test showed an over 2% decoder performance increase. Change-Id: Ie1a236d23d207e4ffcd1fc9f3d77462a9c7fe09d
author: Yunqing Wang <yunqingwang@google.com> 2013-02-26 16:27:41 -0800
committer: Yunqing Wang <yunqingwang@google.com> 2013-02-26 17:16:13 -0800
commit: 35bc02c6eb22602997d9c8aebeb46ef588266cc4 (patch)
tree: 54ab06054f3f5bcef8f214b00fe448c4da19af46 /vp9/common
parent: 9770d564f4984e6a0d3cfdfb7e5b8bc83f52dccf (diff)
download: libvpx-35bc02c6eb22602997d9c8aebeb46ef588266cc4.tar
libvpx-35bc02c6eb22602997d9c8aebeb46ef588266cc4.tar.gz
libvpx-35bc02c6eb22602997d9c8aebeb46ef588266cc4.tar.bz2
libvpx-35bc02c6eb22602997d9c8aebeb46ef588266cc4.zip
4 files changed, 84 insertions, 8 deletions
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index 430cec083..d25d0ac2a 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -13,6 +13,13 @@
 
 #include "./vpx_config.h"
 
+#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
+
+/* If we don't want to use ROUND_POWER_OF_TWO macro
+static INLINE int16_t round_power_of_two(int16_t value, int n) {
+  return (value + (1 << (n - 1))) >> n;
+}*/
+
 // Constants and Macros used by all idct/dct functions
 #define DCT_CONST_BITS 14
 #define DCT_CONST_ROUNDING  (1 << (DCT_CONST_BITS - 1))
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c
index 67cfc9d71..19397028b 100644
--- a/vp9/common/vp9_idctllm.c
+++ b/vp9/common/vp9_idctllm.c
@@ -31,13 +31,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-#define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n) - 1))) >> (n))
-
-/* If we don't want to use ROUND_POWER_OF_TWO macro
-static INLINE int16_t round_power_of_two(int16_t value, int n) {
-  return (value + (1 << (n - 1))) >> n;
-}*/
-
 typedef void (*transform_1d)(int16_t*, int16_t*);
 
 typedef struct {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 700af7fa7..02a6711e5 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -296,7 +296,7 @@ specialize vp9_short_iht16x16
 # dct and add
 
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_idct_add
+specialize vp9_dc_only_idct_add sse2
 
 prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch"
 specialize vp9_short_inv_walsh4x4_1_x8
diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idctllm_x86.c
new file mode 100644
index 000000000..667f5c1d3
--- /dev/null
+++ b/vp9/common/x86/vp9_idctllm_x86.c
@@ -0,0 +1,76 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>  // SSE2
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_idct.h"
+
+#if HAVE_SSE2
+// In order to improve performance, clip absolute diff values to [0, 255],
+// which allows to keep the additions/subtractions in 8 bits.
+void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr,
+                               uint8_t *dst_ptr, int pitch, int stride) {
+  int a1;
+  int16_t out;
+  uint8_t abs_diff;
+  __m128i p0, p1, p2, p3;
+  unsigned int extended_diff;
+  __m128i diff;
+
+  out = dct_const_round_shift(input_dc * cospi_16_64);
+  out = dct_const_round_shift(out * cospi_16_64);
+  a1 = ROUND_POWER_OF_TWO(out, 4);
+
+  // Read prediction data.
+  p0 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 0 * pitch));
+  p1 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 1 * pitch));
+  p2 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 2 * pitch));
+  p3 = _mm_cvtsi32_si128 (*(const int *)(pred_ptr + 3 * pitch));
+
+  // Unpack prediction data, and store 4x4 array in 1 XMM register.
+  p0 = _mm_unpacklo_epi32(p0, p1);
+  p2 = _mm_unpacklo_epi32(p2, p3);
+  p0 = _mm_unpacklo_epi64(p0, p2);
+
+  // Clip dc value to [0, 255] range. Then, do addition or subtraction
+  // according to its sign.
+  if (a1 >= 0) {
+    abs_diff = (a1 > 255) ? 255 : a1;
+    extended_diff = abs_diff * 0x01010101u;
+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+    p1 = _mm_adds_epu8(p0, diff);
+  } else {
+    abs_diff = (a1 < -255) ? 255 : -a1;
+    extended_diff = abs_diff * 0x01010101u;
+    diff = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_diff), 0);
+
+    p1 = _mm_subs_epu8(p0, diff);
+  }
+
+  // Store results to dst.
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+  dst_ptr += stride;
+
+  p1 = _mm_srli_si128(p1, 4);
+  *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
+}
+#endif
author	Yunqing Wang <yunqingwang@google.com>	2013-02-26 16:27:41 -0800
committer	Yunqing Wang <yunqingwang@google.com>	2013-02-26 17:16:13 -0800
commit	35bc02c6eb22602997d9c8aebeb46ef588266cc4 (patch)
tree	54ab06054f3f5bcef8f214b00fe448c4da19af46 /vp9/common
parent	9770d564f4984e6a0d3cfdfb7e5b8bc83f52dccf (diff)
download	libvpx-35bc02c6eb22602997d9c8aebeb46ef588266cc4.tar libvpx-35bc02c6eb22602997d9c8aebeb46ef588266cc4.tar.gz libvpx-35bc02c6eb22602997d9c8aebeb46ef588266cc4.tar.bz2 libvpx-35bc02c6eb22602997d9c8aebeb46ef588266cc4.zip