summaryrefslogtreecommitdiff
path: root/vp9/encoder/x86
diff options
context:
space:
mode:
authorJingning Han <jingning@google.com>2013-07-03 09:05:01 -0700
committerJingning Han <jingning@google.com>2013-07-03 10:11:59 -0700
commit2cb75c960796f40b4dee9132e44252a5bb2a109f (patch)
treeee08134d404fd3ad06708d554fb64c1343b5018f /vp9/encoder/x86
parentb0520b61ed90062fd284caa800289503002cd59e (diff)
downloadlibvpx-2cb75c960796f40b4dee9132e44252a5bb2a109f.tar
libvpx-2cb75c960796f40b4dee9132e44252a5bb2a109f.tar.gz
libvpx-2cb75c960796f40b4dee9132e44252a5bb2a109f.tar.bz2
libvpx-2cb75c960796f40b4dee9132e44252a5bb2a109f.zip
Refactor SSE2 8x8 functional units
These serve as building blocks for SSE2 8x8 and 16x16 ADST/DCT hybrid transform coding. Change-Id: I4089a754c66e0c986f67d9b8ec4dfb9627ad430d
Diffstat (limited to 'vp9/encoder/x86')
-rw-r--r--vp9/encoder/x86/vp9_dct_sse2.c67
1 files changed, 36 insertions, 31 deletions
diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index 507bdd006..87a774c33 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c
@@ -555,8 +555,8 @@ static INLINE void load_buffer_8x8(int16_t *input, __m128i *in, int stride) {
in[7] = _mm_slli_epi16(in[7], 2);
}
-// write 8x8 array
-static INLINE void write_buffer_8x8(int16_t *output, __m128i *res) {
+// right shift and rounding
+static INLINE void right_shift_8x8(__m128i *res, int const bit) {
__m128i sign0 = _mm_srai_epi16(res[0], 15);
__m128i sign1 = _mm_srai_epi16(res[1], 15);
__m128i sign2 = _mm_srai_epi16(res[2], 15);
@@ -575,35 +575,40 @@ static INLINE void write_buffer_8x8(int16_t *output, __m128i *res) {
res[6] = _mm_sub_epi16(res[6], sign6);
res[7] = _mm_sub_epi16(res[7], sign7);
- res[0] = _mm_srai_epi16(res[0], 1);
- res[1] = _mm_srai_epi16(res[1], 1);
- res[2] = _mm_srai_epi16(res[2], 1);
- res[3] = _mm_srai_epi16(res[3], 1);
- res[4] = _mm_srai_epi16(res[4], 1);
- res[5] = _mm_srai_epi16(res[5], 1);
- res[6] = _mm_srai_epi16(res[6], 1);
- res[7] = _mm_srai_epi16(res[7], 1);
-
- _mm_store_si128((__m128i *)(output + 0 * 8), res[0]);
- _mm_store_si128((__m128i *)(output + 1 * 8), res[1]);
- _mm_store_si128((__m128i *)(output + 2 * 8), res[2]);
- _mm_store_si128((__m128i *)(output + 3 * 8), res[3]);
- _mm_store_si128((__m128i *)(output + 4 * 8), res[4]);
- _mm_store_si128((__m128i *)(output + 5 * 8), res[5]);
- _mm_store_si128((__m128i *)(output + 6 * 8), res[6]);
- _mm_store_si128((__m128i *)(output + 7 * 8), res[7]);
+ res[0] = _mm_srai_epi16(res[0], bit);
+ res[1] = _mm_srai_epi16(res[1], bit);
+ res[2] = _mm_srai_epi16(res[2], bit);
+ res[3] = _mm_srai_epi16(res[3], bit);
+ res[4] = _mm_srai_epi16(res[4], bit);
+ res[5] = _mm_srai_epi16(res[5], bit);
+ res[6] = _mm_srai_epi16(res[6], bit);
+ res[7] = _mm_srai_epi16(res[7], bit);
+}
+
+// write 8x8 array
+static INLINE void write_buffer_8x8(int16_t *output, __m128i *res, int stride) {
+ right_shift_8x8(res, 1);
+
+ _mm_store_si128((__m128i *)(output + 0 * stride), res[0]);
+ _mm_store_si128((__m128i *)(output + 1 * stride), res[1]);
+ _mm_store_si128((__m128i *)(output + 2 * stride), res[2]);
+ _mm_store_si128((__m128i *)(output + 3 * stride), res[3]);
+ _mm_store_si128((__m128i *)(output + 4 * stride), res[4]);
+ _mm_store_si128((__m128i *)(output + 5 * stride), res[5]);
+ _mm_store_si128((__m128i *)(output + 6 * stride), res[6]);
+ _mm_store_si128((__m128i *)(output + 7 * stride), res[7]);
}
// perform in-place transpose
-static INLINE void array_transpose_8x8(__m128i *res) {
- const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
- const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
- const __m128i tr0_2 = _mm_unpackhi_epi16(res[0], res[1]);
- const __m128i tr0_3 = _mm_unpackhi_epi16(res[2], res[3]);
- const __m128i tr0_4 = _mm_unpacklo_epi16(res[4], res[5]);
- const __m128i tr0_5 = _mm_unpacklo_epi16(res[6], res[7]);
- const __m128i tr0_6 = _mm_unpackhi_epi16(res[4], res[5]);
- const __m128i tr0_7 = _mm_unpackhi_epi16(res[6], res[7]);
+static INLINE void array_transpose_8x8(__m128i *in, __m128i *res) {
+ const __m128i tr0_0 = _mm_unpacklo_epi16(in[0], in[1]);
+ const __m128i tr0_1 = _mm_unpacklo_epi16(in[2], in[3]);
+ const __m128i tr0_2 = _mm_unpackhi_epi16(in[0], in[1]);
+ const __m128i tr0_3 = _mm_unpackhi_epi16(in[2], in[3]);
+ const __m128i tr0_4 = _mm_unpacklo_epi16(in[4], in[5]);
+ const __m128i tr0_5 = _mm_unpacklo_epi16(in[6], in[7]);
+ const __m128i tr0_6 = _mm_unpackhi_epi16(in[4], in[5]);
+ const __m128i tr0_7 = _mm_unpackhi_epi16(in[6], in[7]);
// 00 10 01 11 02 12 03 13
// 20 30 21 31 22 32 23 33
// 04 14 05 15 06 16 07 17
@@ -783,7 +788,7 @@ void fdct8_1d_sse2(__m128i *in) {
in[7] = _mm_packs_epi32(v6, v7);
// transpose
- array_transpose_8x8(in);
+ array_transpose_8x8(in, in);
}
void fadst8_1d_sse2(__m128i *in) {
@@ -1013,7 +1018,7 @@ void fadst8_1d_sse2(__m128i *in) {
in[7] = _mm_sub_epi16(k__const_0, s1);
// transpose
- array_transpose_8x8(in);
+ array_transpose_8x8(in, in);
}
void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
@@ -1041,7 +1046,7 @@ void vp9_short_fht8x8_sse2(int16_t *input, int16_t *output,
assert(0);
break;
}
- write_buffer_8x8(output, in);
+ write_buffer_8x8(output, in, 8);
}
void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {