summaryrefslogtreecommitdiff
path: root/vp9/decoder
diff options
context:
space:
mode:
authorScott LaVarnway <slavarnway@google.com>2013-05-20 13:03:17 -0400
committerScott LaVarnway <slavarnway@google.com>2013-05-20 13:03:17 -0400
commitba48a11130aa88cf20c2c54e43585968ce49e964 (patch)
tree6a2b02175ec2efc92b5d8a4dca42e425fa70ba3b /vp9/decoder
parent9aa37a51b28596137ba6fbcb1411c070287d6e11 (diff)
downloadlibvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.tar
libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.tar.gz
libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.tar.bz2
libvpx-ba48a11130aa88cf20c2c54e43585968ce49e964.zip
WIP: 4x4 idct/recon merge
This patch eliminates the intermediate diff buffer usage by combining the short idct and the add residual into one function. The encoder can use the same code as well. Change-Id: I296604bf73579c45105de0dd1adbcc91bcc53c22
Diffstat (limited to 'vp9/decoder')
-rw-r--r--vp9/decoder/vp9_decodframe.c4
-rw-r--r--vp9/decoder/vp9_idct_blk.c47
-rw-r--r--vp9/decoder/x86/vp9_dequantize_sse2.c43
3 files changed, 6 insertions, 88 deletions
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index b200e6ccc..b866e95cf 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -998,14 +998,10 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
pc->uv_dc_delta_q == 0 &&
pc->uv_ac_delta_q == 0;
if (xd->lossless) {
- xd->inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
- xd->inv_txm4x4 = vp9_short_iwalsh4x4;
xd->itxm_add = vp9_idct_add_lossless_c;
xd->itxm_add_y_block = vp9_idct_add_y_block_lossless_c;
xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;
} else {
- xd->inv_txm4x4_1 = vp9_short_idct4x4_1;
- xd->inv_txm4x4 = vp9_short_idct4x4;
xd->itxm_add = vp9_idct_add;
xd->itxm_add_y_block = vp9_idct_add_y_block;
xd->itxm_add_uv_block = vp9_idct_add_uv_block;
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 10b585b3f..7726598bc 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -84,23 +84,6 @@ void vp9_idct_add_uv_block_lossless_c(int16_t *q, uint8_t *dst, int stride,
}
}
-static void add_residual(const int16_t *diff, uint8_t *dest, int stride,
- int width, int height) {
- int r, c;
-
- for (r = 0; r < height; r++) {
- for (c = 0; c < width; c++)
- dest[c] = clip_pixel(diff[c] + dest[c]);
-
- dest += stride;
- diff += width;
- }
-}
-
-void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
- add_residual(diff, dest, stride, 4, 4);
-}
-
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) {
int r, c;
@@ -133,11 +116,8 @@ void vp9_iht_add_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest, int stride,
if (tx_type == DCT_DCT) {
vp9_idct_add(input, dest, stride, eob);
} else {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
- vp9_short_iht4x4(input, output, 4, tx_type);
+ vp9_short_iht4x4_add(input, dest, stride, tx_type);
vpx_memset(input, 0, 32);
- vp9_add_residual_4x4(output, dest, stride);
}
}
@@ -154,13 +134,9 @@ void vp9_iht_add_8x8_c(TX_TYPE tx_type, int16_t *input, uint8_t *dest,
}
void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
if (eob > 1) {
- // the idct halves ( >> 1) the pitch
- vp9_short_idct4x4(input, output, 4 << 1);
+ vp9_short_idct4x4_add(input, dest, stride);
vpx_memset(input, 0, 32);
- vp9_add_residual_4x4(output, dest, stride);
} else {
vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
((int *)input)[0] = 0;
@@ -168,38 +144,27 @@ void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
}
void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
input[0] = dc;
-
- // the idct halves ( >> 1) the pitch
- vp9_short_idct4x4(input, output, 4 << 1);
+ vp9_short_idct4x4_add(input, dest, stride);
vpx_memset(input, 0, 32);
- vp9_add_residual_4x4(output, dest, stride);
}
void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
int eob) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
if (eob > 1) {
- vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+ vp9_short_iwalsh4x4_add(input, dest, stride);
vpx_memset(input, 0, 32);
- vp9_add_residual_4x4(output, dest, stride);
} else {
- vp9_dc_only_inv_walsh_add(input[0], dest, dest, stride, stride);
+ vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
((int *)input)[0] = 0;
}
}
void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
int stride, int dc) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
input[0] = dc;
- vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+ vp9_short_iwalsh4x4_add(input, dest, stride);
vpx_memset(input, 0, 32);
- vp9_add_residual_4x4(output, dest, stride);
}
void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 72036c2d4..54ec67f24 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -15,49 +15,6 @@
#include "vp9/common/vp9_common.h"
#include "vp9/common/vp9_idct.h"
-void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
- const int width = 4;
- const __m128i zero = _mm_setzero_si128();
-
- // Diff data
- const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));
- const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));
- const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));
- const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));
-
- // Prediction data.
- __m128i p0 = _mm_cvtsi32_si128(*(const int *)(dest + 0 * stride));
- __m128i p1 = _mm_cvtsi32_si128(*(const int *)(dest + 1 * stride));
- __m128i p2 = _mm_cvtsi32_si128(*(const int *)(dest + 2 * stride));
- __m128i p3 = _mm_cvtsi32_si128(*(const int *)(dest + 3 * stride));
-
- p0 = _mm_unpacklo_epi8(p0, zero);
- p1 = _mm_unpacklo_epi8(p1, zero);
- p2 = _mm_unpacklo_epi8(p2, zero);
- p3 = _mm_unpacklo_epi8(p3, zero);
-
- p0 = _mm_add_epi16(p0, d0);
- p1 = _mm_add_epi16(p1, d1);
- p2 = _mm_add_epi16(p2, d2);
- p3 = _mm_add_epi16(p3, d3);
-
- p0 = _mm_packus_epi16(p0, p1);
- p2 = _mm_packus_epi16(p2, p3);
-
- *(int *)dest = _mm_cvtsi128_si32(p0);
- dest += stride;
-
- p0 = _mm_srli_si128(p0, 8);
- *(int *)dest = _mm_cvtsi128_si32(p0);
- dest += stride;
-
- *(int *)dest = _mm_cvtsi128_si32(p2);
- dest += stride;
-
- p2 = _mm_srli_si128(p2, 8);
- *(int *)dest = _mm_cvtsi128_si32(p2);
-}
-
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) {
uint8_t abs_diff;