summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--vp9/common/vp9_idct.h6
-rw-r--r--vp9/common/vp9_rtcd_defs.sh3
-rw-r--r--vp9/encoder/vp9_block.h3
-rw-r--r--vp9/encoder/vp9_dct.c89
-rw-r--r--vp9/encoder/vp9_encodeframe.c3
-rw-r--r--vp9/encoder/vp9_encodemb.c5
6 files changed, 96 insertions, 13 deletions
diff --git a/vp9/common/vp9_idct.h b/vp9/common/vp9_idct.h
index af35432c4..64f14c993 100644
--- a/vp9/common/vp9_idct.h
+++ b/vp9/common/vp9_idct.h
@@ -71,12 +71,6 @@ static INLINE int dct_const_round_shift(int input) {
return rv;
}
-static INLINE int dct_32_round(int input) {
- int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
- assert(-131072 <= rv && rv <= 131071);
- return rv;
-}
-
typedef void (*transform_1d)(int16_t*, int16_t*);
typedef struct {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 17d1e801c..a405aab8d 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -572,6 +572,9 @@ specialize vp9_short_fdct8x4 sse2
prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct32x32
+prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch"
+specialize vp9_short_fdct32x32_rd
+
prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
specialize vp9_short_fdct16x16 sse2
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index e78f54eb4..59cc3d95c 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -139,6 +139,9 @@ struct macroblock {
int optimize;
+ // indicate if it is in the rd search loop or encoding process
+ int rd_search;
+
// TODO(jingning): Need to refactor the structure arrays that buffers the
// coding mode decisions of each partition type.
PICK_MODE_CONTEXT ab4x4_context[4][4][4];
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c
index 8d4eec139..a90bcf5df 100644
--- a/vp9/encoder/vp9_dct.c
+++ b/vp9/encoder/vp9_dct.c
@@ -991,8 +991,18 @@ void vp9_short_fht16x16_c(int16_t *input, int16_t *output,
}
}
+static INLINE int dct_32_round(int input) {
+ int rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
+ assert(-131072 <= rv && rv <= 131071);
+ return rv;
+}
+
+static INLINE int half_round_shift(int input) {
+ int rv = (input + 1 + (input < 0)) >> 2;
+ return rv;
+}
-static void dct32_1d(int *input, int *output) {
+static void dct32_1d(int *input, int *output, int round) {
int step[32];
// Stage 1
step[0] = input[0] + input[(32 - 1)];
@@ -1101,6 +1111,44 @@ static void dct32_1d(int *input, int *output) {
step[30] = output[30] + output[25];
step[31] = output[31] + output[24];
+ // dump the magnitude by half, hence the intermediate values are within 1108
+ // the range of 16 bits.
+ if (round) {
+ step[0] = half_round_shift(step[0]);
+ step[1] = half_round_shift(step[1]);
+ step[2] = half_round_shift(step[2]);
+ step[3] = half_round_shift(step[3]);
+ step[4] = half_round_shift(step[4]);
+ step[5] = half_round_shift(step[5]);
+ step[6] = half_round_shift(step[6]);
+ step[7] = half_round_shift(step[7]);
+ step[8] = half_round_shift(step[8]);
+ step[9] = half_round_shift(step[9]);
+ step[10] = half_round_shift(step[10]);
+ step[11] = half_round_shift(step[11]);
+ step[12] = half_round_shift(step[12]);
+ step[13] = half_round_shift(step[13]);
+ step[14] = half_round_shift(step[14]);
+ step[15] = half_round_shift(step[15]);
+
+ step[16] = half_round_shift(step[16]);
+ step[17] = half_round_shift(step[17]);
+ step[18] = half_round_shift(step[18]);
+ step[19] = half_round_shift(step[19]);
+ step[20] = half_round_shift(step[20]);
+ step[21] = half_round_shift(step[21]);
+ step[22] = half_round_shift(step[22]);
+ step[23] = half_round_shift(step[23]);
+ step[24] = half_round_shift(step[24]);
+ step[25] = half_round_shift(step[25]);
+ step[26] = half_round_shift(step[26]);
+ step[27] = half_round_shift(step[27]);
+ step[28] = half_round_shift(step[28]);
+ step[29] = half_round_shift(step[29]);
+ step[30] = half_round_shift(step[30]);
+ step[31] = half_round_shift(step[31]);
+ }
+
// Stage 4
output[0] = step[0] + step[3];
output[1] = step[1] + step[2];
@@ -1283,12 +1331,12 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
int output[32 * 32];
// Columns
- for (i = 0; i < 32; i++) {
+ for (i = 0; i < 32; ++i) {
int temp_in[32], temp_out[32];
- for (j = 0; j < 32; j++)
+ for (j = 0; j < 32; ++j)
temp_in[j] = input[j * shortpitch + i] << 2;
- dct32_1d(temp_in, temp_out);
- for (j = 0; j < 32; j++)
+ dct32_1d(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
}
@@ -1297,8 +1345,37 @@ void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) {
int temp_in[32], temp_out[32];
for (j = 0; j < 32; ++j)
temp_in[j] = output[j + i * 32];
- dct32_1d(temp_in, temp_out);
+ dct32_1d(temp_in, temp_out, 0);
for (j = 0; j < 32; ++j)
out[j + i * 32] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
}
}
+
+// Note that although we use dct_32_round in dct32_1d computation flow,
+// this 2d fdct32x32 for rate-distortion optimization loop is operating
+// within 16 bits precision.
+void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) {
+ int shortpitch = pitch >> 1;
+ int i, j;
+ int output[32 * 32];
+
+ // Columns
+ for (i = 0; i < 32; ++i) {
+ int temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = input[j * shortpitch + i] << 2;
+ dct32_1d(temp_in, temp_out, 0);
+ for (j = 0; j < 32; ++j)
+ output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ }
+
+ // Rows
+ for (i = 0; i < 32; ++i) {
+ int temp_in[32], temp_out[32];
+ for (j = 0; j < 32; ++j)
+ temp_in[j] = output[j + i * 32];
+ dct32_1d(temp_in, temp_out, 1);
+ for (j = 0; j < 32; ++j)
+ out[j + i * 32] = temp_out[j];
+ }
+}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index f0f2ef5d3..54b6e2440 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -602,6 +602,8 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
+ x->rd_search = 1;
+
if (bsize < BLOCK_SIZE_SB8X8)
if (xd->ab_index != 0)
return;
@@ -1974,6 +1976,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
const int mis = cm->mode_info_stride;
const int bwl = mi_width_log2(bsize);
const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);
+ x->rd_search = 0;
if (cm->frame_type == KEY_FRAME) {
if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index f0202450e..4f45496df 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -462,7 +462,10 @@ static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
switch (ss_txfrm_size / 2) {
case TX_32X32:
- vp9_short_fdct32x32(src_diff, coeff, bw * 2);
+ if (x->rd_search)
+ vp9_short_fdct32x32_rd(src_diff, coeff, bw * 2);
+ else
+ vp9_short_fdct32x32(src_diff, coeff, bw * 2);
break;
case TX_16X16:
tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;