diff options
Diffstat (limited to 'vp9')
-rw-r--r-- | vp9/common/vp9_blockd.h | 144 | ||||
-rw-r--r-- | vp9/common/vp9_entropymv.c | 12 | ||||
-rw-r--r-- | vp9/common/vp9_entropymv.h | 4 | ||||
-rw-r--r-- | vp9/common/vp9_invtrans.c | 70 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 9 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodframe.c | 136 | ||||
-rw-r--r-- | vp9/decoder/vp9_dequantize.c | 42 | ||||
-rw-r--r-- | vp9/decoder/vp9_detokenize.c | 142 | ||||
-rw-r--r-- | vp9/decoder/x86/vp9_dequantize_x86.c | 255 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 5 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeintra.c | 9 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodemb.c | 102 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 13 | ||||
-rw-r--r-- | vp9/encoder/vp9_mbgraph.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 17 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.h | 13 | ||||
-rw-r--r-- | vp9/encoder/vp9_quantize.c | 379 | ||||
-rw-r--r-- | vp9/encoder/vp9_quantize.h | 54 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 575 | ||||
-rw-r--r-- | vp9/encoder/vp9_tokenize.c | 46 | ||||
-rw-r--r-- | vp9/encoder/vp9_tokenize.h | 12 |
21 files changed, 1138 insertions, 903 deletions
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index 9f978ce5e..5adfa6952 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -455,25 +455,29 @@ extern const uint8_t vp9_block2above_sb[TX_SIZE_MAX_SB][96]; extern const uint8_t vp9_block2left_sb64[TX_SIZE_MAX_SB][384]; extern const uint8_t vp9_block2above_sb64[TX_SIZE_MAX_SB][384]; -#define USE_ADST_FOR_I16X16_8X8 0 -#define USE_ADST_FOR_I16X16_4X4 0 +#define USE_ADST_FOR_I16X16_8X8 1 +#define USE_ADST_FOR_I16X16_4X4 1 #define USE_ADST_FOR_I8X8_4X4 1 #define USE_ADST_PERIPHERY_ONLY 1 +#define USE_ADST_FOR_SB 1 +#define USE_ADST_FOR_REMOTE_EDGE 0 -static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { +static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, int ib) { // TODO(debargha): explore different patterns for ADST usage when blocksize // is smaller than the prediction size TX_TYPE tx_type = DCT_DCT; - int ib = (int)(b - xd->block); - if (ib >= 16) + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; +#if !USE_ADST_FOR_SB + if (sb_type) + return tx_type; +#endif + if (ib >= (16 << (2 * sb_type))) // no chroma adst return tx_type; if (xd->lossless) return DCT_DCT; - // TODO(rbultje, debargha): Explore ADST usage for superblocks - if (xd->mode_info_context->mbmi.sb_type) - return tx_type; if (xd->mode_info_context->mbmi.mode == B_PRED && xd->q_index < ACTIVE_HT) { + const BLOCKD *b = &xd->block[ib]; tx_type = txfm_map( #if CONFIG_NEWBINTRAMODES b->bmi.as_mode.first == B_CONTEXT_PRED ? b->bmi.as_mode.context : @@ -481,16 +485,32 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { b->bmi.as_mode.first); } else if (xd->mode_info_context->mbmi.mode == I8X8_PRED && xd->q_index < ACTIVE_HT) { + const BLOCKD *b = &xd->block[ib]; + const int ic = (ib & 10); #if USE_ADST_FOR_I8X8_4X4 #if USE_ADST_PERIPHERY_ONLY // Use ADST for periphery blocks only - int ic = (ib & 10); + const int inner = ib & 5; b += ic - ib; - tx_type = (ic != 10) ? - txfm_map(pred_mode_conv((MB_PREDICTION_MODE)b->bmi.as_mode.first)) : - DCT_DCT; + tx_type = txfm_map(pred_mode_conv( + (MB_PREDICTION_MODE)b->bmi.as_mode.first)); +#if USE_ADST_FOR_REMOTE_EDGE + if (inner == 5) + tx_type = DCT_DCT; +#else + if (inner == 1) { + if (tx_type == ADST_ADST) tx_type = ADST_DCT; + else if (tx_type == DCT_ADST) tx_type = DCT_DCT; + } else if (inner == 4) { + if (tx_type == ADST_ADST) tx_type = DCT_ADST; + else if (tx_type == ADST_DCT) tx_type = DCT_DCT; + } else if (inner == 5) { + tx_type = DCT_DCT; + } +#endif #else // Use ADST + b += ic - ib; tx_type = txfm_map(pred_mode_conv( (MB_PREDICTION_MODE)b->bmi.as_mode.first)); #endif @@ -502,9 +522,22 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { xd->q_index < ACTIVE_HT) { #if USE_ADST_FOR_I16X16_4X4 #if USE_ADST_PERIPHERY_ONLY - // Use ADST for periphery blocks only - tx_type = (ib < 4 || ((ib & 3) == 0)) ? - txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)) : DCT_DCT; + const int hmax = 4 << sb_type; + tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); +#if USE_ADST_FOR_REMOTE_EDGE + if ((ib & (hmax - 1)) != 0 && ib >= hmax) + tx_type = DCT_DCT; +#else + if (ib >= 1 && ib < hmax) { + if (tx_type == ADST_ADST) tx_type = ADST_DCT; + else if (tx_type == DCT_ADST) tx_type = DCT_DCT; + } else if (ib >= 1 && (ib & (hmax - 1)) == 0) { + if (tx_type == ADST_ADST) tx_type = DCT_ADST; + else if (tx_type == ADST_DCT) tx_type = DCT_DCT; + } else if (ib != 0) { + tx_type = DCT_DCT; + } +#endif #else // Use ADST tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); @@ -517,29 +550,44 @@ static TX_TYPE get_tx_type_4x4(const MACROBLOCKD *xd, const BLOCKD *b) { return tx_type; } -static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) { +static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, int ib) { // TODO(debargha): explore different patterns for ADST usage when blocksize // is smaller than the prediction size TX_TYPE tx_type = DCT_DCT; - int ib = (int)(b - xd->block); - if (ib >= 16) + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; +#if !USE_ADST_FOR_SB + if (sb_type) return tx_type; - // TODO(rbultje, debargha): Explore ADST usage for superblocks - if (xd->mode_info_context->mbmi.sb_type) +#endif + if (ib >= (16 << (2 * sb_type))) // no chroma adst return tx_type; if (xd->mode_info_context->mbmi.mode == I8X8_PRED && xd->q_index < ACTIVE_HT8) { + const BLOCKD *b = &xd->block[ib]; // TODO(rbultje): MB_PREDICTION_MODE / B_PREDICTION_MODE should be merged // or the relationship otherwise modified to address this type conversion. tx_type = txfm_map(pred_mode_conv( (MB_PREDICTION_MODE)b->bmi.as_mode.first)); } else if (xd->mode_info_context->mbmi.mode < I8X8_PRED && xd->q_index < ACTIVE_HT8) { -#if USE_ADST_FOR_I8X8_4X4 +#if USE_ADST_FOR_I16X16_8X8 #if USE_ADST_PERIPHERY_ONLY - // Use ADST for periphery blocks only - tx_type = (ib != 10) ? - txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)) : DCT_DCT; + const int hmax = 4 << sb_type; + tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); +#if USE_ADST_FOR_REMOTE_EDGE + if ((ib & (hmax - 1)) != 0 && ib >= hmax) + tx_type = DCT_DCT; +#else + if (ib >= 1 && ib < hmax) { + if (tx_type == ADST_ADST) tx_type = ADST_DCT; + else if (tx_type == DCT_ADST) tx_type = DCT_DCT; + } else if (ib >= 1 && (ib & (hmax - 1)) == 0) { + if (tx_type == ADST_ADST) tx_type = DCT_ADST; + else if (tx_type == ADST_DCT) tx_type = DCT_DCT; + } else if (ib != 0) { + tx_type = DCT_DCT; + } +#endif #else // Use ADST tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); @@ -552,35 +600,37 @@ static TX_TYPE get_tx_type_8x8(const MACROBLOCKD *xd, const BLOCKD *b) { return tx_type; } -static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, const BLOCKD *b) { +static TX_TYPE get_tx_type_16x16(const MACROBLOCKD *xd, int ib) { TX_TYPE tx_type = DCT_DCT; - int ib = (int)(b - xd->block); - if (ib >= 16) + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; +#if !USE_ADST_FOR_SB + if (sb_type) return tx_type; - // TODO(rbultje, debargha): Explore ADST usage for superblocks - if (xd->mode_info_context->mbmi.sb_type) +#endif + if (ib >= (16 << (2 * sb_type))) return tx_type; if (xd->mode_info_context->mbmi.mode < I8X8_PRED && xd->q_index < ACTIVE_HT16) { tx_type = txfm_map(pred_mode_conv(xd->mode_info_context->mbmi.mode)); - } - return tx_type; -} - -static TX_TYPE get_tx_type(const MACROBLOCKD *xd, const BLOCKD *b) { - TX_TYPE tx_type = DCT_DCT; - int ib = (int)(b - xd->block); - if (ib >= 16) - return tx_type; - if (xd->mode_info_context->mbmi.txfm_size == TX_16X16) { - tx_type = get_tx_type_16x16(xd, b); - } - if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { - ib = (ib & 8) + ((ib & 4) >> 1); - tx_type = get_tx_type_8x8(xd, &xd->block[ib]); - } - if (xd->mode_info_context->mbmi.txfm_size == TX_4X4) { - tx_type = get_tx_type_4x4(xd, b); +#if USE_ADST_PERIPHERY_ONLY + if (sb_type) { + const int hmax = 4 << sb_type; +#if USE_ADST_FOR_REMOTE_EDGE + if ((ib & (hmax - 1)) != 0 && ib >= hmax) + tx_type = DCT_DCT; +#else + if (ib >= 1 && ib < hmax) { + if (tx_type == ADST_ADST) tx_type = ADST_DCT; + else if (tx_type == DCT_ADST) tx_type = DCT_DCT; + } else if (ib >= 1 && (ib & (hmax - 1)) == 0) { + if (tx_type == ADST_ADST) tx_type = DCT_ADST; + else if (tx_type == ADST_DCT) tx_type = DCT_DCT; + } else if (ib != 0) { + tx_type = DCT_DCT; + } +#endif + } +#endif } return tx_type; } diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index 99e3c2e8c..185cc9a09 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -42,7 +42,9 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = { -MV_CLASS_2, -MV_CLASS_3, 10, 12, -MV_CLASS_4, -MV_CLASS_5, + 14, 16, -MV_CLASS_6, -MV_CLASS_7, + -MV_CLASS_8, -MV_CLASS_9, }; struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES]; @@ -63,9 +65,9 @@ const nmv_context vp9_default_nmv_context = { { { /* vert component */ 128, /* sign */ - {224, 144, 192, 168, 192, 176, 192}, /* class */ + {224, 144, 192, 168, 192, 176, 192, 198, 198}, /* class */ {216}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224}, /* bits */ + {136, 140, 148, 160, 176, 192, 224, 234, 234}, /* bits */ {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ {64, 96, 64}, /* fp */ 160, /* class0_hp bit */ @@ -73,9 +75,9 @@ const nmv_context vp9_default_nmv_context = { }, { /* hor component */ 128, /* sign */ - {216, 128, 176, 160, 176, 176, 192}, /* class */ + {216, 128, 176, 160, 176, 176, 192, 198, 198}, /* class */ {208}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224}, /* bits */ + {136, 140, 148, 160, 176, 192, 224, 234, 234}, /* bits */ {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ {64, 96, 64}, /* fp */ 160, /* class0_hp bit */ @@ -103,6 +105,8 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { else if (z < CLASS0_SIZE * 256) c = MV_CLASS_5; else if (z < CLASS0_SIZE * 512) c = MV_CLASS_6; else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7; + else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8; + else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9; else assert(0); if (offset) *offset = z - mv_class_base(c); diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h index f5cfee937..33500069e 100644 --- a/vp9/common/vp9_entropymv.h +++ b/vp9/common/vp9_entropymv.h @@ -49,7 +49,7 @@ extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2]; extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS]; /* Symbols for coding magnitude class of nonzero components */ -#define MV_CLASSES 8 +#define MV_CLASSES 10 typedef enum { MV_CLASS_0 = 0, /* (0, 2] integer pel */ MV_CLASS_1 = 1, /* (2, 4] integer pel */ @@ -59,6 +59,8 @@ typedef enum { MV_CLASS_5 = 5, /* (32, 64] integer pel */ MV_CLASS_6 = 6, /* (64, 128] integer pel */ MV_CLASS_7 = 7, /* (128, 256] integer pel */ + MV_CLASS_8 = 8, /* (256, 512] integer pel */ + MV_CLASS_9 = 9, /* (512, 1024] integer pel */ } MV_CLASS_TYPE; extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2]; diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c index a26415fc3..a03a66e33 100644 --- a/vp9/common/vp9_invtrans.c +++ b/vp9/common/vp9_invtrans.c @@ -24,7 +24,7 @@ void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) { int i; for (i = 0; i < 16; i++) { - TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]); + TX_TYPE tx_type = get_tx_type_4x4(xd, i); if (tx_type != DCT_DCT) { vp9_short_iht4x4(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type); } else { @@ -58,7 +58,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { BLOCKD *blockd = xd->block; for (i = 0; i < 9; i += 8) { - TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]); + TX_TYPE tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { vp9_short_iht8x8(xd->block[i].dqcoeff, xd->block[i].diff, 16, tx_type); } else { @@ -67,7 +67,7 @@ void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) { } } for (i = 2; i < 11; i += 8) { - TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[i]); + TX_TYPE tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { vp9_short_iht8x8(xd->block[i + 2].dqcoeff, xd->block[i].diff, 16, tx_type); @@ -100,7 +100,7 @@ void vp9_inverse_transform_b_16x16(int16_t *input_dqcoeff, void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) { BLOCKD *bd = &xd->block[0]; - TX_TYPE tx_type = get_tx_type_16x16(xd, bd); + TX_TYPE tx_type = get_tx_type_16x16(xd, 0); if (tx_type != DCT_DCT) { vp9_short_iht16x16(bd->dqcoeff, bd->diff, 16, tx_type); } else { @@ -123,9 +123,16 @@ void vp9_inverse_transform_sby_16x16(MACROBLOCKD *xd) { for (n = 0; n < 4; n++) { const int x_idx = n & 1, y_idx = n >> 1; + const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4); - vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256, - xd->diff + x_idx * 16 + y_idx * 32 * 16, 64); + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256, + xd->diff + x_idx * 16 + y_idx * 32 * 16, + 64); + } else { + vp9_short_iht16x16(xd->dqcoeff + n * 256, + xd->diff + x_idx * 16 + y_idx * 32 * 16, 32, tx_type); + } } } @@ -134,9 +141,15 @@ void vp9_inverse_transform_sby_8x8(MACROBLOCKD *xd) { for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2); - vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64, - xd->diff + x_idx * 8 + y_idx * 32 * 8, 64); + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64, + xd->diff + x_idx * 8 + y_idx * 32 * 8, 64); + } else { + vp9_short_iht8x8(xd->dqcoeff + n * 64, + xd->diff + x_idx * 8 + y_idx * 32 * 8, 32, tx_type); + } } } @@ -145,9 +158,15 @@ void vp9_inverse_transform_sby_4x4(MACROBLOCKD *xd) { for (n = 0; n < 64; n++) { const int x_idx = n & 7, y_idx = n >> 3; + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx); - vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16, - xd->diff + x_idx * 4 + y_idx * 4 * 32, 64); + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16, + xd->diff + x_idx * 4 + y_idx * 4 * 32, 64); + } else { + vp9_short_iht4x4(xd->dqcoeff + n * 16, + xd->diff + x_idx * 4 + y_idx * 4 * 32, 32, tx_type); + } } } @@ -206,9 +225,16 @@ void vp9_inverse_transform_sb64y_16x16(MACROBLOCKD *xd) { for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2; + const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4); - vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256, - xd->diff + x_idx * 16 + y_idx * 64 * 16, 128); + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_16x16(xd->dqcoeff + n * 256, + xd->diff + x_idx * 16 + y_idx * 64 * 16, + 128); + } else { + vp9_short_iht16x16(xd->dqcoeff + n * 256, + xd->diff + x_idx * 16 + y_idx * 64 * 16, 64, tx_type); + } } } @@ -217,9 +243,15 @@ void vp9_inverse_transform_sb64y_8x8(MACROBLOCKD *xd) { for (n = 0; n < 64; n++) { const int x_idx = n & 7, y_idx = n >> 3; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2); - vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64, - xd->diff + x_idx * 8 + y_idx * 64 * 8, 128); + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_8x8(xd->dqcoeff + n * 64, + xd->diff + x_idx * 8 + y_idx * 64 * 8, 128); + } else { + vp9_short_iht8x8(xd->dqcoeff + n * 64, + xd->diff + x_idx * 8 + y_idx * 64 * 8, 64, tx_type); + } } } @@ -228,9 +260,15 @@ void vp9_inverse_transform_sb64y_4x4(MACROBLOCKD *xd) { for (n = 0; n < 256; n++) { const int x_idx = n & 15, y_idx = n >> 4; + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx); - vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16, - xd->diff + x_idx * 4 + y_idx * 4 * 64, 128); + if (tx_type == DCT_DCT) { + vp9_inverse_transform_b_4x4(xd, xd->eobs[n], xd->dqcoeff + n * 16, + xd->diff + x_idx * 4 + y_idx * 4 * 64, 128); + } else { + vp9_short_iht4x4(xd->dqcoeff + n * 16, + xd->diff + x_idx * 4 + y_idx * 4 * 64, 64, tx_type); + } } } diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 87628659b..48ae860a9 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -151,6 +151,15 @@ specialize vp9_add_residual_16x16 sse2 prototype void vp9_add_residual_32x32 "const int16_t *diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" specialize vp9_add_residual_32x32 sse2 + +prototype void vp9_add_constant_residual_8x8 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_constant_residual_8x8 sse2 + +prototype void vp9_add_constant_residual_16x16 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_constant_residual_16x16 sse2 + +prototype void vp9_add_constant_residual_32x32 "const int16_t diff, const uint8_t *pred, int pitch, uint8_t *dest, int stride" +specialize vp9_add_constant_residual_32x32 sse2 fi # diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 8dfb3e851..74b882c91 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -201,8 +201,7 @@ static void skip_recon_mb(VP9D_COMP *pbi, MACROBLOCKD *xd, static void decode_16x16(VP9D_COMP *pbi, MACROBLOCKD *xd, BOOL_DECODER* const bc) { - BLOCKD *bd = &xd->block[0]; - TX_TYPE tx_type = get_tx_type_16x16(xd, bd); + TX_TYPE tx_type = get_tx_type_16x16(xd, 0); #ifdef DEC_DEBUG if (dec_debug) { int i; @@ -240,7 +239,7 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd, BOOL_DECODER* const bc) { // First do Y // if the first one is DCT_DCT assume all the rest are as well - TX_TYPE tx_type = get_tx_type_8x8(xd, &xd->block[0]); + TX_TYPE tx_type = get_tx_type_8x8(xd, 0); #ifdef DEC_DEBUG if (dec_debug) { int i; @@ -267,7 +266,7 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd, int i8x8mode = b->bmi.as_mode.first; vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor); } - tx_type = get_tx_type_8x8(xd, &xd->block[ib]); + tx_type = get_tx_type_8x8(xd, ib); if (tx_type != DCT_DCT) { vp9_ht_dequant_idct_add_8x8_c(tx_type, q, dq, pre, dst, 16, stride, xd->eobs[idx]); @@ -341,7 +340,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, vp9_intra8x8_predict(xd, b, i8x8mode, b->predictor); for (j = 0; j < 4; j++) { b = &xd->block[ib + iblock[j]]; - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, ib + iblock[j]); if (tx_type != DCT_DCT) { vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, b->dequant, b->predictor, @@ -375,7 +374,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i); vp9_intra4x4_predict(xd, b, b_mode, b->predictor); - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, i); if (tx_type != DCT_DCT) { vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, b->dequant, b->predictor, @@ -397,7 +396,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, xd->dst.v_buffer, xd->dst.uv_stride, xd); - } else if (mode == SPLITMV || get_tx_type_4x4(xd, &xd->block[0]) == DCT_DCT) { + } else if (mode == SPLITMV || get_tx_type_4x4(xd, 0) == DCT_DCT) { xd->itxm_add_y_block(xd->qcoeff, xd->block[0].dequant, xd->predictor, @@ -431,7 +430,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, #endif for (i = 0; i < 16; i++) { BLOCKD *b = &xd->block[i]; - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, i); if (tx_type != DCT_DCT) { vp9_ht_dequant_idct_add_c(tx_type, b->qcoeff, b->dequant, b->predictor, @@ -517,13 +516,24 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd, xd->block[20].dequant, xd->dst.v_buffer, xd->dst.v_buffer, xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320]); break; - case TX_16X16: // FIXME(rbultje): adst + case TX_16X16: for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2; - vp9_dequant_idct_add_16x16(xd->qcoeff + n * 256, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + const TX_TYPE tx_type = get_tx_type_16x16(xd, + (y_idx * 16 + x_idx) * 4); + if (tx_type == DCT_DCT) { + vp9_dequant_idct_add_16x16(xd->qcoeff + n * 256, + xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, + xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + } else { + vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256, + xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, + xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + } } for (n = 0; n < 4; n++) { const int x_idx = n & 1, y_idx = n >> 1; @@ -539,13 +549,23 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd, xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 16]); } break; - case TX_8X8: // FIXME(rbultje): adst + case TX_8X8: for (n = 0; n < 64; n++) { const int x_idx = n & 7, y_idx = n >> 3; - vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, - xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, - xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2); + if (tx_type == DCT_DCT) { + vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, + xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, + xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + } else { + vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64, + xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, + xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + } } for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2; @@ -561,13 +581,22 @@ static void decode_superblock64(VP9D_COMP *pbi, MACROBLOCKD *xd, xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[320 + n * 4]); } break; - case TX_4X4: // FIXME(rbultje): adst + case TX_4X4: for (n = 0; n < 256; n++) { const int x_idx = n & 15, y_idx = n >> 4; - xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, - xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, - xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx); + if (tx_type == DCT_DCT) { + xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, + xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + } else { + vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16, + xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, + xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + } } for (n = 0; n < 64; n++) { const int x_idx = n & 7, y_idx = n >> 3; @@ -649,14 +678,24 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd, xd->dst.v_buffer, xd->dst.uv_stride, xd); break; - case TX_16X16: // FIXME(rbultje): adst + case TX_16X16: for (n = 0; n < 4; n++) { const int x_idx = n & 1, y_idx = n >> 1; - vp9_dequant_idct_add_16x16( - xd->qcoeff + n * 256, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, - xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + const TX_TYPE tx_type = get_tx_type_16x16(xd, + (y_idx * 8 + x_idx) * 4); + if (tx_type == DCT_DCT) { + vp9_dequant_idct_add_16x16( + xd->qcoeff + n * 256, xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, + xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + } else { + vp9_ht_dequant_idct_add_16x16_c(tx_type, xd->qcoeff + n * 256, + xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, + xd->dst.y_buffer + y_idx * 16 * xd->dst.y_stride + x_idx * 16, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 16]); + } } vp9_dequant_idct_add_uv_block_16x16_c(xd->qcoeff + 1024, xd->block[16].dequant, @@ -664,13 +703,23 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd, xd->dst.v_buffer, xd->dst.uv_stride, xd); break; - case TX_8X8: // FIXME(rbultje): adst + case TX_8X8: for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2; - vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, - xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, - xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2); + if (tx_type == DCT_DCT) { + vp9_dequant_idct_add_8x8_c(xd->qcoeff + n * 64, + xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, + xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + } else { + vp9_ht_dequant_idct_add_8x8_c(tx_type, xd->qcoeff + n * 64, + xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, + xd->dst.y_buffer + y_idx * 8 * xd->dst.y_stride + x_idx * 8, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n * 4]); + } } for (n = 0; n < 4; n++) { const int x_idx = n & 1, y_idx = n >> 1; @@ -686,13 +735,22 @@ static void decode_superblock32(VP9D_COMP *pbi, MACROBLOCKD *xd, xd->dst.uv_stride, xd->dst.uv_stride, xd->eobs[80 + n * 4]); } break; - case TX_4X4: // FIXME(rbultje): adst + case TX_4X4: for (n = 0; n < 64; n++) { const int x_idx = n & 7, y_idx = n >> 3; - xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant, - xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, - xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, - xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx); + if (tx_type == DCT_DCT) { + xd->itxm_add(xd->qcoeff + n * 16, xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, + xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + } else { + vp9_ht_dequant_idct_add_c(tx_type, xd->qcoeff + n * 16, + xd->block[0].dequant, + xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, + xd->dst.y_buffer + y_idx * 4 * xd->dst.y_stride + x_idx * 4, + xd->dst.y_stride, xd->dst.y_stride, xd->eobs[n]); + } } for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2; diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c index dade2aff5..92b78ed19 100644 --- a/vp9/decoder/vp9_dequantize.c +++ b/vp9/decoder/vp9_dequantize.c @@ -64,12 +64,27 @@ static void add_constant_residual(const int16_t diff, const uint8_t *pred, } } +void vp9_add_constant_residual_8x8_c(const int16_t diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_constant_residual(diff, pred, pitch, dest, stride, 8, 8); +} + +void vp9_add_constant_residual_16x16_c(const int16_t diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_constant_residual(diff, pred, pitch, dest, stride, 16, 16); +} + +void vp9_add_constant_residual_32x32_c(const int16_t diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + add_constant_residual(diff, pred, pitch, dest, stride, 32, 32); +} + void vp9_ht_dequant_idct_add_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { int i; - int16_t output[16]; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); for (i = 0; i < 16; i++) input[i] *= dq[i]; @@ -83,7 +98,7 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { - int16_t output[64]; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64); if (eob == 0) { // All 0 DCT coefficients @@ -104,7 +119,7 @@ void vp9_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, int16_t *input, void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { int i; - int16_t output[16]; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); if (eob > 1) { for (i = 0; i < 16; i++) @@ -125,7 +140,7 @@ void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int dc) { int i; - int16_t output[16]; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); input[0] = dc; @@ -142,7 +157,7 @@ void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { int i; - int16_t output[16]; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); if (eob > 1) { for (i = 0; i < 16; i++) @@ -164,7 +179,7 @@ void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq, uint8_t *dest, int pitch, int stride, int dc) { int i; - int16_t output[16]; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16); input[0] = dc; @@ -179,8 +194,7 @@ void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq, void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { - int16_t output[64]; - + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64); // If dc is 1, then input[0] is the reconstructed value, do not need // dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1. @@ -203,7 +217,7 @@ void vp9_dequant_idct_add_8x8_c(int16_t *input, const int16_t *dq, vp9_short_idct1_8x8_c(&in, &out); input[0] = 0; - add_constant_residual(out, pred, pitch, dest, stride, 8, 8); + vp9_add_constant_residual_8x8(out, pred, pitch, dest, stride); } else if (eob <= 10) { input[1] *= dq[1]; input[2] *= dq[1]; @@ -241,7 +255,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { - int16_t output[256]; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256); if (eob == 0) { // All 0 DCT coefficients @@ -270,7 +284,7 @@ void vp9_ht_dequant_idct_add_16x16_c(TX_TYPE tx_type, int16_t *input, void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { - int16_t output[256]; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256); /* The calculation can be simplified if there are not many non-zero dct * coefficients. Use eobs to separate different cases. */ @@ -286,7 +300,7 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, vp9_short_idct1_16x16_c(&in, &out); input[0] = 0; - add_constant_residual(out, pred, pitch, dest, stride, 16, 16); + vp9_add_constant_residual_16x16(out, pred, pitch, dest, stride); } else if (eob <= 10) { input[0] *= dq[0]; @@ -330,13 +344,13 @@ void vp9_dequant_idct_add_16x16_c(int16_t *input, const int16_t *dq, void vp9_dequant_idct_add_32x32_c(int16_t *input, const int16_t *dq, uint8_t *pred, uint8_t *dest, int pitch, int stride, int eob) { - int16_t output[1024]; + DECLARE_ALIGNED_ARRAY(16, int16_t, output, 1024); if (eob) { input[0] = input[0] * dq[0] / 2; if (eob == 1) { vp9_short_idct1_32x32(input, output); - add_constant_residual(output[0], pred, pitch, dest, stride, 32, 32); + vp9_add_constant_residual_32x32(output[0], pred, pitch, dest, stride); input[0] = 0; } else if (eob <= 10) { input[1] = input[1] * dq[1] / 2; diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index a53edfc3c..0a584d79d 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -96,9 +96,8 @@ static int get_signed(BOOL_DECODER *br, int value_to_sign) { static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd, BOOL_DECODER* const br, int block_idx, - PLANE_TYPE type, TX_TYPE tx_type, - int seg_eob, int16_t *qcoeff_ptr, - const int *const scan, TX_SIZE txfm_size) { + PLANE_TYPE type, int seg_eob, int16_t *qcoeff_ptr, + TX_SIZE txfm_size) { ENTROPY_CONTEXT* const A0 = (ENTROPY_CONTEXT *) xd->above_context; ENTROPY_CONTEXT* const L0 = (ENTROPY_CONTEXT *) xd->left_context; int aidx, lidx; @@ -114,6 +113,7 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd, uint16_t nzc = 0; uint16_t nzc_expected = xd->mode_info_context->mbmi.nzcs[block_idx]; #endif + const int *scan; if (xd->mode_info_context->mbmi.sb_type == BLOCK_SIZE_SB64X64) { aidx = vp9_block2above_sb64[txfm_size][block_idx]; @@ -128,19 +128,34 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd, switch (txfm_size) { default: - case TX_4X4: + case TX_4X4: { + const TX_TYPE tx_type = get_tx_type_4x4(xd, block_idx); + switch (tx_type) { + default: + scan = vp9_default_zig_zag1d_4x4; + break; + case ADST_DCT: + scan = vp9_row_scan_4x4; + break; + case DCT_ADST: + scan = vp9_col_scan_4x4; + break; + } above_ec = A0[aidx] != 0; left_ec = L0[lidx] != 0; coef_probs = fc->coef_probs_4x4; coef_counts = fc->coef_counts_4x4; break; + } case TX_8X8: + scan = vp9_default_zig_zag1d_8x8; coef_probs = fc->coef_probs_8x8; coef_counts = fc->coef_counts_8x8; above_ec = (A0[aidx] + A0[aidx + 1]) != 0; left_ec = (L0[lidx] + L0[lidx + 1]) != 0; break; case TX_16X16: + scan = vp9_default_zig_zag1d_16x16; coef_probs = fc->coef_probs_16x16; coef_counts = fc->coef_counts_16x16; if (type == PLANE_TYPE_UV) { @@ -154,6 +169,7 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd, } break; case TX_32X32: + scan = vp9_default_zig_zag1d_32x32; coef_probs = fc->coef_probs_32x32; coef_counts = fc->coef_counts_32x32; if (type == PLANE_TYPE_UV) { @@ -318,17 +334,15 @@ int vp9_decode_sb_tokens(VP9D_COMP* const pbi, case TX_32X32: // Luma block c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC, - DCT_DCT, get_eob(xd, segment_id, 1024), - xd->qcoeff, vp9_default_zig_zag1d_32x32, TX_32X32); + get_eob(xd, segment_id, 1024), xd->qcoeff, TX_32X32); xd->eobs[0] = c; eobtotal += c; // 16x16 chroma blocks seg_eob = get_eob(xd, segment_id, 256); for (i = 64; i < 96; i += 16) { - c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob, - xd->qcoeff + i * 16, - vp9_default_zig_zag1d_16x16, TX_16X16); + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, TX_16X16); xd->eobs[i] = c; eobtotal += c; } @@ -338,17 +352,15 @@ int vp9_decode_sb_tokens(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 256); for (i = 0; i < 64; i += 16) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, - DCT_DCT, seg_eob, xd->qcoeff + i * 16, - vp9_default_zig_zag1d_16x16, TX_16X16); + seg_eob, xd->qcoeff + i * 16, TX_16X16); xd->eobs[i] = c; eobtotal += c; } // 16x16 chroma blocks for (i = 64; i < 96; i += 16) { - c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob, - xd->qcoeff + i * 16, - vp9_default_zig_zag1d_16x16, TX_16X16); + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, TX_16X16); xd->eobs[i] = c; eobtotal += c; } @@ -358,17 +370,15 @@ int vp9_decode_sb_tokens(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 64); for (i = 0; i < 64; i += 4) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, - DCT_DCT, seg_eob, xd->qcoeff + i * 16, - vp9_default_zig_zag1d_8x8, TX_8X8); + seg_eob, xd->qcoeff + i * 16, TX_8X8); xd->eobs[i] = c; eobtotal += c; } // 8x8 chroma blocks for (i = 64; i < 96; i += 4) { - c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob, - xd->qcoeff + i * 16, - vp9_default_zig_zag1d_8x8, TX_8X8); + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, TX_8X8); xd->eobs[i] = c; eobtotal += c; } @@ -378,17 +388,15 @@ int vp9_decode_sb_tokens(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 16); for (i = 0; i < 64; i++) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, - DCT_DCT, seg_eob, xd->qcoeff + i * 16, - vp9_default_zig_zag1d_4x4, TX_4X4); + seg_eob, xd->qcoeff + i * 16, TX_4X4); xd->eobs[i] = c; eobtotal += c; } // 4x4 chroma blocks for (i = 64; i < 96; i++) { - c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob, - xd->qcoeff + i * 16, - vp9_default_zig_zag1d_4x4, TX_4X4); + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, TX_4X4); xd->eobs[i] = c; eobtotal += c; } @@ -411,17 +419,15 @@ int vp9_decode_sb64_tokens(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 1024); for (i = 0; i < 256; i += 64) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, - DCT_DCT, seg_eob, xd->qcoeff + i * 16, - vp9_default_zig_zag1d_32x32, TX_32X32); + seg_eob, xd->qcoeff + i * 16, TX_32X32); xd->eobs[i] = c; eobtotal += c; } // 32x32 chroma blocks for (i = 256; i < 384; i += 64) { - c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob, - xd->qcoeff + i * 16, - vp9_default_zig_zag1d_32x32, TX_32X32); + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, TX_32X32); xd->eobs[i] = c; eobtotal += c; } @@ -431,17 +437,15 @@ int vp9_decode_sb64_tokens(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 256); for (i = 0; i < 256; i += 16) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, - DCT_DCT, seg_eob, xd->qcoeff + i * 16, - vp9_default_zig_zag1d_16x16, TX_16X16); + seg_eob, xd->qcoeff + i * 16, TX_16X16); xd->eobs[i] = c; eobtotal += c; } // 16x16 chroma blocks for (i = 256; i < 384; i += 16) { - c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob, - xd->qcoeff + i * 16, - vp9_default_zig_zag1d_16x16, TX_16X16); + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, TX_16X16); xd->eobs[i] = c; eobtotal += c; } @@ -451,17 +455,15 @@ int vp9_decode_sb64_tokens(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 64); for (i = 0; i < 256; i += 4) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, - DCT_DCT, seg_eob, xd->qcoeff + i * 16, - vp9_default_zig_zag1d_8x8, TX_8X8); + seg_eob, xd->qcoeff + i * 16, TX_8X8); xd->eobs[i] = c; eobtotal += c; } // 8x8 chroma blocks for (i = 256; i < 384; i += 4) { - c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob, - xd->qcoeff + i * 16, - vp9_default_zig_zag1d_8x8, TX_8X8); + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, TX_8X8); xd->eobs[i] = c; eobtotal += c; } @@ -471,17 +473,15 @@ int vp9_decode_sb64_tokens(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 16); for (i = 0; i < 256; i++) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, - DCT_DCT, seg_eob, xd->qcoeff + i * 16, - vp9_default_zig_zag1d_4x4, TX_4X4); + seg_eob, xd->qcoeff + i * 16, TX_4X4); xd->eobs[i] = c; eobtotal += c; } // 4x4 chroma blocks for (i = 256; i < 384; i++) { - c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, DCT_DCT, seg_eob, - xd->qcoeff + i * 16, - vp9_default_zig_zag1d_4x4, TX_4X4); + c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, seg_eob, + xd->qcoeff + i * 16, TX_4X4); xd->eobs[i] = c; eobtotal += c; } @@ -500,9 +500,7 @@ static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi, // Luma block int c = decode_coefs(pbi, xd, bc, 0, PLANE_TYPE_Y_WITH_DC, - get_tx_type(xd, &xd->block[0]), - get_eob(xd, segment_id, 256), - xd->qcoeff, vp9_default_zig_zag1d_16x16, TX_16X16); + get_eob(xd, segment_id, 256), xd->qcoeff, TX_16X16); xd->eobs[0] = c; eobtotal += c; @@ -510,8 +508,7 @@ static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 64); for (i = 16; i < 24; i += 4) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, - DCT_DCT, seg_eob, xd->block[i].qcoeff, - vp9_default_zig_zag1d_8x8, TX_8X8); + seg_eob, xd->block[i].qcoeff, TX_8X8); xd->eobs[i] = c; eobtotal += c; } @@ -528,9 +525,7 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 64); for (i = 0; i < 16; i += 4) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_Y_WITH_DC, - get_tx_type(xd, xd->block + i), - seg_eob, xd->block[i].qcoeff, - vp9_default_zig_zag1d_8x8, TX_8X8); + seg_eob, xd->block[i].qcoeff, TX_8X8); xd->eobs[i] = c; eobtotal += c; } @@ -542,16 +537,14 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi, seg_eob = get_eob(xd, segment_id, 16); for (i = 16; i < 24; i++) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, - DCT_DCT, seg_eob, xd->block[i].qcoeff, - vp9_default_zig_zag1d_4x4, TX_4X4); + seg_eob, xd->block[i].qcoeff, TX_4X4); xd->eobs[i] = c; eobtotal += c; } } else { for (i = 16; i < 24; i += 4) { c = decode_coefs(pbi, xd, bc, i, PLANE_TYPE_UV, - DCT_DCT, seg_eob, xd->block[i].qcoeff, - vp9_default_zig_zag1d_8x8, TX_8X8); + seg_eob, xd->block[i].qcoeff, TX_8X8); xd->eobs[i] = c; eobtotal += c; } @@ -562,43 +555,20 @@ static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi, static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd, BOOL_DECODER* const bc, - PLANE_TYPE type, int i, int seg_eob, - TX_TYPE tx_type, const int *scan) { - int c = decode_coefs(dx, xd, bc, i, type, tx_type, seg_eob, - xd->block[i].qcoeff, scan, TX_4X4); + PLANE_TYPE type, int i, int seg_eob) { + int c = decode_coefs(dx, xd, bc, i, type, seg_eob, + xd->block[i].qcoeff, TX_4X4); xd->eobs[i] = c; return c; } -static int decode_coefs_4x4_y(VP9D_COMP *dx, MACROBLOCKD *xd, - BOOL_DECODER* const bc, - PLANE_TYPE type, int i, int seg_eob) { - const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type(xd, &xd->block[i]) : DCT_DCT; - const int *scan; - - switch (tx_type) { - case ADST_DCT: - scan = vp9_row_scan_4x4; - break; - case DCT_ADST: - scan = vp9_col_scan_4x4; - break; - default: - scan = vp9_default_zig_zag1d_4x4; - break; - } - - return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob, tx_type, scan); -} - int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd, BOOL_DECODER* const bc, PLANE_TYPE type, int i) { const int segment_id = xd->mode_info_context->mbmi.segment_id; const int seg_eob = get_eob(xd, segment_id, 16); - return decode_coefs_4x4_y(dx, xd, bc, type, i, seg_eob); + return decode_coefs_4x4(dx, xd, bc, type, i, seg_eob); } static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, @@ -609,8 +579,7 @@ static int decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, // chroma blocks for (i = 16; i < 24; i++) { - eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob, - DCT_DCT, vp9_default_zig_zag1d_4x4); + eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i, seg_eob); } return eobtotal; @@ -634,8 +603,7 @@ static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx, // luma blocks for (i = 0; i < 16; ++i) { - eobtotal += decode_coefs_4x4_y(dx, xd, bc, - PLANE_TYPE_Y_WITH_DC, i, seg_eob); + eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y_WITH_DC, i, seg_eob); } // chroma blocks diff --git a/vp9/decoder/x86/vp9_dequantize_x86.c b/vp9/decoder/x86/vp9_dequantize_x86.c index c225deca5..acfae2a27 100644 --- a/vp9/decoder/x86/vp9_dequantize_x86.c +++ b/vp9/decoder/x86/vp9_dequantize_x86.c @@ -67,14 +67,14 @@ void vp9_add_residual_8x8_sse2(const int16_t *diff, const uint8_t *pred, const __m128i zero = _mm_setzero_si128(); // Diff data - const __m128i d0 = _mm_loadu_si128((const __m128i *)(diff + 0 * width)); - const __m128i d1 = _mm_loadu_si128((const __m128i *)(diff + 1 * width)); - const __m128i d2 = _mm_loadu_si128((const __m128i *)(diff + 2 * width)); - const __m128i d3 = _mm_loadu_si128((const __m128i *)(diff + 3 * width)); - const __m128i d4 = _mm_loadu_si128((const __m128i *)(diff + 4 * width)); - const __m128i d5 = _mm_loadu_si128((const __m128i *)(diff + 5 * width)); - const __m128i d6 = _mm_loadu_si128((const __m128i *)(diff + 6 * width)); - const __m128i d7 = _mm_loadu_si128((const __m128i *)(diff + 7 * width)); + const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); + const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width)); + const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width)); + const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width)); + const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width)); + const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width)); + const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width)); + const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width)); // Prediction data. __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch)); @@ -137,14 +137,14 @@ void vp9_add_residual_16x16_sse2(const int16_t *diff, const uint8_t *pred, __m128i p0, p1, p2, p3, p4, p5, p6, p7; do { - d0 = _mm_loadu_si128((const __m128i *)(diff + 0 * width)); - d1 = _mm_loadu_si128((const __m128i *)(diff + 0 * width + 8)); - d2 = _mm_loadu_si128((const __m128i *)(diff + 1 * width)); - d3 = _mm_loadu_si128((const __m128i *)(diff + 1 * width + 8)); - d4 = _mm_loadu_si128((const __m128i *)(diff + 2 * width)); - d5 = _mm_loadu_si128((const __m128i *)(diff + 2 * width + 8)); - d6 = _mm_loadu_si128((const __m128i *)(diff + 3 * width)); - d7 = _mm_loadu_si128((const __m128i *)(diff + 3 * width + 8)); + d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); + d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8)); + d2 = _mm_load_si128((const __m128i *)(diff + 1 * width)); + d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8)); + d4 = _mm_load_si128((const __m128i *)(diff + 2 * width)); + d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8)); + d6 = _mm_load_si128((const __m128i *)(diff + 3 * width)); + d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8)); // Prediction data. p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); @@ -197,14 +197,14 @@ void vp9_add_residual_32x32_sse2(const int16_t *diff, const uint8_t *pred, __m128i p0, p1, p2, p3, p4, p5, p6, p7; do { - d0 = _mm_loadu_si128((const __m128i *)(diff + 0 * width)); - d1 = _mm_loadu_si128((const __m128i *)(diff + 0 * width + 8)); - d2 = _mm_loadu_si128((const __m128i *)(diff + 0 * width + 16)); - d3 = _mm_loadu_si128((const __m128i *)(diff + 0 * width + 24)); - d4 = _mm_loadu_si128((const __m128i *)(diff + 1 * width)); - d5 = _mm_loadu_si128((const __m128i *)(diff + 1 * width + 8)); - d6 = _mm_loadu_si128((const __m128i *)(diff + 1 * width + 16)); - d7 = _mm_loadu_si128((const __m128i *)(diff + 1 * width + 24)); + d0 = _mm_load_si128((const __m128i *)(diff + 0 * width)); + d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8)); + d2 = _mm_load_si128((const __m128i *)(diff + 0 * width + 16)); + d3 = _mm_load_si128((const __m128i *)(diff + 0 * width + 24)); + d4 = _mm_load_si128((const __m128i *)(diff + 1 * width)); + d5 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8)); + d6 = _mm_load_si128((const __m128i *)(diff + 1 * width + 16)); + d7 = _mm_load_si128((const __m128i *)(diff + 1 * width + 24)); // Prediction data. p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); @@ -245,4 +245,211 @@ void vp9_add_residual_32x32_sse2(const int16_t *diff, const uint8_t *pred, dest += 2 * stride; } while (--i); } + +void vp9_add_constant_residual_8x8_sse2(const int16_t diff, const uint8_t *pred, + int pitch, uint8_t *dest, int stride) { + uint8_t abs_diff; + __m128i d; + + // Prediction data. + __m128i p0 = _mm_loadl_epi64((const __m128i *)(pred + 0 * pitch)); + __m128i p1 = _mm_loadl_epi64((const __m128i *)(pred + 1 * pitch)); + __m128i p2 = _mm_loadl_epi64((const __m128i *)(pred + 2 * pitch)); + __m128i p3 = _mm_loadl_epi64((const __m128i *)(pred + 3 * pitch)); + __m128i p4 = _mm_loadl_epi64((const __m128i *)(pred + 4 * pitch)); + __m128i p5 = _mm_loadl_epi64((const __m128i *)(pred + 5 * pitch)); + __m128i p6 = _mm_loadl_epi64((const __m128i *)(pred + 6 * pitch)); + __m128i p7 = _mm_loadl_epi64((const __m128i *)(pred + 7 * pitch)); + + p0 = _mm_unpacklo_epi64(p0, p1); + p2 = _mm_unpacklo_epi64(p2, p3); + p4 = _mm_unpacklo_epi64(p4, p5); + p6 = _mm_unpacklo_epi64(p6, p7); + + // Clip diff value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (diff >= 0) { + abs_diff = (diff > 255) ? 255 : diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + + p0 = _mm_adds_epu8(p0, d); + p2 = _mm_adds_epu8(p2, d); + p4 = _mm_adds_epu8(p4, d); + p6 = _mm_adds_epu8(p6, d); + } else { + abs_diff = (diff < -255) ? 255 : -diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + + p0 = _mm_subs_epu8(p0, d); + p2 = _mm_subs_epu8(p2, d); + p4 = _mm_subs_epu8(p4, d); + p6 = _mm_subs_epu8(p6, d); + } + + _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); + p0 = _mm_srli_si128(p0, 8); + _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); + + _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); + p2 = _mm_srli_si128(p2, 8); + _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); + + _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); + p4 = _mm_srli_si128(p4, 8); + _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); + + _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); + p6 = _mm_srli_si128(p6, 8); + _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); +} + +void vp9_add_constant_residual_16x16_sse2(const int16_t diff, + const uint8_t *pred, int pitch, + uint8_t *dest, int stride) { + uint8_t abs_diff; + __m128i d; + + // Prediction data. + __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); + __m128i p1 = _mm_load_si128((const __m128i *)(pred + 1 * pitch)); + __m128i p2 = _mm_load_si128((const __m128i *)(pred + 2 * pitch)); + __m128i p3 = _mm_load_si128((const __m128i *)(pred + 3 * pitch)); + __m128i p4 = _mm_load_si128((const __m128i *)(pred + 4 * pitch)); + __m128i p5 = _mm_load_si128((const __m128i *)(pred + 5 * pitch)); + __m128i p6 = _mm_load_si128((const __m128i *)(pred + 6 * pitch)); + __m128i p7 = _mm_load_si128((const __m128i *)(pred + 7 * pitch)); + __m128i p8 = _mm_load_si128((const __m128i *)(pred + 8 * pitch)); + __m128i p9 = _mm_load_si128((const __m128i *)(pred + 9 * pitch)); + __m128i p10 = _mm_load_si128((const __m128i *)(pred + 10 * pitch)); + __m128i p11 = _mm_load_si128((const __m128i *)(pred + 11 * pitch)); + __m128i p12 = _mm_load_si128((const __m128i *)(pred + 12 * pitch)); + __m128i p13 = _mm_load_si128((const __m128i *)(pred + 13 * pitch)); + __m128i p14 = _mm_load_si128((const __m128i *)(pred + 14 * pitch)); + __m128i p15 = _mm_load_si128((const __m128i *)(pred + 15 * pitch)); + + // Clip diff value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (diff >= 0) { + abs_diff = (diff > 255) ? 255 : diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + + p0 = _mm_adds_epu8(p0, d); + p1 = _mm_adds_epu8(p1, d); + p2 = _mm_adds_epu8(p2, d); + p3 = _mm_adds_epu8(p3, d); + p4 = _mm_adds_epu8(p4, d); + p5 = _mm_adds_epu8(p5, d); + p6 = _mm_adds_epu8(p6, d); + p7 = _mm_adds_epu8(p7, d); + p8 = _mm_adds_epu8(p8, d); + p9 = _mm_adds_epu8(p9, d); + p10 = _mm_adds_epu8(p10, d); + p11 = _mm_adds_epu8(p11, d); + p12 = _mm_adds_epu8(p12, d); + p13 = _mm_adds_epu8(p13, d); + p14 = _mm_adds_epu8(p14, d); + p15 = _mm_adds_epu8(p15, d); + } else { + abs_diff = (diff < -255) ? 255 : -diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + + p0 = _mm_subs_epu8(p0, d); + p1 = _mm_subs_epu8(p1, d); + p2 = _mm_subs_epu8(p2, d); + p3 = _mm_subs_epu8(p3, d); + p4 = _mm_subs_epu8(p4, d); + p5 = _mm_subs_epu8(p5, d); + p6 = _mm_subs_epu8(p6, d); + p7 = _mm_subs_epu8(p7, d); + p8 = _mm_subs_epu8(p8, d); + p9 = _mm_subs_epu8(p9, d); + p10 = _mm_subs_epu8(p10, d); + p11 = _mm_subs_epu8(p11, d); + p12 = _mm_subs_epu8(p12, d); + p13 = _mm_subs_epu8(p13, d); + p14 = _mm_subs_epu8(p14, d); + p15 = _mm_subs_epu8(p15, d); + } + + // Store results + _mm_store_si128((__m128i *)(dest + 0 * stride), p0); + _mm_store_si128((__m128i *)(dest + 1 * stride), p1); + _mm_store_si128((__m128i *)(dest + 2 * stride), p2); + _mm_store_si128((__m128i *)(dest + 3 * stride), p3); + _mm_store_si128((__m128i *)(dest + 4 * stride), p4); + _mm_store_si128((__m128i *)(dest + 5 * stride), p5); + _mm_store_si128((__m128i *)(dest + 6 * stride), p6); + _mm_store_si128((__m128i *)(dest + 7 * stride), p7); + _mm_store_si128((__m128i *)(dest + 8 * stride), p8); + _mm_store_si128((__m128i *)(dest + 9 * stride), p9); + _mm_store_si128((__m128i *)(dest + 10 * stride), p10); + _mm_store_si128((__m128i *)(dest + 11 * stride), p11); + _mm_store_si128((__m128i *)(dest + 12 * stride), p12); + _mm_store_si128((__m128i *)(dest + 13 * stride), p13); + _mm_store_si128((__m128i *)(dest + 14 * stride), p14); + _mm_store_si128((__m128i *)(dest + 15 * stride), p15); +} + +void vp9_add_constant_residual_32x32_sse2(const int16_t diff, + const uint8_t *pred, int pitch, + uint8_t *dest, int stride) { + uint8_t abs_diff; + __m128i d; + int i = 8; + + if (diff >= 0) { + abs_diff = (diff > 255) ? 255 : diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + } else { + abs_diff = (diff < -255) ? 255 : -diff; + d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); + } + + do { + // Prediction data. + __m128i p0 = _mm_load_si128((const __m128i *)(pred + 0 * pitch)); + __m128i p1 = _mm_load_si128((const __m128i *)(pred + 0 * pitch + 16)); + __m128i p2 = _mm_load_si128((const __m128i *)(pred + 1 * pitch)); + __m128i p3 = _mm_load_si128((const __m128i *)(pred + 1 * pitch + 16)); + __m128i p4 = _mm_load_si128((const __m128i *)(pred + 2 * pitch)); + __m128i p5 = _mm_load_si128((const __m128i *)(pred + 2 * pitch + 16)); + __m128i p6 = _mm_load_si128((const __m128i *)(pred + 3 * pitch)); + __m128i p7 = _mm_load_si128((const __m128i *)(pred + 3 * pitch + 16)); + + // Clip diff value to [0, 255] range. Then, do addition or subtraction + // according to its sign. + if (diff >= 0) { + p0 = _mm_adds_epu8(p0, d); + p1 = _mm_adds_epu8(p1, d); + p2 = _mm_adds_epu8(p2, d); + p3 = _mm_adds_epu8(p3, d); + p4 = _mm_adds_epu8(p4, d); + p5 = _mm_adds_epu8(p5, d); + p6 = _mm_adds_epu8(p6, d); + p7 = _mm_adds_epu8(p7, d); + } else { + p0 = _mm_subs_epu8(p0, d); + p1 = _mm_subs_epu8(p1, d); + p2 = _mm_subs_epu8(p2, d); + p3 = _mm_subs_epu8(p3, d); + p4 = _mm_subs_epu8(p4, d); + p5 = _mm_subs_epu8(p5, d); + p6 = _mm_subs_epu8(p6, d); + p7 = _mm_subs_epu8(p7, d); + } + + // Store results + _mm_store_si128((__m128i *)(dest + 0 * stride), p0); + _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1); + _mm_store_si128((__m128i *)(dest + 1 * stride), p2); + _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3); + _mm_store_si128((__m128i *)(dest + 2 * stride), p4); + _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5); + _mm_store_si128((__m128i *)(dest + 3 * stride), p6); + _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7); + + pred += 4 * pitch; + dest += 4 * stride; + } while (--i); +} #endif diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index f2be96dd7..9ca7677a3 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -1241,8 +1241,9 @@ static void encode_frame_internal(VP9_COMP *cpi) { MACROBLOCKD *const xd = &x->e_mbd; int totalrate; -// fprintf(stderr, "encode_frame_internal frame %d (%d)\n", -// cpi->common.current_video_frame, cpi->common.show_frame); +// fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n", +// cpi->common.current_video_frame, cpi->common.show_frame, +// cm->frame_type); // Compute a modified set of reference frame probabilities to use when // prediction fails. These are based on the current general estimates for diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index d3b595bd8..3c98d4aa6 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -50,7 +50,7 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) { vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor); vp9_subtract_b(be, b, 16); - tx_type = get_tx_type_4x4(&x->e_mbd, b); + tx_type = get_tx_type_4x4(&x->e_mbd, ib); if (tx_type != DCT_DCT) { vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); vp9_ht_quantize_b_4x4(x, ib, tx_type); @@ -152,7 +152,7 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { int idx = (ib & 0x02) ? (ib + 2) : ib; - tx_type = get_tx_type_8x8(xd, &xd->block[ib]); + tx_type = get_tx_type_8x8(xd, ib); if (tx_type != DCT_DCT) { vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type); x->quantize_b_8x8(x, idx); @@ -167,12 +167,13 @@ void vp9_encode_intra8x8(MACROBLOCK *x, int ib) { for (i = 0; i < 4; i++) { b = &xd->block[ib + iblock[i]]; be = &x->block[ib + iblock[i]]; - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, ib + iblock[i]); if (tx_type != DCT_DCT) { vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type); vp9_short_iht4x4(b->dqcoeff, b->diff, 16, tx_type); - } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) { + } else if (!(i & 1) && + get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) { x->fwd_txm8x4(be->src_diff, be->coeff, 32); x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1); vp9_inverse_transform_b_4x4(xd, xd->eobs[ib + iblock[i]], diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index c0386459d..dae177a3c 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c @@ -210,10 +210,10 @@ void vp9_transform_mby_4x4(MACROBLOCK *x) { for (i = 0; i < 16; i++) { BLOCK *b = &x->block[i]; - TX_TYPE tx_type = get_tx_type_4x4(xd, &xd->block[i]); + TX_TYPE tx_type = get_tx_type_4x4(xd, i); if (tx_type != DCT_DCT) { vp9_short_fht4x4(b->src_diff, b->coeff, 16, tx_type); - } else if (!(i & 1) && get_tx_type_4x4(xd, &xd->block[i + 1]) == DCT_DCT) { + } else if (!(i & 1) && get_tx_type_4x4(xd, i + 1) == DCT_DCT) { x->fwd_txm8x4(x->block[i].src_diff, x->block[i].coeff, 32); i++; } else { @@ -241,7 +241,7 @@ void vp9_transform_mby_8x8(MACROBLOCK *x) { for (i = 0; i < 9; i += 8) { BLOCK *b = &x->block[i]; - tx_type = get_tx_type_8x8(xd, &xd->block[i]); + tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { vp9_short_fht8x8(b->src_diff, b->coeff, 16, tx_type); } else { @@ -250,7 +250,7 @@ void vp9_transform_mby_8x8(MACROBLOCK *x) { } for (i = 2; i < 11; i += 8) { BLOCK *b = &x->block[i]; - tx_type = get_tx_type_8x8(xd, &xd->block[i]); + tx_type = get_tx_type_8x8(xd, i); if (tx_type != DCT_DCT) { vp9_short_fht8x8(b->src_diff, (b + 2)->coeff, 16, tx_type); } else { @@ -274,7 +274,7 @@ void vp9_transform_mb_8x8(MACROBLOCK *x) { void vp9_transform_mby_16x16(MACROBLOCK *x) { MACROBLOCKD *xd = &x->e_mbd; BLOCK *b = &x->block[0]; - TX_TYPE tx_type = get_tx_type_16x16(xd, &xd->block[0]); + TX_TYPE tx_type = get_tx_type_16x16(xd, 0); vp9_clear_system_state(); if (tx_type != DCT_DCT) { vp9_short_fht16x16(b->src_diff, b->coeff, 16, tx_type); @@ -293,35 +293,56 @@ void vp9_transform_sby_32x32(MACROBLOCK *x) { } void vp9_transform_sby_16x16(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; int n; for (n = 0; n < 4; n++) { const int x_idx = n & 1, y_idx = n >> 1; + const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 8 + x_idx) * 4); - x->fwd_txm16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16, - x->coeff + n * 256, 64); + if (tx_type != DCT_DCT) { + vp9_short_fht16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16, + x->coeff + n * 256, 32, tx_type); + } else { + x->fwd_txm16x16(x->src_diff + y_idx * 32 * 16 + x_idx * 16, + x->coeff + n * 256, 64); + } } } void vp9_transform_sby_8x8(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; int n; for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 8 + x_idx) * 2); - x->fwd_txm8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8, - x->coeff + n * 64, 64); + if (tx_type != DCT_DCT) { + vp9_short_fht8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8, + x->coeff + n * 64, 32, tx_type); + } else { + x->fwd_txm8x8(x->src_diff + y_idx * 32 * 8 + x_idx * 8, + x->coeff + n * 64, 64); + } } } void vp9_transform_sby_4x4(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; int n; for (n = 0; n < 64; n++) { const int x_idx = n & 7, y_idx = n >> 3; + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 8 + x_idx); - x->fwd_txm4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4, - x->coeff + n * 16, 64); + if (tx_type != DCT_DCT) { + vp9_short_fht4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4, + x->coeff + n * 16, 32, tx_type); + } else { + x->fwd_txm4x4(x->src_diff + y_idx * 32 * 4 + x_idx * 4, + x->coeff + n * 16, 64); + } } } @@ -371,35 +392,56 @@ void vp9_transform_sb64y_32x32(MACROBLOCK *x) { } void vp9_transform_sb64y_16x16(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; int n; for (n = 0; n < 16; n++) { const int x_idx = n & 3, y_idx = n >> 2; + const TX_TYPE tx_type = get_tx_type_16x16(xd, (y_idx * 16 + x_idx) * 4); - x->fwd_txm16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16, - x->coeff + n * 256, 128); + if (tx_type != DCT_DCT) { + vp9_short_fht16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16, + x->coeff + n * 256, 64, tx_type); + } else { + x->fwd_txm16x16(x->src_diff + y_idx * 64 * 16 + x_idx * 16, + x->coeff + n * 256, 128); + } } } void vp9_transform_sb64y_8x8(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; int n; for (n = 0; n < 64; n++) { const int x_idx = n & 7, y_idx = n >> 3; + const TX_TYPE tx_type = get_tx_type_8x8(xd, (y_idx * 16 + x_idx) * 2); - x->fwd_txm8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8, - x->coeff + n * 64, 128); + if (tx_type != DCT_DCT) { + vp9_short_fht8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8, + x->coeff + n * 64, 64, tx_type); + } else { + x->fwd_txm8x8(x->src_diff + y_idx * 64 * 8 + x_idx * 8, + x->coeff + n * 64, 128); + } } } void vp9_transform_sb64y_4x4(MACROBLOCK *x) { + MACROBLOCKD *const xd = &x->e_mbd; int n; for (n = 0; n < 256; n++) { const int x_idx = n & 15, y_idx = n >> 4; + const TX_TYPE tx_type = get_tx_type_4x4(xd, y_idx * 16 + x_idx); - x->fwd_txm4x4(x->src_diff + y_idx * 64 * 4 + x_idx * 4, - x->coeff + n * 16, 128); + if (tx_type != DCT_DCT) { + vp9_short_fht8x8(x->src_diff + y_idx * 64 * 4 + x_idx * 4, + x->coeff + n * 16, 64, tx_type); + } else { + x->fwd_txm4x4(x->src_diff + y_idx * 64 * 4 + x_idx * 4, + x->coeff + n * 16, 128); + } } } @@ -513,7 +555,6 @@ static void optimize_b(VP9_COMMON *const cm, int default_eob; int const *scan; const int mul = 1 + (tx_size == TX_32X32); - TX_TYPE tx_type; #if CONFIG_CODE_NONZEROCOUNT // TODO(debargha): the dynamic programming approach used in this function // is not compatible with the true rate cost when nzcs are used. Note @@ -534,32 +575,21 @@ static void optimize_b(VP9_COMMON *const cm, switch (tx_size) { default: - case TX_4X4: + case TX_4X4: { + const TX_TYPE tx_type = get_tx_type_4x4(xd, ib); default_eob = 16; #if CONFIG_CODE_NONZEROCOUNT nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type]; #endif - // NOTE: this isn't called (for intra4x4 modes), but will be left in - // since it could be used later - tx_type = get_tx_type_4x4(&mb->e_mbd, &xd->block[ib]); - if (tx_type != DCT_DCT) { - switch (tx_type) { - case ADST_DCT: - scan = vp9_row_scan_4x4; - break; - - case DCT_ADST: - scan = vp9_col_scan_4x4; - break; - - default: - scan = vp9_default_zig_zag1d_4x4; - break; - } + if (tx_type == DCT_ADST) { + scan = vp9_col_scan_4x4; + } else if (tx_type == ADST_DCT) { + scan = vp9_row_scan_4x4; } else { scan = vp9_default_zig_zag1d_4x4; } break; + } case TX_8X8: scan = vp9_default_zig_zag1d_8x8; default_eob = 64; diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 337276d59..5e2f323a2 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -378,6 +378,19 @@ static void first_pass_motion_search(VP9_COMP *cpi, MACROBLOCK *x, vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16]; int new_mv_mode_penalty = 256; + int sr = 0; + int quart_frm = MIN(cpi->common.Width, cpi->common.Height); + + // refine the motion search range accroding to the frame dimension + // for first pass test + while ((quart_frm << sr) < MAX_FULL_PEL_VAL) + sr++; + if (sr) + sr--; + + step_param += sr; + further_steps -= sr; + // override the default variance function to use MSE v_fn_ptr.vf = vp9_mse16x16; diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c index 121de653f..715d68377 100644 --- a/vp9/encoder/vp9_mbgraph.c +++ b/vp9/encoder/vp9_mbgraph.c @@ -420,7 +420,7 @@ static void separate_arf_mbs(VP9_COMP *cpi) { cpi->static_mb_pct = (ncnt[1] * 100) / cm->MBs; // This error case should not be reachable as this function should - // never be called with the common data structure unititialized. + // never be called with the common data structure uninitialized. else cpi->static_mb_pct = 0; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 300d9f85c..5fd1e83cd 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -21,9 +21,9 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { int col_min = (ref_mv->as_mv.col >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.col & 7) ? 1 : 0); + ((ref_mv->as_mv.col & 7) ? 1 : 0); int row_min = (ref_mv->as_mv.row >> 3) - MAX_FULL_PEL_VAL + - ((ref_mv->as_mv.row & 7) ? 1 : 0); + ((ref_mv->as_mv.row & 7) ? 1 : 0); int col_max = (ref_mv->as_mv.col >> 3) + MAX_FULL_PEL_VAL; int row_max = (ref_mv->as_mv.row >> 3) + MAX_FULL_PEL_VAL; @@ -38,6 +38,19 @@ void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv) { x->mv_row_max = row_max; } +int vp9_init_search_range(int width, int height) { + int sr = 0; + int frm = MIN(width, height); + + while ((frm << sr) < MAX_FULL_PEL_VAL) + sr++; + + if (sr) + sr--; + + return sr; +} + int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], int weight, int ishp) { MV v; diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index 2479d7235..d5c7032a9 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -19,12 +19,17 @@ void print_mode_context(VP9_COMMON *pc); #endif - -#define MAX_MVSEARCH_STEPS 8 // The maximum number of steps in a step search given the largest allowed initial step -#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) // Max full pel mv specified in 1 pel units -#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) // Maximum size of the first step in full pel units +// The maximum number of steps in a step search given the largest +// allowed initial step +#define MAX_MVSEARCH_STEPS 10 +// Max full pel mv specified in 1 pel units +#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) +// Maximum size of the first step in full pel units +#define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1)) void vp9_clamp_mv_min_max(MACROBLOCK *x, int_mv *ref_mv); +int vp9_init_search_range(int width, int height); + int vp9_mv_bit_cost(int_mv *mv, int_mv *ref, int *mvjcost, int *mvcost[2], int weight, int ishp); void vp9_init_dsmotion_compensation(MACROBLOCK *x, int stride); diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c index 9f6531c25..9ac2c8460 100644 --- a/vp9/encoder/vp9_quantize.c +++ b/vp9/encoder/vp9_quantize.c @@ -21,38 +21,46 @@ extern int enc_debug; #endif +static INLINE int plane_idx(MACROBLOCKD *xd, int b_idx) { + const BLOCK_SIZE_TYPE sb_type = xd->mode_info_context->mbmi.sb_type; + if (b_idx < (16 << (sb_type * 2))) + return 0; // Y + else if (b_idx < (20 << (sb_type * 2))) + return 16; // U + assert(b_idx < (24 << (sb_type * 2))); + return 20; // V +} + void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) { MACROBLOCKD *const xd = &mb->e_mbd; - BLOCK *const b = &mb->block[b_idx]; - BLOCKD *const d = &xd->block[b_idx]; + BLOCK *const b = &mb->block[0]; + BLOCKD *const d = &xd->block[0]; int i, rc, eob; int zbin; int x, y, z, sz; + int16_t *coeff_ptr = mb->coeff + b_idx * 16; + int16_t *qcoeff_ptr = xd->qcoeff + b_idx * 16; + int16_t *dqcoeff_ptr = xd->dqcoeff + b_idx * 16; int16_t *zbin_boost_ptr = b->zrun_zbin_boost; - int16_t *coeff_ptr = b->coeff; int16_t *zbin_ptr = b->zbin; int16_t *round_ptr = b->round; int16_t *quant_ptr = b->quant; uint8_t *quant_shift_ptr = b->quant_shift; - int16_t *qcoeff_ptr = d->qcoeff; - int16_t *dqcoeff_ptr = d->dqcoeff; int16_t *dequant_ptr = d->dequant; int zbin_oq_value = b->zbin_extra; - - int const *pt_scan ; + const int *pt_scan; #if CONFIG_CODE_NONZEROCOUNT int nzc = 0; #endif + assert(plane_idx(xd, b_idx) == 0); switch (tx_type) { case ADST_DCT: pt_scan = vp9_row_scan_4x4; break; - case DCT_ADST: pt_scan = vp9_col_scan_4x4; break; - default: pt_scan = vp9_default_zig_zag1d_4x4; break; @@ -101,19 +109,20 @@ void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type) { void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) { MACROBLOCKD *const xd = &mb->e_mbd; - BLOCK *const b = &mb->block[b_idx]; - BLOCKD *const d = &xd->block[b_idx]; + const int c_idx = plane_idx(xd, b_idx); + BLOCK *const b = &mb->block[c_idx]; + BLOCKD *const d = &xd->block[c_idx]; int i, rc, eob; int zbin; int x, y, z, sz; + int16_t *coeff_ptr = mb->coeff + b_idx * 16; + int16_t *qcoeff_ptr = xd->qcoeff + b_idx * 16; + int16_t *dqcoeff_ptr = xd->dqcoeff + b_idx * 16; int16_t *zbin_boost_ptr = b->zrun_zbin_boost; - int16_t *coeff_ptr = b->coeff; int16_t *zbin_ptr = b->zbin; int16_t *round_ptr = b->round; int16_t *quant_ptr = b->quant; uint8_t *quant_shift_ptr = b->quant_shift; - int16_t *qcoeff_ptr = d->qcoeff; - int16_t *dqcoeff_ptr = d->dqcoeff; int16_t *dequant_ptr = d->dequant; int zbin_oq_value = b->zbin_extra; #if CONFIG_CODE_NONZEROCOUNT @@ -162,11 +171,11 @@ void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx) { #endif } -void vp9_quantize_mby_4x4_c(MACROBLOCK *x) { +void vp9_quantize_mby_4x4(MACROBLOCK *x) { int i; for (i = 0; i < 16; i++) { - TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, &x->e_mbd.block[i]); + TX_TYPE tx_type = get_tx_type_4x4(&x->e_mbd, i); if (tx_type != DCT_DCT) { vp9_ht_quantize_b_4x4(x, i, tx_type); } else { @@ -175,24 +184,25 @@ void vp9_quantize_mby_4x4_c(MACROBLOCK *x) { } } -void vp9_quantize_mbuv_4x4_c(MACROBLOCK *x) { +void vp9_quantize_mbuv_4x4(MACROBLOCK *x) { int i; for (i = 16; i < 24; i++) x->quantize_b_4x4(x, i); } -void vp9_quantize_mb_4x4_c(MACROBLOCK *x) { - vp9_quantize_mby_4x4_c(x); - vp9_quantize_mbuv_4x4_c(x); +void vp9_quantize_mb_4x4(MACROBLOCK *x) { + vp9_quantize_mby_4x4(x); + vp9_quantize_mbuv_4x4(x); } void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx) { MACROBLOCKD *const xd = &mb->e_mbd; - BLOCK *const b = &mb->block[b_idx]; - BLOCKD *const d = &xd->block[b_idx]; - int16_t *qcoeff_ptr = d->qcoeff; - int16_t *dqcoeff_ptr = d->dqcoeff; + int16_t *qcoeff_ptr = xd->qcoeff + 16 * b_idx; + int16_t *dqcoeff_ptr = xd->dqcoeff + 16 * b_idx; + const int c_idx = plane_idx(xd, b_idx); + BLOCK *const b = &mb->block[c_idx]; + BLOCKD *const d = &xd->block[c_idx]; vpx_memset(qcoeff_ptr, 0, 64 * sizeof(int16_t)); vpx_memset(dqcoeff_ptr, 0, 64 * sizeof(int16_t)); @@ -203,7 +213,7 @@ void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx) { int x, y, z, sz; int zero_run; int16_t *zbin_boost_ptr = b->zrun_zbin_boost; - int16_t *coeff_ptr = b->coeff; + int16_t *coeff_ptr = mb->coeff + 16 * b_idx; int16_t *zbin_ptr = b->zbin; int16_t *round_ptr = b->round; int16_t *quant_ptr = b->quant; @@ -392,14 +402,16 @@ static void quantize(int16_t *zbin_boost_orig_ptr, void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx) { MACROBLOCKD *const xd = &mb->e_mbd; - BLOCK *const b = &mb->block[b_idx]; - BLOCKD *const d = &xd->block[b_idx]; + const int c_idx = plane_idx(xd, b_idx); + BLOCK *const b = &mb->block[c_idx]; + BLOCKD *const d = &xd->block[c_idx]; + quantize(b->zrun_zbin_boost, - b->coeff, + mb->coeff + 16 * b_idx, 256, b->skip_block, b->zbin, b->round, b->quant, b->quant_shift, - d->qcoeff, - d->dqcoeff, + xd->qcoeff + 16 * b_idx, + xd->dqcoeff + 16 * b_idx, d->dequant, b->zbin_extra, &xd->eobs[b_idx], @@ -409,347 +421,138 @@ void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx) { vp9_default_zig_zag1d_16x16, 1); } -void vp9_quantize_sby_32x32(MACROBLOCK *x) { - MACROBLOCKD *const xd = &x->e_mbd; - BLOCK *const b = &x->block[0]; - BLOCKD *const d = &xd->block[0]; +void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx) { + MACROBLOCKD *const xd = &mb->e_mbd; + const int c_idx = plane_idx(xd, b_idx); + BLOCK *const b = &mb->block[c_idx]; + BLOCKD *const d = &xd->block[c_idx]; quantize(b->zrun_zbin_boost, - x->coeff, + mb->coeff + b_idx * 16, 1024, b->skip_block, b->zbin, b->round, b->quant, b->quant_shift, - xd->qcoeff, - xd->dqcoeff, + xd->qcoeff + b_idx * 16, + xd->dqcoeff + b_idx * 16, d->dequant, b->zbin_extra, - &xd->eobs[0], + &xd->eobs[b_idx], #if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[0], + &xd->nzcs[b_idx], #endif vp9_default_zig_zag1d_32x32, 2); } +void vp9_quantize_sby_32x32(MACROBLOCK *x) { + vp9_regular_quantize_b_32x32(x, 0); +} + void vp9_quantize_sby_16x16(MACROBLOCK *x) { - MACROBLOCKD *const xd = &x->e_mbd; - BLOCK *const b = &x->block[0]; - BLOCKD *const d = &xd->block[0]; int n; for (n = 0; n < 4; n++) - quantize(b->zrun_zbin_boost, - x->coeff + n * 256, - 256, b->skip_block, - b->zbin, - b->round, b->quant, b->quant_shift, - xd->qcoeff + n * 256, - xd->dqcoeff + n * 256, - d->dequant, - b->zbin_extra, - &xd->eobs[n * 16], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[n * 16], -#endif - vp9_default_zig_zag1d_16x16, 1); + x->quantize_b_16x16(x, n * 16); } void vp9_quantize_sby_8x8(MACROBLOCK *x) { - MACROBLOCKD *const xd = &x->e_mbd; - BLOCK *const b = &x->block[0]; - BLOCKD *const d = &xd->block[0]; int n; for (n = 0; n < 16; n++) - quantize(b->zrun_zbin_boost, - x->coeff + n * 64, - 64, b->skip_block, - b->zbin, - b->round, b->quant, b->quant_shift, - xd->qcoeff + n * 64, - xd->dqcoeff + n * 64, - d->dequant, - b->zbin_extra, - &xd->eobs[n * 4], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[n * 4], -#endif - vp9_default_zig_zag1d_8x8, 1); + x->quantize_b_8x8(x, n * 4); } void vp9_quantize_sby_4x4(MACROBLOCK *x) { MACROBLOCKD *const xd = &x->e_mbd; - BLOCK *const b = &x->block[0]; - BLOCKD *const d = &xd->block[0]; int n; - for (n = 0; n < 64; n++) - quantize(b->zrun_zbin_boost, - x->coeff + n * 16, - 16, b->skip_block, - b->zbin, - b->round, b->quant, b->quant_shift, - xd->qcoeff + n * 16, - xd->dqcoeff + n * 16, - d->dequant, - b->zbin_extra, - &xd->eobs[n], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[n], -#endif - vp9_default_zig_zag1d_4x4, 1); + for (n = 0; n < 64; n++) { + const TX_TYPE tx_type = get_tx_type_4x4(xd, n); + if (tx_type != DCT_DCT) { + vp9_ht_quantize_b_4x4(x, n, tx_type); + } else { + x->quantize_b_4x4(x, n); + } + } } void vp9_quantize_sbuv_16x16(MACROBLOCK *x) { - int i; - MACROBLOCKD *const xd = &x->e_mbd; - - for (i = 64; i < 96; i += 16) { - int cidx = i < 80 ? 16 : 20; - quantize(x->block[cidx].zrun_zbin_boost, - x->coeff + i * 16, - 256, x->block[cidx].skip_block, - x->block[cidx].zbin, x->block[cidx].round, - x->block[cidx].quant, x->block[cidx].quant_shift, - xd->qcoeff + i * 16, - xd->dqcoeff + i * 16, - xd->block[cidx].dequant, - x->block[cidx].zbin_extra, - &xd->eobs[i], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[i], -#endif - vp9_default_zig_zag1d_16x16, 1); - } + x->quantize_b_16x16(x, 64); + x->quantize_b_16x16(x, 80); } void vp9_quantize_sbuv_8x8(MACROBLOCK *x) { int i; - MACROBLOCKD *const xd = &x->e_mbd; - for (i = 64; i < 96; i += 4) { - int cidx = i < 80 ? 16 : 20; - quantize(x->block[cidx].zrun_zbin_boost, - x->coeff + i * 16, - 64, x->block[cidx].skip_block, - x->block[cidx].zbin, x->block[cidx].round, - x->block[cidx].quant, x->block[cidx].quant_shift, - xd->qcoeff + i * 16, - xd->dqcoeff + i * 16, - xd->block[cidx].dequant, - x->block[cidx].zbin_extra, - &xd->eobs[i], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[i], -#endif - vp9_default_zig_zag1d_8x8, 1); - } + for (i = 64; i < 96; i += 4) + x->quantize_b_8x8(x, i); } void vp9_quantize_sbuv_4x4(MACROBLOCK *x) { int i; - MACROBLOCKD *const xd = &x->e_mbd; - for (i = 64; i < 96; i++) { - int cidx = i < 80 ? 16 : 20; - quantize(x->block[cidx].zrun_zbin_boost, - x->coeff + i * 16, - 16, x->block[cidx].skip_block, - x->block[cidx].zbin, x->block[cidx].round, - x->block[cidx].quant, x->block[cidx].quant_shift, - xd->qcoeff + i * 16, - xd->dqcoeff + i * 16, - xd->block[cidx].dequant, - x->block[cidx].zbin_extra, - &xd->eobs[i], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[i], -#endif - vp9_default_zig_zag1d_4x4, 1); - } + for (i = 64; i < 96; i++) + x->quantize_b_4x4(x, i); } void vp9_quantize_sb64y_32x32(MACROBLOCK *x) { - MACROBLOCKD *const xd = &x->e_mbd; - BLOCK *const b = &x->block[0]; - BLOCKD *const d = &xd->block[0]; int n; for (n = 0; n < 4; n++) - quantize(b->zrun_zbin_boost, - x->coeff + n * 1024, - 1024, b->skip_block, - b->zbin, - b->round, b->quant, b->quant_shift, - xd->qcoeff + n * 1024, - xd->dqcoeff + n * 1024, - d->dequant, - b->zbin_extra, - &xd->eobs[n * 64], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[n * 64], -#endif - vp9_default_zig_zag1d_32x32, 2); + vp9_regular_quantize_b_32x32(x, n * 64); } void vp9_quantize_sb64y_16x16(MACROBLOCK *x) { - MACROBLOCKD *const xd = &x->e_mbd; - BLOCK *const b = &x->block[0]; - BLOCKD *const d = &xd->block[0]; int n; for (n = 0; n < 16; n++) - quantize(b->zrun_zbin_boost, - x->coeff + n * 256, - 256, b->skip_block, - b->zbin, - b->round, b->quant, b->quant_shift, - xd->qcoeff + n * 256, - xd->dqcoeff + n * 256, - d->dequant, - b->zbin_extra, - &xd->eobs[n * 16], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[n * 16], -#endif - vp9_default_zig_zag1d_16x16, 1); + x->quantize_b_16x16(x, n * 16); } void vp9_quantize_sb64y_8x8(MACROBLOCK *x) { - MACROBLOCKD *const xd = &x->e_mbd; - BLOCK *const b = &x->block[0]; - BLOCKD *const d = &xd->block[0]; int n; for (n = 0; n < 64; n++) - quantize(b->zrun_zbin_boost, - x->coeff + n * 64, - 64, b->skip_block, - b->zbin, - b->round, b->quant, b->quant_shift, - xd->qcoeff + n * 64, - xd->dqcoeff + n * 64, - d->dequant, - b->zbin_extra, - &xd->eobs[n * 4], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[n * 4], -#endif - vp9_default_zig_zag1d_8x8, 1); + x->quantize_b_8x8(x, n * 4); } void vp9_quantize_sb64y_4x4(MACROBLOCK *x) { MACROBLOCKD *const xd = &x->e_mbd; - BLOCK *const b = &x->block[0]; - BLOCKD *const d = &xd->block[0]; int n; - for (n = 0; n < 256; n++) - quantize(b->zrun_zbin_boost, - x->coeff + n * 16, - 16, b->skip_block, - b->zbin, - b->round, b->quant, b->quant_shift, - xd->qcoeff + n * 16, - xd->dqcoeff + n * 16, - d->dequant, - b->zbin_extra, - &xd->eobs[n], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[n], -#endif - vp9_default_zig_zag1d_4x4, 1); + for (n = 0; n < 256; n++) { + const TX_TYPE tx_type = get_tx_type_4x4(xd, n); + if (tx_type != DCT_DCT) { + vp9_ht_quantize_b_4x4(x, n, tx_type); + } else { + x->quantize_b_4x4(x, n); + } + } } void vp9_quantize_sb64uv_32x32(MACROBLOCK *x) { - int i; - MACROBLOCKD *const xd = &x->e_mbd; - - for (i = 256; i < 384; i += 64) { - int cidx = i < 320 ? 16 : 20; - quantize(x->block[cidx].zrun_zbin_boost, - x->coeff + i * 16, - 1024, x->block[cidx].skip_block, - x->block[cidx].zbin, x->block[cidx].round, - x->block[cidx].quant, x->block[cidx].quant_shift, - xd->qcoeff + i * 16, - xd->dqcoeff + i * 16, - xd->block[cidx].dequant, - x->block[cidx].zbin_extra, - &xd->eobs[i], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[i], -#endif - vp9_default_zig_zag1d_32x32, 2); - } + vp9_regular_quantize_b_32x32(x, 256); + vp9_regular_quantize_b_32x32(x, 320); } void vp9_quantize_sb64uv_16x16(MACROBLOCK *x) { int i; - MACROBLOCKD *const xd = &x->e_mbd; - for (i = 256; i < 384; i += 16) { - int cidx = i < 320 ? 16 : 20; - quantize(x->block[cidx].zrun_zbin_boost, - x->coeff + i * 16, - 256, x->block[cidx].skip_block, - x->block[cidx].zbin, x->block[cidx].round, - x->block[cidx].quant, x->block[cidx].quant_shift, - xd->qcoeff + i * 16, - xd->dqcoeff + i * 16, - xd->block[cidx].dequant, - x->block[cidx].zbin_extra, - &xd->eobs[i], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[i], -#endif - vp9_default_zig_zag1d_16x16, 1); - } + for (i = 256; i < 384; i += 16) + x->quantize_b_16x16(x, i); } void vp9_quantize_sb64uv_8x8(MACROBLOCK *x) { int i; - MACROBLOCKD *const xd = &x->e_mbd; - for (i = 256; i < 384; i += 4) { - int cidx = i < 320 ? 16 : 20; - quantize(x->block[cidx].zrun_zbin_boost, - x->coeff + i * 16, - 64, x->block[cidx].skip_block, - x->block[cidx].zbin, x->block[cidx].round, - x->block[cidx].quant, x->block[cidx].quant_shift, - xd->qcoeff + i * 16, - xd->dqcoeff + i * 16, - xd->block[cidx].dequant, - x->block[cidx].zbin_extra, - &xd->eobs[i], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[i], -#endif - vp9_default_zig_zag1d_8x8, 1); - } + for (i = 256; i < 384; i += 4) + x->quantize_b_8x8(x, i); } void vp9_quantize_sb64uv_4x4(MACROBLOCK *x) { int i; - MACROBLOCKD *const xd = &x->e_mbd; - for (i = 256; i < 384; i++) { - int cidx = i < 320 ? 16 : 20; - quantize(x->block[cidx].zrun_zbin_boost, - x->coeff + i * 16, - 16, x->block[cidx].skip_block, - x->block[cidx].zbin, x->block[cidx].round, - x->block[cidx].quant, x->block[cidx].quant_shift, - xd->qcoeff + i * 16, - xd->dqcoeff + i * 16, - xd->block[cidx].dequant, - x->block[cidx].zbin_extra, - &xd->eobs[i], -#if CONFIG_CODE_NONZEROCOUNT - &xd->nzcs[i], -#endif - vp9_default_zig_zag1d_4x4, 1); - } + for (i = 256; i < 384; i++) + x->quantize_b_4x4(x, i); } /* quantize_b_pair function pointer in MACROBLOCK structure is set to one of diff --git a/vp9/encoder/vp9_quantize.h b/vp9/encoder/vp9_quantize.h index 32eb05a11..739254025 100644 --- a/vp9/encoder/vp9_quantize.h +++ b/vp9/encoder/vp9_quantize.h @@ -26,52 +26,24 @@ #include "x86/vp9_quantize_x86.h" #endif -#define prototype_quantize_block_type(sym) \ - void (sym)(MACROBLOCK *mb, int b_ix, TX_TYPE type) -extern prototype_quantize_block_type(vp9_ht_quantize_b_4x4); - -#ifndef vp9_quantize_quantb_4x4 -#define vp9_quantize_quantb_4x4 vp9_regular_quantize_b_4x4 -#endif -extern prototype_quantize_block(vp9_quantize_quantb_4x4); - -#ifndef vp9_quantize_quantb_4x4_pair -#define vp9_quantize_quantb_4x4_pair vp9_regular_quantize_b_4x4_pair -#endif -extern prototype_quantize_block_pair(vp9_quantize_quantb_4x4_pair); - -#ifndef vp9_quantize_quantb_8x8 -#define vp9_quantize_quantb_8x8 vp9_regular_quantize_b_8x8 -#endif -extern prototype_quantize_block(vp9_quantize_quantb_8x8); - -#ifndef vp9_quantize_quantb_16x16 -#define vp9_quantize_quantb_16x16 vp9_regular_quantize_b_16x16 -#endif -extern prototype_quantize_block(vp9_quantize_quantb_16x16); - -#ifndef vp9_quantize_mb_4x4 -#define vp9_quantize_mb_4x4 vp9_quantize_mb_4x4_c -#endif -extern prototype_quantize_mb(vp9_quantize_mb_4x4); +void vp9_ht_quantize_b_4x4(MACROBLOCK *mb, int b_ix, TX_TYPE type); +void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx); +void vp9_regular_quantize_b_4x4_pair(MACROBLOCK *mb, int b_idx1, int b_idx2); +void vp9_regular_quantize_b_8x8(MACROBLOCK *mb, int b_idx); +void vp9_regular_quantize_b_16x16(MACROBLOCK *mb, int b_idx); +void vp9_regular_quantize_b_32x32(MACROBLOCK *mb, int b_idx); + +void vp9_quantize_mb_4x4(MACROBLOCK *x); void vp9_quantize_mb_8x8(MACROBLOCK *x); -#ifndef vp9_quantize_mbuv_4x4 -#define vp9_quantize_mbuv_4x4 vp9_quantize_mbuv_4x4_c -#endif -extern prototype_quantize_mb(vp9_quantize_mbuv_4x4); - -#ifndef vp9_quantize_mby_4x4 -#define vp9_quantize_mby_4x4 vp9_quantize_mby_4x4_c -#endif -extern prototype_quantize_mb(vp9_quantize_mby_4x4); +void vp9_quantize_mbuv_4x4(MACROBLOCK *x); +void vp9_quantize_mby_4x4(MACROBLOCK *x); -extern prototype_quantize_mb(vp9_quantize_mby_8x8); -extern prototype_quantize_mb(vp9_quantize_mbuv_8x8); +void vp9_quantize_mby_8x8(MACROBLOCK *x); +void vp9_quantize_mbuv_8x8(MACROBLOCK *x); void vp9_quantize_mb_16x16(MACROBLOCK *x); -extern prototype_quantize_block(vp9_quantize_quantb_16x16); -extern prototype_quantize_mb(vp9_quantize_mby_16x16); +void vp9_quantize_mby_16x16(MACROBLOCK *x); void vp9_quantize_sby_32x32(MACROBLOCK *x); void vp9_quantize_sby_16x16(MACROBLOCK *x); diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 1b83091b3..cdc8edd03 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -453,7 +453,6 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, TX_SIZE tx_size) { MACROBLOCKD *const xd = &mb->e_mbd; MB_MODE_INFO *mbmi = &xd->mode_info_context->mbmi; - const BLOCK_SIZE_TYPE sb_type = mbmi->sb_type; int pt; const int eob = xd->eobs[ib]; int c = 0; @@ -461,9 +460,6 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, const int *scan; const int16_t *qcoeff_ptr = xd->qcoeff + ib * 16; const int ref = mbmi->ref_frame != INTRA_FRAME; - const TX_TYPE tx_type = (sb_type == BLOCK_SIZE_MB16X16 && - type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type(xd, &xd->block[ib]) : DCT_DCT; unsigned int (*token_costs)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS] = mb->token_costs[tx_size][type][ref]; ENTROPY_CONTEXT a_ec, l_ec; @@ -489,23 +485,25 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, } switch (tx_size) { - case TX_4X4: + case TX_4X4: { + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_4x4(xd, ib) : DCT_DCT; a_ec = *a; l_ec = *l; - scan = vp9_default_zig_zag1d_4x4; #if CONFIG_CODE_NONZEROCOUNT nzc_cost = mb->nzc_costs_4x4[nzc_context][ref][type]; #else seg_eob = 16; #endif - if (type == PLANE_TYPE_Y_WITH_DC) { - if (tx_type == ADST_DCT) { - scan = vp9_row_scan_4x4; - } else if (tx_type == DCT_ADST) { - scan = vp9_col_scan_4x4; - } + if (tx_type == ADST_DCT) { + scan = vp9_row_scan_4x4; + } else if (tx_type == DCT_ADST) { + scan = vp9_col_scan_4x4; + } else { + scan = vp9_default_zig_zag1d_4x4; } break; + } case TX_8X8: a_ec = (a[0] + a[1]) != 0; l_ec = (l[0] + l[1]) != 0; @@ -611,24 +609,16 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb, return cost; } -static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) { +static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb) { int cost = 0; int b; MACROBLOCKD *xd = &mb->e_mbd; ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta; - ENTROPY_CONTEXT *tl; - - if (backup) { - vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left; - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - } else { - ta = (ENTROPY_CONTEXT *)xd->above_context; - tl = (ENTROPY_CONTEXT *)xd->left_context; - } + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); for (b = 0; b < 16; b++) cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC, @@ -641,38 +631,30 @@ static int rdcost_mby_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) { static void macro_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *mb, - int *Rate, - int *Distortion, - int *skippable, int backup) { + int *rate, + int *distortion, + int *skippable) { MACROBLOCKD *const xd = &mb->e_mbd; xd->mode_info_context->mbmi.txfm_size = TX_4X4; vp9_transform_mby_4x4(mb); vp9_quantize_mby_4x4(mb); - *Distortion = vp9_mbblock_error(mb) >> 2; - *Rate = rdcost_mby_4x4(cm, mb, backup); + *distortion = vp9_mbblock_error(mb) >> 2; + *rate = rdcost_mby_4x4(cm, mb); *skippable = vp9_mby_is_skippable_4x4(xd); } -static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) { +static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb) { int cost = 0; int b; MACROBLOCKD *xd = &mb->e_mbd; ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta; - ENTROPY_CONTEXT *tl; - - if (backup) { - vpx_memcpy(&t_above,xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left; - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - } else { - ta = (ENTROPY_CONTEXT *)mb->e_mbd.above_context; - tl = (ENTROPY_CONTEXT *)mb->e_mbd.left_context; - } + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); for (b = 0; b < 16; b += 4) cost += cost_coeffs(cm, mb, b, PLANE_TYPE_Y_WITH_DC, @@ -685,45 +667,35 @@ static int rdcost_mby_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) { static void macro_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *mb, - int *Rate, - int *Distortion, - int *skippable, int backup) { + int *rate, + int *distortion, + int *skippable) { MACROBLOCKD *const xd = &mb->e_mbd; xd->mode_info_context->mbmi.txfm_size = TX_8X8; vp9_transform_mby_8x8(mb); vp9_quantize_mby_8x8(mb); - *Distortion = vp9_mbblock_error(mb) >> 2; - *Rate = rdcost_mby_8x8(cm, mb, backup); + *distortion = vp9_mbblock_error(mb) >> 2; + *rate = rdcost_mby_8x8(cm, mb); *skippable = vp9_mby_is_skippable_8x8(xd); } -static int rdcost_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *mb, int backup) { - int cost; - MACROBLOCKD *xd = &mb->e_mbd; +static int rdcost_mby_16x16(VP9_COMMON *const cm, MACROBLOCK *mb) { + MACROBLOCKD *const xd = &mb->e_mbd; ENTROPY_CONTEXT_PLANES t_above, t_left; - ENTROPY_CONTEXT *ta, *tl; - - if (backup) { - vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES)); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES)); + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *)&t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *)&t_left; - ta = (ENTROPY_CONTEXT *)&t_above; - tl = (ENTROPY_CONTEXT *)&t_left; - } else { - ta = (ENTROPY_CONTEXT *)xd->above_context; - tl = (ENTROPY_CONTEXT *)xd->left_context; - } + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); - cost = cost_coeffs(cm, mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16); - return cost; + return cost_coeffs(cm, mb, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_16X16); } static void macro_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *mb, - int *Rate, int *Distortion, - int *skippable, int backup) { - MACROBLOCKD *xd = &mb->e_mbd; + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &mb->e_mbd; xd->mode_info_context->mbmi.txfm_size = TX_16X16; vp9_transform_mby_16x16(mb); @@ -735,8 +707,8 @@ static void macro_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *mb, xd->mode_info_context->mbmi.mode < I8X8_PRED) vp9_optimize_mby_16x16(cm, mb); - *Distortion = vp9_mbblock_error(mb) >> 2; - *Rate = rdcost_mby_16x16(cm, mb, backup); + *distortion = vp9_mbblock_error(mb) >> 2; + *rate = rdcost_mby_16x16(cm, mb); *skippable = vp9_mby_is_skippable_16x16(xd); } @@ -835,9 +807,9 @@ static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, vp9_subtract_mby(x->src_diff, *(x->block[0].base_src), xd->predictor, x->block[0].src_stride); - macro_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16], 1); - macro_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8], 1); - macro_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4], 1); + macro_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); + macro_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); + macro_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable, txfm_cache, TX_16X16); @@ -852,27 +824,8 @@ static void copy_predictor(uint8_t *dst, const uint8_t *predictor) { d[12] = p[12]; } -static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x, int backup) { - MACROBLOCKD * xd = &x->e_mbd; - ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; - ENTROPY_CONTEXT *ta, *tl; - - if (backup) { - ta = (ENTROPY_CONTEXT *) &t_above, - tl = (ENTROPY_CONTEXT *) &t_left; - - vpx_memcpy(&t_above, xd->above_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2); - vpx_memcpy(&t_left, xd->left_context, sizeof(ENTROPY_CONTEXT_PLANES) * 2); - } else { - ta = (ENTROPY_CONTEXT *) xd->above_context; - tl = (ENTROPY_CONTEXT *) xd->left_context; - } - - return cost_coeffs(cm, x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32); -} - static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff, - int block_size) { + int block_size, int shift) { int i; int64_t error = 0; @@ -880,33 +833,126 @@ static int vp9_sb_block_error_c(int16_t *coeff, int16_t *dqcoeff, unsigned int this_diff = coeff[i] - dqcoeff[i]; error += this_diff * this_diff; } + error >>= shift; return error > INT_MAX ? INT_MAX : (int)error; } -#define DEBUG_ERROR 0 +static int rdcost_sby_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 64; b++) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb[TX_4X4][b], + tl + vp9_block2left_sb[TX_4X4][b], TX_4X4); + + return cost; +} + +static void super_block_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_4X4; + vp9_transform_sby_4x4(x); + vp9_quantize_sby_4x4(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2); + *rate = rdcost_sby_4x4(cm, x); + *skippable = vp9_sby_is_skippable_4x4(xd); +} + +static int rdcost_sby_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 64; b += 4) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb[TX_8X8][b], + tl + vp9_block2left_sb[TX_8X8][b], TX_8X8); + + return cost; +} + +static void super_block_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_8X8; + vp9_transform_sby_8x8(x); + vp9_quantize_sby_8x8(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2); + *rate = rdcost_sby_8x8(cm, x); + *skippable = vp9_sby_is_skippable_8x8(xd); +} + +static int rdcost_sby_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 64; b += 16) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb[TX_16X16][b], + tl + vp9_block2left_sb[TX_16X16][b], TX_16X16); + + return cost; +} + +static void super_block_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_16X16; + vp9_transform_sby_16x16(x); + vp9_quantize_sby_16x16(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 2); + *rate = rdcost_sby_16x16(cm, x); + *skippable = vp9_sby_is_skippable_16x16(xd); +} + +static int rdcost_sby_32x32(VP9_COMMON *const cm, MACROBLOCK *x) { + MACROBLOCKD * const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[2], t_left[2]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + return cost_coeffs(cm, x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, TX_32X32); +} + static void super_block_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x, - int *rate, int *distortion, int *skippable, - int backup) { + int *rate, int *distortion, int *skippable) { MACROBLOCKD *const xd = &x->e_mbd; -#if DEBUG_ERROR - int16_t out[1024]; -#endif - xd->mode_info_context->mbmi.txfm_size = TX_32X32; + xd->mode_info_context->mbmi.txfm_size = TX_32X32; vp9_transform_sby_32x32(x); vp9_quantize_sby_32x32(x); -#if DEBUG_ERROR - vp9_short_idct32x32(xd->dqcoeff, out, 64); -#endif - - *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024); -#if DEBUG_ERROR - printf("IDCT/FDCT error 32x32: %d (d: %d)\n", - vp9_block_error_c(x->src_diff, out, 1024), *distortion); -#endif - *rate = rdcost_sby_32x32(cm, x, backup); + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 1024, 0); + *rate = rdcost_sby_32x32(cm, x); *skippable = vp9_sby_is_skippable_32x32(xd); } @@ -914,180 +960,166 @@ static void super_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int *skip, int64_t txfm_cache[NB_TXFM_MODES]) { + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n; + int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB]; const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer; int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride; - ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_MB][2], - *orig_above = xd->above_context; - ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_MB][2], - *orig_left = xd->left_context; - - for (n = TX_4X4; n < TX_SIZE_MAX_MB; n++) { - vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n])); - vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n])); - r[n][0] = 0; - d[n] = 0; - s[n] = 1; - } - vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, - dst, dst_y_stride); - super_block_yrd_32x32(&cpi->common, x, - &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32], 1); + vp9_subtract_sby_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride); + super_block_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); + super_block_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); + super_block_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); + super_block_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); -#if DEBUG_ERROR - int err[3] = { 0, 0, 0 }; -#endif - for (n = 0; n < 4; n++) { - int x_idx = n & 1, y_idx = n >> 1; - int r_tmp, d_tmp, s_tmp; - - vp9_subtract_mby_s_c(x->src_diff, - src + x_idx * 16 + y_idx * 16 * src_y_stride, - src_y_stride, - dst + x_idx * 16 + y_idx * 16 * dst_y_stride, - dst_y_stride); - - xd->above_context = &t_above[TX_16X16][x_idx]; - xd->left_context = &t_left[TX_16X16][y_idx]; - macro_block_yrd_16x16(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_16X16] += d_tmp; - r[TX_16X16][0] += r_tmp; - s[TX_16X16] = s[TX_16X16] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_16x16(xd); - err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif - - xd->above_context = &t_above[TX_4X4][x_idx]; - xd->left_context = &t_left[TX_4X4][y_idx]; - macro_block_yrd_4x4(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_4X4] += d_tmp; - r[TX_4X4][0] += r_tmp; - s[TX_4X4] = s[TX_4X4] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_4x4(xd); - err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif - - xd->above_context = &t_above[TX_8X8][x_idx]; - xd->left_context = &t_left[TX_8X8][y_idx]; - macro_block_yrd_8x8(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_8X8] += d_tmp; - r[TX_8X8][0] += r_tmp; - s[TX_8X8] = s[TX_8X8] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_8x8(xd); - err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif - } -#if DEBUG_ERROR - printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]); - printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]); - printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]); -#endif choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, TX_SIZE_MAX_SB - 1); +} + +static int rdcost_sb64y_4x4(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 256; b++) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb64[TX_4X4][b], + tl + vp9_block2left_sb64[TX_4X4][b], TX_4X4); + + return cost; +} + +static void super_block64_yrd_4x4(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_4X4; + vp9_transform_sb64y_4x4(x); + vp9_quantize_sb64y_4x4(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2); + *rate = rdcost_sb64y_4x4(cm, x); + *skippable = vp9_sb64y_is_skippable_4x4(xd); +} + +static int rdcost_sb64y_8x8(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); - xd->above_context = orig_above; - xd->left_context = orig_left; + for (b = 0; b < 256; b += 4) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb64[TX_8X8][b], + tl + vp9_block2left_sb64[TX_8X8][b], TX_8X8); + + return cost; +} + +static void super_block64_yrd_8x8(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_8X8; + vp9_transform_sb64y_8x8(x); + vp9_quantize_sb64y_8x8(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2); + *rate = rdcost_sb64y_8x8(cm, x); + *skippable = vp9_sb64y_is_skippable_8x8(xd); +} + +static int rdcost_sb64y_16x16(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD *const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 256; b += 16) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb64[TX_16X16][b], + tl + vp9_block2left_sb64[TX_16X16][b], TX_16X16); + + return cost; +} + +static void super_block64_yrd_16x16(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, + int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_16X16; + vp9_transform_sb64y_16x16(x); + vp9_quantize_sb64y_16x16(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 2); + *rate = rdcost_sb64y_16x16(cm, x); + *skippable = vp9_sb64y_is_skippable_16x16(xd); +} + +static int rdcost_sb64y_32x32(VP9_COMMON *const cm, MACROBLOCK *x) { + int cost = 0, b; + MACROBLOCKD * const xd = &x->e_mbd; + ENTROPY_CONTEXT_PLANES t_above[4], t_left[4]; + ENTROPY_CONTEXT *ta = (ENTROPY_CONTEXT *) &t_above; + ENTROPY_CONTEXT *tl = (ENTROPY_CONTEXT *) &t_left; + + vpx_memcpy(&t_above, xd->above_context, sizeof(t_above)); + vpx_memcpy(&t_left, xd->left_context, sizeof(t_left)); + + for (b = 0; b < 256; b += 64) + cost += cost_coeffs(cm, x, b, PLANE_TYPE_Y_WITH_DC, + ta + vp9_block2above_sb64[TX_32X32][b], + tl + vp9_block2left_sb64[TX_32X32][b], TX_32X32); + + return cost; +} + +static void super_block64_yrd_32x32(VP9_COMMON *const cm, MACROBLOCK *x, + int *rate, int *distortion, + int *skippable) { + MACROBLOCKD *const xd = &x->e_mbd; + + xd->mode_info_context->mbmi.txfm_size = TX_32X32; + vp9_transform_sb64y_32x32(x); + vp9_quantize_sb64y_32x32(x); + + *distortion = vp9_sb_block_error_c(x->coeff, xd->dqcoeff, 4096, 0); + *rate = rdcost_sb64y_32x32(cm, x); + *skippable = vp9_sb64y_is_skippable_32x32(xd); } static void super_block_64_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int *distortion, int *skip, int64_t txfm_cache[NB_TXFM_MODES]) { + VP9_COMMON *const cm = &cpi->common; MACROBLOCKD *const xd = &x->e_mbd; - int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB], n; + int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB]; const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer; int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride; - ENTROPY_CONTEXT_PLANES t_above[TX_SIZE_MAX_SB][4], - *orig_above = xd->above_context; - ENTROPY_CONTEXT_PLANES t_left[TX_SIZE_MAX_SB][4], - *orig_left = xd->left_context; - - for (n = TX_4X4; n < TX_SIZE_MAX_SB; n++) { - vpx_memcpy(t_above[n], xd->above_context, sizeof(t_above[n])); - vpx_memcpy(t_left[n], xd->left_context, sizeof(t_left[n])); - r[n][0] = 0; - d[n] = 0; - s[n] = 1; - } - for (n = 0; n < 4; n++) { - int x_idx = n & 1, y_idx = n >> 1; - int r_tmp, d_tmp, s_tmp; - - xd->above_context = &t_above[TX_32X32][x_idx << 1]; - xd->left_context = &t_left[TX_32X32][y_idx << 1]; - vp9_subtract_sby_s_c(x->src_diff, - src + 32 * x_idx + 32 * y_idx * src_y_stride, - src_y_stride, - dst + 32 * x_idx + 32 * y_idx * dst_y_stride, - dst_y_stride); - super_block_yrd_32x32(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0); - r[TX_32X32][0] += r_tmp; - d[TX_32X32] += d_tmp; - s[TX_32X32] = s[TX_32X32] && s_tmp; - } - -#if DEBUG_ERROR - int err[3] = { 0, 0, 0 }; -#endif - for (n = 0; n < 16; n++) { - int x_idx = n & 3, y_idx = n >> 2; - int r_tmp, d_tmp, s_tmp; - - vp9_subtract_mby_s_c(x->src_diff, - src + x_idx * 16 + y_idx * 16 * src_y_stride, - src_y_stride, - dst + x_idx * 16 + y_idx * 16 * dst_y_stride, - dst_y_stride); - - xd->above_context = &t_above[TX_16X16][x_idx]; - xd->left_context = &t_left[TX_16X16][y_idx]; - macro_block_yrd_16x16(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_16X16] += d_tmp; - r[TX_16X16][0] += r_tmp; - s[TX_16X16] = s[TX_16X16] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_16x16(xd); - err[2] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif - - xd->above_context = &t_above[TX_4X4][x_idx]; - xd->left_context = &t_left[TX_4X4][y_idx]; - macro_block_yrd_4x4(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_4X4] += d_tmp; - r[TX_4X4][0] += r_tmp; - s[TX_4X4] = s[TX_4X4] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_4x4(xd); - err[0] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif + vp9_subtract_sb64y_s_c(x->src_diff, src, src_y_stride, dst, dst_y_stride); + super_block64_yrd_32x32(cm, x, &r[TX_32X32][0], &d[TX_32X32], &s[TX_32X32]); + super_block64_yrd_16x16(cm, x, &r[TX_16X16][0], &d[TX_16X16], &s[TX_16X16]); + super_block64_yrd_8x8(cm, x, &r[TX_8X8][0], &d[TX_8X8], &s[TX_8X8]); + super_block64_yrd_4x4(cm, x, &r[TX_4X4][0], &d[TX_4X4], &s[TX_4X4]); - xd->above_context = &t_above[TX_8X8][x_idx]; - xd->left_context = &t_left[TX_8X8][y_idx]; - macro_block_yrd_8x8(&cpi->common, x, &r_tmp, &d_tmp, &s_tmp, 0); - d[TX_8X8] += d_tmp; - r[TX_8X8][0] += r_tmp; - s[TX_8X8] = s[TX_8X8] && s_tmp; -#if DEBUG_ERROR - vp9_inverse_transform_mby_8x8(xd); - err[1] += vp9_block_error_c(xd->diff, x->src_diff, 256); -#endif - } -#if DEBUG_ERROR - printf("IDCT/FDCT error 16x16: %d (d: %d)\n", err[2], d[2]); - printf("IDCT/FDCT error 8x8: %d (d: %d)\n", err[1], d[1]); - printf("IDCT/FDCT error 4x4: %d (d: %d)\n", err[0], d[0]); -#endif choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skip, txfm_cache, TX_SIZE_MAX_SB - 1); - - xd->above_context = orig_above; - xd->left_context = orig_left; } static void copy_predictor_8x8(uint8_t *dst, const uint8_t *predictor) { @@ -1166,7 +1198,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, vp9_subtract_b(be, b, 16); b->bmi.as_mode.first = mode; - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, be - x->block); if (tx_type != DCT_DCT) { vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); vp9_ht_quantize_b_4x4(x, be - x->block, tx_type); @@ -1465,7 +1497,7 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, vp9_subtract_4b_c(be, b, 16); if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) { - TX_TYPE tx_type = get_tx_type_8x8(xd, b); + TX_TYPE tx_type = get_tx_type_8x8(xd, ib); if (tx_type != DCT_DCT) vp9_short_fht8x8(be->src_diff, (x->block + idx)->coeff, 16, tx_type); else @@ -1504,11 +1536,12 @@ static int64_t rd_pick_intra8x8block(VP9_COMP *cpi, MACROBLOCK *x, int ib, int do_two = 0; b = &xd->block[ib + iblock[i]]; be = &x->block[ib + iblock[i]]; - tx_type = get_tx_type_4x4(xd, b); + tx_type = get_tx_type_4x4(xd, ib + iblock[i]); if (tx_type != DCT_DCT) { vp9_short_fht4x4(be->src_diff, be->coeff, 16, tx_type); vp9_ht_quantize_b_4x4(x, ib + iblock[i], tx_type); - } else if (!(i & 1) && get_tx_type_4x4(xd, b + 1) == DCT_DCT) { + } else if (!(i & 1) && + get_tx_type_4x4(xd, ib + iblock[i] + 1) == DCT_DCT) { x->fwd_txm8x4(be->src_diff, be->coeff, 32); x->quantize_b_4x4_pair(x, ib + iblock[i], ib + iblock[i] + 1); do_two = 1; @@ -1725,8 +1758,8 @@ static void rd_inter32x32_uv_16x16(VP9_COMMON *const cm, MACROBLOCK *x, vp9_quantize_sbuv_16x16(x); *rate = rd_cost_sbuv_16x16(cm, x, backup); - *distortion = vp9_block_error_c(x->coeff + 1024, - xd->dqcoeff + 1024, 512) >> 2; + *distortion = vp9_sb_block_error_c(x->coeff + 1024, + xd->dqcoeff + 1024, 512, 2); *skip = vp9_sbuv_is_skippable_16x16(xd); } @@ -2001,8 +2034,8 @@ static void rd_inter64x64_uv_32x32(VP9_COMMON *const cm, MACROBLOCK *x, vp9_quantize_sb64uv_32x32(x); *rate = rd_cost_sb64uv_32x32(cm, x, backup); - *distortion = vp9_block_error_c(x->coeff + 4096, - xd->dqcoeff + 4096, 2048); + *distortion = vp9_sb_block_error_c(x->coeff + 4096, + xd->dqcoeff + 4096, 2048, 0); *skip = vp9_sb64uv_is_skippable_32x32(xd); } @@ -3472,6 +3505,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, vp9_clamp_mv_min_max(x, &ref_mv[0]); + sr = vp9_init_search_range(cpi->common.Width, cpi->common.Height); + // mvp_full.as_int = ref_mv[0].as_int; mvp_full.as_int = mbmi->ref_mvs[refs[0]][x->mv_best_ref_index[refs[0]]].as_int; diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c index 0fad9b032..370d53fc6 100644 --- a/vp9/encoder/vp9_tokenize.c +++ b/vp9/encoder/vp9_tokenize.c @@ -114,9 +114,6 @@ static void tokenize_b(VP9_COMP *cpi, const int *scan; vp9_coeff_count *counts; vp9_coeff_probs *probs; - const TX_TYPE tx_type = (sb_type == BLOCK_SIZE_MB16X16 && - type == PLANE_TYPE_Y_WITH_DC) ? - get_tx_type(xd, &xd->block[ib]) : DCT_DCT; const int ref = mbmi->ref_frame != INTRA_FRAME; ENTROPY_CONTEXT *a, *l, *a1, *l1, *a2, *l2, *a3, *l3, a_ec, l_ec; #if CONFIG_CODE_NONZEROCOUNT @@ -149,7 +146,9 @@ static void tokenize_b(VP9_COMP *cpi, switch (tx_size) { default: - case TX_4X4: + case TX_4X4: { + const TX_TYPE tx_type = (type == PLANE_TYPE_Y_WITH_DC) ? + get_tx_type_4x4(xd, ib) : DCT_DCT; a_ec = *a; l_ec = *l; seg_eob = 16; @@ -164,6 +163,7 @@ static void tokenize_b(VP9_COMP *cpi, counts = cpi->coef_counts_4x4; probs = cpi->common.fc.coef_probs_4x4; break; + } case TX_8X8: a_ec = (a[0] + a[1]) != 0; l_ec = (l[0] + l[1]) != 0; @@ -354,7 +354,7 @@ static int sb_is_skippable_32x32(MACROBLOCKD *xd) { vp9_sbuv_is_skippable_16x16(xd); } -static int sby_is_skippable_16x16(MACROBLOCKD *xd) { +int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -365,10 +365,10 @@ static int sby_is_skippable_16x16(MACROBLOCKD *xd) { } static int sb_is_skippable_16x16(MACROBLOCKD *xd) { - return sby_is_skippable_16x16(xd) & vp9_sbuv_is_skippable_16x16(xd); + return vp9_sby_is_skippable_16x16(xd) & vp9_sbuv_is_skippable_16x16(xd); } -static int sby_is_skippable_8x8(MACROBLOCKD *xd) { +int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -378,7 +378,7 @@ static int sby_is_skippable_8x8(MACROBLOCKD *xd) { return skip; } -static int sbuv_is_skippable_8x8(MACROBLOCKD *xd) { +int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -389,10 +389,10 @@ static int sbuv_is_skippable_8x8(MACROBLOCKD *xd) { } static int sb_is_skippable_8x8(MACROBLOCKD *xd) { - return sby_is_skippable_8x8(xd) & sbuv_is_skippable_8x8(xd); + return vp9_sby_is_skippable_8x8(xd) & vp9_sbuv_is_skippable_8x8(xd); } -static int sby_is_skippable_4x4(MACROBLOCKD *xd) { +int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -402,7 +402,7 @@ static int sby_is_skippable_4x4(MACROBLOCKD *xd) { return skip; } -static int sbuv_is_skippable_4x4(MACROBLOCKD *xd) { +int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -413,7 +413,7 @@ static int sbuv_is_skippable_4x4(MACROBLOCKD *xd) { } static int sb_is_skippable_4x4(MACROBLOCKD *xd) { - return sby_is_skippable_4x4(xd) & sbuv_is_skippable_4x4(xd); + return vp9_sby_is_skippable_4x4(xd) & vp9_sbuv_is_skippable_4x4(xd); } void vp9_tokenize_sb(VP9_COMP *cpi, @@ -499,7 +499,7 @@ void vp9_tokenize_sb(VP9_COMP *cpi, *t = t_backup; } -static int sb64y_is_skippable_32x32(MACROBLOCKD *xd) { +int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -514,10 +514,10 @@ int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd) { } static int sb64_is_skippable_32x32(MACROBLOCKD *xd) { - return sb64y_is_skippable_32x32(xd) & vp9_sb64uv_is_skippable_32x32(xd); + return vp9_sb64y_is_skippable_32x32(xd) & vp9_sb64uv_is_skippable_32x32(xd); } -static int sb64y_is_skippable_16x16(MACROBLOCKD *xd) { +int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -527,7 +527,7 @@ static int sb64y_is_skippable_16x16(MACROBLOCKD *xd) { return skip; } -static int sb64uv_is_skippable_16x16(MACROBLOCKD *xd) { +int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -538,10 +538,10 @@ static int sb64uv_is_skippable_16x16(MACROBLOCKD *xd) { } static int sb64_is_skippable_16x16(MACROBLOCKD *xd) { - return sb64y_is_skippable_16x16(xd) & sb64uv_is_skippable_16x16(xd); + return vp9_sb64y_is_skippable_16x16(xd) & vp9_sb64uv_is_skippable_16x16(xd); } -static int sb64y_is_skippable_8x8(MACROBLOCKD *xd) { +int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -551,7 +551,7 @@ static int sb64y_is_skippable_8x8(MACROBLOCKD *xd) { return skip; } -static int sb64uv_is_skippable_8x8(MACROBLOCKD *xd) { +int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -562,10 +562,10 @@ static int sb64uv_is_skippable_8x8(MACROBLOCKD *xd) { } static int sb64_is_skippable_8x8(MACROBLOCKD *xd) { - return sb64y_is_skippable_8x8(xd) & sb64uv_is_skippable_8x8(xd); + return vp9_sb64y_is_skippable_8x8(xd) & vp9_sb64uv_is_skippable_8x8(xd); } -static int sb64y_is_skippable_4x4(MACROBLOCKD *xd) { +int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -575,7 +575,7 @@ static int sb64y_is_skippable_4x4(MACROBLOCKD *xd) { return skip; } -static int sb64uv_is_skippable_4x4(MACROBLOCKD *xd) { +int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd) { int skip = 1; int i = 0; @@ -586,7 +586,7 @@ static int sb64uv_is_skippable_4x4(MACROBLOCKD *xd) { } static int sb64_is_skippable_4x4(MACROBLOCKD *xd) { - return sb64y_is_skippable_4x4(xd) & sb64uv_is_skippable_4x4(xd); + return vp9_sb64y_is_skippable_4x4(xd) & vp9_sb64uv_is_skippable_4x4(xd); } void vp9_tokenize_sb64(VP9_COMP *cpi, diff --git a/vp9/encoder/vp9_tokenize.h b/vp9/encoder/vp9_tokenize.h index 4d6fe6343..464d7caf6 100644 --- a/vp9/encoder/vp9_tokenize.h +++ b/vp9/encoder/vp9_tokenize.h @@ -37,8 +37,20 @@ int vp9_mby_is_skippable_8x8(MACROBLOCKD *xd); int vp9_mbuv_is_skippable_8x8(MACROBLOCKD *xd); int vp9_mby_is_skippable_16x16(MACROBLOCKD *xd); int vp9_sby_is_skippable_32x32(MACROBLOCKD *xd); +int vp9_sby_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sby_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_sby_is_skippable_4x4(MACROBLOCKD *xd); int vp9_sbuv_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sbuv_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_sbuv_is_skippable_4x4(MACROBLOCKD *xd); +int vp9_sb64y_is_skippable_32x32(MACROBLOCKD *xd); +int vp9_sb64y_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sb64y_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_sb64y_is_skippable_4x4(MACROBLOCKD *xd); int vp9_sb64uv_is_skippable_32x32(MACROBLOCKD *xd); +int vp9_sb64uv_is_skippable_16x16(MACROBLOCKD *xd); +int vp9_sb64uv_is_skippable_8x8(MACROBLOCKD *xd); +int vp9_sb64uv_is_skippable_4x4(MACROBLOCKD *xd); struct VP9_COMP; |