diff options
Diffstat (limited to 'vp9')
24 files changed, 1179 insertions, 383 deletions
diff --git a/vp9/common/ppc/vp9_idctllm_altivec.asm b/vp9/common/ppc/vp9_idct_altivec.asm index 117d9cfc8..b87aa4200 100644 --- a/vp9/common/ppc/vp9_idctllm_altivec.asm +++ b/vp9/common/ppc/vp9_idct_altivec.asm @@ -9,7 +9,7 @@ ; - .globl short_idct4x4llm_ppc + .globl short_idct4x4_ppc .macro load_c V, LABEL, OFF, R0, R1 lis \R0, \LABEL@ha @@ -21,7 +21,7 @@ ;# r4 short *output ;# r5 int pitch .align 2 -short_idct4x4llm_ppc: +short_idct4x4_ppc: mfspr r11, 256 ;# get old VRSAVE oris r12, r11, 0xfff8 mtspr 256, r12 ;# set VRSAVE diff --git a/vp9/common/ppc/vp9_systemdependent.c b/vp9/common/ppc/vp9_systemdependent.c index 02035191f..ac13722d4 100644 --- a/vp9/common/ppc/vp9_systemdependent.c +++ b/vp9/common/ppc/vp9_systemdependent.c @@ -63,7 +63,7 @@ void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_pt void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -extern void short_idct4x4llm_ppc(short *input, short *output, int pitch); +extern void short_idct4x4_ppc(short *input, short *output, int pitch); // Generic C extern subpixel_predict_function vp9_sixtap_predict_c; @@ -83,8 +83,8 @@ void vp9_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ void vp9_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); void vp9_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride); -extern void vp9_short_idct4x4llm_1_c(short *input, short *output, int pitch); -extern void vp9_short_idct4x4llm_c(short *input, short *output, int pitch); +extern void vp9_short_idct4x4_1_c(short *input, short *output, int pitch); +extern void vp9_short_idct4x4_c(short *input, short *output, int pitch); extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch); // PPC @@ -139,8 +139,8 @@ void vp9_machine_specific_config(void) { vp9_sixtap_predict8x4 = sixtap_predict8x4_ppc; vp9_sixtap_predict = sixtap_predict_ppc; - vp8_short_idct4x4_1 = vp9_short_idct4x4llm_1_c; - vp8_short_idct4x4 = short_idct4x4llm_ppc; + vp8_short_idct4x4_1 = vp9_short_idct4x4_1_c; + vp8_short_idct4x4 = short_idct4x4_ppc; vp8_dc_only_idct = vp8_dc_only_idct_c; vp8_lf_mbvfull = loop_filter_mbv_ppc; diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c index 9a7be4578..89dea4edc 100644 --- a/vp9/common/vp9_entropymv.c +++ b/vp9/common/vp9_entropymv.c @@ -42,9 +42,10 @@ const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2] = { -MV_CLASS_2, -MV_CLASS_3, 10, 12, -MV_CLASS_4, -MV_CLASS_5, - 14, 16, - -MV_CLASS_6, -MV_CLASS_7, - -MV_CLASS_8, -MV_CLASS_9, + -MV_CLASS_6, 14, + 16, 18, + -MV_CLASS_7, -MV_CLASS_8, + -MV_CLASS_9, -MV_CLASS_10, }; struct vp9_token_struct vp9_mv_class_encodings[MV_CLASSES]; @@ -64,24 +65,24 @@ const nmv_context vp9_default_nmv_context = { {32, 64, 96}, { { /* vert component */ - 128, /* sign */ - {224, 144, 192, 168, 192, 176, 192, 198, 198}, /* class */ - {216}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224, 234, 234}, /* bits */ - {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ - {64, 96, 64}, /* fp */ - 160, /* class0_hp bit */ - 128, /* hp */ + 128, /* sign */ + {224, 144, 192, 168, 192, 176, 192, 198, 198, 245}, /* class */ + {216}, /* class0 */ + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */ + {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ + {64, 96, 64}, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ }, { /* hor component */ - 128, /* sign */ - {216, 128, 176, 160, 176, 176, 192, 198, 198}, /* class */ - {208}, /* class0 */ - {136, 140, 148, 160, 176, 192, 224, 234, 234}, /* bits */ - {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ - {64, 96, 64}, /* fp */ - 160, /* class0_hp bit */ - 128, /* hp */ + 128, /* sign */ + {216, 128, 176, 160, 176, 176, 192, 198, 198, 208}, /* class */ + {208}, /* class0 */ + {136, 140, 148, 160, 176, 192, 224, 234, 234, 240}, /* bits */ + {{128, 128, 64}, {96, 112, 64}}, /* class0_fp */ + {64, 96, 64}, /* fp */ + 160, /* class0_hp bit */ + 128, /* hp */ } }, }; @@ -107,6 +108,7 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) { else if (z < CLASS0_SIZE * 1024) c = MV_CLASS_7; else if (z < CLASS0_SIZE * 2048) c = MV_CLASS_8; else if (z < CLASS0_SIZE * 4096) c = MV_CLASS_9; + else if (z < CLASS0_SIZE * 8192) c = MV_CLASS_10; else assert(0); if (offset) *offset = z - mv_class_base(c); diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h index 33500069e..162d2b44f 100644 --- a/vp9/common/vp9_entropymv.h +++ b/vp9/common/vp9_entropymv.h @@ -49,7 +49,7 @@ extern const vp9_tree_index vp9_mv_joint_tree[2 * MV_JOINTS - 2]; extern struct vp9_token_struct vp9_mv_joint_encodings [MV_JOINTS]; /* Symbols for coding magnitude class of nonzero components */ -#define MV_CLASSES 10 +#define MV_CLASSES 11 typedef enum { MV_CLASS_0 = 0, /* (0, 2] integer pel */ MV_CLASS_1 = 1, /* (2, 4] integer pel */ @@ -61,6 +61,7 @@ typedef enum { MV_CLASS_7 = 7, /* (128, 256] integer pel */ MV_CLASS_8 = 8, /* (256, 512] integer pel */ MV_CLASS_9 = 9, /* (512, 1024] integer pel */ + MV_CLASS_10 = 10, /* (1024,2048] integer pel */ } MV_CLASS_TYPE; extern const vp9_tree_index vp9_mv_class_tree[2 * MV_CLASSES - 2]; diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c index 8409885a0..f6d6932cc 100644 --- a/vp9/common/vp9_findnearmv.c +++ b/vp9/common/vp9_findnearmv.c @@ -118,10 +118,12 @@ unsigned int vp9_sub_pixel_variance2x16_c(const uint8_t *src_ptr, return vp9_variance2x16_c(temp2, 2, dst_ptr, dst_pixels_per_line, sse); } +#if CONFIG_USESELECTREFMV /* check a list of motion vectors by sad score using a number rows of pixels * above and a number cols of pixels in the left to select the one with best * score to use as ref motion vector */ + void vp9_find_best_ref_mvs(MACROBLOCKD *xd, uint8_t *ref_y_buffer, int ref_y_stride, @@ -298,3 +300,20 @@ void vp9_find_best_ref_mvs(MACROBLOCKD *xd, // Copy back the re-ordered mv list vpx_memcpy(mvlist, sorted_mvs, sizeof(sorted_mvs)); } +#else +void vp9_find_best_ref_mvs(MACROBLOCKD *xd, + uint8_t *ref_y_buffer, + int ref_y_stride, + int_mv *mvlist, + int_mv *nearest, + int_mv *near) { + int i; + // Make sure all the candidates are properly clamped etc + for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i) { + lower_mv_precision(&mvlist[i], xd->allow_high_precision_mv); + clamp_mv2(&mvlist[i], xd); + } + *nearest = mvlist[0]; + *near = mvlist[1]; +} +#endif diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idct.c index e2106250f..3ec093f73 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idct.c @@ -8,20 +8,6 @@ * be found in the AUTHORS file in the root of the source tree. */ - -/**************************************************************************** - * Notes: - * - * This implementation makes use of 16 bit fixed point verio of two multiply - * constants: - * 1. sqrt(2) * cos (pi/8) - * 2. sqrt(2) * sin (pi/8) - * Becuase the first constant is bigger than 1, to maintain the same 16 bit - * fixed point precision as the second one, we use a trick of - * x * a = x + x*(a-1) - * so - * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). - **************************************************************************/ #include <assert.h> #include <math.h> @@ -32,7 +18,7 @@ #include "vp9/common/vp9_common.h" #include "vp9/common/vp9_idct.h" -void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) { int i; int a1, b1, c1, d1; int16_t *ip = input; @@ -73,7 +59,7 @@ void vp9_short_inv_walsh4x4_x8_c(int16_t *input, int16_t *output, int pitch) { } } -void vp9_short_inv_walsh4x4_1_x8_c(int16_t *in, int16_t *out, int pitch) { +void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) { int i; int16_t tmp[4]; int16_t *ip = in; @@ -99,7 +85,7 @@ void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr, int r, c; int16_t dc = input_dc; int16_t tmp[4 * 4]; - vp9_short_inv_walsh4x4_1_x8_c(&dc, tmp, 4 << 1); + vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1); for (r = 0; r < 4; r++) { for (c = 0; c < 4; c++) @@ -130,7 +116,7 @@ void vp9_idct4_1d_c(int16_t *input, int16_t *output) { output[3] = step[0] - step[3]; } -void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) { int16_t out[4 * 4]; int16_t *outptr = out; const int half_pitch = pitch >> 1; @@ -156,7 +142,7 @@ void vp9_short_idct4x4llm_c(int16_t *input, int16_t *output, int pitch) { } } -void vp9_short_idct4x4llm_1_c(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) { int i; int a1; int16_t *op = output; diff --git a/vp9/common/vp9_quant_common.c b/vp9/common/vp9_quant_common.c index 119038121..90eb88ed3 100644 --- a/vp9/common/vp9_quant_common.c +++ b/vp9/common/vp9_quant_common.c @@ -52,21 +52,6 @@ int vp9_dc_quant(int QIndex, int Delta) { return retval; } -int vp9_dc2quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = dc_qlookup[ QIndex ]; - - return retval; - -} int vp9_dc_uv_quant(int QIndex, int Delta) { int retval; @@ -94,22 +79,6 @@ int vp9_ac_yquant(int QIndex) { return retval; } -int vp9_ac2quant(int QIndex, int Delta) { - int retval; - - QIndex = QIndex + Delta; - - if (QIndex > MAXQ) - QIndex = MAXQ; - else if (QIndex < 0) - QIndex = 0; - - retval = (ac_qlookup[ QIndex ] * 775) / 1000; - if (retval < 4) - retval = 4; - - return retval; -} int vp9_ac_uv_quant(int QIndex, int Delta) { int retval; diff --git a/vp9/common/vp9_reconintra.h b/vp9/common/vp9_reconintra.h index 3031fb699..b97b6089d 100644 --- a/vp9/common/vp9_reconintra.h +++ b/vp9/common/vp9_reconintra.h @@ -17,9 +17,10 @@ void vp9_recon_intra_mbuv(MACROBLOCKD *xd); B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, - int stride, int n); + int stride, int n, + int tx, int ty); -B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x); +B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x); #if CONFIG_COMP_INTERINTRA_PRED void vp9_build_interintra_16x16_predictors_mb(MACROBLOCKD *xd, diff --git a/vp9/common/vp9_reconintra4x4.c b/vp9/common/vp9_reconintra4x4.c index 7fbee7c32..eab5ab495 100644 --- a/vp9/common/vp9_reconintra4x4.c +++ b/vp9/common/vp9_reconintra4x4.c @@ -15,17 +15,17 @@ #include "vp9_rtcd.h" #if CONFIG_NEWBINTRAMODES -static int find_grad_measure(uint8_t *x, int stride, int n, int t, +static int find_grad_measure(uint8_t *x, int stride, int n, int tx, int ty, int dx, int dy) { int i, j; int count = 0, gsum = 0, gdiv; /* TODO: Make this code more efficient by breaking up into two loops */ - for (i = -t; i < n; ++i) - for (j = -t; j < n; ++j) { + for (i = -ty; i < n; ++i) + for (j = -tx; j < n; ++j) { int g; if (i >= 0 && j >= 0) continue; if (i + dy >= 0 && j + dx >= 0) continue; - if (i + dy < -t || i + dy >= n || j + dx < -t || j + dx >= n) continue; + if (i + dy < -ty || i + dy >= n || j + dx < -tx || j + dx >= n) continue; g = abs(x[(i + dy) * stride + j + dx] - x[i * stride + j]); gsum += g * g; count++; @@ -36,14 +36,15 @@ static int find_grad_measure(uint8_t *x, int stride, int n, int t, #if CONTEXT_PRED_REPLACEMENTS == 6 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, - int stride, int n) { + int stride, int n, + int tx, int ty) { int g[8], i, imin, imax; - g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); - g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1); - g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); - g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); - g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1); - g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); + g[1] = find_grad_measure(ptr, stride, n, tx, ty, 2, 1); + g[2] = find_grad_measure(ptr, stride, n, tx, ty, 1, 1); + g[3] = find_grad_measure(ptr, stride, n, tx, ty, 1, 2); + g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2); + g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1); + g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1); imin = 1; for (i = 2; i < 8; i += 1 + (i == 3)) imin = (g[i] < g[imin] ? i : imin); @@ -73,12 +74,13 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, } #elif CONTEXT_PRED_REPLACEMENTS == 4 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, - int stride, int n) { + int stride, int n, + int tx, int ty) { int g[8], i, imin, imax; - g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); - g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); - g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); - g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); + g[1] = find_grad_measure(ptr, stride, n, tx, ty, 2, 1); + g[3] = find_grad_measure(ptr, stride, n, tx, ty, 1, 2); + g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2); + g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1); imin = 1; for (i = 3; i < 8; i+=2) imin = (g[i] < g[imin] ? i : imin); @@ -104,16 +106,17 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, } #elif CONTEXT_PRED_REPLACEMENTS == 0 B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, - int stride, int n) { + int stride, int n, + int tx, int ty) { int g[8], i, imin, imax; - g[0] = find_grad_measure(ptr, stride, n, 4, 1, 0); - g[1] = find_grad_measure(ptr, stride, n, 4, 2, 1); - g[2] = find_grad_measure(ptr, stride, n, 4, 1, 1); - g[3] = find_grad_measure(ptr, stride, n, 4, 1, 2); - g[4] = find_grad_measure(ptr, stride, n, 4, 0, 1); - g[5] = find_grad_measure(ptr, stride, n, 4, -1, 2); - g[6] = find_grad_measure(ptr, stride, n, 4, -1, 1); - g[7] = find_grad_measure(ptr, stride, n, 4, -2, 1); + g[0] = find_grad_measure(ptr, stride, n, tx, ty, 1, 0); + g[1] = find_grad_measure(ptr, stride, n, tx, ty, 2, 1); + g[2] = find_grad_measure(ptr, stride, n, tx, ty, 1, 1); + g[3] = find_grad_measure(ptr, stride, n, tx, ty, 1, 2); + g[4] = find_grad_measure(ptr, stride, n, tx, ty, 0, 1); + g[5] = find_grad_measure(ptr, stride, n, tx, ty, -1, 2); + g[6] = find_grad_measure(ptr, stride, n, tx, ty, -1, 1); + g[7] = find_grad_measure(ptr, stride, n, tx, ty, -2, 1); imax = 0; for (i = 1; i < 8; i++) imax = (g[i] > g[imax] ? i : imax); @@ -144,10 +147,17 @@ B_PREDICTION_MODE vp9_find_dominant_direction(uint8_t *ptr, } #endif -B_PREDICTION_MODE vp9_find_bpred_context(BLOCKD *x) { +B_PREDICTION_MODE vp9_find_bpred_context(MACROBLOCKD *xd, BLOCKD *x) { + const int block_idx = x - xd->block; + const int have_top = (block_idx >> 2) || xd->up_available; + const int have_left = (block_idx & 3) || xd->left_available; uint8_t *ptr = *(x->base_dst) + x->dst; int stride = x->dst_stride; - return vp9_find_dominant_direction(ptr, stride, 4); + int tx = have_left ? 4 : 0; + int ty = have_top ? 4 : 0; + if (!have_left && !have_top) + return B_DC_PRED; + return vp9_find_dominant_direction(ptr, stride, 4, tx, ty); } #endif diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 04b67b925..0c2a5c94a 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -279,11 +279,11 @@ specialize vp9_convolve8_avg_vert ssse3 # # dct # -prototype void vp9_short_idct4x4llm_1 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct4x4llm_1 +prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_idct4x4_1 -prototype void vp9_short_idct4x4llm "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_idct4x4llm sse2 +prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_idct4x4 sse2 prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch" specialize vp9_short_idct8x8 @@ -330,10 +330,10 @@ specialize vp9_idct4_1d sse2 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" specialize vp9_dc_only_idct_add sse2 -prototype void vp9_short_inv_walsh4x4_1_x8 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_inv_walsh4x4_1_x8 -prototype void vp9_short_inv_walsh4x4_x8 "int16_t *input, int16_t *output, int pitch" -specialize vp9_short_inv_walsh4x4_x8 +prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_iwalsh4x4_1 +prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_iwalsh4x4 prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride" specialize vp9_dc_only_inv_walsh_add @@ -598,13 +598,13 @@ prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int specialize vp9_short_fdct32x32 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_fdct16x16 +specialize vp9_short_fdct16x16 sse2 -prototype void vp9_short_walsh4x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh4x4_x8 +prototype void vp9_short_walsh4x4 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_walsh4x4 -prototype void vp9_short_walsh8x4_x8 "int16_t *InputData, int16_t *OutputData, int pitch" -specialize vp9_short_walsh8x4_x8 +prototype void vp9_short_walsh8x4 "int16_t *InputData, int16_t *OutputData, int pitch" +specialize vp9_short_walsh8x4 # # Motion search diff --git a/vp9/common/x86/vp9_idctllm_sse2.asm b/vp9/common/x86/vp9_idct_sse2.asm index 8f3c6dfc3..8f3c6dfc3 100644 --- a/vp9/common/x86/vp9_idctllm_sse2.asm +++ b/vp9/common/x86/vp9_idct_sse2.asm diff --git a/vp9/common/x86/vp9_idctllm_x86.c b/vp9/common/x86/vp9_idct_x86.c index 3d7a1481c..6a35823bd 100644 --- a/vp9/common/x86/vp9_idctllm_x86.c +++ b/vp9/common/x86/vp9_idct_x86.c @@ -74,7 +74,7 @@ void vp9_dc_only_idct_add_sse2(int input_dc, uint8_t *pred_ptr, *(int *)dst_ptr = _mm_cvtsi128_si32(p1); } -void vp9_short_idct4x4llm_sse2(int16_t *input, int16_t *output, int pitch) { +void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) { const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64, diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index a3324731e..b53e419b5 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -44,6 +44,21 @@ int dec_debug = 0; #endif + +static int read_le16(const uint8_t *p) { + return (p[1] << 8) | p[0]; +} + +static int read_le32(const uint8_t *p) { + return (p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]; +} + +// len == 0 is not allowed +static int read_is_valid(const unsigned char *start, size_t len, + const unsigned char *end) { + return start + len > start && start + len <= end; +} + static int merge_index(int v, int n, int modulus) { int max1 = (n - 1 - modulus / 2) / modulus + 1; if (v < max1) v = v * modulus + modulus / 2; @@ -61,14 +76,13 @@ static int merge_index(int v, int n, int modulus) { static int inv_remap_prob(int v, int m) { const int n = 256; const int modulus = MODULUS_PARAM; - int i; + v = merge_index(v, n - 1, modulus); if ((m << 1) <= n) { - i = vp9_inv_recenter_nonneg(v + 1, m); + return vp9_inv_recenter_nonneg(v + 1, m); } else { - i = n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m); + return n - 1 - vp9_inv_recenter_nonneg(v + 1, n - 1 - m); } - return i; } static vp9_prob read_prob_diff_update(vp9_reader *const bc, int oldp) { @@ -112,8 +126,8 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *mb) { int i; VP9_COMMON *const pc = &pbi->common; - int segment_id = mb->mode_info_context->mbmi.segment_id; - int qindex = get_qindex(mb, segment_id, pc->base_qindex); + const int segment_id = mb->mode_info_context->mbmi.segment_id; + const int qindex = get_qindex(mb, segment_id, pc->base_qindex); mb->q_index = qindex; for (i = 0; i < 16; i++) @@ -124,14 +138,14 @@ static void mb_init_dequantizer(VP9D_COMP *pbi, MACROBLOCKD *mb) { if (mb->lossless) { assert(qindex == 0); - mb->inv_txm4x4_1 = vp9_short_inv_walsh4x4_1_x8; - mb->inv_txm4x4 = vp9_short_inv_walsh4x4_x8; + mb->inv_txm4x4_1 = vp9_short_iwalsh4x4_1; + mb->inv_txm4x4 = vp9_short_iwalsh4x4; mb->itxm_add = vp9_dequant_idct_add_lossless_c; mb->itxm_add_y_block = vp9_dequant_idct_add_y_block_lossless_c; mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block_lossless_c; } else { - mb->inv_txm4x4_1 = vp9_short_idct4x4llm_1; - mb->inv_txm4x4 = vp9_short_idct4x4llm; + mb->inv_txm4x4_1 = vp9_short_idct4x4_1; + mb->inv_txm4x4 = vp9_short_idct4x4; mb->itxm_add = vp9_dequant_idct_add; mb->itxm_add_y_block = vp9_dequant_idct_add_y_block; mb->itxm_add_uv_block = vp9_dequant_idct_add_uv_block; @@ -287,12 +301,14 @@ static void decode_8x8(VP9D_COMP *pbi, MACROBLOCKD *xd, int ib = vp9_i8x8_block[i]; BLOCKD *b = &xd->block[ib]; int i8x8mode = b->bmi.as_mode.first; + b = &xd->block[16 + i]; - vp9_intra_uv4x4_predict(xd, &xd->block[16 + i], i8x8mode, b->predictor); + vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor); xd->itxm_add(b->qcoeff, b->dequant, b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[16 + i]); + b = &xd->block[20 + i]; - vp9_intra_uv4x4_predict(xd, &xd->block[20 + i], i8x8mode, b->predictor); + vp9_intra_uv4x4_predict(xd, b, i8x8mode, b->predictor); xd->itxm_add(b->qcoeff, b->dequant, b->predictor, *(b->base_dst) + b->dst, 8, b->dst_stride, xd->eobs[20 + i]); } @@ -361,7 +377,7 @@ static void decode_4x4(VP9D_COMP *pbi, MACROBLOCKD *xd, int b_mode = xd->mode_info_context->bmi[i].as_mode.first; #if CONFIG_NEWBINTRAMODES xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context = - vp9_find_bpred_context(b); + vp9_find_bpred_context(xd, b); #endif if (!xd->mode_info_context->mbmi.mb_skip_coeff) eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i); @@ -798,9 +814,8 @@ static void decode_macroblock(VP9D_COMP *pbi, MACROBLOCKD *xd, if (xd->mode_info_context->mbmi.mb_skip_coeff) { vp9_reset_mb_tokens_context(xd); } else if (!bool_error(bc)) { - if (mode != B_PRED) { + if (mode != B_PRED) eobtotal = vp9_decode_mb_tokens(pbi, xd, bc); - } } //mode = xd->mode_info_context->mbmi.mode; @@ -923,10 +938,9 @@ static void set_offsets(VP9D_COMP *pbi, int block_size, xd->above_context = cm->above_context + mb_col; xd->left_context = cm->left_context + (mb_row & 3); - /* Distance of Mb to the various image edges. - * These are specified to 8th pel as they are always compared to - * values that are in 1/8th pel units - */ + // Distance of Mb to the various image edges. + // These are specified to 8th pel as they are always compared to + // values that are in 1/8th pel units block_size >>= 4; // in mb units set_mb_row(cm, xd, mb_row, block_size); @@ -937,37 +951,31 @@ static void set_offsets(VP9D_COMP *pbi, int block_size, xd->dst.v_buffer = cm->yv12_fb[dst_fb_idx].v_buffer + recon_uvoffset; } -static void set_refs(VP9D_COMP *pbi, int block_size, - int mb_row, int mb_col) { +static void set_refs(VP9D_COMP *pbi, int block_size, int mb_row, int mb_col) { VP9_COMMON *const cm = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - MODE_INFO *mi = xd->mode_info_context; - MB_MODE_INFO *const mbmi = &mi->mbmi; + MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi; if (mbmi->ref_frame > INTRA_FRAME) { - int ref_fb_idx; - - /* Select the appropriate reference frame for this MB */ - ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1]; + // Select the appropriate reference frame for this MB + int ref_fb_idx = cm->active_ref_idx[mbmi->ref_frame - 1]; xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame - 1]; xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame - 1]; setup_pred_block(&xd->pre, &cm->yv12_fb[ref_fb_idx], mb_row, mb_col, &xd->scale_factor[0], &xd->scale_factor_uv[0]); - /* propagate errors from reference frames */ + // propagate errors from reference frames xd->corrupted |= cm->yv12_fb[ref_fb_idx].corrupted; if (mbmi->second_ref_frame > INTRA_FRAME) { - int second_ref_fb_idx; - - /* Select the appropriate reference frame for this MB */ - second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1]; + // Select the appropriate reference frame for this MB + int second_ref_fb_idx = cm->active_ref_idx[mbmi->second_ref_frame - 1]; setup_pred_block(&xd->second_pre, &cm->yv12_fb[second_ref_fb_idx], mb_row, mb_col, &xd->scale_factor[1], &xd->scale_factor_uv[1]); - /* propagate errors from reference frames */ + // propagate errors from reference frames xd->corrupted |= cm->yv12_fb[second_ref_fb_idx].corrupted; } } @@ -1054,15 +1062,6 @@ static void decode_sb_row(VP9D_COMP *pbi, VP9_COMMON *pc, } } -static unsigned int read_partition_size(const unsigned char *cx_size) { - return cx_size[0] + (cx_size[1] << 8) + (cx_size[2] << 16); -} - -static int read_is_valid(const unsigned char *start, size_t len, - const unsigned char *end) { - return start + len > start && start + len <= end; -} - static void setup_token_decoder(VP9D_COMP *pbi, const unsigned char *cx_data, @@ -1090,7 +1089,7 @@ static void setup_token_decoder(VP9D_COMP *pbi, static void init_frame(VP9D_COMP *pbi) { VP9_COMMON *const pc = &pbi->common; - MACROBLOCKD *const xd = &pbi->mb; + MACROBLOCKD *const xd = &pbi->mb; if (pc->frame_type == KEY_FRAME) { vp9_setup_past_independence(pc, xd); @@ -1113,7 +1112,6 @@ static void init_frame(VP9D_COMP *pbi) { xd->mode_info_context->mbmi.mode = DC_PRED; xd->mode_info_stride = pc->mode_info_stride; xd->corrupted = 0; - xd->fullpixel_mask = pc->full_pixel ? 0xfffffff8 : 0xffffffff; } @@ -1241,8 +1239,8 @@ static void update_frame_size(VP9D_COMP *pbi) { VP9_COMMON *cm = &pbi->common; /* our internal buffers are always multiples of 16 */ - int width = (cm->Width + 15) & ~15; - int height = (cm->Height + 15) & ~15; + const int width = (cm->Width + 15) & ~15; + const int height = (cm->Height + 15) & ~15; cm->mb_rows = height >> 4; cm->mb_cols = width >> 4; @@ -1261,8 +1259,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { BOOL_DECODER header_bc, residual_bc; VP9_COMMON *const pc = &pbi->common; MACROBLOCKD *const xd = &pbi->mb; - const unsigned char *data = (const unsigned char *)pbi->Source; - const unsigned char *data_end = data + pbi->source_sz; + const uint8_t *data = (const uint8_t *)pbi->Source; + const uint8_t *data_end = data + pbi->source_sz; ptrdiff_t first_partition_length_in_bytes = 0; int mb_row; @@ -1284,8 +1282,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { first_partition_length_in_bytes = (data[0] | (data[1] << 8) | (data[2] << 16)) >> 5; - if ((data + first_partition_length_in_bytes > data_end - || data + first_partition_length_in_bytes < data)) + if (!read_is_valid(data, first_partition_length_in_bytes, data_end)) vpx_internal_error(&pc->error, VPX_CODEC_CORRUPT_FRAME, "Truncated packet or corrupt partition 0 length"); @@ -1314,8 +1311,8 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { * size. */ if (data + 5 < data_end) { - pc->Width = (data[0] | (data[1] << 8)); - pc->Height = (data[2] | (data[3] << 8)); + pc->Width = read_le16(data); + pc->Height = read_le16(data + 2); pc->horiz_scale = data[4] >> 4; pc->vert_scale = data[4] & 0x0F; @@ -1467,7 +1464,6 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { pc->ref_pred_probs[0] = 120; pc->ref_pred_probs[1] = 80; pc->ref_pred_probs[2] = 40; - } else { for (i = 0; i < PREDICTION_PROBS; i++) { if (vp9_read_bit(&header_bc)) @@ -1481,10 +1477,11 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { if (xd->lossless) { pc->txfm_mode = ONLY_4X4; } else { - /* Read the loop filter level and type */ + // Read the loop filter level and type pc->txfm_mode = vp9_read_literal(&header_bc, 2); if (pc->txfm_mode == 3) pc->txfm_mode += vp9_read_bit(&header_bc); + if (pc->txfm_mode == TX_MODE_SELECT) { pc->prob_tx[0] = vp9_read_literal(&header_bc, 8); pc->prob_tx[1] = vp9_read_literal(&header_bc, 8); @@ -1511,7 +1508,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { xd->ref_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6); if (vp9_read_bit(&header_bc)) /* Apply sign */ - xd->ref_lf_deltas[i] = xd->ref_lf_deltas[i] * -1; + xd->ref_lf_deltas[i] = -xd->ref_lf_deltas[i]; } } @@ -1522,7 +1519,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { xd->mode_lf_deltas[i] = (signed char)vp9_read_literal(&header_bc, 6); if (vp9_read_bit(&header_bc)) /* Apply sign */ - xd->mode_lf_deltas[i] = xd->mode_lf_deltas[i] * -1; + xd->mode_lf_deltas[i] = -xd->mode_lf_deltas[i]; } } } @@ -1570,14 +1567,13 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { pc->ref_frame_sign_bias[GOLDEN_FRAME] = vp9_read_bit(&header_bc); pc->ref_frame_sign_bias[ALTREF_FRAME] = vp9_read_bit(&header_bc); - /* Is high precision mv allowed */ + // Is high precision mv allowed xd->allow_high_precision_mv = (unsigned char)vp9_read_bit(&header_bc); + // Read the type of subpel filter to use - if (vp9_read_bit(&header_bc)) { - pc->mcomp_filter_type = SWITCHABLE; - } else { - pc->mcomp_filter_type = vp9_read_literal(&header_bc, 2); - } + pc->mcomp_filter_type = vp9_read_bit(&header_bc) ? SWITCHABLE : + vp9_read_literal(&header_bc, 2); + #if CONFIG_COMP_INTERINTRA_PRED pc->use_interintra = vp9_read_bit(&header_bc); #endif @@ -1731,7 +1727,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { /* tile info */ { - const unsigned char *data_ptr = data + first_partition_length_in_bytes; + const uint8_t *data_ptr = data + first_partition_length_in_bytes; int tile_row, tile_col, delta_log2_tiles; vp9_get_tile_n_bits(pc, &pc->log2_tile_columns, &delta_log2_tiles); @@ -1753,26 +1749,20 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { if (pbi->oxcf.inv_tile_order) { const int n_cols = pc->tile_columns; - const unsigned char *data_ptr2[4][1 << 6]; + const uint8_t *data_ptr2[4][1 << 6]; BOOL_DECODER UNINITIALIZED_IS_SAFE(bc_bak); // pre-initialize the offsets, we're going to read in inverse order data_ptr2[0][0] = data_ptr; for (tile_row = 0; tile_row < pc->tile_rows; tile_row++) { if (tile_row) { - int size = data_ptr2[tile_row - 1][n_cols - 1][0] | - (data_ptr2[tile_row - 1][n_cols - 1][1] << 8) | - (data_ptr2[tile_row - 1][n_cols - 1][2] << 16) | - (data_ptr2[tile_row - 1][n_cols - 1][3] << 24); + const int size = read_le32(data_ptr2[tile_row - 1][n_cols - 1]); data_ptr2[tile_row - 1][n_cols - 1] += 4; data_ptr2[tile_row][0] = data_ptr2[tile_row - 1][n_cols - 1] + size; } for (tile_col = 1; tile_col < n_cols; tile_col++) { - int size = data_ptr2[tile_row][tile_col - 1][0] | - (data_ptr2[tile_row][tile_col - 1][1] << 8) | - (data_ptr2[tile_row][tile_col - 1][2] << 16) | - (data_ptr2[tile_row][tile_col - 1][3] << 24); + const int size = read_le32(data_ptr2[tile_row][tile_col - 1]); data_ptr2[tile_row][tile_col - 1] += 4; data_ptr2[tile_row][tile_col] = data_ptr2[tile_row][tile_col - 1] + size; @@ -1813,10 +1803,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { } if (tile_col < pc->tile_columns - 1 || tile_row < pc->tile_rows - 1) { - int size = data_ptr[0] | - (data_ptr[1] << 8) | - (data_ptr[2] << 16) | - (data_ptr[3] << 24); + int size = read_le32(data_ptr); data_ptr += 4 + size; } } @@ -1829,31 +1816,29 @@ int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) { pc->last_width = pc->Width; pc->last_height = pc->Height; - /* Collect information about decoder corruption. */ - /* 1. Check first boolean decoder for errors. */ - pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc); - /* 2. Check the macroblock information */ - pc->yv12_fb[pc->new_fb_idx].corrupted |= corrupt_tokens; + // Collect information about decoder corruption. + // 1. Check first boolean decoder for errors. + // 2. Check the macroblock information + pc->yv12_fb[pc->new_fb_idx].corrupted = bool_error(&header_bc) | + corrupt_tokens; if (!pbi->decoded_key_frame) { - if (pc->frame_type == KEY_FRAME && - !pc->yv12_fb[pc->new_fb_idx].corrupted) + if (pc->frame_type == KEY_FRAME && !pc->yv12_fb[pc->new_fb_idx].corrupted) pbi->decoded_key_frame = 1; else vpx_internal_error(&pbi->common.error, VPX_CODEC_CORRUPT_FRAME, "A stream must start with a complete key frame"); } - if (!pc->error_resilient_mode && - !pc->frame_parallel_decoding_mode) { + if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) { vp9_adapt_coef_probs(pc); #if CONFIG_CODE_NONZEROCOUNT vp9_adapt_nzc_probs(pc); #endif } + if (pc->frame_type != KEY_FRAME) { - if (!pc->error_resilient_mode && - !pc->frame_parallel_decoding_mode) { + if (!pc->error_resilient_mode && !pc->frame_parallel_decoding_mode) { vp9_adapt_mode_probs(pc); vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv); vp9_adapt_mode_context(&pbi->common); diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c index 92b78ed19..cb4601a15 100644 --- a/vp9/decoder/vp9_dequantize.c +++ b/vp9/decoder/vp9_dequantize.c @@ -126,7 +126,7 @@ void vp9_dequant_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, input[i] *= dq[i]; // the idct halves ( >> 1) the pitch - vp9_short_idct4x4llm(input, output, 4 << 1); + vp9_short_idct4x4(input, output, 4 << 1); vpx_memset(input, 0, 32); @@ -148,7 +148,7 @@ void vp9_dequant_dc_idct_add_c(int16_t *input, const int16_t *dq, uint8_t *pred, input[i] *= dq[i]; // the idct halves ( >> 1) the pitch - vp9_short_idct4x4llm(input, output, 4 << 1); + vp9_short_idct4x4(input, output, 4 << 1); vpx_memset(input, 0, 32); vp9_add_residual_4x4(output, pred, pitch, dest, stride); } @@ -163,7 +163,7 @@ void vp9_dequant_idct_add_lossless_c(int16_t *input, const int16_t *dq, for (i = 0; i < 16; i++) input[i] *= dq[i]; - vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1); + vp9_short_iwalsh4x4_c(input, output, 4 << 1); vpx_memset(input, 0, 32); @@ -186,7 +186,7 @@ void vp9_dequant_dc_idct_add_lossless_c(int16_t *input, const int16_t *dq, for (i = 1; i < 16; i++) input[i] *= dq[i]; - vp9_short_inv_walsh4x4_x8_c(input, output, 4 << 1); + vp9_short_iwalsh4x4_c(input, output, 4 << 1); vpx_memset(input, 0, 32); vp9_add_residual_4x4(output, pred, pitch, dest, stride); } diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index e4ac2ce36..6365ed9a2 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c @@ -269,6 +269,185 @@ void vp9_short_fdct8x8_c(int16_t *input, int16_t *final_output, int pitch) { } } +void vp9_short_fdct16x16_c(int16_t *input, int16_t *output, int pitch) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we tranpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + const int stride = pitch >> 1; + int pass; + // We need an intermediate buffer between passes. + int16_t intermediate[256]; + int16_t *in = input; + int16_t *out = intermediate; + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + /*canbe16*/ int step1[8]; + /*canbe16*/ int step2[8]; + /*canbe16*/ int step3[8]; + /*canbe16*/ int input[8]; + /*needs32*/ int temp1, temp2; + int i; + for (i = 0; i < 16; i++) { + if (0 == pass) { + // Calculate input for the first 8 results. + input[0] = (in[0 * stride] + in[15 * stride]) << 2; + input[1] = (in[1 * stride] + in[14 * stride]) << 2; + input[2] = (in[2 * stride] + in[13 * stride]) << 2; + input[3] = (in[3 * stride] + in[12 * stride]) << 2; + input[4] = (in[4 * stride] + in[11 * stride]) << 2; + input[5] = (in[5 * stride] + in[10 * stride]) << 2; + input[6] = (in[6 * stride] + in[ 9 * stride]) << 2; + input[7] = (in[7 * stride] + in[ 8 * stride]) << 2; + // Calculate input for the next 8 results. + step1[0] = (in[7 * stride] - in[ 8 * stride]) << 2; + step1[1] = (in[6 * stride] - in[ 9 * stride]) << 2; + step1[2] = (in[5 * stride] - in[10 * stride]) << 2; + step1[3] = (in[4 * stride] - in[11 * stride]) << 2; + step1[4] = (in[3 * stride] - in[12 * stride]) << 2; + step1[5] = (in[2 * stride] - in[13 * stride]) << 2; + step1[6] = (in[1 * stride] - in[14 * stride]) << 2; + step1[7] = (in[0 * stride] - in[15 * stride]) << 2; + } else { + // Calculate input for the first 8 results. + input[0] = ((in[0 * 16] + 1) >> 2) + ((in[15 * 16] + 1) >> 2); + input[1] = ((in[1 * 16] + 1) >> 2) + ((in[14 * 16] + 1) >> 2); + input[2] = ((in[2 * 16] + 1) >> 2) + ((in[13 * 16] + 1) >> 2); + input[3] = ((in[3 * 16] + 1) >> 2) + ((in[12 * 16] + 1) >> 2); + input[4] = ((in[4 * 16] + 1) >> 2) + ((in[11 * 16] + 1) >> 2); + input[5] = ((in[5 * 16] + 1) >> 2) + ((in[10 * 16] + 1) >> 2); + input[6] = ((in[6 * 16] + 1) >> 2) + ((in[ 9 * 16] + 1) >> 2); + input[7] = ((in[7 * 16] + 1) >> 2) + ((in[ 8 * 16] + 1) >> 2); + // Calculate input for the next 8 results. + step1[0] = ((in[7 * 16] + 1) >> 2) - ((in[ 8 * 16] + 1) >> 2); + step1[1] = ((in[6 * 16] + 1) >> 2) - ((in[ 9 * 16] + 1) >> 2); + step1[2] = ((in[5 * 16] + 1) >> 2) - ((in[10 * 16] + 1) >> 2); + step1[3] = ((in[4 * 16] + 1) >> 2) - ((in[11 * 16] + 1) >> 2); + step1[4] = ((in[3 * 16] + 1) >> 2) - ((in[12 * 16] + 1) >> 2); + step1[5] = ((in[2 * 16] + 1) >> 2) - ((in[13 * 16] + 1) >> 2); + step1[6] = ((in[1 * 16] + 1) >> 2) - ((in[14 * 16] + 1) >> 2); + step1[7] = ((in[0 * 16] + 1) >> 2) - ((in[15 * 16] + 1) >> 2); + } + // Work on the first eight values; fdct8_1d(input, even_results); + { + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; + /*needs32*/ int t0, t1, t2, t3; + /*canbe16*/ int x0, x1, x2, x3; + + // stage 1 + s0 = input[0] + input[7]; + s1 = input[1] + input[6]; + s2 = input[2] + input[5]; + s3 = input[3] + input[4]; + s4 = input[3] - input[4]; + s5 = input[2] - input[5]; + s6 = input[1] - input[6]; + s7 = input[0] - input[7]; + + // fdct4_1d(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; + out[0] = dct_const_round_shift(t0); + out[4] = dct_const_round_shift(t2); + out[8] = dct_const_round_shift(t1); + out[12] = dct_const_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = dct_const_round_shift(t0); + t3 = dct_const_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + out[2] = dct_const_round_shift(t0); + out[6] = dct_const_round_shift(t2); + out[10] = dct_const_round_shift(t1); + out[14] = dct_const_round_shift(t3); + } + // Work on the next eight values; step1 -> odd_results + { + // step 2 + temp1 = (step1[5] - step1[2]) * cospi_16_64; + temp2 = (step1[4] - step1[3]) * cospi_16_64; + step2[2] = dct_const_round_shift(temp1); + step2[3] = dct_const_round_shift(temp2); + temp1 = (step1[4] + step1[3]) * cospi_16_64; + temp2 = (step1[5] + step1[2]) * cospi_16_64; + step2[4] = dct_const_round_shift(temp1); + step2[5] = dct_const_round_shift(temp2); + // step 3 + step3[0] = step1[0] + step2[3]; + step3[1] = step1[1] + step2[2]; + step3[2] = step1[1] - step2[2]; + step3[3] = step1[0] - step2[3]; + step3[4] = step1[7] - step2[4]; + step3[5] = step1[6] - step2[5]; + step3[6] = step1[6] + step2[5]; + step3[7] = step1[7] + step2[4]; + // step 4 + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; + temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; + step2[1] = dct_const_round_shift(temp1); + step2[2] = dct_const_round_shift(temp2); + temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; + step2[5] = dct_const_round_shift(temp1); + step2[6] = dct_const_round_shift(temp2); + // step 5 + step1[0] = step3[0] + step2[1]; + step1[1] = step3[0] - step2[1]; + step1[2] = step3[3] - step2[2]; + step1[3] = step3[3] + step2[2]; + step1[4] = step3[4] + step2[5]; + step1[5] = step3[4] - step2[5]; + step1[6] = step3[7] - step2[6]; + step1[7] = step3[7] + step2[6]; + // step 6 + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; + out[1] = dct_const_round_shift(temp1); + out[9] = dct_const_round_shift(temp2); + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + out[5] = dct_const_round_shift(temp1); + out[13] = dct_const_round_shift(temp2); + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; + out[3] = dct_const_round_shift(temp1); + out[11] = dct_const_round_shift(temp2); + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + out[7] = dct_const_round_shift(temp1); + out[15] = dct_const_round_shift(temp2); + } + // Do next column (which is a transposed row in second/horizontal pass) + in++; + out += 16; + } + // Setup in/out for next pass. + in = intermediate; + out = output; + } +} + static void fadst8_1d(int16_t *input, int16_t *output) { int s0, s1, s2, s3, s4, s5, s6, s7; @@ -374,7 +553,7 @@ void vp9_short_fht8x8_c(int16_t *input, int16_t *output, } } -void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) { +void vp9_short_walsh4x4_c(short *input, short *output, int pitch) { int i; int a1, b1, c1, d1; short *ip = input; @@ -414,139 +593,152 @@ void vp9_short_walsh4x4_x8_c(short *input, short *output, int pitch) { } } -void vp9_short_walsh8x4_x8_c(short *input, short *output, int pitch) { - vp9_short_walsh4x4_x8_c(input, output, pitch); - vp9_short_walsh4x4_x8_c(input + 4, output + 16, pitch); +void vp9_short_walsh8x4_c(short *input, short *output, int pitch) { + vp9_short_walsh4x4_c(input, output, pitch); + vp9_short_walsh4x4_c(input + 4, output + 16, pitch); } // Rewrote to use same algorithm as others. -static void fdct16_1d(int16_t input[16], int16_t output[16]) { - int16_t step[16]; - int temp1, temp2; +static void fdct16_1d(int16_t in[16], int16_t out[16]) { + /*canbe16*/ int step1[8]; + /*canbe16*/ int step2[8]; + /*canbe16*/ int step3[8]; + /*canbe16*/ int input[8]; + /*needs32*/ int temp1, temp2; // step 1 - step[ 0] = input[0] + input[15]; - step[ 1] = input[1] + input[14]; - step[ 2] = input[2] + input[13]; - step[ 3] = input[3] + input[12]; - step[ 4] = input[4] + input[11]; - step[ 5] = input[5] + input[10]; - step[ 6] = input[6] + input[ 9]; - step[ 7] = input[7] + input[ 8]; - step[ 8] = input[7] - input[ 8]; - step[ 9] = input[6] - input[ 9]; - step[10] = input[5] - input[10]; - step[11] = input[4] - input[11]; - step[12] = input[3] - input[12]; - step[13] = input[2] - input[13]; - step[14] = input[1] - input[14]; - step[15] = input[0] - input[15]; - - fdct8_1d(step, step); + input[0] = in[0] + in[15]; + input[1] = in[1] + in[14]; + input[2] = in[2] + in[13]; + input[3] = in[3] + in[12]; + input[4] = in[4] + in[11]; + input[5] = in[5] + in[10]; + input[6] = in[6] + in[ 9]; + input[7] = in[7] + in[ 8]; + + step1[0] = in[7] - in[ 8]; + step1[1] = in[6] - in[ 9]; + step1[2] = in[5] - in[10]; + step1[3] = in[4] - in[11]; + step1[4] = in[3] - in[12]; + step1[5] = in[2] - in[13]; + step1[6] = in[1] - in[14]; + step1[7] = in[0] - in[15]; + + // fdct8_1d(step, step); + { + /*canbe16*/ int s0, s1, s2, s3, s4, s5, s6, s7; + /*needs32*/ int t0, t1, t2, t3; + /*canbe16*/ int x0, x1, x2, x3; + + // stage 1 + s0 = input[0] + input[7]; + s1 = input[1] + input[6]; + s2 = input[2] + input[5]; + s3 = input[3] + input[4]; + s4 = input[3] - input[4]; + s5 = input[2] - input[5]; + s6 = input[1] - input[6]; + s7 = input[0] - input[7]; + + // fdct4_1d(step, step); + x0 = s0 + s3; + x1 = s1 + s2; + x2 = s1 - s2; + x3 = s0 - s3; + t0 = (x0 + x1) * cospi_16_64; + t1 = (x0 - x1) * cospi_16_64; + t2 = x3 * cospi_8_64 + x2 * cospi_24_64; + t3 = x3 * cospi_24_64 - x2 * cospi_8_64; + out[0] = dct_const_round_shift(t0); + out[4] = dct_const_round_shift(t2); + out[8] = dct_const_round_shift(t1); + out[12] = dct_const_round_shift(t3); + + // Stage 2 + t0 = (s6 - s5) * cospi_16_64; + t1 = (s6 + s5) * cospi_16_64; + t2 = dct_const_round_shift(t0); + t3 = dct_const_round_shift(t1); + + // Stage 3 + x0 = s4 + t2; + x1 = s4 - t2; + x2 = s7 - t3; + x3 = s7 + t3; + + // Stage 4 + t0 = x0 * cospi_28_64 + x3 * cospi_4_64; + t1 = x1 * cospi_12_64 + x2 * cospi_20_64; + t2 = x2 * cospi_12_64 + x1 * -cospi_20_64; + t3 = x3 * cospi_28_64 + x0 * -cospi_4_64; + out[2] = dct_const_round_shift(t0); + out[6] = dct_const_round_shift(t2); + out[10] = dct_const_round_shift(t1); + out[14] = dct_const_round_shift(t3); + } // step 2 - output[8] = step[8]; - output[9] = step[9]; - temp1 = (-step[10] + step[13]) * cospi_16_64; - temp2 = (-step[11] + step[12]) * cospi_16_64; - output[10] = dct_const_round_shift(temp1); - output[11] = dct_const_round_shift(temp2); - temp1 = (step[11] + step[12]) * cospi_16_64; - temp2 = (step[10] + step[13]) * cospi_16_64; - output[12] = dct_const_round_shift(temp1); - output[13] = dct_const_round_shift(temp2); - output[14] = step[14]; - output[15] = step[15]; + temp1 = (step1[5] - step1[2]) * cospi_16_64; + temp2 = (step1[4] - step1[3]) * cospi_16_64; + step2[2] = dct_const_round_shift(temp1); + step2[3] = dct_const_round_shift(temp2); + temp1 = (step1[4] + step1[3]) * cospi_16_64; + temp2 = (step1[5] + step1[2]) * cospi_16_64; + step2[4] = dct_const_round_shift(temp1); + step2[5] = dct_const_round_shift(temp2); // step 3 - step[ 8] = output[8] + output[11]; - step[ 9] = output[9] + output[10]; - step[ 10] = output[9] - output[10]; - step[ 11] = output[8] - output[11]; - step[ 12] = -output[12] + output[15]; - step[ 13] = -output[13] + output[14]; - step[ 14] = output[13] + output[14]; - step[ 15] = output[12] + output[15]; + step3[0] = step1[0] + step2[3]; + step3[1] = step1[1] + step2[2]; + step3[2] = step1[1] - step2[2]; + step3[3] = step1[0] - step2[3]; + step3[4] = step1[7] - step2[4]; + step3[5] = step1[6] - step2[5]; + step3[6] = step1[6] + step2[5]; + step3[7] = step1[7] + step2[4]; // step 4 - output[8] = step[8]; - temp1 = -step[9] * cospi_8_64 + step[14] * cospi_24_64; - temp2 = -step[10] * cospi_24_64 - step[13] * cospi_8_64; - output[9] = dct_const_round_shift(temp1); - output[10] = dct_const_round_shift(temp2); - output[11] = step[11]; - output[12] = step[12]; - temp1 = -step[10] * cospi_8_64 + step[13] * cospi_24_64; - temp2 = step[9] * cospi_24_64 + step[14] * cospi_8_64; - output[13] = dct_const_round_shift(temp1); - output[14] = dct_const_round_shift(temp2); - output[15] = step[15]; + temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64; + temp2 = step3[2] * -cospi_24_64 - step3[5] * cospi_8_64; + step2[1] = dct_const_round_shift(temp1); + step2[2] = dct_const_round_shift(temp2); + temp1 = step3[2] * -cospi_8_64 + step3[5] * cospi_24_64; + temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64; + step2[5] = dct_const_round_shift(temp1); + step2[6] = dct_const_round_shift(temp2); // step 5 - step[8] = output[8] + output[9]; - step[9] = output[8] - output[9]; - step[10] = -output[10] + output[11]; - step[11] = output[10] + output[11]; - step[12] = output[12] + output[13]; - step[13] = output[12] - output[13]; - step[14] = -output[14] + output[15]; - step[15] = output[14] + output[15]; + step1[0] = step3[0] + step2[1]; + step1[1] = step3[0] - step2[1]; + step1[2] = step3[3] - step2[2]; + step1[3] = step3[3] + step2[2]; + step1[4] = step3[4] + step2[5]; + step1[5] = step3[4] - step2[5]; + step1[6] = step3[7] - step2[6]; + step1[7] = step3[7] + step2[6]; // step 6 - output[0] = step[0]; - output[8] = step[4]; - output[4] = step[2]; - output[12] = step[6]; - output[2] = step[1]; - output[10] = step[5]; - output[6] = step[3]; - output[14] = step[7]; - - temp1 = step[8] * cospi_30_64 + step[15] * cospi_2_64; - temp2 = step[9] * cospi_14_64 + step[14] * cospi_18_64; - output[1] = dct_const_round_shift(temp1); - output[9] = dct_const_round_shift(temp2); - - temp1 = step[10] * cospi_22_64 + step[13] * cospi_10_64; - temp2 = step[11] * cospi_6_64 + step[12] * cospi_26_64; - output[5] = dct_const_round_shift(temp1); - output[13] = dct_const_round_shift(temp2); - - temp1 = -step[11] * cospi_26_64 + step[12] * cospi_6_64; - temp2 = -step[10] * cospi_10_64 + step[13] * cospi_22_64; - output[3] = dct_const_round_shift(temp1); - output[11] = dct_const_round_shift(temp2); - - temp1 = -step[9] * cospi_18_64 + step[14] * cospi_14_64; - temp2 = -step[8] * cospi_2_64 + step[15] * cospi_30_64; - output[7] = dct_const_round_shift(temp1); - output[15] = dct_const_round_shift(temp2); -} - -void vp9_short_fdct16x16_c(int16_t *input, int16_t *out, int pitch) { - int shortpitch = pitch >> 1; - int i, j; - int16_t output[256]; - int16_t temp_in[16], temp_out[16]; - - // First transform columns - for (i = 0; i < 16; i++) { - for (j = 0; j < 16; j++) - temp_in[j] = input[j * shortpitch + i] << 2; - fdct16_1d(temp_in, temp_out); - for (j = 0; j < 16; j++) - output[j * 16 + i] = (temp_out[j] + 1) >> 2; - } - - // Then transform rows - for (i = 0; i < 16; ++i) { - for (j = 0; j < 16; ++j) - temp_in[j] = output[j + i * 16]; - fdct16_1d(temp_in, temp_out); - for (j = 0; j < 16; ++j) - out[j + i * 16] = temp_out[j]; - } + temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64; + temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64; + out[1] = dct_const_round_shift(temp1); + out[9] = dct_const_round_shift(temp2); + + temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64; + temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64; + out[5] = dct_const_round_shift(temp1); + out[13] = dct_const_round_shift(temp2); + + temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64; + temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64; + out[3] = dct_const_round_shift(temp1); + out[11] = dct_const_round_shift(temp2); + + temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64; + temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64; + out[7] = dct_const_round_shift(temp1); + out[15] = dct_const_round_shift(temp2); } void fadst16_1d(int16_t *input, int16_t *output) { diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 87d456dd4..428e585e1 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -594,9 +594,6 @@ static void update_state(VP9_COMP *cpi, [vp9_switchable_interp_map[mbmi->interp_filter]]; } - cpi->prediction_error += ctx->distortion; - cpi->intra_error += ctx->intra_error; - cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff; cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff; cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff; @@ -1217,10 +1214,10 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) { static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { if (lossless) { - cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4_x8; - cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4_x8; - cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_inv_walsh4x4_1_x8; - cpi->mb.e_mbd.inv_txm4x4 = vp9_short_inv_walsh4x4_x8; + cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; + cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_iwalsh4x4; cpi->mb.optimize = 0; cpi->common.filter_level = 0; cpi->zbin_mode_boost_enabled = FALSE; @@ -1228,8 +1225,8 @@ static void switch_lossless_mode(VP9_COMP *cpi, int lossless) { } else { cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; - cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4llm_1; - cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4llm; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4; } } @@ -1265,8 +1262,6 @@ static void encode_frame_internal(VP9_COMP *cpi) { // Reset frame count of inter 0,0 motion vector usage. cpi->inter_zz_count = 0; - cpi->prediction_error = 0; - cpi->intra_error = 0; cpi->skip_true_count[0] = cpi->skip_true_count[1] = cpi->skip_true_count[2] = 0; cpi->skip_false_count[0] = cpi->skip_false_count[1] = cpi->skip_false_count[2] = 0; @@ -1292,8 +1287,9 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_zero(cpi->mb_mv_ref_count); #endif - - // force lossless mode when Q0 is selected + // force lossless mode + if (cm->base_qindex <= 4) + cm->base_qindex = 0; cpi->mb.e_mbd.lossless = (cm->base_qindex == 0 && cm->y1dc_delta_q == 0 && cm->uvdc_delta_q == 0 && diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c index 3c98d4aa6..9e5bcea16 100644 --- a/vp9/encoder/vp9_encodeintra.c +++ b/vp9/encoder/vp9_encodeintra.c @@ -44,7 +44,7 @@ void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) { TX_TYPE tx_type; #if CONFIG_NEWBINTRAMODES - b->bmi.as_mode.context = vp9_find_bpred_context(b); + b->bmi.as_mode.context = vp9_find_bpred_context(&x->e_mbd, b); #endif vp9_intra4x4_predict(&x->e_mbd, b, b->bmi.as_mode.first, b->predictor); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index 5c2067b00..d646fe222 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -859,6 +859,8 @@ static double calc_correction_factor(double err_per_mb, power_term = (power_term > pt_high) ? pt_high : power_term; // Calculate correction factor + if (power_term < 1.0) + assert(error_term >= 0.0); correction_factor = pow(error_term, power_term); // Clip range @@ -920,15 +922,19 @@ static int estimate_max_q(VP9_COMP *cpi, // Look at the drop in prediction quality between the last frame // and the GF buffer (which contained an older frame). - sr_err_diff = - (fpstats->sr_coded_error - fpstats->coded_error) / - (fpstats->count * cpi->common.MBs); - sr_correction = (sr_err_diff / 32.0); - sr_correction = pow(sr_correction, 0.25); - if (sr_correction < 0.75) + if (fpstats->sr_coded_error > fpstats->coded_error) { + sr_err_diff = + (fpstats->sr_coded_error - fpstats->coded_error) / + (fpstats->count * cpi->common.MBs); + sr_correction = (sr_err_diff / 32.0); + sr_correction = pow(sr_correction, 0.25); + if (sr_correction < 0.75) + sr_correction = 0.75; + else if (sr_correction > 1.25) + sr_correction = 1.25; + } else { sr_correction = 0.75; - else if (sr_correction > 1.25) - sr_correction = 1.25; + } // Calculate a corrective factor based on a rolling ratio of bits spent // vs target bits @@ -1031,15 +1037,19 @@ static int estimate_cq(VP9_COMP *cpi, // Look at the drop in prediction quality between the last frame // and the GF buffer (which contained an older frame). - sr_err_diff = - (fpstats->sr_coded_error - fpstats->coded_error) / - (fpstats->count * cpi->common.MBs); - sr_correction = (sr_err_diff / 32.0); - sr_correction = pow(sr_correction, 0.25); - if (sr_correction < 0.75) + if (fpstats->sr_coded_error > fpstats->coded_error) { + sr_err_diff = + (fpstats->sr_coded_error - fpstats->coded_error) / + (fpstats->count * cpi->common.MBs); + sr_correction = (sr_err_diff / 32.0); + sr_correction = pow(sr_correction, 0.25); + if (sr_correction < 0.75) + sr_correction = 0.75; + else if (sr_correction > 1.25) + sr_correction = 1.25; + } else { sr_correction = 0.75; - else if (sr_correction > 1.25) - sr_correction = 1.25; + } // II ratio correction factor for clip as a whole clip_iiratio = cpi->twopass.total_stats->intra_error / @@ -1178,12 +1188,16 @@ static double get_prediction_decay_rate(VP9_COMP *cpi, mb_sr_err_diff = (next_frame->sr_coded_error - next_frame->coded_error) / (cpi->common.MBs); - second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0); - second_ref_decay = pow(second_ref_decay, 0.5); - if (second_ref_decay < 0.85) + if (mb_sr_err_diff <= 512.0) { + second_ref_decay = 1.0 - (mb_sr_err_diff / 512.0); + second_ref_decay = pow(second_ref_decay, 0.5); + if (second_ref_decay < 0.85) + second_ref_decay = 0.85; + else if (second_ref_decay > 1.0) + second_ref_decay = 1.0; + } else { second_ref_decay = 0.85; - else if (second_ref_decay > 1.0) - second_ref_decay = 1.0; + } if (second_ref_decay < prediction_decay_rate) prediction_decay_rate = second_ref_decay; diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index d5c7032a9..fd1bb2b4e 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -21,7 +21,7 @@ void print_mode_context(VP9_COMMON *pc); // The maximum number of steps in a step search given the largest // allowed initial step -#define MAX_MVSEARCH_STEPS 10 +#define MAX_MVSEARCH_STEPS 11 // Max full pel mv specified in 1 pel units #define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1) // Maximum size of the first step in full pel units diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index 5e4b4915b..57ebfa1b1 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -843,8 +843,8 @@ void vp9_set_speed_features(VP9_COMP *cpi) { cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4; cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4; if (cpi->oxcf.lossless || cpi->mb.e_mbd.lossless) { - cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4_x8; - cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4_x8; + cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4; + cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4; } cpi->mb.quantize_b_4x4 = vp9_regular_quantize_b_4x4; @@ -1204,11 +1204,11 @@ void vp9_change_config(VP9_PTR ptr, VP9_CONFIG *oxcf) { cpi->oxcf.lossless = oxcf->lossless; if (cpi->oxcf.lossless) { - cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_inv_walsh4x4_1_x8; - cpi->mb.e_mbd.inv_txm4x4 = vp9_short_inv_walsh4x4_x8; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_iwalsh4x4; } else { - cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4llm_1; - cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4llm; + cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1; + cpi->mb.e_mbd.inv_txm4x4 = vp9_short_idct4x4; } cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL; diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h index a7a07b393..160722a12 100644 --- a/vp9/encoder/vp9_onyx_int.h +++ b/vp9/encoder/vp9_onyx_int.h @@ -390,11 +390,6 @@ typedef struct VP9_COMP { CODING_CONTEXT coding_context; // Rate targetting variables - int64_t prediction_error; - int64_t last_prediction_error; - int64_t intra_error; - int64_t last_intra_error; - int this_frame_target; int projected_frame_size; int last_q[2]; // Separate values for Intra/Inter diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 387e32583..6e8be6b15 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1168,7 +1168,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, BLOCK *be, DECLARE_ALIGNED_ARRAY(16, int16_t, best_dqcoeff, 16); #if CONFIG_NEWBINTRAMODES - b->bmi.as_mode.context = vp9_find_bpred_context(b); + b->bmi.as_mode.context = vp9_find_bpred_context(xd, b); #endif xd->mode_info_context->mbmi.txfm_size = TX_4X4; for (mode = B_DC_PRED; mode < LEFT4X4; mode++) { @@ -1279,7 +1279,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb, bmode_costs = mb->bmode_costs[A][L]; } #if CONFIG_NEWBINTRAMODES - mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd->block + i); + mic->bmi[i].as_mode.context = vp9_find_bpred_context(xd, xd->block + i); #endif total_rd += rd_pick_intra4x4block( diff --git a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c index ff884d999..28c4c754e 100644 --- a/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c +++ b/vp9/encoder/x86/vp9_dct_sse2_intrinsics.c @@ -270,3 +270,629 @@ void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) { _mm_storeu_si128 ((__m128i *)(output + 7 * 8), in7); } } + +void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) { + // The 2D transform is done with two passes which are actually pretty + // similar. In the first one, we transform the columns and transpose + // the results. In the second one, we transform the rows. To achieve that, + // as the first pass results are transposed, we tranpose the columns (that + // is the transposed rows) and transpose the results (so that it goes back + // in normal/row positions). + const int stride = pitch >> 1; + int pass; + // We need an intermediate buffer between passes. + int16_t intermediate[256]; + int16_t *in = input; + int16_t *out = intermediate; + // Constants + // When we use them, in one case, they are all the same. In all others + // it's a pair of them that we need to repeat four times. This is done + // by constructing the 32 bit constant corresponding to that pair. + const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64); + const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64); + const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64); + const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64); + const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64); + const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64); + const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64); + const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64); + const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64); + const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64); + const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64); + const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64); + const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64); + const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64); + const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64); + const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64); + const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64); + const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING); + const __m128i kOne = _mm_set1_epi16(1); + // Do the two transform/transpose passes + for (pass = 0; pass < 2; ++pass) { + // We process eight columns (transposed rows in second pass) at a time. + int column_start; + for (column_start = 0; column_start < 16; column_start += 8) { + __m128i in00, in01, in02, in03, in04, in05, in06, in07; + __m128i in08, in09, in10, in11, in12, in13, in14, in15; + __m128i input0, input1, input2, input3, input4, input5, input6, input7; + __m128i step1_0, step1_1, step1_2, step1_3; + __m128i step1_4, step1_5, step1_6, step1_7; + __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6; + __m128i step3_0, step3_1, step3_2, step3_3; + __m128i step3_4, step3_5, step3_6, step3_7; + __m128i res00, res01, res02, res03, res04, res05, res06, res07; + __m128i res08, res09, res10, res11, res12, res13, res14, res15; + // Load and pre-condition input. + if (0 == pass) { + in00 = _mm_loadu_si128((const __m128i *)(in + 0 * stride)); + in01 = _mm_loadu_si128((const __m128i *)(in + 1 * stride)); + in02 = _mm_loadu_si128((const __m128i *)(in + 2 * stride)); + in03 = _mm_loadu_si128((const __m128i *)(in + 3 * stride)); + in04 = _mm_loadu_si128((const __m128i *)(in + 4 * stride)); + in05 = _mm_loadu_si128((const __m128i *)(in + 5 * stride)); + in06 = _mm_loadu_si128((const __m128i *)(in + 6 * stride)); + in07 = _mm_loadu_si128((const __m128i *)(in + 7 * stride)); + in08 = _mm_loadu_si128((const __m128i *)(in + 8 * stride)); + in09 = _mm_loadu_si128((const __m128i *)(in + 9 * stride)); + in10 = _mm_loadu_si128((const __m128i *)(in + 10 * stride)); + in11 = _mm_loadu_si128((const __m128i *)(in + 11 * stride)); + in12 = _mm_loadu_si128((const __m128i *)(in + 12 * stride)); + in13 = _mm_loadu_si128((const __m128i *)(in + 13 * stride)); + in14 = _mm_loadu_si128((const __m128i *)(in + 14 * stride)); + in15 = _mm_loadu_si128((const __m128i *)(in + 15 * stride)); + // x = x << 2 + in00 = _mm_slli_epi16(in00, 2); + in01 = _mm_slli_epi16(in01, 2); + in02 = _mm_slli_epi16(in02, 2); + in03 = _mm_slli_epi16(in03, 2); + in04 = _mm_slli_epi16(in04, 2); + in05 = _mm_slli_epi16(in05, 2); + in06 = _mm_slli_epi16(in06, 2); + in07 = _mm_slli_epi16(in07, 2); + in08 = _mm_slli_epi16(in08, 2); + in09 = _mm_slli_epi16(in09, 2); + in10 = _mm_slli_epi16(in10, 2); + in11 = _mm_slli_epi16(in11, 2); + in12 = _mm_slli_epi16(in12, 2); + in13 = _mm_slli_epi16(in13, 2); + in14 = _mm_slli_epi16(in14, 2); + in15 = _mm_slli_epi16(in15, 2); + } else { + in00 = _mm_loadu_si128((const __m128i *)(in + 0 * 16)); + in01 = _mm_loadu_si128((const __m128i *)(in + 1 * 16)); + in02 = _mm_loadu_si128((const __m128i *)(in + 2 * 16)); + in03 = _mm_loadu_si128((const __m128i *)(in + 3 * 16)); + in04 = _mm_loadu_si128((const __m128i *)(in + 4 * 16)); + in05 = _mm_loadu_si128((const __m128i *)(in + 5 * 16)); + in06 = _mm_loadu_si128((const __m128i *)(in + 6 * 16)); + in07 = _mm_loadu_si128((const __m128i *)(in + 7 * 16)); + in08 = _mm_loadu_si128((const __m128i *)(in + 8 * 16)); + in09 = _mm_loadu_si128((const __m128i *)(in + 9 * 16)); + in10 = _mm_loadu_si128((const __m128i *)(in + 10 * 16)); + in11 = _mm_loadu_si128((const __m128i *)(in + 11 * 16)); + in12 = _mm_loadu_si128((const __m128i *)(in + 12 * 16)); + in13 = _mm_loadu_si128((const __m128i *)(in + 13 * 16)); + in14 = _mm_loadu_si128((const __m128i *)(in + 14 * 16)); + in15 = _mm_loadu_si128((const __m128i *)(in + 15 * 16)); + // x = (x + 1) >> 2 + in00 = _mm_add_epi16(in00, kOne); + in01 = _mm_add_epi16(in01, kOne); + in02 = _mm_add_epi16(in02, kOne); + in03 = _mm_add_epi16(in03, kOne); + in04 = _mm_add_epi16(in04, kOne); + in05 = _mm_add_epi16(in05, kOne); + in06 = _mm_add_epi16(in06, kOne); + in07 = _mm_add_epi16(in07, kOne); + in08 = _mm_add_epi16(in08, kOne); + in09 = _mm_add_epi16(in09, kOne); + in10 = _mm_add_epi16(in10, kOne); + in11 = _mm_add_epi16(in11, kOne); + in12 = _mm_add_epi16(in12, kOne); + in13 = _mm_add_epi16(in13, kOne); + in14 = _mm_add_epi16(in14, kOne); + in15 = _mm_add_epi16(in15, kOne); + in00 = _mm_srai_epi16(in00, 2); + in01 = _mm_srai_epi16(in01, 2); + in02 = _mm_srai_epi16(in02, 2); + in03 = _mm_srai_epi16(in03, 2); + in04 = _mm_srai_epi16(in04, 2); + in05 = _mm_srai_epi16(in05, 2); + in06 = _mm_srai_epi16(in06, 2); + in07 = _mm_srai_epi16(in07, 2); + in08 = _mm_srai_epi16(in08, 2); + in09 = _mm_srai_epi16(in09, 2); + in10 = _mm_srai_epi16(in10, 2); + in11 = _mm_srai_epi16(in11, 2); + in12 = _mm_srai_epi16(in12, 2); + in13 = _mm_srai_epi16(in13, 2); + in14 = _mm_srai_epi16(in14, 2); + in15 = _mm_srai_epi16(in15, 2); + } + in += 8; + // Calculate input for the first 8 results. + { + input0 = _mm_add_epi16(in00, in15); + input1 = _mm_add_epi16(in01, in14); + input2 = _mm_add_epi16(in02, in13); + input3 = _mm_add_epi16(in03, in12); + input4 = _mm_add_epi16(in04, in11); + input5 = _mm_add_epi16(in05, in10); + input6 = _mm_add_epi16(in06, in09); + input7 = _mm_add_epi16(in07, in08); + } + // Calculate input for the next 8 results. + { + step1_0 = _mm_sub_epi16(in07, in08); + step1_1 = _mm_sub_epi16(in06, in09); + step1_2 = _mm_sub_epi16(in05, in10); + step1_3 = _mm_sub_epi16(in04, in11); + step1_4 = _mm_sub_epi16(in03, in12); + step1_5 = _mm_sub_epi16(in02, in13); + step1_6 = _mm_sub_epi16(in01, in14); + step1_7 = _mm_sub_epi16(in00, in15); + } + // Work on the first eight values; fdct8_1d(input, even_results); + { + // Add/substract + const __m128i q0 = _mm_add_epi16(input0, input7); + const __m128i q1 = _mm_add_epi16(input1, input6); + const __m128i q2 = _mm_add_epi16(input2, input5); + const __m128i q3 = _mm_add_epi16(input3, input4); + const __m128i q4 = _mm_sub_epi16(input3, input4); + const __m128i q5 = _mm_sub_epi16(input2, input5); + const __m128i q6 = _mm_sub_epi16(input1, input6); + const __m128i q7 = _mm_sub_epi16(input0, input7); + // Work on first four results + { + // Add/substract + const __m128i r0 = _mm_add_epi16(q0, q3); + const __m128i r1 = _mm_add_epi16(q1, q2); + const __m128i r2 = _mm_sub_epi16(q1, q2); + const __m128i r3 = _mm_sub_epi16(q0, q3); + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i t0 = _mm_unpacklo_epi16(r0, r1); + const __m128i t1 = _mm_unpackhi_epi16(r0, r1); + const __m128i t2 = _mm_unpacklo_epi16(r2, r3); + const __m128i t3 = _mm_unpackhi_epi16(r2, r3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res00 = _mm_packs_epi32(w0, w1); + res08 = _mm_packs_epi32(w2, w3); + res04 = _mm_packs_epi32(w4, w5); + res12 = _mm_packs_epi32(w6, w7); + } + // Work on next four results + { + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i d0 = _mm_unpacklo_epi16(q6, q5); + const __m128i d1 = _mm_unpackhi_epi16(q6, q5); + const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16); + const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16); + const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16); + const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING); + const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING); + const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING); + const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING); + const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS); + const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS); + const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS); + const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS); + // Combine + const __m128i r0 = _mm_packs_epi32(s0, s1); + const __m128i r1 = _mm_packs_epi32(s2, s3); + // Add/substract + const __m128i x0 = _mm_add_epi16(q4, r0); + const __m128i x1 = _mm_sub_epi16(q4, r0); + const __m128i x2 = _mm_sub_epi16(q7, r1); + const __m128i x3 = _mm_add_epi16(q7, r1); + // Interleave to do the multiply by constants which gets us + // into 32 bits. + const __m128i t0 = _mm_unpacklo_epi16(x0, x3); + const __m128i t1 = _mm_unpackhi_epi16(x0, x3); + const __m128i t2 = _mm_unpacklo_epi16(x1, x2); + const __m128i t3 = _mm_unpackhi_epi16(x1, x2); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04); + const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28); + const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28); + const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20); + const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20); + const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12); + const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING); + const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING); + const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING); + const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS); + const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS); + const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS); + const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS); + // Combine + res02 = _mm_packs_epi32(w0, w1); + res14 = _mm_packs_epi32(w2, w3); + res10 = _mm_packs_epi32(w4, w5); + res06 = _mm_packs_epi32(w6, w7); + } + } + // Work on the next eight values; step1 -> odd_results + { + // step 2 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); + const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); + const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); + const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + step2_2 = _mm_packs_epi32(w0, w1); + step2_3 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2); + const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2); + const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3); + const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + step2_5 = _mm_packs_epi32(w0, w1); + step2_4 = _mm_packs_epi32(w2, w3); + } + // step 3 + { + step3_0 = _mm_add_epi16(step1_0, step2_3); + step3_1 = _mm_add_epi16(step1_1, step2_2); + step3_2 = _mm_sub_epi16(step1_1, step2_2); + step3_3 = _mm_sub_epi16(step1_0, step2_3); + step3_4 = _mm_sub_epi16(step1_7, step2_4); + step3_5 = _mm_sub_epi16(step1_6, step2_5); + step3_6 = _mm_add_epi16(step1_6, step2_5); + step3_7 = _mm_add_epi16(step1_7, step2_4); + } + // step 4 + { + const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); + const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); + const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); + const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + step2_1 = _mm_packs_epi32(w0, w1); + step2_2 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6); + const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6); + const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5); + const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + step2_6 = _mm_packs_epi32(w0, w1); + step2_5 = _mm_packs_epi32(w2, w3); + } + // step 5 + { + step1_0 = _mm_add_epi16(step3_0, step2_1); + step1_1 = _mm_sub_epi16(step3_0, step2_1); + step1_2 = _mm_sub_epi16(step3_3, step2_2); + step1_3 = _mm_add_epi16(step3_3, step2_2); + step1_4 = _mm_add_epi16(step3_4, step2_5); + step1_5 = _mm_sub_epi16(step3_4, step2_5); + step1_6 = _mm_sub_epi16(step3_7, step2_6); + step1_7 = _mm_add_epi16(step3_7, step2_6); + } + // step 6 + { + const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); + const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); + const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); + const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + res01 = _mm_packs_epi32(w0, w1); + res09 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); + const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); + const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); + const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + res05 = _mm_packs_epi32(w0, w1); + res13 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5); + const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5); + const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4); + const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + res11 = _mm_packs_epi32(w0, w1); + res03 = _mm_packs_epi32(w2, w3); + } + { + const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7); + const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7); + const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6); + const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6); + const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30); + const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30); + const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14); + const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14); + // dct_const_round_shift + const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING); + const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING); + const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING); + const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING); + const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS); + const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS); + const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS); + const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS); + // Combine + res15 = _mm_packs_epi32(w0, w1); + res07 = _mm_packs_epi32(w2, w3); + } + } + // Transpose the results, do it as two 8x8 transposes. + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01); + const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03); + const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01); + const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03); + const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05); + const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07); + const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05); + const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + _mm_storeu_si128 ((__m128i *)(out + 0 * 16), tr2_0); + _mm_storeu_si128 ((__m128i *)(out + 1 * 16), tr2_1); + _mm_storeu_si128 ((__m128i *)(out + 2 * 16), tr2_2); + _mm_storeu_si128 ((__m128i *)(out + 3 * 16), tr2_3); + _mm_storeu_si128 ((__m128i *)(out + 4 * 16), tr2_4); + _mm_storeu_si128 ((__m128i *)(out + 5 * 16), tr2_5); + _mm_storeu_si128 ((__m128i *)(out + 6 * 16), tr2_6); + _mm_storeu_si128 ((__m128i *)(out + 7 * 16), tr2_7); + } + { + // 00 01 02 03 04 05 06 07 + // 10 11 12 13 14 15 16 17 + // 20 21 22 23 24 25 26 27 + // 30 31 32 33 34 35 36 37 + // 40 41 42 43 44 45 46 47 + // 50 51 52 53 54 55 56 57 + // 60 61 62 63 64 65 66 67 + // 70 71 72 73 74 75 76 77 + const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09); + const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11); + const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09); + const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11); + const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13); + const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15); + const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13); + const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + // 04 14 05 15 06 16 07 17 + // 24 34 25 35 26 36 27 37 + // 40 50 41 51 42 52 43 53 + // 60 70 61 71 62 72 63 73 + // 54 54 55 55 56 56 57 57 + // 64 74 65 75 66 76 67 77 + const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); + const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3); + const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1); + const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); + const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5); + const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7); + const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5); + const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7); + // 00 10 20 30 01 11 21 31 + // 40 50 60 70 41 51 61 71 + // 02 12 22 32 03 13 23 33 + // 42 52 62 72 43 53 63 73 + // 04 14 24 34 05 15 21 36 + // 44 54 64 74 45 55 61 76 + // 06 16 26 36 07 17 27 37 + // 46 56 66 76 47 57 67 77 + const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4); + const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4); + const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6); + const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6); + const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5); + const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5); + const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7); + const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7); + // 00 10 20 30 40 50 60 70 + // 01 11 21 31 41 51 61 71 + // 02 12 22 32 42 52 62 72 + // 03 13 23 33 43 53 63 73 + // 04 14 24 34 44 54 64 74 + // 05 15 25 35 45 55 65 75 + // 06 16 26 36 46 56 66 76 + // 07 17 27 37 47 57 67 77 + // Store results + _mm_storeu_si128 ((__m128i *)(out + 8 + 0 * 16), tr2_0); + _mm_storeu_si128 ((__m128i *)(out + 8 + 1 * 16), tr2_1); + _mm_storeu_si128 ((__m128i *)(out + 8 + 2 * 16), tr2_2); + _mm_storeu_si128 ((__m128i *)(out + 8 + 3 * 16), tr2_3); + _mm_storeu_si128 ((__m128i *)(out + 8 + 4 * 16), tr2_4); + _mm_storeu_si128 ((__m128i *)(out + 8 + 5 * 16), tr2_5); + _mm_storeu_si128 ((__m128i *)(out + 8 + 6 * 16), tr2_6); + _mm_storeu_si128 ((__m128i *)(out + 8 + 7 * 16), tr2_7); + } + out += 8*16; + } + // Setup in/out for next pass. + in = intermediate; + out = output; + } +} diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk index f1b157317..ea8631711 100644 --- a/vp9/vp9_common.mk +++ b/vp9/vp9_common.mk @@ -29,7 +29,7 @@ VP9_COMMON_SRCS-yes += common/vp9_filter.c VP9_COMMON_SRCS-yes += common/vp9_filter.h VP9_COMMON_SRCS-yes += common/vp9_findnearmv.c VP9_COMMON_SRCS-yes += common/generic/vp9_systemdependent.c -VP9_COMMON_SRCS-yes += common/vp9_idctllm.c +VP9_COMMON_SRCS-yes += common/vp9_idct.c VP9_COMMON_SRCS-yes += common/vp9_alloccommon.h VP9_COMMON_SRCS-yes += common/vp9_blockd.h VP9_COMMON_SRCS-yes += common/vp9_common.h @@ -92,7 +92,7 @@ VP9_COMMON_SRCS-$(CONFIG_POSTPROC) += common/vp9_postproc.c VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_iwalsh_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_recon_mmx.asm VP9_COMMON_SRCS-$(HAVE_MMX) += common/x86/vp9_loopfilter_mmx.asm -VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idctllm_sse2.asm +VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_iwalsh_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_loopfilter_sse2.asm VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_recon_sse2.asm @@ -111,13 +111,13 @@ VP9_COMMON_SRCS-yes += common/vp9_maskingmv.c VP9_COMMON_SRCS-$(HAVE_SSE3) += common/x86/vp9_mask_sse3.asm endif -VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idctllm_x86.c +VP9_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp9_idct_x86.c VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_sadmxn_x86.c ifeq ($(HAVE_SSE2),yes) -vp9/common/x86/vp9_idctllm_x86.c.o: CFLAGS += -msse2 +vp9/common/x86/vp9_idct_x86.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_x86.c.o: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_x86.c.o: CFLAGS += -msse2 -vp9/common/x86/vp9_idctllm_x86.c.d: CFLAGS += -msse2 +vp9/common/x86/vp9_idct_x86.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_loopfilter_x86.c.d: CFLAGS += -msse2 vp9/common/x86/vp9_sadmxn_x86.c.d: CFLAGS += -msse2 endif |