summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
authorJohn Koleszar <jkoleszar@google.com>2011-04-05 00:05:07 -0400
committerJohn Koleszar <jkoleszar@google.com>2011-04-05 00:05:07 -0400
commit89bdcc211e717b8f7407dd4af225ffd945dd9fe3 (patch)
tree4212fe21393937302ab1ab93b158a5e4146a76fe /vp8
parent50643fcfd1c82d338a75ada1b26ab3a3a22f8910 (diff)
parent91036996ac3871dccf4c6cfe47504c7b99f8555c (diff)
downloadlibvpx-89bdcc211e717b8f7407dd4af225ffd945dd9fe3.tar
libvpx-89bdcc211e717b8f7407dd4af225ffd945dd9fe3.tar.gz
libvpx-89bdcc211e717b8f7407dd4af225ffd945dd9fe3.tar.bz2
libvpx-89bdcc211e717b8f7407dd4af225ffd945dd9fe3.zip
Merge remote branch 'internal/upstream' into HEAD
Diffstat (limited to 'vp8')
-rw-r--r--vp8/common/entropymv.h2
-rwxr-xr-x[-rw-r--r--]vp8/decoder/detokenize.c8
-rw-r--r--vp8/encoder/block.h2
-rw-r--r--vp8/encoder/encodeframe.c2
-rw-r--r--vp8/encoder/encodemv.c31
-rw-r--r--vp8/encoder/encodemv.h2
-rw-r--r--vp8/encoder/ethreading.c4
-rw-r--r--vp8/encoder/firstpass.c6
-rw-r--r--vp8/encoder/mcomp.c111
-rw-r--r--vp8/encoder/mcomp.h2
-rw-r--r--vp8/encoder/onyx_if.c8
-rw-r--r--vp8/encoder/pickinter.c4
-rw-r--r--vp8/encoder/ratectrl.c2
-rw-r--r--vp8/encoder/rdopt.c12
-rw-r--r--vp8/encoder/temporal_filter.c65
-rw-r--r--vp8/encoder/x86/quantize_sse2.asm94
16 files changed, 149 insertions, 206 deletions
diff --git a/vp8/common/entropymv.h b/vp8/common/entropymv.h
index 911507ddc..2db1e385b 100644
--- a/vp8/common/entropymv.h
+++ b/vp8/common/entropymv.h
@@ -18,6 +18,8 @@ enum
{
mv_max = 1023, /* max absolute value of a MV component */
MVvals = (2 * mv_max) + 1, /* # possible values "" */
+ mvfp_max = 255, /* max absolute value of a full pixel MV component */
+ MVfpvals = (2 * mvfp_max) +1, /* # possible full pixel MV values */
mvlong_width = 10, /* Large MVs have 9 bit magnitudes */
mvnum_short = 8, /* magnitudes 0 through 7 */
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index 1d056bbb2..2d050512a 100644..100755
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -181,7 +181,7 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
{
ENTROPY_CONTEXT *A = (ENTROPY_CONTEXT *)x->above_context;
ENTROPY_CONTEXT *L = (ENTROPY_CONTEXT *)x->left_context;
- const VP8_COMMON *const oc = & dx->common;
+ const FRAME_CONTEXT * const fc = &dx->common.fc;
BOOL_DECODER *bc = x->current_bc;
@@ -236,7 +236,7 @@ int vp8_decode_mb_tokens(VP8D_COMP *dx, MACROBLOCKD *x)
range = bc->range;
- coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
+ coef_probs = fc->coef_probs [type] [ 0 ] [0];
BLOCK_LOOP:
a = A + vp8_block2above[i];
@@ -348,7 +348,7 @@ BLOCK_FINISHED:
type = 0;
i = 0;
stop = 16;
- coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
+ coef_probs = fc->coef_probs [type] [ 0 ] [0];
qcoeff_ptr -= (24*16 + 16);
goto BLOCK_LOOP;
}
@@ -356,7 +356,7 @@ BLOCK_FINISHED:
if (i == 16)
{
type = 2;
- coef_probs = oc->fc.coef_probs [type] [ 0 ] [0];
+ coef_probs = fc->coef_probs [type] [ 0 ] [0];
stop = 24;
goto BLOCK_LOOP;
}
diff --git a/vp8/encoder/block.h b/vp8/encoder/block.h
index 2fd67822b..5a2568dde 100644
--- a/vp8/encoder/block.h
+++ b/vp8/encoder/block.h
@@ -86,7 +86,7 @@ typedef struct
int mvcosts[2][MVvals+1];
int *mvcost[2];
- int mvsadcosts[2][MVvals+1];
+ int mvsadcosts[2][MVfpvals+1];
int *mvsadcost[2];
int mbmode_cost[2][MB_MODE_COUNT];
int intra_uv_mode_cost[2][MB_MODE_COUNT];
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 3d280005d..ac0f93790 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -1117,7 +1117,7 @@ void vp8_encode_frame(VP8_COMP *cpi)
}
if (flag[0] || flag[1])
- vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+ vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
}
#endif
diff --git a/vp8/encoder/encodemv.c b/vp8/encoder/encodemv.c
index 6b1e6f965..a4849c654 100644
--- a/vp8/encoder/encodemv.c
+++ b/vp8/encoder/encodemv.c
@@ -134,31 +134,14 @@ static unsigned int cost_mvcomponent(const int v, const struct mv_context *mvc)
return cost; // + vp8_cost_bit( p [MVPsign], v < 0);
}
-//#define M_LOG2_E 0.693147180559945309417
-//#define log2f(x) (log (x) / (float) M_LOG2_E)
-void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2])
{
int i = 1; //-mv_max;
unsigned int cost0 = 0;
unsigned int cost1 = 0;
vp8_clear_system_state();
-#if 0
- mvsadcost [0] [0] = 300;
- mvsadcost [1] [0] = 300;
-
- do
- {
- double z = 256 * (2 * (log2f(2 * i) + .6));
- mvsadcost [0][i] = (int) z;
- mvsadcost [1][i] = (int) z;
- mvsadcost [0][-i] = (int) z;
- mvsadcost [1][-i] = (int) z;
- }
- while (++i <= mv_max);
-
-#endif
i = 1;
@@ -193,16 +176,6 @@ void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_
}
while (++i <= mv_max);
}
-
- /*
- i=-mv_max;
- do
- {
- mvcost [0] [i] = cost_mvcomponent( i, mvc[0]);
- mvcost [1] [i] = cost_mvcomponent( i, mvc[1]);
- }
- while( ++i <= mv_max);
- */
}
@@ -436,7 +409,7 @@ void vp8_write_mvprobs(VP8_COMP *cpi)
);
if (flags[0] || flags[1])
- vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);
+ vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flags);
#ifdef ENTROPY_STATS
active_section = 5;
diff --git a/vp8/encoder/encodemv.h b/vp8/encoder/encodemv.h
index e4481bff0..a6116c133 100644
--- a/vp8/encoder/encodemv.h
+++ b/vp8/encoder/encodemv.h
@@ -16,6 +16,6 @@
void vp8_write_mvprobs(VP8_COMP *);
void vp8_encode_motion_vector(vp8_writer *, const MV *, const MV_CONTEXT *);
-void vp8_build_component_cost_table(int *mvcost[2], int *mvsadcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);
+void vp8_build_component_cost_table(int *mvcost[2], const MV_CONTEXT *mvc, int mvc_flag[2]);
#endif
diff --git a/vp8/encoder/ethreading.c b/vp8/encoder/ethreading.c
index d73542226..8aef915b8 100644
--- a/vp8/encoder/ethreading.c
+++ b/vp8/encoder/ethreading.c
@@ -319,8 +319,8 @@ static void setup_mbby_copy(MACROBLOCK *mbdst, MACROBLOCK *mbsrc)
vpx_memcpy(z->mvcosts, x->mvcosts, sizeof(x->mvcosts));
z->mvcost[0] = &z->mvcosts[0][mv_max+1];
z->mvcost[1] = &z->mvcosts[1][mv_max+1];
- z->mvsadcost[0] = &z->mvsadcosts[0][mv_max+1];
- z->mvsadcost[1] = &z->mvsadcosts[1][mv_max+1];
+ z->mvsadcost[0] = &z->mvsadcosts[0][mvfp_max+1];
+ z->mvsadcost[1] = &z->mvsadcosts[1][mvfp_max+1];
vpx_memcpy(z->token_costs, x->token_costs, sizeof(x->token_costs));
diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 3ebbba4d6..9a7774863 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c
@@ -446,7 +446,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, M
xd->pre.y_buffer = recon_buffer->y_buffer + recon_yoffset;
// Initial step/diamond search centred on best mv
- tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
+ tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param, x->errorperbit, &num00, &v_fn_ptr, x->mvcost, ref_mv);
if ( tmp_err < INT_MAX-new_mv_mode_penalty )
tmp_err += new_mv_mode_penalty;
@@ -469,7 +469,7 @@ static void first_pass_motion_search(VP8_COMP *cpi, MACROBLOCK *x, MV *ref_mv, M
num00--;
else
{
- tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvsadcost, x->mvcost, ref_mv);
+ tmp_err = cpi->diamond_search_sad(x, b, d, ref_mv, &tmp_mv, step_param + n, x->errorperbit, &num00, &v_fn_ptr, x->mvcost, ref_mv);
if ( tmp_err < INT_MAX-new_mv_mode_penalty )
tmp_err += new_mv_mode_penalty;
@@ -540,7 +540,7 @@ void vp8_first_pass(VP8_COMP *cpi)
int flag[2] = {1, 1};
vp8_initialize_rd_consts(cpi, cm->base_qindex+cm->y1dc_delta_q);
vpx_memcpy(cm->fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
- vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
+ vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cm->fc.mvc, flag);
}
// for each macroblock row in image
diff --git a/vp8/encoder/mcomp.c b/vp8/encoder/mcomp.c
index de6642b75..37c30da14 100644
--- a/vp8/encoder/mcomp.c
+++ b/vp8/encoder/mcomp.c
@@ -54,6 +54,11 @@ static int mv_err_cost(MV *mv, MV *ref, int *mvcost[2], int error_per_bit)
//return (vp8_mv_bit_cost(mv, ref, mvcost, 128) * error_per_bit + 128) >> 8;
}
+static int mvsad_err_cost(MV *mv, MV *ref, int *mvsadcost[2], int error_per_bit)
+{
+ /* Calculate sad error cost on full pixel basis. */
+ return ((mvsadcost[0][(mv->row - ref->row)] + mvsadcost[1][(mv->col - ref->col)]) * error_per_bit + 128) >> 8;
+}
static int mv_bits(MV *mv, MV *ref, int *mvcost[2])
{
@@ -753,7 +758,7 @@ int vp8_find_best_half_pixel_step(MACROBLOCK *mb, BLOCK *b, BLOCKD *d, MV *bestm
}
-#define MVC(r,c) (((mvsadcost[0][((r)<<2)-rr] + mvsadcost[1][((c)<<2) - rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
+#define MVC(r,c) (((mvsadcost[0][r-rr] + mvsadcost[1][c-rc]) * error_per_bit + 128 )>>8 ) // estimated cost of a motion vector (r,c)
#define PRE(r,c) (*(d->base_pre) + d->pre + (r) * d->pre_stride + (c)) // pointer to predictor base of a motionvector
#define DIST(r,c,v) vfp->sdf( src,src_stride,PRE(r,c),d->pre_stride, v) // returns sad error score.
#define ERR(r,c,v) (MVC(r,c)+DIST(r,c,v)) // returns distortion + motion vector cost
@@ -801,8 +806,8 @@ int vp8_hex_search
if (br > x->mv_row_max) br = x->mv_row_max;
- rr >>= 1;
- rc >>= 1;
+ rr >>= 3;
+ rc >>= 3;
besterr = ERR(br, bc, thiserr);
@@ -915,7 +920,6 @@ int vp8_diamond_search_sad
int error_per_bit,
int *num00,
vp8_variance_fn_ptr_t *fn_ptr,
- int *mvsadcost[2],
int *mvcost[2],
MV *center_mv
)
@@ -944,8 +948,16 @@ int vp8_diamond_search_sad
unsigned char *check_here;
int thissad;
+ int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+ MV fcenter_mv;
+ fcenter_mv.row = center_mv->row >> 3;
+ fcenter_mv.col = center_mv->col >> 3;
+
*num00 = 0;
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
+
// Work out the start point for the search
in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
best_address = in_what;
@@ -955,7 +967,7 @@ int vp8_diamond_search_sad
(ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
{
// Check the starting position
- bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);
}
// search_param determines the length of the initial step and hence the number of iterations
@@ -964,8 +976,6 @@ int vp8_diamond_search_sad
tot_steps = (x->ss_count / x->searches_per_step) - search_param;
i = 1;
- best_mv->row = ref_row;
- best_mv->col = ref_col;
for (step = 0; step < tot_steps ; step++)
{
@@ -984,9 +994,9 @@ int vp8_diamond_search_sad
if (thissad < bestsad)
{
- this_mv.row = this_row_offset << 3;
- this_mv.col = this_col_offset << 3;
- thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+ this_mv.row = this_row_offset;
+ this_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
@@ -1031,7 +1041,6 @@ int vp8_diamond_search_sadx4
int error_per_bit,
int *num00,
vp8_variance_fn_ptr_t *fn_ptr,
- int *mvsadcost[2],
int *mvcost[2],
MV *center_mv
)
@@ -1060,7 +1069,14 @@ int vp8_diamond_search_sadx4
unsigned char *check_here;
unsigned int thissad;
+ int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+ MV fcenter_mv;
+ fcenter_mv.row = center_mv->row >> 3;
+ fcenter_mv.col = center_mv->col >> 3;
+
*num00 = 0;
+ best_mv->row = ref_row;
+ best_mv->col = ref_col;
// Work out the start point for the search
in_what = (unsigned char *)(*(d->base_pre) + d->pre + (ref_row * (d->pre_stride)) + ref_col);
@@ -1071,7 +1087,7 @@ int vp8_diamond_search_sadx4
(ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
{
// Check the starting position
- bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+ bestsad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);
}
// search_param determines the length of the initial step and hence the number of iterations
@@ -1080,8 +1096,6 @@ int vp8_diamond_search_sadx4
tot_steps = (x->ss_count / x->searches_per_step) - search_param;
i = 1;
- best_mv->row = ref_row;
- best_mv->col = ref_col;
for (step = 0; step < tot_steps ; step++)
{
@@ -1111,9 +1125,9 @@ int vp8_diamond_search_sadx4
{
if (sad_array[t] < bestsad)
{
- this_mv.row = (best_mv->row + ss[i].mv.row) << 3;
- this_mv.col = (best_mv->col + ss[i].mv.col) << 3;
- sad_array[t] += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+ this_mv.row = best_mv->row + ss[i].mv.row;
+ this_mv.col = best_mv->col + ss[i].mv.col;
+ sad_array[t] += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
if (sad_array[t] < bestsad)
{
@@ -1140,9 +1154,9 @@ int vp8_diamond_search_sadx4
if (thissad < bestsad)
{
- this_mv.row = this_row_offset << 3;
- this_mv.col = this_col_offset << 3;
- thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+ this_mv.row = this_row_offset;
+ this_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
@@ -1178,7 +1192,7 @@ int vp8_diamond_search_sadx4
#if !(CONFIG_REALTIME_ONLY)
-int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], MV *center_mv)
{
unsigned char *what = (*(b->base_src) + b->src);
int what_stride = b->src_stride;
@@ -1202,6 +1216,11 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
int col_min = ref_col - distance;
int col_max = ref_col + distance;
+ int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+ MV fcenter_mv;
+ fcenter_mv.row = center_mv->row >> 3;
+ fcenter_mv.col = center_mv->col >> 3;
+
// Work out the mid point for the search
in_what = *(d->base_pre) + d->pre;
bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
@@ -1216,7 +1235,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
// Baseline value at the centre
//bestsad = fn_ptr->sf( what,what_stride,bestaddress,in_what_stride) + (int)sqrt(mv_err_cost(ref_mv,ref_mv, mvcost,error_per_bit*14));
- bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);
}
// Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1234,17 +1253,17 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
for (r = row_min; r < row_max ; r++)
{
- this_mv.row = r << 3;
+ this_mv.row = r;
check_here = r * mv_stride + in_what + col_min;
for (c = col_min; c < col_max; c++)
{
thissad = fn_ptr->sdf(what, what_stride, check_here , in_what_stride, bestsad);
- this_mv.col = c << 3;
+ this_mv.col = c;
//thissad += (int)sqrt(mv_err_cost(&this_mv,ref_mv, mvcost,error_per_bit*14));
//thissad += error_per_bit * mv_bits_sadcost[mv_bits(&this_mv, ref_mv, mvcost)];
- thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit); //mv_bits(error_per_bit, &this_mv, ref_mv, mvsadcost);
if (thissad < bestsad)
{
@@ -1268,7 +1287,7 @@ int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int erro
return INT_MAX;
}
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
+int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], MV *center_mv)
{
unsigned char *what = (*(b->base_src) + b->src);
int what_stride = b->src_stride;
@@ -1294,6 +1313,11 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
unsigned int sad_array[3];
+ int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+ MV fcenter_mv;
+ fcenter_mv.row = center_mv->row >> 3;
+ fcenter_mv.col = center_mv->col >> 3;
+
// Work out the mid point for the search
in_what = *(d->base_pre) + d->pre;
bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
@@ -1306,7 +1330,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
(ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
{
// Baseline value at the centre
- bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);
}
// Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1324,7 +1348,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
for (r = row_min; r < row_max ; r++)
{
- this_mv.row = r << 3;
+ this_mv.row = r;
check_here = r * mv_stride + in_what + col_min;
c = col_min;
@@ -1340,8 +1364,8 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
if (thissad < bestsad)
{
- this_mv.col = c << 3;
- thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+ this_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
@@ -1363,8 +1387,8 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
if (thissad < bestsad)
{
- this_mv.col = c << 3;
- thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+ this_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
@@ -1391,7 +1415,7 @@ int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
return INT_MAX;
}
-int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], int *mvsadcost[2], MV *center_mv)
+int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int error_per_bit, int distance, vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2], MV *center_mv)
{
unsigned char *what = (*(b->base_src) + b->src);
int what_stride = b->src_stride;
@@ -1418,6 +1442,11 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
DECLARE_ALIGNED_ARRAY(16, unsigned short, sad_array8, 8);
unsigned int sad_array[3];
+ int *mvsadcost[2] = {x->mvsadcost[0], x->mvsadcost[1]};
+ MV fcenter_mv;
+ fcenter_mv.row = center_mv->row >> 3;
+ fcenter_mv.col = center_mv->col >> 3;
+
// Work out the mid point for the search
in_what = *(d->base_pre) + d->pre;
bestaddress = in_what + (ref_row * d->pre_stride) + ref_col;
@@ -1430,7 +1459,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
(ref_row > x->mv_row_min) && (ref_row < x->mv_row_max))
{
// Baseline value at the centre
- bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mv_err_cost(ref_mv, center_mv, mvsadcost, error_per_bit);
+ bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride, 0x7fffffff) + mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, error_per_bit);
}
// Apply further limits to prevent us looking using vectors that stretch beyiond the UMV border
@@ -1448,7 +1477,7 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
for (r = row_min; r < row_max ; r++)
{
- this_mv.row = r << 3;
+ this_mv.row = r;
check_here = r * mv_stride + in_what + col_min;
c = col_min;
@@ -1464,8 +1493,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
if (thissad < bestsad)
{
- this_mv.col = c << 3;
- thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+ this_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
@@ -1493,8 +1522,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
if (thissad < bestsad)
{
- this_mv.col = c << 3;
- thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+ this_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
@@ -1516,8 +1545,8 @@ int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, MV *ref_mv, int er
if (thissad < bestsad)
{
- this_mv.col = c << 3;
- thissad += mv_err_cost(&this_mv, center_mv, mvsadcost, error_per_bit);
+ this_mv.col = c;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, error_per_bit);
if (thissad < bestsad)
{
diff --git a/vp8/encoder/mcomp.h b/vp8/encoder/mcomp.h
index 83f95c6e0..5efcec296 100644
--- a/vp8/encoder/mcomp.h
+++ b/vp8/encoder/mcomp.h
@@ -66,7 +66,6 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
int distance, \
vp8_variance_fn_ptr_t *fn_ptr, \
int *mvcost[2], \
- int *mvsadcost[2], \
MV *center_mv \
)
@@ -82,7 +81,6 @@ extern fractional_mv_step_fp vp8_skip_fractional_mv_step;
int error_per_bit, \
int *num00, \
vp8_variance_fn_ptr_t *fn_ptr, \
- int *mvsadcost[2], \
int *mvcost[2], \
MV *center_mv \
)
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 67c6f61d0..87f0f1853 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -1863,13 +1863,13 @@ static void cal_mvsadcosts(int *mvsadcost[2])
do
{
- double z = 256 * (2 * (log2f(2 * i) + .6));
+ double z = 256 * (2 * (log2f(8 * i) + .6));
mvsadcost [0][i] = (int) z;
mvsadcost [1][i] = (int) z;
mvsadcost [0][-i] = (int) z;
mvsadcost [1][-i] = (int) z;
}
- while (++i <= mv_max);
+ while (++i <= mvfp_max);
}
VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
@@ -2065,8 +2065,8 @@ VP8_PTR vp8_create_compressor(VP8_CONFIG *oxcf)
cpi->mb.mvcost[0] = &cpi->mb.mvcosts[0][mv_max+1];
cpi->mb.mvcost[1] = &cpi->mb.mvcosts[1][mv_max+1];
- cpi->mb.mvsadcost[0] = &cpi->mb.mvsadcosts[0][mv_max+1];
- cpi->mb.mvsadcost[1] = &cpi->mb.mvsadcosts[1][mv_max+1];
+ cpi->mb.mvsadcost[0] = &cpi->mb.mvsadcosts[0][mvfp_max+1];
+ cpi->mb.mvsadcost[1] = &cpi->mb.mvsadcosts[1][mvfp_max+1];
cal_mvsadcosts(cpi->mb.mvsadcost);
diff --git a/vp8/encoder/pickinter.c b/vp8/encoder/pickinter.c
index 0790d3517..0edd806a2 100644
--- a/vp8/encoder/pickinter.c
+++ b/vp8/encoder/pickinter.c
@@ -738,7 +738,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
}
else
{
- bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb < 9
+ bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); //sadpb < 9
mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
@@ -757,7 +757,7 @@ int vp8_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int rec
num00--;
else
{
- thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb = 9
+ thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); //sadpb = 9
if (thissme < bestsme)
{
diff --git a/vp8/encoder/ratectrl.c b/vp8/encoder/ratectrl.c
index 9c3dcdb27..f3bcf9921 100644
--- a/vp8/encoder/ratectrl.c
+++ b/vp8/encoder/ratectrl.c
@@ -355,7 +355,7 @@ void vp8_setup_key_frame(VP8_COMP *cpi)
vpx_memcpy(cpi->common.fc.mvc, vp8_default_mv_context, sizeof(vp8_default_mv_context));
{
int flag[2] = {1, 1};
- vp8_build_component_cost_table(cpi->mb.mvcost, cpi->mb.mvsadcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
+ vp8_build_component_cost_table(cpi->mb.mvcost, (const MV_CONTEXT *) cpi->common.fc.mvc, flag);
}
vpx_memset(cpi->common.fc.pre_mvc, 0, sizeof(cpi->common.fc.pre_mvc)); //initialize pre_mvc to all zero.
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 6d9e33a69..f125e952a 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -1224,7 +1224,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
{
bestsme = cpi->diamond_search_sad(x, c, e, bsi->mvp,
&mode_mv[NEW4X4], step_param,
- sadpb / 2, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv);
+ sadpb / 2, &num00, v_fn_ptr, x->mvcost, bsi->ref_mv);
n = num00;
num00 = 0;
@@ -1239,7 +1239,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
{
thissme = cpi->diamond_search_sad(x, c, e, bsi->mvp,
&temp_mv, step_param + n,
- sadpb / 2, &num00, v_fn_ptr, x->mvsadcost, x->mvcost, bsi->ref_mv);
+ sadpb / 2, &num00, v_fn_ptr, x->mvcost, bsi->ref_mv);
if (thissme < bestsme)
{
@@ -1257,7 +1257,7 @@ static void rd_check_segment(VP8_COMP *cpi, MACROBLOCK *x,
if ((cpi->compressor_speed == 0) && (bestsme >> sseshift) > 4000)
{
thissme = cpi->full_search_sad(x, c, e, bsi->mvp,
- sadpb / 4, 16, v_fn_ptr, x->mvcost, x->mvsadcost,bsi->ref_mv);
+ sadpb / 4, 16, v_fn_ptr, x->mvcost, bsi->ref_mv);
if (thissme < bestsme)
{
@@ -2167,7 +2167,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
}
else
{
- bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb < 9
+ bestsme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param, sadpb / 2/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); //sadpb < 9
mode_mv[NEWMV].row = d->bmi.mv.as_mv.row;
mode_mv[NEWMV].col = d->bmi.mv.as_mv.col;
@@ -2186,7 +2186,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
num00--;
else
{
- thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvsadcost, x->mvcost, &best_ref_mv); //sadpb = 9
+ thissme = cpi->diamond_search_sad(x, b, d, &mvp, &d->bmi.mv.as_mv, step_param + n, sadpb / 4/*x->errorperbit*/, &num00, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv); //sadpb = 9
if (thissme < bestsme)
{
@@ -2232,7 +2232,7 @@ int vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
{
int sadpb = x->sadperbit16 >> 2;
- thissme = cpi->full_search_sad(x, b, d, &full_mvp, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, x->mvsadcost,&best_ref_mv);
+ thissme = cpi->full_search_sad(x, b, d, &full_mvp, sadpb, search_range, &cpi->fn_ptr[BLOCK_16X16], x->mvcost, &best_ref_mv);
}
// Barrier threshold to initiating full search
diff --git a/vp8/encoder/temporal_filter.c b/vp8/encoder/temporal_filter.c
index fd36b22eb..cec951897 100644
--- a/vp8/encoder/temporal_filter.c
+++ b/vp8/encoder/temporal_filter.c
@@ -195,63 +195,14 @@ static int vp8_temporal_filter_find_matching_mb_c
further_steps = 0;
}
- if (1/*cpi->sf.search_method == HEX*/)
- {
- // TODO Check that the 16x16 vf & sdf are selected here
- bestsme = vp8_hex_search(x, b, d,
- &best_ref_mv1, &d->bmi.mv.as_mv,
- step_param,
- sadpb/*x->errorperbit*/,
- &num00, &cpi->fn_ptr[BLOCK_16X16],
- mvsadcost, mvcost, &best_ref_mv1);
- }
- else
- {
- int mv_x, mv_y;
-
- bestsme = cpi->diamond_search_sad(x, b, d,
- &best_ref_mv1, &d->bmi.mv.as_mv,
- step_param,
- sadpb / 2/*x->errorperbit*/,
- &num00, &cpi->fn_ptr[BLOCK_16X16],
- mvsadcost, mvcost, &best_ref_mv1); //sadpb < 9
-
- // Further step/diamond searches as necessary
- n = 0;
- //further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
-
- n = num00;
- num00 = 0;
-
- while (n < further_steps)
- {
- n++;
-
- if (num00)
- num00--;
- else
- {
- thissme = cpi->diamond_search_sad(x, b, d,
- &best_ref_mv1, &d->bmi.mv.as_mv,
- step_param + n,
- sadpb / 4/*x->errorperbit*/,
- &num00, &cpi->fn_ptr[BLOCK_16X16],
- mvsadcost, mvcost, &best_ref_mv1); //sadpb = 9
-
- if (thissme < bestsme)
- {
- bestsme = thissme;
- mv_y = d->bmi.mv.as_mv.row;
- mv_x = d->bmi.mv.as_mv.col;
- }
- else
- {
- d->bmi.mv.as_mv.row = mv_y;
- d->bmi.mv.as_mv.col = mv_x;
- }
- }
- }
- }
+ /*cpi->sf.search_method == HEX*/
+ // TODO Check that the 16x16 vf & sdf are selected here
+ bestsme = vp8_hex_search(x, b, d,
+ &best_ref_mv1, &d->bmi.mv.as_mv,
+ step_param,
+ sadpb/*x->errorperbit*/,
+ &num00, &cpi->fn_ptr[BLOCK_16X16],
+ mvsadcost, mvcost, &best_ref_mv1);
#if ALT_REF_SUBPEL_ENABLED
// Try sub-pixel MC?
diff --git a/vp8/encoder/x86/quantize_sse2.asm b/vp8/encoder/x86/quantize_sse2.asm
index 9a1584024..e00faebd1 100644
--- a/vp8/encoder/x86/quantize_sse2.asm
+++ b/vp8/encoder/x86/quantize_sse2.asm
@@ -22,35 +22,36 @@ sym(vp8_regular_quantize_b_sse2):
mov rbp, rsp
SAVE_XMM
GET_GOT rbx
- push rsi
%if ABI_IS_32BIT
push rdi
+ push rsi
%else
%ifidn __OUTPUT_FORMAT__,x64
push rdi
+ push rsi
%endif
%endif
ALIGN_STACK 16, rax
- %define BLOCKD_d 0 ; 8
- %define zrun_zbin_boost 8 ; 8
- %define abs_minus_zbin 16 ; 32
- %define temp_qcoeff 48 ; 32
- %define qcoeff 80 ; 32
- %define stack_size 112
+ %define zrun_zbin_boost 0 ; 8
+ %define abs_minus_zbin 8 ; 32
+ %define temp_qcoeff 40 ; 32
+ %define qcoeff 72 ; 32
+ %define stack_size 104
sub rsp, stack_size
; end prolog
%if ABI_IS_32BIT
- mov rdi, arg(0)
+ mov rdi, arg(0) ; BLOCK *b
+ mov rsi, arg(1) ; BLOCKD *d
%else
%ifidn __OUTPUT_FORMAT__,x64
mov rdi, rcx ; BLOCK *b
- mov [rsp + BLOCKD_d], rdx
+ mov rsi, rdx ; BLOCKD *d
%else
;mov rdi, rdi ; BLOCK *b
- mov [rsp + BLOCKD_d], rsi
+ ;mov rsi, rsi ; BLOCKD *d
%endif
%endif
@@ -125,59 +126,52 @@ sym(vp8_regular_quantize_b_sse2):
movdqa [rsp + qcoeff], xmm6
movdqa [rsp + qcoeff + 16], xmm6
- mov rsi, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
+ mov rdx, [rdi + vp8_block_zrun_zbin_boost] ; zbin_boost_ptr
mov rax, [rdi + vp8_block_quant_shift] ; quant_shift_ptr
- mov [rsp + zrun_zbin_boost], rsi
+ mov [rsp + zrun_zbin_boost], rdx
%macro ZIGZAG_LOOP 1
- movsx edx, WORD PTR[GLOBAL(zig_zag + (%1 * 2))] ; rc
-
; x
- movsx ecx, WORD PTR[rsp + abs_minus_zbin + rdx *2]
+ movsx ecx, WORD PTR[rsp + abs_minus_zbin + %1 * 2]
; if (x >= zbin)
- sub cx, WORD PTR[rsi] ; x - zbin
- lea rsi, [rsi + 2] ; zbin_boost_ptr++
+ sub cx, WORD PTR[rdx] ; x - zbin
+ lea rdx, [rdx + 2] ; zbin_boost_ptr++
jl rq_zigzag_loop_%1 ; x < zbin
- movsx edi, WORD PTR[rsp + temp_qcoeff + rdx *2]
+ movsx edi, WORD PTR[rsp + temp_qcoeff + %1 * 2]
- ; downshift by quant_shift[rdx]
- movsx ecx, WORD PTR[rax + rdx*2] ; quant_shift_ptr[rc]
+ ; downshift by quant_shift[rc]
+ movsx ecx, WORD PTR[rax + %1 * 2] ; quant_shift_ptr[rc]
sar edi, cl ; also sets Z bit
je rq_zigzag_loop_%1 ; !y
- mov WORD PTR[rsp + qcoeff + rdx*2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
- mov rsi, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
+ mov WORD PTR[rsp + qcoeff + %1 * 2], di ;qcoeff_ptr[rc] = temp_qcoeff[rc]
+ mov rdx, [rsp + zrun_zbin_boost] ; reset to b->zrun_zbin_boost
rq_zigzag_loop_%1:
%endmacro
-ZIGZAG_LOOP 0
-ZIGZAG_LOOP 1
-ZIGZAG_LOOP 2
-ZIGZAG_LOOP 3
-ZIGZAG_LOOP 4
-ZIGZAG_LOOP 5
-ZIGZAG_LOOP 6
-ZIGZAG_LOOP 7
-ZIGZAG_LOOP 8
-ZIGZAG_LOOP 9
-ZIGZAG_LOOP 10
-ZIGZAG_LOOP 11
+; in vp8_default_zig_zag1d order: see vp8/common/entropy.c
+ZIGZAG_LOOP 0
+ZIGZAG_LOOP 1
+ZIGZAG_LOOP 4
+ZIGZAG_LOOP 8
+ZIGZAG_LOOP 5
+ZIGZAG_LOOP 2
+ZIGZAG_LOOP 3
+ZIGZAG_LOOP 6
+ZIGZAG_LOOP 9
ZIGZAG_LOOP 12
ZIGZAG_LOOP 13
+ZIGZAG_LOOP 10
+ZIGZAG_LOOP 7
+ZIGZAG_LOOP 11
ZIGZAG_LOOP 14
ZIGZAG_LOOP 15
movdqa xmm2, [rsp + qcoeff]
movdqa xmm3, [rsp + qcoeff + 16]
-%if ABI_IS_32BIT
- mov rdi, arg(1)
-%else
- mov rdi, [rsp + BLOCKD_d]
-%endif
-
- mov rcx, [rdi + vp8_blockd_dequant] ; dequant_ptr
- mov rsi, [rdi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
+ mov rcx, [rsi + vp8_blockd_dequant] ; dequant_ptr
+ mov rdi, [rsi + vp8_blockd_dqcoeff] ; dqcoeff_ptr
; y ^ sz
pxor xmm2, xmm0
@@ -190,15 +184,15 @@ ZIGZAG_LOOP 15
movdqa xmm0, [rcx]
movdqa xmm1, [rcx + 16]
- mov rcx, [rdi + vp8_blockd_qcoeff] ; qcoeff_ptr
+ mov rcx, [rsi + vp8_blockd_qcoeff] ; qcoeff_ptr
pmullw xmm0, xmm2
pmullw xmm1, xmm3
movdqa [rcx], xmm2 ; store qcoeff
movdqa [rcx + 16], xmm3
- movdqa [rsi], xmm0 ; store dqcoeff
- movdqa [rsi + 16], xmm1
+ movdqa [rdi], xmm0 ; store dqcoeff
+ movdqa [rdi + 16], xmm1
; select the last value (in zig_zag order) for EOB
pcmpeqw xmm2, xmm6
@@ -220,19 +214,20 @@ ZIGZAG_LOOP 15
pmaxsw xmm2, xmm3
movd eax, xmm2
and eax, 0xff
- mov [rdi + vp8_blockd_eob], eax
+ mov [rsi + vp8_blockd_eob], eax
; begin epilog
add rsp, stack_size
pop rsp
%if ABI_IS_32BIT
+ pop rsi
pop rdi
%else
%ifidn __OUTPUT_FORMAT__,x64
+ pop rsi
pop rdi
%endif
%endif
- pop rsi
RESTORE_GOT
RESTORE_XMM
pop rbp
@@ -347,11 +342,6 @@ sym(vp8_fast_quantize_b_impl_sse2):
SECTION_RODATA
align 16
-zig_zag:
- dw 0x0000, 0x0001, 0x0004, 0x0008
- dw 0x0005, 0x0002, 0x0003, 0x0006
- dw 0x0009, 0x000c, 0x000d, 0x000a
- dw 0x0007, 0x000b, 0x000e, 0x000f
inv_zig_zag:
dw 0x0001, 0x0002, 0x0006, 0x0007
dw 0x0003, 0x0005, 0x0008, 0x000d