summaryrefslogtreecommitdiff
path: root/vp8
diff options
context:
space:
mode:
Diffstat (limited to 'vp8')
-rw-r--r--vp8/common/blockd.h23
-rw-r--r--vp8/common/findnearmv.c137
-rw-r--r--vp8/common/mvref_common.c303
-rw-r--r--vp8/common/mvref_common.h37
-rw-r--r--vp8/common/recon.h8
-rw-r--r--vp8/common/reconinter.h9
-rw-r--r--vp8/common/reconintra.c246
-rw-r--r--vp8/common/rtcd_defs.sh8
-rw-r--r--vp8/common/x86/filter_sse2.c289
-rw-r--r--vp8/common/x86/filter_sse4.c3
-rw-r--r--vp8/decoder/decodemv.c35
-rw-r--r--vp8/decoder/dequantize.h11
-rw-r--r--vp8/decoder/detokenize.c3
-rw-r--r--vp8/encoder/bitstream.c22
-rw-r--r--vp8/encoder/encodeframe.c35
-rw-r--r--vp8/encoder/encodemb.c16
-rw-r--r--vp8/encoder/encodemb.h10
-rw-r--r--vp8/encoder/onyx_if.c35
-rw-r--r--vp8/encoder/onyx_int.h13
-rw-r--r--vp8/encoder/rdopt.c78
-rw-r--r--vp8/vp8_common.mk7
21 files changed, 1085 insertions, 243 deletions
diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h
index 4e5d9e813..46d002af9 100644
--- a/vp8/common/blockd.h
+++ b/vp8/common/blockd.h
@@ -44,6 +44,9 @@ void vpx_log(const char *format, ...);
/* Segment Feature Masks */
#define SEGMENT_DELTADATA 0
#define SEGMENT_ABSDATA 1
+#if CONFIG_NEW_MVREF
+#define MAX_MV_REFS 10
+#endif
typedef struct {
int r, c;
@@ -179,6 +182,14 @@ typedef enum {
B_MODE_COUNT
} B_PREDICTION_MODE;
+#if CONFIG_NEW_MVREF
+// Segment level features.
+typedef enum {
+ FIRST_REF = 0,
+ SECOND_REF = 1
+} MV_REF_TYPE;
+#endif
+
#if CONFIG_HYBRIDTRANSFORM8X8
// convert MB_PREDICTION_MODE to B_PREDICTION_MODE
static B_PREDICTION_MODE pred_mode_conv(MB_PREDICTION_MODE mode) {
@@ -268,9 +279,14 @@ typedef struct {
MV_REFERENCE_FRAME ref_frame, second_ref_frame;
TX_SIZE txfm_size;
int_mv mv[2]; // for each reference frame used
-#if CONFIG_NEWBESTREFMV
+#if CONFIG_NEWBESTREFMV || CONFIG_NEW_MVREF
int_mv ref_mv, second_ref_mv;
#endif
+#if CONFIG_NEW_MVREF
+ int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REFS];
+ int mv_ref_index[MAX_REF_FRAMES];
+#endif
+
unsigned char partitioning;
unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */
unsigned char need_to_clamp_mvs;
@@ -432,9 +448,14 @@ typedef struct MacroBlockD {
#endif
int mb_index; // Index of the MB in the SB (0..3)
+
#if CONFIG_NEWBESTREFMV
+#if CONFIG_NEW_MVREF
+ int_mv ref_mv[MAX_MV_REFS];
+#else
int_mv ref_mv[4];
#endif
+#endif
#if CONFIG_HYBRIDTRANSFORM
int q_index;
diff --git a/vp8/common/findnearmv.c b/vp8/common/findnearmv.c
index 6f7361dd0..694f4cc32 100644
--- a/vp8/common/findnearmv.c
+++ b/vp8/common/findnearmv.c
@@ -200,6 +200,139 @@ vp8_prob *vp8_mv_ref_probs(VP8_COMMON *pc,
* above and a number cols of pixels in the left to select the one with best
* score to use as ref motion vector
*/
+
+#if CONFIG_NEW_MVREF
+
+void vp8_find_best_ref_mvs(MACROBLOCKD *xd,
+ unsigned char *ref_y_buffer,
+ int ref_y_stride,
+ int_mv *best_mv,
+ int_mv *nearest,
+ int_mv *near) {
+ int_mv *ref_mv = xd->ref_mv;
+ int i, j;
+ unsigned char *above_src;
+ unsigned char *left_src;
+ unsigned char *above_ref;
+ unsigned char *left_ref;
+ int sad;
+ int sad_scores[MAX_MV_REFS];
+ int_mv sorted_mvs[MAX_MV_REFS];
+ int zero_seen = FALSE;
+
+ // Default all to 0,0 if nothing else available
+ best_mv->as_int = nearest->as_int = near->as_int = 0;
+ vpx_memset(sorted_mvs, 0, sizeof(sorted_mvs));
+
+ above_src = xd->dst.y_buffer - xd->dst.y_stride * 2;
+ left_src = xd->dst.y_buffer - 2;
+ above_ref = ref_y_buffer - ref_y_stride * 2;
+ left_ref = ref_y_buffer - 2;
+
+ for(i = 0; i < MAX_MV_REFS; ++i) {
+ int_mv this_mv;
+ int offset=0;
+ int row_offset, col_offset;
+
+ this_mv.as_int = ref_mv[i].as_int;
+
+ // If we see a 0,0 vector for a second time we have reached the end of
+ // the list of valid candidate vectors.
+ if (!this_mv.as_int)
+ if (zero_seen)
+ break;
+ else
+ zero_seen = TRUE;
+
+ vp8_clamp_mv(&this_mv,
+ xd->mb_to_left_edge - LEFT_TOP_MARGIN + 16,
+ xd->mb_to_right_edge + RIGHT_BOTTOM_MARGIN,
+ xd->mb_to_top_edge - LEFT_TOP_MARGIN + 16,
+ xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
+
+ row_offset = (this_mv.as_mv.row > 0) ?
+ ((this_mv.as_mv.row + 3) >> 3):((this_mv.as_mv.row + 4) >> 3);
+ col_offset = (this_mv.as_mv.col > 0) ?
+ ((this_mv.as_mv.col + 3) >> 3):((this_mv.as_mv.col + 4) >> 3);
+ offset = ref_y_stride * row_offset + col_offset;
+
+ sad = vp8_sad16x2_c(above_src, xd->dst.y_stride,
+ above_ref + offset, ref_y_stride, INT_MAX);
+
+ sad += vp8_sad2x16_c(left_src, xd->dst.y_stride,
+ left_ref + offset, ref_y_stride, INT_MAX);
+
+ // Add the entry to our list and then resort the list on score.
+ sad_scores[i] = sad;
+ sorted_mvs[i].as_int = this_mv.as_int;
+ j = i;
+ while (j > 0) {
+ if (sad_scores[j] < sad_scores[j-1]) {
+ sad_scores[j] = sad_scores[j-1];
+ sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
+ sad_scores[j-1] = sad;
+ sorted_mvs[j-1].as_int = this_mv.as_int;
+ j--;
+ } else
+ break;
+ }
+ }
+
+ // If not see add 0,0 as a possibility
+ /*if ( (i < MAX_MV_REFS) && !zero_seen ) {
+
+ sad = vp8_sad16x2_c(above_src, xd->dst.y_stride,
+ above_ref, ref_y_stride,
+ INT_MAX);
+ sad += vp8_sad2x16_c(left_src, xd->dst.y_stride,
+ left_ref, ref_y_stride,
+ INT_MAX);
+ this_mv.as_int = 0;
+
+ // Add the entry to our list and then resort the list on score.
+ sad_scores[i] = sad;
+ sorted_mvs[i].as_int = this_mv.as_int;
+ j = i;
+ while (j > 0) {
+ if (sad_scores[j] < sad_scores[j-1]) {
+ sad_scores[j] = sad_scores[j-1];
+ sorted_mvs[j].as_int = sorted_mvs[j-1].as_int;
+ sad_scores[j-1] = sad;
+ sorted_mvs[j-1].as_int = this_mv.as_int;
+ j--;
+ } else
+ break;
+ }
+ }*/
+
+ // Set the best mv to the first entry in the sorted list
+ best_mv->as_int = sorted_mvs[0].as_int;
+
+ // Provided that there are non zero vectors available there will not
+ // be more than one 0,0 entry in the sorted list.
+ // The best ref mv is always set to the first entry (which gave the best
+ // results. The nearest is set to the first non zero vector if available and
+ // near to the second non zero vector if avaialable.
+ // We do not use 0,0 as a nearest or near as 0,0 has its own mode.
+ if ( sorted_mvs[0].as_int ) {
+ nearest->as_int = sorted_mvs[0].as_int;
+ if ( sorted_mvs[1].as_int )
+ near->as_int = sorted_mvs[1].as_int;
+ else
+ near->as_int = sorted_mvs[2].as_int;
+ } else {
+ nearest->as_int = sorted_mvs[1].as_int;
+ near->as_int = sorted_mvs[2].as_int;
+ }
+
+ if (!xd->allow_high_precision_mv)
+ lower_mv_precision(best_mv);
+
+ vp8_clamp_mv2(best_mv, xd);
+}
+
+#else // !CONFIG_NEW_MVREF
+
void vp8_find_best_ref_mvs(MACROBLOCKD *xd,
unsigned char *ref_y_buffer,
int ref_y_stride,
@@ -270,5 +403,5 @@ void vp8_find_best_ref_mvs(MACROBLOCKD *xd,
nearest->as_int = best_mv->as_int;
}
}
-
-#endif
+#endif // CONFIG_NEW_MVREF
+#endif // CONFIG_NEWBESTREFMV
diff --git a/vp8/common/mvref_common.c b/vp8/common/mvref_common.c
new file mode 100644
index 000000000..1c345dba5
--- /dev/null
+++ b/vp8/common/mvref_common.c
@@ -0,0 +1,303 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "mvref_common.h"
+
+#if CONFIG_NEW_MVREF
+
+#define MVREF_NEIGHBOURS 8
+static int mv_ref_search[MVREF_NEIGHBOURS][2] =
+ { {0,-1},{-1,0},{-1,-1},{0,-2},{-2,0},{-1,-2},{-2,-1},{-2,-2} };
+static int ref_distance_weight[MVREF_NEIGHBOURS] =
+ { 3,3,2,1,1,1,1,1 };
+ //{ 4,4,2,1,1,1,1,1 };
+
+// clamp_mv
+#define MV_BORDER (16 << 3) // Allow 16 pels in 1/8th pel units
+static void clamp_mv(const MACROBLOCKD *xd, int_mv *mv) {
+
+ if (mv->as_mv.col < (xd->mb_to_left_edge - MV_BORDER))
+ mv->as_mv.col = xd->mb_to_left_edge - MV_BORDER;
+ else if (mv->as_mv.col > xd->mb_to_right_edge + MV_BORDER)
+ mv->as_mv.col = xd->mb_to_right_edge + MV_BORDER;
+
+ if (mv->as_mv.row < (xd->mb_to_top_edge - MV_BORDER))
+ mv->as_mv.row = xd->mb_to_top_edge - MV_BORDER;
+ else if (mv->as_mv.row > xd->mb_to_bottom_edge + MV_BORDER)
+ mv->as_mv.row = xd->mb_to_bottom_edge + MV_BORDER;
+}
+
+// Code for selecting / building and entropy coding a motion vector reference
+// Returns a seperation value for two vectors.
+// This is taken as the sum of the abs x and y difference.
+unsigned int mv_distance(int_mv *mv1, int_mv *mv2) {
+ return (abs(mv1->as_mv.row - mv2->as_mv.row) +
+ abs(mv1->as_mv.col - mv2->as_mv.col));
+}
+
+// Gets a best matching candidate refenence motion vector
+// from the given mode info structure (if available)
+int get_candidate_mvref(
+ const MODE_INFO *candidate_mi,
+ MV_REFERENCE_FRAME ref_frame,
+ MV_REFERENCE_FRAME *candidate_ref_frame,
+ int_mv *candidate_mv
+) {
+
+ int ret_val = FALSE;
+
+ if (ref_frame == candidate_mi->mbmi.ref_frame) {
+ candidate_mv->as_int = candidate_mi->mbmi.mv[FIRST_REF].as_int;
+ *candidate_ref_frame = ref_frame;
+ ret_val = TRUE;
+
+ } else if (ref_frame == candidate_mi->mbmi.second_ref_frame) {
+ candidate_mv->as_int = candidate_mi->mbmi.mv[SECOND_REF].as_int;
+ *candidate_ref_frame = ref_frame;
+ ret_val = TRUE;
+
+ } else if (candidate_mi->mbmi.ref_frame != INTRA_FRAME) {
+ candidate_mv->as_int = candidate_mi->mbmi.mv[FIRST_REF].as_int;
+ *candidate_ref_frame = candidate_mi->mbmi.ref_frame;
+ ret_val = TRUE;
+
+ } else if (candidate_mi->mbmi.second_ref_frame != INTRA_FRAME) {
+ candidate_mv->as_int = candidate_mi->mbmi.mv[SECOND_REF].as_int;
+ *candidate_ref_frame = candidate_mi->mbmi.second_ref_frame;
+ ret_val = TRUE;
+ }
+
+ return ret_val;
+}
+
+// Performs mv adjustment based on reference frame and clamps the MV
+// if it goes off the edge of the buffer.
+void scale_mv(
+ MACROBLOCKD *xd,
+ MV_REFERENCE_FRAME this_ref_frame,
+ MV_REFERENCE_FRAME candidate_ref_frame,
+ int_mv *candidate_mv,
+ int *ref_sign_bias
+) {
+
+ if (candidate_ref_frame != this_ref_frame) {
+
+ //int frame_distances[MAX_REF_FRAMES];
+ //int last_distance = 1;
+ //int gf_distance = xd->frames_since_golden;
+ //int arf_distance = xd->frames_till_alt_ref_frame;
+
+ // Sign inversion where appropriate.
+ if (ref_sign_bias[candidate_ref_frame] != ref_sign_bias[this_ref_frame]) {
+ candidate_mv->as_mv.row = -candidate_mv->as_mv.row;
+ candidate_mv->as_mv.col = -candidate_mv->as_mv.col;
+ }
+
+ // Scale based on frame distance if the reference frames not the same.
+ /*frame_distances[INTRA_FRAME] = 1; // should never be used
+ frame_distances[LAST_FRAME] = 1;
+ frame_distances[GOLDEN_FRAME] =
+ (xd->frames_since_golden) ? xd->frames_since_golden : 1;
+ frame_distances[ALTREF_FRAME] =
+ (xd->frames_till_alt_ref_frame) ? xd->frames_till_alt_ref_frame : 1;
+
+ if (frame_distances[this_ref_frame] &&
+ frame_distances[candidate_ref_frame]) {
+ candidate_mv->as_mv.row =
+ (short)(((int)(candidate_mv->as_mv.row) *
+ frame_distances[this_ref_frame]) /
+ frame_distances[candidate_ref_frame]);
+
+ candidate_mv->as_mv.col =
+ (short)(((int)(candidate_mv->as_mv.col) *
+ frame_distances[this_ref_frame]) /
+ frame_distances[candidate_ref_frame]);
+ }
+ */
+ }
+
+ // Clamp the MV so it does not point out of the frame buffer
+ clamp_mv(xd, candidate_mv);
+}
+
+// Adds a new candidate reference vector to the list if indeed it is new.
+// If it is not new then the score of the existing candidate that it matches
+// is increased and the list is resorted.
+void addmv_and_shuffle(
+ int_mv *mv_list,
+ int *mv_scores,
+ int *index,
+ int_mv candidate_mv,
+ int weight
+) {
+
+ int i = *index;
+ int duplicate_found = FALSE;
+
+ // Check for duplicates. If there is one increment its score.
+ while (i > 0) {
+ i--;
+ if (candidate_mv.as_int == mv_list[i].as_int) {
+ duplicate_found = TRUE;
+ mv_scores[i] += weight;
+ break;
+ }
+ }
+
+ // If no duplicate was found add the new vector and give it a weight
+ if (!duplicate_found) {
+ mv_list[*index].as_int = candidate_mv.as_int;
+ mv_scores[*index] = weight;
+ i = *index;
+ (*index)++;
+ }
+
+ // Reshuffle the list so that highest scoring mvs at the top.
+ while (i > 0) {
+ if (mv_scores[i] > mv_scores[i-1]) {
+ int tmp_score = mv_scores[i-1];
+ int_mv tmp_mv = mv_list[i-1];
+
+ mv_scores[i-1] = mv_scores[i];
+ mv_list[i-1] = mv_list[i];
+ mv_scores[i] = tmp_score;
+ mv_list[i] = tmp_mv;
+ i--;
+ } else
+ break;
+ }
+}
+
+
+// Measure the distance of each reference candidate from the actual
+// residual vector and return the nearest
+unsigned int pick_best_mv_ref( int_mv target_mv,
+ int_mv * mv_ref_list,
+ int_mv * best_ref ) {
+
+ int i;
+ int best_index = 0;
+ unsigned int distance, distance2;
+
+ distance = mv_distance(&target_mv, &mv_ref_list[0]);
+
+ for (i = 1; i < MAX_MV_REFS; ++i ) {
+ distance2 =
+ mv_distance(&target_mv, &mv_ref_list[i]);
+ if (distance2 < distance) {
+ distance = distance2;
+ best_index = i;
+ }
+ }
+
+ (*best_ref).as_int = mv_ref_list[best_index].as_int;
+
+ return best_index;
+}
+
+// This function searches the neighbourhood of a given MB/SB and populates a
+// list of candidate reference vectors.
+//
+void find_mv_refs(
+ MACROBLOCKD *xd,
+ MODE_INFO *here,
+ MODE_INFO *lf_here,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv *mv_ref_list,
+ int *ref_sign_bias
+) {
+
+ int i;
+ MODE_INFO *candidate_mi;
+ int_mv candidate_mvs[MAX_MV_REFS];
+ int_mv c_refmv;
+ MV_REFERENCE_FRAME c_ref_frame;
+ int candidate_scores[MAX_MV_REFS];
+ int index = 0;
+ int ref_weight = 0;
+ int valid_mv_ref;
+
+ // Blank the reference vector lists and other local structures.
+ vpx_memset(mv_ref_list, 0, sizeof(int_mv) * MAX_MV_REFS);
+ vpx_memset(candidate_mvs, 0, sizeof(int_mv) * MAX_MV_REFS);
+ vpx_memset(candidate_scores, 0, sizeof(candidate_scores));
+
+ // Populate a list with candidate reference vectors from the
+ // spatial neighbours.
+ for (i = 0; i < 2; ++i) {
+ if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+ ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+
+ candidate_mi = here + mv_ref_search[i][0] +
+ (mv_ref_search[i][1] * xd->mode_info_stride);
+
+ valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+ &c_ref_frame, &c_refmv);
+
+ if (valid_mv_ref) {
+ scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+ ref_weight = ref_distance_weight[i] +
+ ((c_ref_frame == ref_frame) << 3);
+
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c_refmv, ref_weight);
+ }
+ }
+ }
+
+ // Look at the corresponding vector in the last frame
+ candidate_mi = lf_here;
+ valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+ &c_ref_frame, &c_refmv);
+ if (valid_mv_ref) {
+ scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+ ref_weight = 2 + ((c_ref_frame == ref_frame) << 3);
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c_refmv, ref_weight);
+ }
+
+ // Populate a list with candidate reference vectors from the
+ // spatial neighbours.
+ for (i = 2; i < MVREF_NEIGHBOURS; ++i) {
+ if (((mv_ref_search[i][0] << 7) >= xd->mb_to_left_edge) &&
+ ((mv_ref_search[i][1] << 7) >= xd->mb_to_top_edge)) {
+
+ candidate_mi = here + mv_ref_search[i][0] +
+ (mv_ref_search[i][1] * xd->mode_info_stride);
+
+ valid_mv_ref = get_candidate_mvref(candidate_mi, ref_frame,
+ &c_ref_frame, &c_refmv);
+
+ if (valid_mv_ref) {
+ scale_mv(xd, ref_frame, c_ref_frame, &c_refmv, ref_sign_bias );
+ ref_weight = ref_distance_weight[i] +
+ ((c_ref_frame == ref_frame) << 3);
+
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c_refmv, ref_weight);
+ }
+ }
+ }
+
+ // 0,0 is always a valid reference.
+ for (i = 0; i < index; ++i)
+ if (candidate_mvs[i].as_int == 0)
+ break;
+ if (i == index) {
+ c_refmv.as_int = 0;
+ addmv_and_shuffle(candidate_mvs, candidate_scores,
+ &index, c_refmv, 1);
+ }
+
+ // Copy over the candidate list.
+ vpx_memcpy(mv_ref_list, candidate_mvs, sizeof(candidate_mvs));
+}
+
+#endif
diff --git a/vp8/common/mvref_common.h b/vp8/common/mvref_common.h
new file mode 100644
index 000000000..9be408894
--- /dev/null
+++ b/vp8/common/mvref_common.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "onyxc_int.h"
+#include "blockd.h"
+
+// MR reference entropy header file.
+#if CONFIG_NEW_MVREF
+
+#ifndef __INC_MVREF_COMMON_H
+#define __INC_MVREF_COMMON_H
+
+unsigned int mv_distance(int_mv *mv1, int_mv *mv2);
+
+unsigned int pick_best_mv_ref( int_mv target_mv,
+ int_mv * mv_ref_list,
+ int_mv * best_ref );
+
+void find_mv_refs(
+ MACROBLOCKD *xd,
+ MODE_INFO *here,
+ MODE_INFO *lf_here,
+ MV_REFERENCE_FRAME ref_frame,
+ int_mv * mv_ref_list,
+ int *ref_sign_bias
+);
+
+#endif
+
+#endif
diff --git a/vp8/common/recon.h b/vp8/common/recon.h
index 3527fc14d..0bb5c8863 100644
--- a/vp8/common/recon.h
+++ b/vp8/common/recon.h
@@ -262,4 +262,12 @@ typedef struct vp8_recon_rtcd_vtable {
void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd,
MACROBLOCKD *xd);
+
+#if CONFIG_SUPERBLOCKS
+extern void vp8_recon_mby_s_c(const vp8_recon_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd, uint8_t *dst);
+extern void vp8_recon_mbuv_s_c(const vp8_recon_rtcd_vtable_t *rtcd,
+ MACROBLOCKD *xd, uint8_t *udst, uint8_t *vdst);
+#endif
+
#endif
diff --git a/vp8/common/reconinter.h b/vp8/common/reconinter.h
index 7ad0adbd4..37e34b5e1 100644
--- a/vp8/common/reconinter.h
+++ b/vp8/common/reconinter.h
@@ -45,6 +45,15 @@ extern void vp8_build_2nd_inter16x16_predictors_mb(MACROBLOCKD *xd,
int dst_ystride,
int dst_uvstride);
+#if CONFIG_SUPERBLOCKS
+extern void vp8_build_inter32x32_predictors_sb(MACROBLOCKD *x,
+ unsigned char *dst_y,
+ unsigned char *dst_u,
+ unsigned char *dst_v,
+ int dst_ystride,
+ int dst_uvstride);
+#endif
+
extern void vp8_build_inter_predictors_mb(MACROBLOCKD *xd);
extern void vp8_build_inter_predictors_b(BLOCKD *d, int pitch,
diff --git a/vp8/common/reconintra.c b/vp8/common/reconintra.c
index e391fa9be..cad9652b7 100644
--- a/vp8/common/reconintra.c
+++ b/vp8/common/reconintra.c
@@ -207,10 +207,10 @@ void vp8_recon_intra_mbuv(const vp8_recon_rtcd_vtable_t *rtcd,
}
}
-void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
- unsigned char *src, int src_stride,
+void vp8_build_intra_predictors_internal(unsigned char *src, int src_stride,
unsigned char *ypred_ptr,
- int y_stride, int mode, int bsize) {
+ int y_stride, int mode, int bsize,
+ int up_available, int left_available) {
unsigned char *yabove_row = src - src_stride;
unsigned char yleft_col[32];
@@ -218,7 +218,7 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
int r, c, i;
for (i = 0; i < bsize; i++) {
- yleft_col[i] = xd->dst.y_buffer [i * src_stride - 1];
+ yleft_col[i] = src[i * src_stride - 1];
}
/* for Y */
@@ -230,8 +230,10 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
int average = 0;
int log2_bsize_minus_1;
- assert(bsize == 8 || bsize == 16 || bsize == 32);
- if (bsize == 8) {
+ assert(bsize == 4 || bsize == 8 || bsize == 16 || bsize == 32);
+ if (bsize == 4) {
+ log2_bsize_minus_1 = 1;
+ } else if (bsize == 8) {
log2_bsize_minus_1 = 2;
} else if (bsize == 16) {
log2_bsize_minus_1 = 3;
@@ -239,19 +241,19 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
log2_bsize_minus_1 = 4;
}
- if (xd->up_available || xd->left_available) {
- if (xd->up_available) {
+ if (up_available || left_available) {
+ if (up_available) {
for (i = 0; i < bsize; i++) {
average += yabove_row[i];
}
}
- if (xd->left_available) {
+ if (left_available) {
for (i = 0; i < bsize; i++) {
average += yleft_col[i];
}
}
- shift = log2_bsize_minus_1 + xd->up_available + xd->left_available;
+ shift = log2_bsize_minus_1 + up_available + left_available;
expected_dc = (average + (1 << (shift - 1))) >> shift;
} else {
expected_dc = 128;
@@ -332,22 +334,25 @@ void vp8_build_intra_predictors_internal(MACROBLOCKD *xd,
}
void vp8_build_intra_predictors_mby(MACROBLOCKD *xd) {
- vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+ vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
xd->predictor, 16,
- xd->mode_info_context->mbmi.mode, 16);
+ xd->mode_info_context->mbmi.mode, 16,
+ xd->up_available, xd->left_available);
}
void vp8_build_intra_predictors_mby_s(MACROBLOCKD *xd) {
- vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+ vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
xd->dst.y_buffer, xd->dst.y_stride,
- xd->mode_info_context->mbmi.mode, 16);
+ xd->mode_info_context->mbmi.mode, 16,
+ xd->up_available, xd->left_available);
}
#if CONFIG_SUPERBLOCKS
-void vp8_build_intra_predictors_sby_s(MACROBLOCKD *x) {
- vp8_build_intra_predictors_internal(x, x->dst.y_buffer, x->dst.y_stride,
- x->dst.y_buffer, x->dst.y_stride,
- x->mode_info_context->mbmi.mode, 32);
+void vp8_build_intra_predictors_sby_s(MACROBLOCKD *xd) {
+ vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
+ xd->dst.y_buffer, xd->dst.y_stride,
+ xd->mode_info_context->mbmi.mode, 32,
+ xd->up_available, xd->left_available);
}
#endif
@@ -356,14 +361,16 @@ void vp8_build_comp_intra_predictors_mby(MACROBLOCKD *xd) {
unsigned char predictor[2][256];
int i;
- vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+ vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
predictor[0], 16,
xd->mode_info_context->mbmi.mode,
- 16);
- vp8_build_intra_predictors_internal(xd, xd->dst.y_buffer, xd->dst.y_stride,
+ 16, xd->up_available,
+ xd->left_available);
+ vp8_build_intra_predictors_internal(xd->dst.y_buffer, xd->dst.y_stride,
predictor[1], 16,
xd->mode_info_context->mbmi.second_mode,
- 16);
+ 16, xd->up_available,
+ xd->left_available);
for (i = 0; i < 256; i++) {
xd->predictor[i] = (predictor[0][i] + predictor[1][i] + 1) >> 1;
@@ -376,10 +383,12 @@ void vp8_build_intra_predictors_mbuv_internal(MACROBLOCKD *xd,
unsigned char *vpred_ptr,
int uv_stride,
int mode, int bsize) {
- vp8_build_intra_predictors_internal(xd, xd->dst.u_buffer, xd->dst.uv_stride,
- upred_ptr, uv_stride, mode, bsize);
- vp8_build_intra_predictors_internal(xd, xd->dst.v_buffer, xd->dst.uv_stride,
- vpred_ptr, uv_stride, mode, bsize);
+ vp8_build_intra_predictors_internal(xd->dst.u_buffer, xd->dst.uv_stride,
+ upred_ptr, uv_stride, mode, bsize,
+ xd->up_available, xd->left_available);
+ vp8_build_intra_predictors_internal(xd->dst.v_buffer, xd->dst.uv_stride,
+ vpred_ptr, uv_stride, mode, bsize,
+ xd->up_available, xd->left_available);
}
void vp8_build_intra_predictors_mbuv(MACROBLOCKD *xd) {
@@ -428,95 +437,9 @@ void vp8_build_comp_intra_predictors_mbuv(MACROBLOCKD *xd) {
void vp8_intra8x8_predict(BLOCKD *xd,
int mode,
unsigned char *predictor) {
-
- unsigned char *yabove_row = *(xd->base_dst) + xd->dst - xd->dst_stride;
- unsigned char yleft_col[8];
- unsigned char ytop_left = yabove_row[-1];
- int r, c, i;
-
- for (i = 0; i < 8; i++) {
- yleft_col[i] = (*(xd->base_dst))[xd->dst - 1 + i * xd->dst_stride];
- }
- switch (mode) {
- case DC_PRED: {
- int expected_dc = 0;
-
- for (i = 0; i < 8; i++) {
- expected_dc += yabove_row[i];
- expected_dc += yleft_col[i];
- }
- expected_dc = (expected_dc + 8) >> 4;
-
- for (r = 0; r < 8; r++) {
- for (c = 0; c < 8; c++) {
- predictor[c] = expected_dc;
- }
- predictor += 16;
- }
- }
- break;
- case V_PRED: {
- for (r = 0; r < 8; r++) {
- for (c = 0; c < 8; c++) {
- predictor[c] = yabove_row[c];
- }
- predictor += 16;
- }
-
- }
- break;
- case H_PRED: {
-
- for (r = 0; r < 8; r++) {
- for (c = 0; c < 8; c++) {
- predictor[c] = yleft_col[r];
- }
- predictor += 16;
- }
- }
- break;
- case TM_PRED: {
- /* prediction similar to true_motion prediction */
- for (r = 0; r < 8; r++) {
- for (c = 0; c < 8; c++) {
- int pred = yabove_row[c] - ytop_left + yleft_col[r];
- if (pred < 0)
- pred = 0;
-
- if (pred > 255)
- pred = 255;
- predictor[c] = pred;
- }
-
- predictor += 16;
- }
- }
- break;
- case D45_PRED: {
- d45_predictor(predictor, 16, 8, yabove_row, yleft_col);
- }
- break;
- case D135_PRED: {
- d135_predictor(predictor, 16, 8, yabove_row, yleft_col);
- }
- break;
- case D117_PRED: {
- d117_predictor(predictor, 16, 8, yabove_row, yleft_col);
- }
- break;
- case D153_PRED: {
- d153_predictor(predictor, 16, 8, yabove_row, yleft_col);
- }
- break;
- case D27_PRED: {
- d27_predictor(predictor, 16, 8, yabove_row, yleft_col);
- }
- break;
- case D63_PRED: {
- d63_predictor(predictor, 16, 8, yabove_row, yleft_col);
- }
- break;
- }
+ vp8_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
+ xd->dst_stride, predictor, 16,
+ mode, 8, 1, 1);
}
#if CONFIG_COMP_INTRA_PRED
@@ -540,96 +463,9 @@ void vp8_comp_intra8x8_predict(BLOCKD *xd,
void vp8_intra_uv4x4_predict(BLOCKD *xd,
int mode,
unsigned char *predictor) {
-
- unsigned char *above_row = *(xd->base_dst) + xd->dst - xd->dst_stride;
- unsigned char left_col[4];
- unsigned char top_left = above_row[-1];
- int r, c, i;
-
- for (i = 0; i < 4; i++) {
- left_col[i] = (*(xd->base_dst))[xd->dst - 1 + i * xd->dst_stride];
- }
- switch (mode) {
- case DC_PRED: {
- int expected_dc = 0;
-
- for (i = 0; i < 4; i++) {
- expected_dc += above_row[i];
- expected_dc += left_col[i];
- }
- expected_dc = (expected_dc + 4) >> 3;
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- predictor[c] = expected_dc;
- }
- predictor += 8;
- }
- }
- break;
- case V_PRED: {
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
-
- predictor[c] = above_row[c];
- }
- predictor += 8;
- }
-
- }
- break;
- case H_PRED: {
-
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- predictor[c] = left_col[r];
- }
- predictor += 8;
- }
- }
- break;
- case TM_PRED: {
- /* prediction similar to true_motion prediction */
- for (r = 0; r < 4; r++) {
- for (c = 0; c < 4; c++) {
- int pred = above_row[c] - top_left + left_col[r];
- if (pred < 0)
- pred = 0;
-
- if (pred > 255)
- pred = 255;
- predictor[c] = pred;
- }
-
- predictor += 8;
- }
- }
- break;
- case D45_PRED: {
- d45_predictor(predictor, 8, 4, above_row, left_col);
- }
- break;
- case D135_PRED: {
- d135_predictor(predictor, 8, 4, above_row, left_col);
- }
- break;
- case D117_PRED: {
- d117_predictor(predictor, 8, 4, above_row, left_col);
- }
- break;
- case D153_PRED: {
- d153_predictor(predictor, 8, 4, above_row, left_col);
- }
- break;
- case D27_PRED: {
- d27_predictor(predictor, 8, 4, above_row, left_col);
- }
- break;
- case D63_PRED: {
- d63_predictor(predictor, 8, 4, above_row, left_col);
- }
- break;
- }
+ vp8_build_intra_predictors_internal(*(xd->base_dst) + xd->dst,
+ xd->dst_stride, predictor, 8,
+ mode, 4, 1, 1);
}
#if CONFIG_COMP_INTRA_PRED
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index 1cb5de311..66029f88e 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -14,8 +14,8 @@ prototype void vp8_filter_block2d_16x16_8 "const unsigned char *src_ptr, const u
# compiles warning free but a dissassembly of generated code show bugs. To be
# on the safe side, only enabled when compiled with 'gcc'.
if [ "$CONFIG_GCC" = "yes" ]; then
- specialize vp8_filter_block2d_4x4_8 sse4_1
- specialize vp8_filter_block2d_8x4_8 sse4_1
- specialize vp8_filter_block2d_8x8_8 sse4_1
- specialize vp8_filter_block2d_16x16_8 sse4_1
+ specialize vp8_filter_block2d_4x4_8 sse4_1 sse2
+ specialize vp8_filter_block2d_8x4_8 sse4_1 sse2
+ specialize vp8_filter_block2d_8x8_8 sse4_1 sse2
+ specialize vp8_filter_block2d_16x16_8 sse4_1 sse2
fi
diff --git a/vp8/common/x86/filter_sse2.c b/vp8/common/x86/filter_sse2.c
new file mode 100644
index 000000000..fe57b4e0b
--- /dev/null
+++ b/vp8/common/x86/filter_sse2.c
@@ -0,0 +1,289 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h> // for alignment checks
+#include <emmintrin.h> // SSE2
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h" // for DECLARE_ALIGNED
+#include "vpx_rtcd.h"
+
+// TODO(cd): After cleanup, commit faster versions for non 4x4 size. This is
+// just a quick partial snapshot so that other can already use some
+// speedup.
+// TODO(cd): Use vectorized 8 tap filtering code as speedup to pure C 6 tap
+// filtering.
+// TODO(cd): Add some comments, better variable naming.
+// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
+// of positive above 128), or have higher precision filter
+// coefficients.
+
+DECLARE_ALIGNED(16, static const unsigned int, rounding_c[4]) = {
+ VP8_FILTER_WEIGHT >> 1,
+ VP8_FILTER_WEIGHT >> 1,
+ VP8_FILTER_WEIGHT >> 1,
+ VP8_FILTER_WEIGHT >> 1,
+};
+
+// Creating a macro to do more than four pixels at once to hide instruction
+// latency is actually slower :-(
+#define DO_FOUR_PIXELS(result, src_ptr, offset) \
+ { \
+ /* Do shifted load to achieve require shuffles through unpacking */ \
+ const __m128i src0 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 0)); \
+ const __m128i src1 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 1)); \
+ const __m128i src2 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 2)); \
+ const __m128i src3 = _mm_loadu_si128((const __m128i *)(src_ptr + offset + 3)); \
+ const __m128i src01 = _mm_unpacklo_epi8(src0, src1); \
+ const __m128i src01_16 = _mm_unpacklo_epi8(src01, zero); \
+ const __m128i src23 = _mm_unpacklo_epi8(src2, src3); \
+ const __m128i src23_16 = _mm_unpacklo_epi8(src23, zero); \
+ /* Shit by 4 bytes through suffle to get additional shifted loads */ \
+ const __m128i src4 = _mm_shuffle_epi32(src0, _MM_SHUFFLE(3, 3, 2, 1)); \
+ const __m128i src5 = _mm_shuffle_epi32(src1, _MM_SHUFFLE(3, 3, 2, 1)); \
+ const __m128i src6 = _mm_shuffle_epi32(src2, _MM_SHUFFLE(3, 3, 2, 1)); \
+ const __m128i src7 = _mm_shuffle_epi32(src3, _MM_SHUFFLE(3, 3, 2, 1)); \
+ const __m128i src45 = _mm_unpacklo_epi8(src4, src5); \
+ const __m128i src45_16 = _mm_unpacklo_epi8(src45, zero); \
+ const __m128i src67 = _mm_unpacklo_epi8(src6, src7); \
+ const __m128i src67_16 = _mm_unpacklo_epi8(src67, zero); \
+ /* multiply accumulate them */ \
+ const __m128i mad01 = _mm_madd_epi16(src01_16, fil01); \
+ const __m128i mad23 = _mm_madd_epi16(src23_16, fil23); \
+ const __m128i mad45 = _mm_madd_epi16(src45_16, fil45); \
+ const __m128i mad67 = _mm_madd_epi16(src67_16, fil67); \
+ const __m128i mad0123 = _mm_add_epi32(mad01, mad23); \
+ const __m128i mad4567 = _mm_add_epi32(mad45, mad67); \
+ __m128i mad_all = _mm_add_epi32(mad0123, mad4567); \
+ mad_all = _mm_add_epi32(mad_all, rounding); \
+ result = _mm_srai_epi32(mad_all, VP8_FILTER_SHIFT); \
+ }
+
+void vp8_filter_block2d_4x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ __m128i intermediateA, intermediateB, intermediateC;
+
+ const int kInterp_Extend = 4;
+
+ const __m128i zero = _mm_set1_epi16(0);
+ const __m128i rounding = _mm_load_si128((const __m128i *)rounding_c);
+
+ // check alignment
+ assert(0 == ((long)HFilter_aligned16)%16);
+ assert(0 == ((long)VFilter_aligned16)%16);
+
+ {
+ __m128i transpose3_0;
+ __m128i transpose3_1;
+ __m128i transpose3_2;
+ __m128i transpose3_3;
+
+ // Horizontal pass (src -> intermediate).
+ {
+ const __m128i HFilter = _mm_load_si128((const __m128i *)HFilter_aligned16);
+ // get first two columns filter coefficients
+ __m128i fil01 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(0, 0, 0, 0));
+ __m128i fil23 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(1, 1, 1, 1));
+ __m128i fil45 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(2, 2, 2, 2));
+ __m128i fil67 = _mm_shuffle_epi32(HFilter, _MM_SHUFFLE(3, 3, 3, 3));
+ src_ptr -= (kInterp_Extend - 1) * src_stride + (kInterp_Extend - 1);
+
+ {
+ __m128i mad_all0;
+ __m128i mad_all1;
+ __m128i mad_all2;
+ __m128i mad_all3;
+ DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+ DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+ DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+ DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+ mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+ mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+ intermediateA = _mm_packus_epi16(mad_all0, mad_all2);
+ // --
+ src_ptr += src_stride*4;
+ // --
+ DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+ DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+ DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+ DO_FOUR_PIXELS(mad_all3, src_ptr, 3*src_stride)
+ mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+ mad_all2 = _mm_packs_epi32(mad_all2, mad_all3);
+ intermediateB = _mm_packus_epi16(mad_all0, mad_all2);
+ // --
+ src_ptr += src_stride*4;
+ // --
+ DO_FOUR_PIXELS(mad_all0, src_ptr, 0*src_stride)
+ DO_FOUR_PIXELS(mad_all1, src_ptr, 1*src_stride)
+ DO_FOUR_PIXELS(mad_all2, src_ptr, 2*src_stride)
+ mad_all0 = _mm_packs_epi32(mad_all0, mad_all1);
+ mad_all2 = _mm_packs_epi32(mad_all2, mad_all2);
+ intermediateC = _mm_packus_epi16(mad_all0, mad_all2);
+ }
+ }
+
+ // Transpose result (intermediate -> transpose3_x)
+ {
+ // 00 01 02 03 10 11 12 13 20 21 22 23 30 31 32 33
+ // 40 41 42 43 50 51 52 53 60 61 62 63 70 71 72 73
+ // 80 81 82 83 90 91 92 93 A0 A1 A2 A3 xx xx xx xx
+ const __m128i transpose0_0 = _mm_unpacklo_epi8(intermediateA, intermediateB);
+ const __m128i transpose0_1 = _mm_unpackhi_epi8(intermediateA, intermediateB);
+ const __m128i transpose0_2 = _mm_unpacklo_epi8(intermediateC, intermediateC);
+ const __m128i transpose0_3 = _mm_unpackhi_epi8(intermediateC, intermediateC);
+ // 00 40 01 41 02 42 03 43 10 50 11 51 12 52 13 53
+ // 20 60 21 61 22 62 23 63 30 70 31 71 32 72 33 73
+ // 80 xx 81 xx 82 xx 83 xx 90 xx 91 xx 92 xx 93 xx
+ // A0 xx A1 xx A2 xx A3 xx xx xx xx xx xx xx xx xx
+ const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
+ const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
+ const __m128i transpose1_2 = _mm_unpacklo_epi8(transpose0_2, transpose0_3);
+ const __m128i transpose1_3 = _mm_unpackhi_epi8(transpose0_2, transpose0_3);
+ // 00 20 40 60 01 21 41 61 02 22 42 62 03 23 43 63
+ // 10 30 50 70 11 31 51 71 12 32 52 72 13 33 53 73
+ // 80 A0 xx xx 81 A1 xx xx 82 A2 xx xx 83 A3 xx xx
+ // 90 xx xx xx 91 xx xx xx 92 xx xx xx 93 xx xx xx
+ const __m128i transpose2_0 = _mm_unpacklo_epi8(transpose1_0, transpose1_1);
+ const __m128i transpose2_1 = _mm_unpackhi_epi8(transpose1_0, transpose1_1);
+ const __m128i transpose2_2 = _mm_unpacklo_epi8(transpose1_2, transpose1_3);
+ const __m128i transpose2_3 = _mm_unpackhi_epi8(transpose1_2, transpose1_3);
+ // 00 10 20 30 40 50 60 70 01 11 21 31 41 51 61 71
+ // 02 12 22 32 42 52 62 72 03 13 23 33 43 53 63 73
+ // 80 90 A0 xx xx xx xx xx 81 91 A1 xx xx xx xx xx
+ // 82 92 A2 xx xx xx xx xx 83 93 A3 xx xx xx xx xx
+ transpose3_0 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+ _mm_castsi128_ps(transpose2_2),
+ _MM_SHUFFLE(1, 0, 1, 0)));
+ transpose3_1 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_0),
+ _mm_castsi128_ps(transpose2_2),
+ _MM_SHUFFLE(3, 2, 3, 2)));
+ transpose3_2 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+ _mm_castsi128_ps(transpose2_3),
+ _MM_SHUFFLE(1, 0, 1, 0)));
+ transpose3_3 = _mm_castps_si128(
+ _mm_shuffle_ps(_mm_castsi128_ps(transpose2_1),
+ _mm_castsi128_ps(transpose2_3),
+ _MM_SHUFFLE(3, 2, 3, 2)));
+ // 00 10 20 30 40 50 60 70 80 90 A0 xx xx xx xx xx
+ // 01 11 21 31 41 51 61 71 81 91 A1 xx xx xx xx xx
+ // 02 12 22 32 42 52 62 72 82 92 A2 xx xx xx xx xx
+ // 03 13 23 33 43 53 63 73 83 93 A3 xx xx xx xx xx
+ }
+
+ // Vertical pass (transpose3_x -> dst).
+ {
+ const __m128i VFilter = _mm_load_si128((const __m128i *)VFilter_aligned16);
+ // get first two columns filter coefficients
+ __m128i fil01 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(0, 0, 0, 0));
+ __m128i fil23 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(1, 1, 1, 1));
+ __m128i fil45 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(2, 2, 2, 2));
+ __m128i fil67 = _mm_shuffle_epi32(VFilter, _MM_SHUFFLE(3, 3, 3, 3));
+ __m128i col0, col1, col2, col3;
+ DECLARE_ALIGNED(16, unsigned char, temp[32]);
+ {
+ _mm_store_si128((__m128i *)temp, transpose3_0);
+ DO_FOUR_PIXELS(col0, temp, 0);
+ }
+ {
+ _mm_store_si128((__m128i *)temp, transpose3_1);
+ DO_FOUR_PIXELS(col1, temp, 0);
+ }
+ {
+ _mm_store_si128((__m128i *)temp, transpose3_2);
+ DO_FOUR_PIXELS(col2, temp, 0);
+ }
+ {
+ _mm_store_si128((__m128i *)temp, transpose3_3);
+ DO_FOUR_PIXELS(col3, temp, 0);
+ }
+ // transpose
+ {
+ __m128i T0 = _mm_unpacklo_epi32(col0, col1);
+ __m128i T1 = _mm_unpacklo_epi32(col2, col3);
+ __m128i T2 = _mm_unpackhi_epi32(col0, col1);
+ __m128i T3 = _mm_unpackhi_epi32(col2, col3);
+ col0 = _mm_unpacklo_epi64(T0, T1);
+ col1 = _mm_unpackhi_epi64(T0, T1);
+ col2 = _mm_unpacklo_epi64(T2, T3);
+ col3 = _mm_unpackhi_epi64(T2, T3);
+ }
+ // saturate to 8 bit
+ {
+ col0 = _mm_packs_epi32(col0, col0);
+ col0 = _mm_packus_epi16(col0, col0);
+ col1 = _mm_packs_epi32(col1, col1);
+ col1 = _mm_packus_epi16(col1, col1);
+ col2 = _mm_packs_epi32 (col2, col2);
+ col2 = _mm_packus_epi16(col2, col2);
+ col3 = _mm_packs_epi32 (col3, col3);
+ col3 = _mm_packus_epi16(col3, col3);
+ }
+ // store
+ {
+ *((unsigned int *)&dst_ptr[dst_stride * 0]) = _mm_cvtsi128_si32(col0);
+ *((unsigned int *)&dst_ptr[dst_stride * 1]) = _mm_cvtsi128_si32(col1);
+ *((unsigned int *)&dst_ptr[dst_stride * 2]) = _mm_cvtsi128_si32(col2);
+ *((unsigned int *)&dst_ptr[dst_stride * 3]) = _mm_cvtsi128_si32(col3);
+ }
+ }
+ }
+}
+
+void vp8_filter_block2d_8x4_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ int j;
+ for (j=0; j<8; j+=4) {
+ vp8_filter_block2d_4x4_8_sse2(src_ptr + j, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ dst_ptr + j, dst_stride);
+ }
+}
+
+void vp8_filter_block2d_8x8_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ int i, j;
+ for (i=0; i<8; i+=4) {
+ for (j=0; j<8; j+=4) {
+ vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ dst_ptr + j + i*dst_stride, dst_stride);
+ }
+ }
+}
+
+void vp8_filter_block2d_16x16_8_sse2
+(
+ const unsigned char *src_ptr, const unsigned int src_stride,
+ const short *HFilter_aligned16, const short *VFilter_aligned16,
+ unsigned char *dst_ptr, unsigned int dst_stride
+) {
+ int i, j;
+ for (i=0; i<16; i+=4) {
+ for (j=0; j<16; j+=4) {
+ vp8_filter_block2d_4x4_8_sse2(src_ptr + j + i*src_stride, src_stride,
+ HFilter_aligned16, VFilter_aligned16,
+ dst_ptr + j + i*dst_stride, dst_stride);
+ }
+ }
+}
diff --git a/vp8/common/x86/filter_sse4.c b/vp8/common/x86/filter_sse4.c
index a037622e1..c461db173 100644
--- a/vp8/common/x86/filter_sse4.c
+++ b/vp8/common/x86/filter_sse4.c
@@ -25,9 +25,6 @@
// TODO(cd): Maybe use _mm_maddubs_epi16 if smaller filter coeficients (no sum
// of positive above 128), or have higher precision filter
// coefficients.
-// TODO(cd): Remove use of _mm_extract_epi32 and _mm_extract_epi64, to not
-// require SSE4.1
-// TODO(cd): Remove use of _mm_shuffle_epi8 to not require SSSE3
DECLARE_ALIGNED(16, static const unsigned char, mask0123_c[16]) = {
0x00, 0x01,
diff --git a/vp8/decoder/decodemv.c b/vp8/decoder/decodemv.c
index 5e0600c2d..069d073d4 100644
--- a/vp8/decoder/decodemv.c
+++ b/vp8/decoder/decodemv.c
@@ -657,6 +657,7 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
prev_mi,
&nearest, &nearby, &best_mv, rct,
mbmi->ref_frame, cm->ref_frame_sign_bias);
+
#if CONFIG_NEWBESTREFMV
{
int ref_fb_idx;
@@ -679,6 +680,23 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
xd->pre.u_buffer = cm->yv12_fb[ref_fb_idx].u_buffer + recon_uvoffset;
xd->pre.v_buffer = cm->yv12_fb[ref_fb_idx].v_buffer + recon_uvoffset;
+#if CONFIG_NEW_MVREF
+ // Update stats on relative distance of chosen vector to the
+ // possible best reference vectors.
+ {
+ int i;
+ MV_REFERENCE_FRAME ref_frame = mbmi->ref_frame;
+
+ find_mv_refs(xd, mi, prev_mi,
+ ref_frame, mbmi->ref_mvs[ref_frame],
+ cm->ref_frame_sign_bias );
+
+ // Copy over the candidates.
+ vpx_memcpy(xd->ref_mv, mbmi->ref_mvs[ref_frame],
+ (MAX_MV_REFS * sizeof(int_mv)) );
+ }
+#endif
+
vp8_find_best_ref_mvs(xd,
xd->pre.y_buffer,
recon_y_stride,
@@ -763,6 +781,23 @@ static void read_mb_modes_mv(VP8D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
rct,
mbmi->second_ref_frame,
cm->ref_frame_sign_bias);
+
+#if CONFIG_NEW_MVREF
+ // Update stats on relative distance of chosen vector to the
+ // possible best reference vectors.
+ {
+ MV_REFERENCE_FRAME ref_frame = mbmi->second_ref_frame;
+
+ find_mv_refs(xd, mi, prev_mi,
+ ref_frame, mbmi->ref_mvs[ref_frame],
+ cm->ref_frame_sign_bias );
+
+ // Copy over the mv candidates
+ vpx_memcpy(xd->ref_mv, mbmi->ref_mvs[ref_frame],
+ (MAX_MV_REFS * sizeof(int_mv)) );
+ }
+#endif
+
vp8_find_best_ref_mvs(xd,
xd->second_pre.y_buffer,
recon_y_stride,
diff --git a/vp8/decoder/dequantize.h b/vp8/decoder/dequantize.h
index c4c8d4a06..2326e467d 100644
--- a/vp8/decoder/dequantize.h
+++ b/vp8/decoder/dequantize.h
@@ -201,5 +201,16 @@ void vp8_ht_dequant_idct_add_8x8_c(TX_TYPE tx_type, short *input, short *dq,
int pitch, int stride);
#endif
+#if CONFIG_SUPERBLOCKS
+void vp8_dequant_dc_idct_add_y_block_8x8_inplace_c(short *q, short *dq,
+ unsigned char *dst,
+ int stride, char *eobs,
+ short *dc, MACROBLOCKD *xd);
+void vp8_dequant_idct_add_uv_block_8x8_inplace_c(short *q, short *dq,
+ unsigned char *dstu,
+ unsigned char *dstv,
+ int stride, char *eobs,
+ MACROBLOCKD *xd);
+#endif
#endif
diff --git a/vp8/decoder/detokenize.c b/vp8/decoder/detokenize.c
index adff88a59..a6c837084 100644
--- a/vp8/decoder/detokenize.c
+++ b/vp8/decoder/detokenize.c
@@ -295,6 +295,7 @@ static int vp8_decode_coefs(VP8D_COMP *dx, const MACROBLOCKD *xd,
const vp8_prob *prob, *coef_probs;
switch (block_type) {
+ default:
case TX_4X4:
coef_probs = fc->coef_probs[type][0][0];
break;
@@ -302,7 +303,7 @@ static int vp8_decode_coefs(VP8D_COMP *dx, const MACROBLOCKD *xd,
coef_probs = fc->coef_probs_8x8[type][0][0];
break;
#if CONFIG_TX16X16
- default:
+ case TX_16X16:
coef_probs = fc->coef_probs_16x16[type][0][0];
break;
#endif
diff --git a/vp8/encoder/bitstream.c b/vp8/encoder/bitstream.c
index 90bc8e987..7e667aa63 100644
--- a/vp8/encoder/bitstream.c
+++ b/vp8/encoder/bitstream.c
@@ -28,6 +28,10 @@
#include "vp8/common/pred_common.h"
#include "vp8/common/entropy.h"
+#if CONFIG_NEW_MVREF
+#include "vp8/common/mvref_common.h"
+#endif
+
#if defined(SECTIONBITS_OUTPUT)
unsigned __int64 Sectionbits[500];
#endif
@@ -1043,12 +1047,30 @@ static void pack_inter_mode_mvs(VP8_COMP *const cpi) {
active_section = 5;
#endif
+#if 0 //CONFIG_NEW_MVREF
+ find_mv_refs(xd, m, prev_m,
+ m->mbmi.ref_frame,
+ mi->ref_mvs[rf],
+ cpi->common.ref_frame_sign_bias );
+
+ pick_best_mv_ref( mi->mv[0], mi->ref_mvs[rf], &best_mv);
+#endif
if (xd->allow_high_precision_mv)
write_mv_hp(w, &mi->mv[0].as_mv, &best_mv, mvc_hp);
else
write_mv(w, &mi->mv[0].as_mv, &best_mv, mvc);
if (mi->second_ref_frame) {
+#if 0 //CONFIG_NEW_MVREF
+ find_mv_refs(xd, m, prev_m,
+ m->mbmi.second_ref_frame,
+ mi->ref_mvs[mi->second_ref_frame],
+ cpi->common.ref_frame_sign_bias );
+
+ pick_best_mv_ref( mi->mv[1],
+ mi->ref_mvs[mi->second_ref_frame],
+ &best_second_mv);
+#endif
if (xd->allow_high_precision_mv)
write_mv_hp(w, &mi->mv[1].as_mv, &best_second_mv, mvc_hp);
else
diff --git a/vp8/encoder/encodeframe.c b/vp8/encoder/encodeframe.c
index 4472497e0..6ade0aa78 100644
--- a/vp8/encoder/encodeframe.c
+++ b/vp8/encoder/encodeframe.c
@@ -21,6 +21,7 @@
#include "vp8/common/setupintrarecon.h"
#include "encodeintra.h"
#include "vp8/common/reconinter.h"
+#include "vp8/common/invtrans.h"
#include "rdopt.h"
#include "vp8/common/findnearmv.h"
#include "vp8/common/reconintra.h"
@@ -33,6 +34,10 @@
#include "vp8/common/pred_common.h"
#define DBG_PRNT_SEGMAP 0
+#if CONFIG_NEW_MVREF
+#include "vp8/common/mvref_common.h"
+#endif
+
#if CONFIG_RUNTIME_CPU_DETECT
#define RTCD(x) &cpi->common.rtcd.x
@@ -76,7 +81,8 @@ void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
MACROBLOCK *x,
TOKENEXTRA **t, int mb_col);
static void adjust_act_zbin(VP8_COMP *cpi, MACROBLOCK *x);
-
+extern void vp8_stuff_mb_8x8(VP8_COMP *cpi,
+ MACROBLOCKD *xd, TOKENEXTRA **t, int dry_run);
#ifdef MODE_STATS
unsigned int inter_y_modes[MB_MODE_COUNT];
@@ -852,7 +858,6 @@ static void encode_sb(VP8_COMP *cpi,
MACROBLOCK *x,
MACROBLOCKD *xd,
TOKENEXTRA **tp) {
- VP8_COMMON *pc = cm;
int i;
int map_index;
int mb_row, mb_col;
@@ -1300,6 +1305,12 @@ static void encode_frame_internal(VP8_COMP *cpi) {
// this frame which may be updated with each iteration of the recode loop.
compute_mod_refprobs(cm);
+#if CONFIG_NEW_MVREF
+ // temp stats reset
+ vp8_zero( cpi->mv_ref_sum_distance );
+ vp8_zero( cpi->best_ref_index_counts );
+#endif
+
// debug output
#if DBG_PRNT_SEGMAP
{
@@ -1693,7 +1704,6 @@ static void update_sb_skip_coeff_state(VP8_COMP *cpi,
// reset pointer, stuff EOBs where necessary
*tp = t[0];
for (n = 0; n < 4; n++) {
- TOKENEXTRA *tbak = *tp;
if (skip[n]) {
x->e_mbd.above_context = &ta[n];
x->e_mbd.left_context = &tl[n];
@@ -1715,9 +1725,12 @@ void vp8cx_encode_intra_super_block(VP8_COMP *cpi,
int n;
MACROBLOCKD *xd = &x->e_mbd;
VP8_COMMON *cm = &cpi->common;
- const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
- const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
- const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+ const uint8_t *src = x->src.y_buffer;
+ uint8_t *dst = xd->dst.y_buffer;
+ const uint8_t *usrc = x->src.u_buffer;
+ uint8_t *udst = xd->dst.u_buffer;
+ const uint8_t *vsrc = x->src.v_buffer;
+ uint8_t *vdst = xd->dst.v_buffer;
int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
@@ -2041,13 +2054,15 @@ void vp8cx_encode_inter_superblock(VP8_COMP *cpi, MACROBLOCK *x, TOKENEXTRA **t,
const int output_enabled = 1;
VP8_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
- const uint8_t *src = x->src.y_buffer, *dst = xd->dst.y_buffer;
- const uint8_t *usrc = x->src.u_buffer, *udst = xd->dst.u_buffer;
- const uint8_t *vsrc = x->src.v_buffer, *vdst = xd->dst.v_buffer;
+ const uint8_t *src = x->src.y_buffer;
+ uint8_t *dst = xd->dst.y_buffer;
+ const uint8_t *usrc = x->src.u_buffer;
+ uint8_t *udst = xd->dst.u_buffer;
+ const uint8_t *vsrc = x->src.v_buffer;
+ uint8_t *vdst = xd->dst.v_buffer;
int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
const VP8_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
- int mis = xd->mode_info_stride;
unsigned int segment_id = xd->mode_info_context->mbmi.segment_id;
int seg_ref_active;
unsigned char ref_pred_flag;
diff --git a/vp8/encoder/encodemb.c b/vp8/encoder/encodemb.c
index 8c48b0d83..a26350552 100644
--- a/vp8/encoder/encodemb.c
+++ b/vp8/encoder/encodemb.c
@@ -67,8 +67,10 @@ void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch) {
}
}
-void vp8_subtract_mbuv_s_c(short *diff, unsigned char *usrc, unsigned char *vsrc, int src_stride,
- unsigned char *upred, unsigned char *vpred, int dst_stride) {
+void vp8_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
+ const unsigned char *vsrc, int src_stride,
+ const unsigned char *upred,
+ const unsigned char *vpred, int dst_stride) {
short *udiff = diff + 256;
short *vdiff = diff + 320;
@@ -95,14 +97,16 @@ void vp8_subtract_mbuv_s_c(short *diff, unsigned char *usrc, unsigned char *vsrc
}
}
-void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride) {
+void vp8_subtract_mbuv_c(short *diff, unsigned char *usrc,
+ unsigned char *vsrc, unsigned char *pred, int stride) {
unsigned char *upred = pred + 256;
unsigned char *vpred = pred + 320;
vp8_subtract_mbuv_s_c(diff, usrc, vsrc, stride, upred, vpred, 8);
}
-void vp8_subtract_mby_s_c(short *diff, unsigned char *src, int src_stride, unsigned char *pred, int dst_stride) {
+void vp8_subtract_mby_s_c(short *diff, const unsigned char *src, int src_stride,
+ const unsigned char *pred, int dst_stride) {
int r, c;
for (r = 0; r < 16; r++) {
@@ -116,8 +120,8 @@ void vp8_subtract_mby_s_c(short *diff, unsigned char *src, int src_stride, unsig
}
}
-void vp8_subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride)
-{
+void vp8_subtract_mby_c(short *diff, unsigned char *src,
+ unsigned char *pred, int stride) {
vp8_subtract_mby_s_c(diff, src, stride, pred, 16);
}
diff --git a/vp8/encoder/encodemb.h b/vp8/encoder/encodemb.h
index 13ddcf115..653774aaf 100644
--- a/vp8/encoder/encodemb.h
+++ b/vp8/encoder/encodemb.h
@@ -132,4 +132,14 @@ void vp8_optimize_mby_16x16(MACROBLOCK *x, const struct VP8_ENCODER_RTCD *rtcd);
void vp8_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
+#if CONFIG_SUPERBLOCKS
+void vp8_subtract_mbuv_s_c(short *diff, const unsigned char *usrc,
+ const unsigned char *vsrc, int src_stride,
+ const unsigned char *upred,
+ const unsigned char *vpred, int dst_stride);
+void vp8_subtract_mby_s_c(short *diff, const unsigned char *src,
+ int src_stride, const unsigned char *pred,
+ int dst_stride);
+#endif
+
#endif
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 256c70386..85a3c5402 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -40,6 +40,10 @@
#include "bitstream.h"
#include "ratectrl.h"
+#if CONFIG_NEW_MVREF
+#include "vp8/common/mvref_common.h"
+#endif
+
#if ARCH_ARM
#include "vpx_ports/arm.h"
#endif
@@ -630,7 +634,6 @@ static void update_reference_segmentation_map(VP8_COMP *cpi) {
for (row = 0; row < sb_rows; row++) {
for (col = 0; col < sb_cols; col++) {
MODE_INFO *miptr = mi + col * 2;
- uint8_t *seg = segmap + col * 2;
uint8_t *cache = segcache + col * 2;
#if CONFIG_SUPERBLOCKS
if (miptr->mbmi.encoded_as_sb) {
@@ -3791,6 +3794,36 @@ static void encode_frame_to_data_rate
// in this frame.
update_base_skip_probs(cpi);
+
+#if CONFIG_NEW_MVREF
+#if 0 && CONFIG_INTERNAL_STATS
+ {
+ FILE *f = fopen("mv_ref_dist.stt", "a");
+ unsigned int i;
+ //fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %10d\n",
+ fprintf(f, "%10d %10d %10d %10d %10d %10d %10d %10d %10d %10d",
+ cpi->common.current_video_frame,
+ cpi->mv_ref_sum_distance[1][0],
+ cpi->mv_ref_sum_distance[1][1],
+ cpi->mv_ref_sum_distance[1][2],
+ cpi->mv_ref_sum_distance[2][0],
+ cpi->mv_ref_sum_distance[2][1],
+ cpi->mv_ref_sum_distance[2][2],
+ cpi->mv_ref_sum_distance[3][0],
+ cpi->mv_ref_sum_distance[3][1],
+ cpi->mv_ref_sum_distance[3][2] );
+
+ for (i = 0; i < MAX_MV_REFS; ++i) {
+ fprintf(f, "%10d", cpi->best_ref_index_counts[i] );
+ }
+ fprintf(f, "\n" );
+
+ fclose(f);
+ }
+#endif
+#endif
+
+
#if 0// 1 && CONFIG_INTERNAL_STATS
{
FILE *f = fopen("tmp.stt", "a");
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 7fb7dd2ff..bff3cdf6c 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -59,6 +59,13 @@
#define VP8_TEMPORAL_ALT_REF 1
+#if CONFIG_NEW_MVREF
+// temp. relate to mv_ref_sum_distance stats
+#define CUR_BEST 0
+#define NEW_BEST 1
+#define BEST_SELECTED 2
+#endif
+
typedef struct {
MV_CONTEXT mvc[2];
int mvcosts[2][MVvals + 1];
@@ -752,6 +759,12 @@ typedef struct VP8_COMP {
[VP8_SWITCHABLE_FILTERS];
#endif
+#if CONFIG_NEW_MVREF
+ // temp stats [REF_FRAME]{REF_METHOD]
+ unsigned int mv_ref_sum_distance[4][3];
+ unsigned int best_ref_index_counts[17];
+#endif
+
} VP8_COMP;
void control_data_rate(VP8_COMP *cpi);
diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index d9b49bfb6..d07c2383e 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c
@@ -41,6 +41,10 @@
#include "vp8/common/seg_common.h"
#include "vp8/common/pred_common.h"
+#if CONFIG_NEW_MVREF
+#include "vp8/common/mvref_common.h"
+#endif
+
#if CONFIG_RUNTIME_CPU_DETECT
#define IF_RTCD(x) (x)
#else
@@ -2892,9 +2896,10 @@ void setup_buffer_inter(VP8_COMP *cpi, MACROBLOCK *x, int idx, int frame_type,
unsigned char *y_buffer[4], unsigned char *u_buffer[4],
unsigned char *v_buffer[4]) {
YV12_BUFFER_CONFIG *yv12 = &cpi->common.yv12_fb[idx];
+ MACROBLOCKD *xd = &x->e_mbd;
- vp8_find_near_mvs(&x->e_mbd, x->e_mbd.mode_info_context,
- x->e_mbd.prev_mode_info_context,
+ vp8_find_near_mvs(xd, xd->mode_info_context,
+ xd->prev_mode_info_context,
&frame_nearest_mv[frame_type], &frame_near_mv[frame_type],
&frame_best_ref_mv[frame_type], frame_mdcounts[frame_type],
frame_type, cpi->common.ref_frame_sign_bias);
@@ -2902,8 +2907,27 @@ void setup_buffer_inter(VP8_COMP *cpi, MACROBLOCK *x, int idx, int frame_type,
y_buffer[frame_type] = yv12->y_buffer + recon_yoffset;
u_buffer[frame_type] = yv12->u_buffer + recon_uvoffset;
v_buffer[frame_type] = yv12->v_buffer + recon_uvoffset;
+
#if CONFIG_NEWBESTREFMV
- vp8_find_best_ref_mvs(&x->e_mbd, y_buffer[frame_type],
+#if CONFIG_NEW_MVREF
+ // Update stats on relative distance of chosen vector to the
+ // possible best reference vectors.
+ {
+ MB_MODE_INFO * mbmi = &xd->mode_info_context->mbmi;
+
+ find_mv_refs(xd, xd->mode_info_context,
+ xd->prev_mode_info_context,
+ frame_type,
+ mbmi->ref_mvs[frame_type],
+ cpi->common.ref_frame_sign_bias );
+
+ // Copy over the mv candidates
+ vpx_memcpy(xd->ref_mv, mbmi->ref_mvs[frame_type],
+ (MAX_MV_REFS * sizeof(int_mv)) );
+ }
+#endif
+
+ vp8_find_best_ref_mvs(xd, y_buffer[frame_type],
yv12->y_stride,
&frame_best_ref_mv[frame_type],
&frame_nearest_mv[frame_type],
@@ -2943,7 +2967,7 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
int uv_intra_rate_8x8 = 0, uv_intra_distortion_8x8 = 0, uv_intra_rate_tokenonly_8x8 = 0;
int uv_intra_skippable_8x8 = 0;
int rate_y, UNINITIALIZED_IS_SAFE(rate_uv);
- int distortion_uv;
+ int distortion_uv = INT_MAX;
int64_t best_yrd = INT64_MAX;
#if CONFIG_PRED_FILTER
int best_filter_state;
@@ -3407,6 +3431,43 @@ void vp8_rd_pick_inter_mode(VP8_COMP *cpi, MACROBLOCK *x, int recon_yoffset, int
d->bmi.as_mv.first.as_int = tmp_mv.as_int;
frame_mv[NEWMV][refs[0]].as_int = d->bmi.as_mv.first.as_int;
+#if CONFIG_NEW_MVREF
+ // Update stats on relative distance of chosen vector to the
+ // possible best reference vectors.
+ {
+ unsigned int distance;
+ MV_REFERENCE_FRAME ref = mbmi->ref_frame;
+ int_mv selected_best_ref;
+ unsigned int best_index = 0;
+
+ find_mv_refs(xd, xd->mode_info_context,
+ xd->prev_mode_info_context,
+ ref,
+ mbmi->ref_mvs[ref],
+ cpi->common.ref_frame_sign_bias );
+
+ distance = mv_distance(&tmp_mv, &best_ref_mv);
+ cpi->mv_ref_sum_distance[ref][CUR_BEST] += distance;
+
+ distance =
+ mv_distance(&tmp_mv,
+ &mbmi->ref_mvs[ref][0]);
+ cpi->mv_ref_sum_distance[ref][NEW_BEST] += distance;
+
+ best_index = pick_best_mv_ref(tmp_mv, mbmi->ref_mvs[ref],
+ &selected_best_ref);
+
+ distance = mv_distance(&tmp_mv, &selected_best_ref);
+ mbmi->mv_ref_index[ref] = best_index;
+ cpi->mv_ref_sum_distance[ref][BEST_SELECTED] += distance;
+ cpi->best_ref_index_counts[best_index]++;
+
+ // Temp
+ //mbmi->mv_ref_index[ref] = 0;
+ //mbmi->ref_mvs[ref][0].as_int = best_ref_mv.as_int;
+ }
+#endif
+
// Add the new motion vector cost to our rolling cost variable
rate2 += vp8_mv_bit_cost(&tmp_mv, &best_ref_mv,
XMVCOST, 96,
@@ -3856,7 +3917,6 @@ void vp8_rd_pick_intra_mode(VP8_COMP *cpi, MACROBLOCK *x,
int mode16x16;
int mode8x8[2][4];
int dist;
- int rateuv8, rateuv_tokenonly8, distuv8;
mbmi->ref_frame = INTRA_FRAME;
rd_pick_intra_mbuv_mode(cpi, x, &rateuv, &rateuv_tokenonly, &distuv);
@@ -3961,7 +4021,6 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
BLOCKD *d = &xd->block[0];
MB_PREDICTION_MODE this_mode;
MV_REFERENCE_FRAME ref_frame;
- int mis = xd->mode_info_stride;
unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
int comp_pred;
int_mv best_ref_mv, second_best_ref_mv;
@@ -4313,11 +4372,11 @@ int64_t vp8_rd_pick_inter_mode_sb(VP8_COMP *cpi, MACROBLOCK *x,
if ((sse - var < q2dc *q2dc >> 4) ||
(sse / 2 > var && sse - var < 64)) {
// Check u and v to make sure skip is ok
- int sse2, sse3;
- int var2 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+ unsigned int sse2, sse3;
+ var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
(x->src.u_buffer, x->src.uv_stride,
xd->dst.u_buffer, xd->dst.uv_stride, &sse2);
- int var3 = VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
+ var += VARIANCE_INVOKE(&cpi->rtcd.variance, var16x16)
(x->src.v_buffer, x->src.uv_stride,
xd->dst.v_buffer, xd->dst.uv_stride, &sse3);
sse2 += sse3;
@@ -4658,7 +4717,6 @@ void vp8cx_pick_mode_inter_macroblock(VP8_COMP *cpi, MACROBLOCK *x,
int recon_yoffset,
int recon_uvoffset,
int *totalrate, int *totaldist) {
- VP8_COMMON *cm = &cpi->common;
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
int rate, distortion;
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index 9f708ac4c..f04bc3497 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -67,6 +67,8 @@ VP8_COMMON_SRCS-yes += common/loopfilter_filters.c
VP8_COMMON_SRCS-yes += common/mbpitch.c
VP8_COMMON_SRCS-yes += common/modecont.c
VP8_COMMON_SRCS-yes += common/modecontext.c
+VP8_COMMON_SRCS-yes += common/mvref_common.c
+VP8_COMMON_SRCS-yes += common/mvref_common.h
VP8_COMMON_SRCS-yes += common/quant_common.c
VP8_COMMON_SRCS-yes += common/recon.c
VP8_COMMON_SRCS-yes += common/reconinter.c
@@ -116,6 +118,11 @@ ifeq ($(HAVE_SSE4_1),yes)
vp8/common/x86/filter_sse4.c.o: CFLAGS += -msse4
endif
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/filter_sse2.c
+ifeq ($(HAVE_SSE2),yes)
+vp8/common/x86/filter_sse2.c.o: CFLAGS += -msse2
+endif
+
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/arm_systemdependent.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.c
VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/bilinearfilter_arm.h