summaryrefslogtreecommitdiff
path: root/vp9
diff options
context:
space:
mode:
Diffstat (limited to 'vp9')
-rw-r--r--vp9/common/vp9_debugmodes.c140
-rw-r--r--vp9/common/vp9_entropymv.c46
-rw-r--r--vp9/common/vp9_entropymv.h4
-rw-r--r--vp9/common/vp9_findnearmv.c2
-rw-r--r--vp9/common/vp9_rtcd_defs.sh76
-rw-r--r--vp9/decoder/vp9_decodemv.c279
-rw-r--r--vp9/decoder/vp9_decodframe.c60
-rw-r--r--vp9/encoder/vp9_encodeframe.c1248
-rw-r--r--vp9/encoder/vp9_encodemb.c8
-rw-r--r--vp9/encoder/vp9_encodemb.h4
-rw-r--r--vp9/encoder/vp9_encodemv.c2
-rw-r--r--vp9/encoder/vp9_firstpass.c6
-rw-r--r--vp9/encoder/vp9_mcomp.c6
-rw-r--r--vp9/encoder/vp9_onyx_if.c81
-rw-r--r--vp9/encoder/vp9_onyx_int.h13
-rw-r--r--vp9/encoder/vp9_quantize.c195
-rw-r--r--vp9/encoder/vp9_rdopt.c322
-rw-r--r--vp9/encoder/vp9_rdopt.h4
-rw-r--r--vp9/encoder/x86/vp9_encodeopt.asm125
-rw-r--r--vp9/encoder/x86/vp9_error_sse2.asm57
-rw-r--r--vp9/encoder/x86/vp9_subpel_variance.asm1288
-rw-r--r--vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm308
-rw-r--r--vp9/encoder/x86/vp9_subtract_mmx.asm432
-rw-r--r--vp9/encoder/x86/vp9_subtract_sse2.asm463
-rw-r--r--vp9/encoder/x86/vp9_variance_impl_mmx.asm341
-rw-r--r--vp9/encoder/x86/vp9_variance_impl_sse2.asm27
-rw-r--r--vp9/encoder/x86/vp9_variance_impl_ssse3.asm372
-rw-r--r--vp9/encoder/x86/vp9_variance_mmx.c235
-rw-r--r--vp9/encoder/x86/vp9_variance_sse2.c523
-rw-r--r--vp9/encoder/x86/vp9_variance_ssse3.c142
-rw-r--r--vp9/encoder/x86/vp9_x86_csystemdependent.c55
-rw-r--r--vp9/vp9cx.mk7
32 files changed, 3057 insertions, 3814 deletions
diff --git a/vp9/common/vp9_debugmodes.c b/vp9/common/vp9_debugmodes.c
index 5841f8091..370ebe8f8 100644
--- a/vp9/common/vp9_debugmodes.c
+++ b/vp9/common/vp9_debugmodes.c
@@ -11,126 +11,68 @@
#include <stdio.h>
#include "vp9/common/vp9_blockd.h"
+#include "vp9/common/vp9_onyxc_int.h"
-void vp9_print_modes_and_motion_vectors(MODE_INFO *mi, int rows, int cols,
- int frame, char *file) {
+static void log_frame_info(VP9_COMMON *cm, const char *str, FILE *f) {
+ fprintf(f, "%s", str);
+ fprintf(f, "(Frame %d, Show:%d, Q:%d): \n", cm->current_video_frame,
+ cm->show_frame, cm->base_qindex);
+}
+/* This function dereferences a pointer to the mbmi structure
+ * and uses the passed in member offset to print out the value of an integer
+ * for each mbmi member value in the mi structure.
+ */
+static void print_mi_data(VP9_COMMON *common, FILE *file, char *descriptor,
+ size_t member_offset) {
int mi_row;
int mi_col;
int mi_index = 0;
- FILE *mvs = fopen(file, "a");
-
- // Print out the macroblock Y modes
- fprintf(mvs, "SB Types for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.sb_type);
-
- mi_index++;
- }
-
- fprintf(mvs, "\n");
- mi_index += 8;
- }
+ MODE_INFO *mi = common->mi;
+ int rows = common->mi_rows;
+ int cols = common->mi_cols;
+ char prefix = descriptor[0];
- // Print out the macroblock Y modes
- fprintf(mvs, "Mb Modes for Frame %d\n", frame);
+ log_frame_info(common, descriptor, file);
mi_index = 0;
for (mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(file, "%c ", prefix);
for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.mode);
-
+ fprintf(file, "%2d ",
+ *((int*) ((char *) (&mi[mi_index].mbmi) + member_offset)));
mi_index++;
}
-
- fprintf(mvs, "\n");
+ fprintf(file, "\n");
mi_index += 8;
}
-
- fprintf(mvs, "\n");
-
- mi_index = 0;
- fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.ref_frame[0]);
-
- mi_index++;
- }
-
- fprintf(mvs, "\n");
- mi_index += 8;
- }
- fprintf(mvs, "\n");
-
- mi_index = 0;
- fprintf(mvs, "Mb mv ref for Frame %d\n", frame);
-
+ fprintf(file, "\n");
+}
+void vp9_print_modes_and_motion_vectors(VP9_COMMON *cm, char *file) {
+ int mi_row;
+ int mi_col;
+ int mi_index = 0;
+ FILE *mvs = fopen(file, "a");
+ MODE_INFO *mi = cm->mi;
+ int rows = cm->mi_rows;
+ int cols = cm->mi_cols;
+
+ print_mi_data(cm, mvs, "Partitions:", offsetof(MB_MODE_INFO, sb_type));
+ print_mi_data(cm, mvs, "Modes:", offsetof(MB_MODE_INFO, mode));
+ print_mi_data(cm, mvs, "Skips:", offsetof(MB_MODE_INFO, mb_skip_coeff));
+ print_mi_data(cm, mvs, "Ref frame:", offsetof(MB_MODE_INFO, ref_frame[0]));
+ print_mi_data(cm, mvs, "Transform:", offsetof(MB_MODE_INFO, txfm_size));
+ print_mi_data(cm, mvs, "UV Modes:", offsetof(MB_MODE_INFO, uv_mode));
+
+ log_frame_info(cm, "Vectors ",mvs);
for (mi_row = 0; mi_row < rows; mi_row++) {
+ fprintf(mvs,"V ");
for (mi_col = 0; mi_col < cols; mi_col++) {
fprintf(mvs, "%4d:%4d ", mi[mi_index].mbmi.mv[0].as_mv.row,
mi[mi_index].mbmi.mv[0].as_mv.col);
-
mi_index++;
}
-
fprintf(mvs, "\n");
mi_index += 8;
}
-
- fprintf(mvs, "\n");
-
- /* print out the macroblock txform sizes */
- mi_index = 0;
- fprintf(mvs, "TXFM size for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.txfm_size);
-
- mi_index++;
- }
-
- mi_index += 8;
- fprintf(mvs, "\n");
- }
-
- fprintf(mvs, "\n");
-
- /* print out the macroblock UV modes */
- mi_index = 0;
- fprintf(mvs, "UV Modes for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%2d ", mi[mi_index].mbmi.uv_mode);
-
- mi_index++;
- }
-
- mi_index += 8;
- fprintf(mvs, "\n");
- }
-
- fprintf(mvs, "\n");
-
- /* print out the macroblock mvs */
- mi_index = 0;
- fprintf(mvs, "MVs for Frame %d\n", frame);
-
- for (mi_row = 0; mi_row < rows; mi_row++) {
- for (mi_col = 0; mi_col < cols; mi_col++) {
- fprintf(mvs, "%5d:%-5d", mi[mi_index].mbmi.mv[0].as_mv.row / 2,
- mi[mi_index].mbmi.mv[0].as_mv.col / 2);
-
- mi_index++;
- }
-
- mi_index += 8;
- fprintf(mvs, "\n");
- }
-
fprintf(mvs, "\n");
fclose(mvs);
diff --git a/vp9/common/vp9_entropymv.c b/vp9/common/vp9_entropymv.c
index e07e43c8b..000e284ee 100644
--- a/vp9/common/vp9_entropymv.c
+++ b/vp9/common/vp9_entropymv.c
@@ -114,7 +114,7 @@ MV_CLASS_TYPE vp9_get_mv_class(int z, int *offset) {
return c;
}
-int vp9_use_nmv_hp(const MV *ref) {
+int vp9_use_mv_hp(const MV *ref) {
return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
(abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
}
@@ -123,54 +123,50 @@ int vp9_get_mv_mag(MV_CLASS_TYPE c, int offset) {
return mv_class_base(c) + offset;
}
-static void increment_nmv_component_count(int v,
- nmv_component_counts *mvcomp,
- int incr,
- int usehp) {
- assert (v != 0); /* should not be zero */
- mvcomp->mvcount[MV_MAX + v] += incr;
+static void inc_mv_component_count(int v, nmv_component_counts *comp_counts,
+ int incr) {
+ assert (v != 0);
+ comp_counts->mvcount[MV_MAX + v] += incr;
}
-static void increment_nmv_component(int v,
- nmv_component_counts *mvcomp,
- int incr,
- int usehp) {
+static void inc_mv_component(int v, nmv_component_counts *comp_counts,
+ int incr, int usehp) {
int s, z, c, o, d, e, f;
if (!incr)
return;
assert (v != 0); /* should not be zero */
s = v < 0;
- mvcomp->sign[s] += incr;
+ comp_counts->sign[s] += incr;
z = (s ? -v : v) - 1; /* magnitude - 1 */
c = vp9_get_mv_class(z, &o);
- mvcomp->classes[c] += incr;
+ comp_counts->classes[c] += incr;
d = (o >> 3); /* int mv data */
f = (o >> 1) & 3; /* fractional pel mv data */
e = (o & 1); /* high precision mv data */
if (c == MV_CLASS_0) {
- mvcomp->class0[d] += incr;
+ comp_counts->class0[d] += incr;
} else {
int i;
int b = c + CLASS0_BITS - 1; // number of bits
for (i = 0; i < b; ++i)
- mvcomp->bits[i][((d >> i) & 1)] += incr;
+ comp_counts->bits[i][((d >> i) & 1)] += incr;
}
/* Code the fractional pel bits */
if (c == MV_CLASS_0) {
- mvcomp->class0_fp[d][f] += incr;
+ comp_counts->class0_fp[d][f] += incr;
} else {
- mvcomp->fp[f] += incr;
+ comp_counts->fp[f] += incr;
}
/* Code the high precision bit */
if (usehp) {
if (c == MV_CLASS_0) {
- mvcomp->class0_hp[e] += incr;
+ comp_counts->class0_hp[e] += incr;
} else {
- mvcomp->hp[e] += incr;
+ comp_counts->hp[e] += incr;
}
}
}
@@ -197,8 +193,8 @@ static void counts_to_context(nmv_component_counts *mvcomp, int usehp) {
int v;
vpx_memset(mvcomp->sign, 0, sizeof(nmv_component_counts) - sizeof(mvcomp->mvcount));
for (v = 1; v <= MV_MAX; v++) {
- increment_nmv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
- increment_nmv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
+ inc_mv_component(-v, mvcomp, mvcomp->mvcount[MV_MAX - v], usehp);
+ inc_mv_component( v, mvcomp, mvcomp->mvcount[MV_MAX + v], usehp);
}
}
@@ -206,12 +202,12 @@ void vp9_increment_nmv(const MV *mv, const MV *ref, nmv_context_counts *mvctx,
int usehp) {
const MV_JOINT_TYPE j = vp9_get_mv_joint(mv);
mvctx->joints[j]++;
- usehp = usehp && vp9_use_nmv_hp(ref);
+ usehp = usehp && vp9_use_mv_hp(ref);
if (mv_joint_vertical(j))
- increment_nmv_component_count(mv->row, &mvctx->comps[0], 1, usehp);
+ inc_mv_component_count(mv->row, &mvctx->comps[0], 1);
if (mv_joint_horizontal(j))
- increment_nmv_component_count(mv->col, &mvctx->comps[1], 1, usehp);
+ inc_mv_component_count(mv->col, &mvctx->comps[1], 1);
}
static void adapt_prob(vp9_prob *dest, vp9_prob prep, unsigned int ct[2]) {
@@ -332,7 +328,7 @@ static unsigned int adapt_probs(unsigned int i,
}
-void vp9_adapt_nmv_probs(VP9_COMMON *cm, int usehp) {
+void vp9_adapt_mv_probs(VP9_COMMON *cm, int usehp) {
int i, j;
#ifdef MV_COUNT_TESTING
printf("joints count: ");
diff --git a/vp9/common/vp9_entropymv.h b/vp9/common/vp9_entropymv.h
index 15994a6ae..0fc20dbfc 100644
--- a/vp9/common/vp9_entropymv.h
+++ b/vp9/common/vp9_entropymv.h
@@ -21,8 +21,8 @@ struct VP9Common;
void vp9_entropy_mv_init();
void vp9_init_mv_probs(struct VP9Common *cm);
-void vp9_adapt_nmv_probs(struct VP9Common *cm, int usehp);
-int vp9_use_nmv_hp(const MV *ref);
+void vp9_adapt_mv_probs(struct VP9Common *cm, int usehp);
+int vp9_use_mv_hp(const MV *ref);
#define VP9_NMV_UPDATE_PROB 252
diff --git a/vp9/common/vp9_findnearmv.c b/vp9/common/vp9_findnearmv.c
index a6922715e..643b229a6 100644
--- a/vp9/common/vp9_findnearmv.c
+++ b/vp9/common/vp9_findnearmv.c
@@ -15,7 +15,7 @@
#include "vp9/common/vp9_sadmxn.h"
static void lower_mv_precision(int_mv *mv, int usehp) {
- if (!usehp || !vp9_use_nmv_hp(&mv->as_mv)) {
+ if (!usehp || !vp9_use_mv_hp(&mv->as_mv)) {
if (mv->as_mv.row & 1)
mv->as_mv.row += (mv->as_mv.row > 0 ? -1 : 1);
if (mv->as_mv.col & 1)
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index a405aab8d..892ad5615 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -266,88 +266,84 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid
specialize vp9_variance4x4 mmx sse2
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x64 sse2
+specialize vp9_sub_pixel_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x64
+specialize vp9_sub_pixel_avg_variance64x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x64
+specialize vp9_sub_pixel_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x64
+specialize vp9_sub_pixel_avg_variance32x64 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance64x32
+specialize vp9_sub_pixel_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance64x32
+specialize vp9_sub_pixel_avg_variance64x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x16
+specialize vp9_sub_pixel_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x16
+specialize vp9_sub_pixel_avg_variance32x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x32
+specialize vp9_sub_pixel_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x32
+specialize vp9_sub_pixel_avg_variance16x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance32x32 sse2
+specialize vp9_sub_pixel_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance32x32
+specialize vp9_sub_pixel_avg_variance32x32 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
+specialize vp9_sub_pixel_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x16
+specialize vp9_sub_pixel_avg_variance16x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x16 sse2 mmx
-vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
+specialize vp9_sub_pixel_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x16
+specialize vp9_sub_pixel_avg_variance8x16 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
-vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
+specialize vp9_sub_pixel_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance16x8
+specialize vp9_sub_pixel_avg_variance16x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x8 sse2 mmx
-vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
+specialize vp9_sub_pixel_variance8x8 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x8
+specialize vp9_sub_pixel_avg_variance8x8 sse2 ssse3
# TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form
prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance8x4
+specialize vp9_sub_pixel_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance8x4
+specialize vp9_sub_pixel_avg_variance8x4 sse2 ssse3
prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x8
+specialize vp9_sub_pixel_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x8
+specialize vp9_sub_pixel_avg_variance4x8 sse ssse3
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_sub_pixel_variance4x4 sse2 mmx
-vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+specialize vp9_sub_pixel_variance4x4 sse ssse3
+#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
-specialize vp9_sub_pixel_avg_variance4x4
+specialize vp9_sub_pixel_avg_variance4x4 sse ssse3
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad64x64 sse2
@@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co
specialize vp9_sad4x4 mmx sse
prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_h mmx sse2
+specialize vp9_variance_halfpixvar16x16_h sse2
vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt
prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_v mmx sse2
+specialize vp9_variance_halfpixvar16x16_v sse2
vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt
prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
-specialize vp9_variance_halfpixvar16x16_hv mmx sse2
+specialize vp9_variance_halfpixvar16x16_hv sse2
vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt
prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
@@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse
prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array"
specialize vp9_sad4x4x4d sse
-prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
-specialize vp9_sub_pixel_mse16x16 sse2 mmx
+#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse"
+#specialize vp9_sub_pixel_mse16x16 sse2 mmx
prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"
specialize vp9_mse16x16 mmx sse2
@@ -533,9 +529,11 @@ prototype unsigned int vp9_get_mb_ss "const int16_t *"
specialize vp9_get_mb_ss mmx sse2
# ENCODEMB INVOKE
-prototype int vp9_block_error "int16_t *coeff, int16_t *dqcoeff, int block_size"
-specialize vp9_block_error mmx sse2
-vp9_block_error_sse2=vp9_block_error_xmm
+prototype int64_t vp9_block_error "int16_t *coeff, int16_t *dqcoeff, intptr_t block_size"
+specialize vp9_block_error sse2
+
+prototype void vp9_subtract_block "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride"
+specialize vp9_subtract_block sse2
#
# Structured Similarity (SSIM)
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index b3d41bed7..d8836c962 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -8,20 +8,21 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
-#include "vp9/decoder/vp9_treereader.h"
-#include "vp9/common/vp9_entropymv.h"
+#include "vp9/common/vp9_common.h"
+#include "vp9/common/vp9_entropy.h"
#include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/common/vp9_entropymv.h"
#include "vp9/common/vp9_findnearmv.h"
-#include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_seg_common.h"
+#include "vp9/common/vp9_mvref_common.h"
#include "vp9/common/vp9_pred_common.h"
-#include "vp9/common/vp9_entropy.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_seg_common.h"
+
#include "vp9/decoder/vp9_decodemv.h"
#include "vp9/decoder/vp9_decodframe.h"
-#include "vp9/common/vp9_mvref_common.h"
+#include "vp9/decoder/vp9_onyxd_int.h"
+#include "vp9/decoder/vp9_treereader.h"
+
#if CONFIG_DEBUG
#include <assert.h>
#endif
@@ -37,14 +38,52 @@ extern int dec_debug;
#endif
static MB_PREDICTION_MODE read_intra_mode(vp9_reader *r, const vp9_prob *p) {
- MB_PREDICTION_MODE m = treed_read(r, vp9_intra_mode_tree, p);
- return m;
+ return treed_read(r, vp9_intra_mode_tree, p);
}
static int read_mb_segid(vp9_reader *r, MACROBLOCKD *xd) {
return treed_read(r, vp9_segment_tree, xd->mb_segment_tree_probs);
}
+static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
+ vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
+ const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
+ const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
+ TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);
+ if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
+ txfm_size += vp9_read(r, tx_probs[1]);
+ if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
+ txfm_size += vp9_read(r, tx_probs[2]);
+ }
+
+ if (bsize >= BLOCK_SIZE_SB32X32)
+ cm->fc.tx_count_32x32p[context][txfm_size]++;
+ else if (bsize >= BLOCK_SIZE_MB16X16)
+ cm->fc.tx_count_16x16p[context][txfm_size]++;
+ else
+ cm->fc.tx_count_8x8p[context][txfm_size]++;
+
+ return txfm_size;
+}
+
+static TX_SIZE get_txfm_size(VP9D_COMP *pbi, TXFM_MODE txfm_mode,
+ BLOCK_SIZE_TYPE bsize, int select_cond,
+ vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+
+ if (txfm_mode == TX_MODE_SELECT && bsize >= BLOCK_SIZE_SB8X8 && select_cond)
+ return select_txfm_size(cm, xd, r, bsize);
+ else if (txfm_mode >= ALLOW_32X32 && bsize >= BLOCK_SIZE_SB32X32)
+ return TX_32X32;
+ else if (txfm_mode >= ALLOW_16X16 && bsize >= BLOCK_SIZE_MB16X16)
+ return TX_16X16;
+ else if (txfm_mode >= ALLOW_8X8 && bsize >= BLOCK_SIZE_SB8X8)
+ return TX_8X8;
+ else
+ return TX_4X4;
+}
+
static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
int mi_row, int mi_col, int segment_id) {
const int mi_index = mi_row * cm->mi_cols + mi_col;
@@ -63,27 +102,6 @@ static void set_segment_id(VP9_COMMON *cm, MB_MODE_INFO *mbmi,
}
}
-static TX_SIZE select_txfm_size(VP9_COMMON *cm, MACROBLOCKD *xd,
- vp9_reader *r, BLOCK_SIZE_TYPE bsize) {
- const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
- const vp9_prob *tx_probs = vp9_get_pred_probs(cm, xd, PRED_TX_SIZE);
- TX_SIZE txfm_size = vp9_read(r, tx_probs[0]);
- if (txfm_size != TX_4X4 && bsize >= BLOCK_SIZE_MB16X16) {
- txfm_size += vp9_read(r, tx_probs[1]);
- if (txfm_size != TX_8X8 && bsize >= BLOCK_SIZE_SB32X32)
- txfm_size += vp9_read(r, tx_probs[2]);
- }
- if (bsize >= BLOCK_SIZE_SB32X32) {
- cm->fc.tx_count_32x32p[context][txfm_size]++;
- } else if (bsize >= BLOCK_SIZE_MB16X16) {
- cm->fc.tx_count_16x16p[context][txfm_size]++;
- } else {
- cm->fc.tx_count_8x8p[context][txfm_size]++;
- }
- return txfm_size;
-}
-
-
static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
int mi_row, int mi_col,
vp9_reader *r) {
@@ -106,21 +124,8 @@ static void kfread_modes(VP9D_COMP *pbi, MODE_INFO *m,
[m->mbmi.mb_skip_coeff]++;
}
- if (cm->txfm_mode == TX_MODE_SELECT &&
- m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
- m->mbmi.txfm_size = select_txfm_size(cm, xd, r, m->mbmi.sb_type);
- } else if (cm->txfm_mode >= ALLOW_32X32 &&
- m->mbmi.sb_type >= BLOCK_SIZE_SB32X32) {
- m->mbmi.txfm_size = TX_32X32;
- } else if (cm->txfm_mode >= ALLOW_16X16 &&
- m->mbmi.sb_type >= BLOCK_SIZE_MB16X16) {
- m->mbmi.txfm_size = TX_16X16;
- } else if (cm->txfm_mode >= ALLOW_8X8 &&
- m->mbmi.sb_type >= BLOCK_SIZE_SB8X8) {
- m->mbmi.txfm_size = TX_8X8;
- } else {
- m->mbmi.txfm_size = TX_4X4;
- }
+ m->mbmi.txfm_size = get_txfm_size(pbi, cm->txfm_mode, m->mbmi.sb_type,
+ 1, r);
// luma mode
m->mbmi.ref_frame[0] = INTRA_FRAME;
@@ -303,28 +308,22 @@ unsigned int vp9_mv_cont_count[5][4] = {
};
#endif
-static void read_switchable_interp_probs(VP9_COMMON* const cm, vp9_reader *r) {
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
int i, j;
- for (j = 0; j <= VP9_SWITCHABLE_FILTERS; ++j)
- for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
- cm->fc.switchable_interp_prob[j][i] =
- // vp9_read_prob(r);
- vp9_read_prob_diff_update(r, cm->fc.switchable_interp_prob[j][i]);
- }
- }
+ for (j = 0; j < VP9_SWITCHABLE_FILTERS + 1; ++j)
+ for (i = 0; i < VP9_SWITCHABLE_FILTERS - 1; ++i)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ fc->switchable_interp_prob[j][i] = vp9_read_prob_diff_update(r,
+ fc->switchable_interp_prob[j][i]);
}
-static void read_inter_mode_probs(VP9_COMMON *const cm, vp9_reader *r) {
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp9_reader *r) {
int i, j;
for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
- for (j = 0; j < VP9_INTER_MODES - 1; ++j) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
- // cm->fc.inter_mode_probs[i][j] = vp9_read_prob(r);
- cm->fc.inter_mode_probs[i][j] =
- vp9_read_prob_diff_update(r, cm->fc.inter_mode_probs[i][j]);
- }
- }
+ for (j = 0; j < VP9_INTER_MODES - 1; ++j)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
+ fc->inter_mode_probs[i][j] = vp9_read_prob_diff_update(r,
+ fc->inter_mode_probs[i][j]);
}
static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
@@ -337,21 +336,20 @@ static INLINE COMPPREDMODE_TYPE read_comp_pred_mode(vp9_reader *r) {
static void mb_mode_mv_init(VP9D_COMP *pbi, vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
- if ((cm->frame_type != KEY_FRAME) && (!cm->intra_only)) {
+ if (cm->frame_type != KEY_FRAME && !cm->intra_only) {
nmv_context *const nmvc = &pbi->common.fc.nmvc;
MACROBLOCKD *const xd = &pbi->mb;
int i, j;
- read_inter_mode_probs(cm, r);
+ read_inter_mode_probs(&cm->fc, r);
if (cm->mcomp_filter_type == SWITCHABLE)
- read_switchable_interp_probs(cm, r);
+ read_switchable_interp_probs(&cm->fc, r);
- for (i = 0; i < INTRA_INTER_CONTEXTS; i++) {
+ for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
if (vp9_read(r, VP9_MODE_UPDATE_PROB))
cm->fc.intra_inter_prob[i] =
vp9_read_prob_diff_update(r, cm->fc.intra_inter_prob[i]);
- }
if (cm->allow_comp_inter_inter) {
cm->comp_pred_mode = read_comp_pred_mode(r);
@@ -461,7 +459,7 @@ static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,
const MV_JOINT_TYPE j = treed_read(r, vp9_mv_joint_tree, ctx->joints);
MV diff = {0, 0};
- usehp = usehp && vp9_use_nmv_hp(ref);
+ usehp = usehp && vp9_use_mv_hp(ref);
if (mv_joint_vertical(j))
diff.row = read_mv_component(r, &ctx->comps[0], usehp);
@@ -476,27 +474,80 @@ static INLINE void decode_mv(vp9_reader *r, MV *mv, const MV *ref,
static INLINE INTERPOLATIONFILTERTYPE read_switchable_filter_type(
VP9D_COMP *pbi, vp9_reader *r) {
- const int index = treed_read(r, vp9_switchable_interp_tree,
- vp9_get_pred_probs(&pbi->common, &pbi->mb,
- PRED_SWITCHABLE_INTERP));
- ++pbi->common.fc.switchable_interp_count
- [vp9_get_pred_context(
- &pbi->common, &pbi->mb, PRED_SWITCHABLE_INTERP)][index];
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ const vp9_prob *probs = vp9_get_pred_probs(cm, xd, PRED_SWITCHABLE_INTERP);
+ const int index = treed_read(r, vp9_switchable_interp_tree, probs);
+ const int ctx = vp9_get_pred_context(cm, xd, PRED_SWITCHABLE_INTERP);
+ ++cm->fc.switchable_interp_count[ctx][index];
return vp9_switchable_interp[index];
}
+static void read_intra_block_modes(VP9D_COMP *pbi, MODE_INFO *mi,
+ MB_MODE_INFO *mbmi, vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+ const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const int bw = 1 << b_width_log2(bsize);
+ const int bh = 1 << b_height_log2(bsize);
+
+ if (bsize >= BLOCK_SIZE_SB8X8) {
+ const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
+ const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
+ const int bsl = MIN(bwl, bhl);
+ mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]);
+ cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++;
+ } else {
+ int idx, idy;
+ for (idy = 0; idy < 2; idy += bh) {
+ for (idx = 0; idx < 2; idx += bw) {
+ int ib = idy * 2 + idx, k;
+ int m = read_intra_mode(r, cm->fc.y_mode_prob[0]);
+ mi->bmi[ib].as_mode.first = m;
+ cm->fc.y_mode_counts[0][m]++;
+ for (k = 1; k < bh; ++k)
+ mi->bmi[ib + k * 2].as_mode.first = m;
+ for (k = 1; k < bw; ++k)
+ mi->bmi[ib + k].as_mode.first = m;
+ }
+ }
+ mbmi->mode = mi->bmi[3].as_mode.first;
+ }
+
+ mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
+ cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
+}
+
+static MV_REFERENCE_FRAME read_reference_frame(VP9D_COMP *pbi, int segment_id,
+ vp9_reader *r) {
+ VP9_COMMON *const cm = &pbi->common;
+ MACROBLOCKD *const xd = &pbi->mb;
+
+ MV_REFERENCE_FRAME ref;
+ if (!vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME)) {
+ const int ctx = vp9_get_pred_context(cm, xd, PRED_INTRA_INTER);
+ ref = (MV_REFERENCE_FRAME)
+ vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER));
+ cm->fc.intra_inter_count[ctx][ref != INTRA_FRAME]++;
+ } else {
+ ref = (MV_REFERENCE_FRAME)
+ vp9_get_segdata(xd, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
+ }
+ return ref;
+}
+
static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
int mi_row, int mi_col,
vp9_reader *r) {
VP9_COMMON *const cm = &pbi->common;
- nmv_context *const nmvc = &cm->fc.nmvc;
MACROBLOCKD *const xd = &pbi->mb;
+ nmv_context *const nmvc = &cm->fc.nmvc;
int_mv *const mv0 = &mbmi->mv[0];
int_mv *const mv1 = &mbmi->mv[1];
- BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
- int bw = 1 << b_width_log2(bsize);
- int bh = 1 << b_height_log2(bsize);
+ const BLOCK_SIZE_TYPE bsize = mi->mbmi.sb_type;
+ const int bw = 1 << b_width_log2(bsize);
+ const int bh = 1 << b_height_log2(bsize);
int mb_to_left_edge, mb_to_right_edge, mb_to_top_edge, mb_to_bottom_edge;
int j, idx, idy;
@@ -529,32 +580,9 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
[mbmi->mb_skip_coeff]++;
}
- // Read the reference frame
- if (!vp9_segfeature_active(xd, mbmi->segment_id, SEG_LVL_REF_FRAME)) {
- mbmi->ref_frame[0] =
- vp9_read(r, vp9_get_pred_prob(cm, xd, PRED_INTRA_INTER));
- cm->fc.intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
- [mbmi->ref_frame[0] != INTRA_FRAME]++;
- } else {
- mbmi->ref_frame[0] =
- vp9_get_segdata(xd, mbmi->segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
- }
-
- if (cm->txfm_mode == TX_MODE_SELECT &&
- (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME) &&
- bsize >= BLOCK_SIZE_SB8X8) {
- mbmi->txfm_size = select_txfm_size(cm, xd, r, bsize);
- } else if (bsize >= BLOCK_SIZE_SB32X32 &&
- cm->txfm_mode >= ALLOW_32X32) {
- mbmi->txfm_size = TX_32X32;
- } else if (cm->txfm_mode >= ALLOW_16X16 &&
- bsize >= BLOCK_SIZE_MB16X16) {
- mbmi->txfm_size = TX_16X16;
- } else if (cm->txfm_mode >= ALLOW_8X8 && (bsize >= BLOCK_SIZE_SB8X8)) {
- mbmi->txfm_size = TX_8X8;
- } else {
- mbmi->txfm_size = TX_4X4;
- }
+ mbmi->ref_frame[0] = read_reference_frame(pbi, mbmi->segment_id, r);
+ mbmi->txfm_size = get_txfm_size(pbi, cm->txfm_mode, bsize,
+ (mbmi->mb_skip_coeff == 0 || mbmi->ref_frame[0] == INTRA_FRAME), r);
// If reference frame is an Inter frame
if (mbmi->ref_frame[0] != INTRA_FRAME) {
@@ -745,34 +773,8 @@ static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
}
}
} else {
- // required for left and above block mv
- mv0->as_int = 0;
-
- if (bsize >= BLOCK_SIZE_SB8X8) {
- const BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
- const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- const int bsl = MIN(bwl, bhl);
- mbmi->mode = read_intra_mode(r, cm->fc.y_mode_prob[MIN(3, bsl)]);
- cm->fc.y_mode_counts[MIN(3, bsl)][mbmi->mode]++;
- } else {
- int idx, idy;
- for (idy = 0; idy < 2; idy += bh) {
- for (idx = 0; idx < 2; idx += bw) {
- int ib = idy * 2 + idx, k;
- int m = read_intra_mode(r, cm->fc.y_mode_prob[0]);
- mi->bmi[ib].as_mode.first = m;
- cm->fc.y_mode_counts[0][m]++;
- for (k = 1; k < bh; ++k)
- mi->bmi[ib + k * 2].as_mode.first = m;
- for (k = 1; k < bw; ++k)
- mi->bmi[ib + k].as_mode.first = m;
- }
- }
- mbmi->mode = mi->bmi[3].as_mode.first;
- }
-
- mbmi->uv_mode = read_intra_mode(r, cm->fc.uv_mode_prob[mbmi->mode]);
- cm->fc.uv_mode_counts[mbmi->mode][mbmi->uv_mode]++;
+ mv0->as_int = 0; // required for left and above block mv
+ read_intra_block_modes(pbi, mi, mbmi, r);
}
}
@@ -782,13 +784,10 @@ void vp9_decode_mode_mvs_init(VP9D_COMP* const pbi, vp9_reader *r) {
// TODO(jkoleszar): does this clear more than MBSKIP_CONTEXTS? Maybe remove.
// vpx_memset(cm->fc.mbskip_probs, 0, sizeof(cm->fc.mbskip_probs));
- for (k = 0; k < MBSKIP_CONTEXTS; ++k) {
- if (vp9_read(r, VP9_MODE_UPDATE_PROB)) {
+ for (k = 0; k < MBSKIP_CONTEXTS; ++k)
+ if (vp9_read(r, VP9_MODE_UPDATE_PROB))
cm->fc.mbskip_probs[k] =
vp9_read_prob_diff_update(r, cm->fc.mbskip_probs[k]);
- }
- // cm->fc.mbskip_probs[k] = vp9_read_prob(r);
- }
mb_mode_mv_init(pbi, r);
}
@@ -802,7 +801,7 @@ void vp9_decode_mb_mode_mv(VP9D_COMP* const pbi,
MODE_INFO *mi = xd->mode_info_context;
MB_MODE_INFO *const mbmi = &mi->mbmi;
- if ((cm->frame_type == KEY_FRAME) || cm->intra_only) {
+ if (cm->frame_type == KEY_FRAME || cm->intra_only) {
kfread_modes(pbi, mi, mi_row, mi_col, r);
} else {
read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r);
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 49b181d69..078d09b0d 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -276,9 +276,7 @@ static void decode_atom(VP9D_COMP *pbi, MACROBLOCKD *xd,
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
assert(mbmi->ref_frame[0] != INTRA_FRAME);
-
- if ((pbi->common.frame_type != KEY_FRAME) && (!pbi->common.intra_only))
- vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, &pbi->common);
// prediction
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
@@ -327,8 +325,7 @@ static void decode_sb(VP9D_COMP *pbi, MACROBLOCKD *xd, int mi_row, int mi_col,
assert(mbmi->sb_type == bsize);
assert(mbmi->ref_frame[0] != INTRA_FRAME);
- if (pbi->common.frame_type != KEY_FRAME)
- vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
+ vp9_setup_interp_filters(xd, mbmi->interp_filter, pc);
// generate prediction
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
@@ -392,26 +389,24 @@ static void set_refs(VP9D_COMP *pbi, int mi_row, int mi_col) {
MACROBLOCKD *const xd = &pbi->mb;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
- if (mbmi->ref_frame[0] > INTRA_FRAME) {
+ // Select the appropriate reference frame for this MB
+ const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
+ const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
+ xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
+ xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
+ setup_pre_planes(xd, cfg, NULL, mi_row, mi_col, xd->scale_factor,
+ xd->scale_factor_uv);
+ xd->corrupted |= cfg->corrupted;
+
+ if (mbmi->ref_frame[1] > INTRA_FRAME) {
// Select the appropriate reference frame for this MB
- const int fb_idx = cm->active_ref_idx[mbmi->ref_frame[0] - 1];
- const YV12_BUFFER_CONFIG *cfg = &cm->yv12_fb[fb_idx];
- xd->scale_factor[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
- xd->scale_factor_uv[0] = cm->active_ref_scale[mbmi->ref_frame[0] - 1];
- setup_pre_planes(xd, cfg, NULL, mi_row, mi_col,
- xd->scale_factor, xd->scale_factor_uv);
- xd->corrupted |= cfg->corrupted;
-
- if (mbmi->ref_frame[1] > INTRA_FRAME) {
- // Select the appropriate reference frame for this MB
- const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
- const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
- xd->scale_factor[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
- xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
- setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col,
- xd->scale_factor, xd->scale_factor_uv);
- xd->corrupted |= second_cfg->corrupted;
- }
+ const int second_fb_idx = cm->active_ref_idx[mbmi->ref_frame[1] - 1];
+ const YV12_BUFFER_CONFIG *second_cfg = &cm->yv12_fb[second_fb_idx];
+ xd->scale_factor[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
+ xd->scale_factor_uv[1] = cm->active_ref_scale[mbmi->ref_frame[1] - 1];
+ setup_pre_planes(xd, NULL, second_cfg, mi_row, mi_col, xd->scale_factor,
+ xd->scale_factor_uv);
+ xd->corrupted |= second_cfg->corrupted;
}
}
@@ -424,16 +419,17 @@ static void decode_modes_b(VP9D_COMP *pbi, int mi_row, int mi_col,
return;
set_offsets(pbi, bsize, mi_row, mi_col);
vp9_decode_mb_mode_mv(pbi, xd, mi_row, mi_col, r);
- set_refs(pbi, mi_row, mi_col);
- if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME)
+ if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
decode_sb_intra(pbi, xd, mi_row, mi_col, r, (bsize < BLOCK_SIZE_SB8X8) ?
BLOCK_SIZE_SB8X8 : bsize);
- else if (bsize < BLOCK_SIZE_SB8X8)
- decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
- else
- decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
-
+ } else {
+ set_refs(pbi, mi_row, mi_col);
+ if (bsize < BLOCK_SIZE_SB8X8)
+ decode_atom(pbi, xd, mi_row, mi_col, r, BLOCK_SIZE_SB8X8);
+ else
+ decode_sb(pbi, xd, mi_row, mi_col, r, bsize);
+ }
xd->corrupted |= vp9_reader_has_error(r);
}
@@ -1187,7 +1183,7 @@ int vp9_decode_frame(VP9D_COMP *pbi, const uint8_t **p_data_end) {
if ((!keyframe) && (!pc->intra_only)) {
vp9_adapt_mode_probs(pc);
vp9_adapt_mode_context(pc);
- vp9_adapt_nmv_probs(pc, xd->allow_high_precision_mv);
+ vp9_adapt_mv_probs(pc, xd->allow_high_precision_mv);
}
}
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 54b6e2440..f655d456b 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -8,7 +8,6 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-
#include "./vpx_config.h"
#include "./vp9_rtcd.h"
#include "vp9/encoder/vp9_encodeframe.h"
@@ -46,9 +45,8 @@ int enc_debug = 0;
void vp9_select_interp_filter_type(VP9_COMP *cpi);
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
- int output_enabled, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize);
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+ int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize);
static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
@@ -64,10 +62,8 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x);
* Eventually this should be replaced by custom no-reference routines,
* which will be faster.
*/
-static const uint8_t VP9_VAR_OFFS[16] = {
- 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128
-};
-
+static const uint8_t VP9_VAR_OFFS[16] = {128, 128, 128, 128, 128, 128, 128, 128,
+ 128, 128, 128, 128, 128, 128, 128, 128};
// Original activity measure from Tim T's code.
static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
@@ -92,13 +88,11 @@ static unsigned int tt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x) {
}
// Stub for alternative experimental activity measures.
-static unsigned int alt_activity_measure(VP9_COMP *cpi,
- MACROBLOCK *x, int use_dc_pred) {
+static unsigned int alt_activity_measure(VP9_COMP *cpi, MACROBLOCK *x,
+ int use_dc_pred) {
return vp9_encode_intra(cpi, x, use_dc_pred);
}
-
-DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = { 0 };
-
+DECLARE_ALIGNED(16, static const uint8_t, vp9_64x64_zeros[64*64]) = {0};
// Measure the activity of the current macroblock
// What we measure here is TBD so abstracted to this function
@@ -136,13 +130,12 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
// Create a list to sort to
CHECK_MEM_ERROR(sortlist,
- vpx_calloc(sizeof(unsigned int),
- cpi->common.MBs));
+ vpx_calloc(sizeof(unsigned int),
+ cpi->common.MBs));
// Copy map to sort list
vpx_memcpy(sortlist, cpi->mb_activity_map,
- sizeof(unsigned int) * cpi->common.MBs);
-
+ sizeof(unsigned int) * cpi->common.MBs);
// Ripple each value down to its correct position
for (i = 1; i < cpi->common.MBs; i ++) {
@@ -153,13 +146,13 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
sortlist[j - 1] = sortlist[j];
sortlist[j] = tmp;
} else
- break;
+ break;
}
}
// Even number MBs so estimate median as mean of two either side.
median = (1 + sortlist[cpi->common.MBs >> 1] +
- sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
+ sortlist[(cpi->common.MBs >> 1) + 1]) >> 1;
cpi->activity_avg = median;
@@ -167,7 +160,7 @@ static void calc_av_activity(VP9_COMP *cpi, int64_t activity_sum) {
}
#else
// Simple mean for now
- cpi->activity_avg = (unsigned int)(activity_sum / cpi->common.MBs);
+ cpi->activity_avg = (unsigned int) (activity_sum / cpi->common.MBs);
#endif
if (cpi->activity_avg < VP9_ACTIVITY_AVG_MIN)
@@ -211,9 +204,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
b = 4 * act + cpi->activity_avg;
if (b >= a)
- *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
+ *(x->activity_ptr) = (int)((b + (a >> 1)) / a) - 1;
else
- *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
+ *(x->activity_ptr) = 1 - (int)((a + (b >> 1)) / b);
#if OUTPUT_NORM_ACT_STATS
fprintf(f, " %6d", *(x->mb_activity_ptr));
@@ -238,9 +231,9 @@ static void calc_activity_index(VP9_COMP *cpi, MACROBLOCK *x) {
// Loop through all MBs. Note activity of each, average activity and
// calculate a normalized activity for each
static void build_activity_map(VP9_COMP *cpi) {
- MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD *xd = &x->e_mbd;
- VP9_COMMON *const cm = &cpi->common;
+ VP9_COMMON * const cm = &cpi->common;
#if ALT_ACT_MEASURE
YV12_BUFFER_CONFIG *new_yv12 = &cm->yv12_fb[cm->new_fb_idx];
@@ -285,7 +278,6 @@ static void build_activity_map(VP9_COMP *cpi) {
x->plane[0].src.buf += 16;
}
-
// adjust to the next row of mbs
x->plane[0].src.buf += 16 * x->plane[0].src.stride - 16 * cm->mb_cols;
}
@@ -315,7 +307,7 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
a = act + (2 * cpi->activity_avg);
b = (2 * act) + cpi->activity_avg;
- x->rdmult = (unsigned int)(((int64_t)x->rdmult * b + (a >> 1)) / a);
+ x->rdmult = (unsigned int) (((int64_t) x->rdmult * b + (a >> 1)) / a);
x->errorperbit = x->rdmult * 100 / (110 * x->rddiv);
x->errorperbit += (x->errorperbit == 0);
#endif
@@ -324,15 +316,13 @@ void vp9_activity_masking(VP9_COMP *cpi, MACROBLOCK *x) {
adjust_act_zbin(cpi, x);
}
-static void update_state(VP9_COMP *cpi,
- PICK_MODE_CONTEXT *ctx,
- BLOCK_SIZE_TYPE bsize,
- int output_enabled) {
+static void update_state(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+ BLOCK_SIZE_TYPE bsize, int output_enabled) {
int i, x_idx, y;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
MODE_INFO *mi = &ctx->mic;
- MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
+ MB_MODE_INFO * const mbmi = &xd->mode_info_context->mbmi;
#if CONFIG_DEBUG || CONFIG_INTERNAL_STATS
MB_PREDICTION_MODE mb_mode = mi->mbmi.mode;
#endif
@@ -352,8 +342,8 @@ static void update_state(VP9_COMP *cpi,
// when the mode was picked for it
for (y = 0; y < bh; y++) {
for (x_idx = 0; x_idx < bw; x_idx++) {
- if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx &&
- (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) {
+ if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > x_idx
+ && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > y) {
MODE_INFO *mi_addr = xd->mode_info_context + x_idx + y * mis;
*mi_addr = *mi;
}
@@ -408,27 +398,27 @@ static void update_state(VP9_COMP *cpi,
#endif
} else {
/*
- // Reduce the activation RD thresholds for the best choice mode
- if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
- (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
- {
- int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
-
- cpi->rd_thresh_mult[mb_mode_index] =
- (cpi->rd_thresh_mult[mb_mode_index]
- >= (MIN_THRESHMULT + best_adjustment)) ?
- cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
- MIN_THRESHMULT;
- cpi->rd_threshes[mb_mode_index] =
- (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
- * cpi->rd_thresh_mult[mb_mode_index];
-
- }
- */
+ // Reduce the activation RD thresholds for the best choice mode
+ if ((cpi->rd_baseline_thresh[mb_mode_index] > 0) &&
+ (cpi->rd_baseline_thresh[mb_mode_index] < (INT_MAX >> 2)))
+ {
+ int best_adjustment = (cpi->rd_thresh_mult[mb_mode_index] >> 2);
+
+ cpi->rd_thresh_mult[mb_mode_index] =
+ (cpi->rd_thresh_mult[mb_mode_index]
+ >= (MIN_THRESHMULT + best_adjustment)) ?
+ cpi->rd_thresh_mult[mb_mode_index] - best_adjustment :
+ MIN_THRESHMULT;
+ cpi->rd_threshes[mb_mode_index] =
+ (cpi->rd_baseline_thresh[mb_mode_index] >> 7)
+ * cpi->rd_thresh_mult[mb_mode_index];
+
+ }
+ */
// Note how often each mode chosen as best
cpi->mode_chosen_counts[mb_mode_index]++;
- if (mbmi->ref_frame[0] != INTRA_FRAME &&
- (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
+ if (mbmi->ref_frame[0] != INTRA_FRAME
+ && (mbmi->sb_type < BLOCK_SIZE_SB8X8 || mbmi->mode == NEWMV)) {
int_mv best_mv, best_second_mv;
const MV_REFERENCE_FRAME rf1 = mbmi->ref_frame[0];
const MV_REFERENCE_FRAME rf2 = mbmi->ref_frame[1];
@@ -447,21 +437,21 @@ static void update_state(VP9_COMP *cpi,
int i, j;
for (j = 0; j < bh; ++j)
for (i = 0; i < bw; ++i)
- if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i &&
- (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j)
+ if ((xd->mb_to_right_edge >> (3 + LOG2_MI_SIZE)) + bw > i
+ && (xd->mb_to_bottom_edge >> (3 + LOG2_MI_SIZE)) + bh > j)
xd->mode_info_context[mis * j + i].mbmi = *mbmi;
}
- if (cpi->common.mcomp_filter_type == SWITCHABLE &&
- is_inter_mode(mbmi->mode)) {
- ++cpi->common.fc.switchable_interp_count
- [vp9_get_pred_context(&cpi->common, xd, PRED_SWITCHABLE_INTERP)]
- [vp9_switchable_interp_map[mbmi->interp_filter]];
+ if (cpi->common.mcomp_filter_type == SWITCHABLE
+ && is_inter_mode(mbmi->mode)) {
+ ++cpi->common.fc.switchable_interp_count[vp9_get_pred_context(
+ &cpi->common, xd, PRED_SWITCHABLE_INTERP)][vp9_switchable_interp_map[mbmi
+ ->interp_filter]];
}
cpi->rd_comp_pred_diff[SINGLE_PREDICTION_ONLY] += ctx->single_pred_diff;
- cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
- cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
+ cpi->rd_comp_pred_diff[COMP_PREDICTION_ONLY] += ctx->comp_pred_diff;
+ cpi->rd_comp_pred_diff[HYBRID_PREDICTION] += ctx->hybrid_pred_diff;
}
}
@@ -484,29 +474,26 @@ static unsigned find_seg_id(VP9_COMMON *cm, uint8_t *buf, BLOCK_SIZE_TYPE bsize,
return seg_id;
}
-void vp9_setup_src_planes(MACROBLOCK *x,
- const YV12_BUFFER_CONFIG *src,
+void vp9_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
int mb_row, int mb_col) {
- uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
- src->alpha_buffer};
- int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
- src->alpha_stride};
+ uint8_t *buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer, src
+ ->alpha_buffer};
+ int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride, src
+ ->alpha_stride};
int i;
for (i = 0; i < MAX_MB_PLANE; i++) {
- setup_pred_plane(&x->plane[i].src,
- buffers[i], strides[i],
- mb_row, mb_col, NULL,
- x->e_mbd.plane[i].subsampling_x,
+ setup_pred_plane(&x->plane[i].src, buffers[i], strides[i], mb_row, mb_col,
+ NULL, x->e_mbd.plane[i].subsampling_x,
x->e_mbd.plane[i].subsampling_y);
}
}
-static void set_offsets(VP9_COMP *cpi,
- int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
- MACROBLOCK *const x = &cpi->mb;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
+static void set_offsets(VP9_COMP *cpi, int mi_row, int mi_col,
+ BLOCK_SIZE_TYPE bsize) {
+ MACROBLOCK * const x = &cpi->mb;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCKD * const xd = &x->e_mbd;
MB_MODE_INFO *mbmi;
const int dst_fb_idx = cm->new_fb_idx;
const int idx_str = xd->mode_info_stride * mi_row + mi_col;
@@ -518,10 +505,10 @@ static void set_offsets(VP9_COMP *cpi,
// entropy context structures
for (i = 0; i < MAX_MB_PLANE; i++) {
- xd->plane[i].above_context = cm->above_context[i] +
- (mi_col * 2 >> xd->plane[i].subsampling_x);
- xd->plane[i].left_context = cm->left_context[i] +
- (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
+ xd->plane[i].above_context = cm->above_context[i]
+ + (mi_col * 2 >> xd->plane[i].subsampling_x);
+ xd->plane[i].left_context = cm->left_context[i]
+ + (((mi_row * 2) & 15) >> xd->plane[i].subsampling_y);
}
// partition contexts
@@ -532,25 +519,24 @@ static void set_offsets(VP9_COMP *cpi,
x->active_ptr = cpi->active_map + idx_map;
/* pointers to mode info contexts */
- x->partition_info = x->pi + idx_str;
- xd->mode_info_context = cm->mi + idx_str;
+ x->partition_info = x->pi + idx_str;
+ xd->mode_info_context = cm->mi + idx_str;
mbmi = &xd->mode_info_context->mbmi;
// Special case: if prev_mi is NULL, the previous mode info context
// cannot be used.
- xd->prev_mode_info_context = cm->prev_mi ?
- cm->prev_mi + idx_str : NULL;
+ xd->prev_mode_info_context = cm->prev_mi ? cm->prev_mi + idx_str : NULL;
// Set up destination pointers
setup_dst_planes(xd, &cm->yv12_fb[dst_fb_idx], mi_row, mi_col);
/* Set up limit values for MV components to prevent them from
* extending beyond the UMV borders assuming 16x16 block size */
- x->mv_row_min = -((mi_row * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
- x->mv_col_min = -((mi_col * MI_SIZE) + VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
- x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE +
- (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND));
- x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE +
- (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND));
+ x->mv_row_min = -((mi_row * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+ x->mv_col_min = -((mi_col * MI_SIZE)+ VP9BORDERINPIXELS - VP9_INTERP_EXTEND);
+ x->mv_row_max = ((cm->mi_rows - mi_row) * MI_SIZE
+ + (VP9BORDERINPIXELS - MI_SIZE * bh - VP9_INTERP_EXTEND));
+ x->mv_col_max = ((cm->mi_cols - mi_col) * MI_SIZE
+ + (VP9BORDERINPIXELS - MI_SIZE * bw - VP9_INTERP_EXTEND));
// Set up distance of MB to edge of frame in 1/8th pel units
assert(!(mi_col & (bw - 1)) && !(mi_row & (bh - 1)));
@@ -565,30 +551,30 @@ static void set_offsets(VP9_COMP *cpi,
/* segment ID */
if (xd->segmentation_enabled) {
- uint8_t *map = xd->update_mb_segmentation_map ? cpi->segmentation_map
- : cm->last_frame_seg_map;
- mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row,
- cm->mi_rows, mi_col, cm->mi_cols);
+ uint8_t *map =
+ xd->update_mb_segmentation_map ?
+ cpi->segmentation_map : cm->last_frame_seg_map;
+ mbmi->segment_id = find_seg_id(cm, map, bsize, mi_row, cm->mi_rows, mi_col,
+ cm->mi_cols);
assert(mbmi->segment_id <= (MAX_MB_SEGMENTS-1));
vp9_mb_init_quantizer(cpi, x);
- if (xd->segmentation_enabled && cpi->seg0_cnt > 0 &&
- !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME) &&
- vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
+ if (xd->segmentation_enabled && cpi->seg0_cnt > 0
+ && !vp9_segfeature_active(xd, 0, SEG_LVL_REF_FRAME)
+ && vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
cpi->seg0_progress = (cpi->seg0_idx << 16) / cpi->seg0_cnt;
} else {
const int y = mb_row & ~3;
const int x = mb_col & ~3;
- const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
+ const int p16 = ((mb_row & 1) << 1) + (mb_col & 1);
const int p32 = ((mb_row & 2) << 2) + ((mb_col & 2) << 1);
- const int tile_progress =
- cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
- const int mb_cols =
- (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start) >> 1;
+ const int tile_progress = cm->cur_tile_mi_col_start * cm->mb_rows >> 1;
+ const int mb_cols = (cm->cur_tile_mi_col_end - cm->cur_tile_mi_col_start)
+ >> 1;
- cpi->seg0_progress =
- ((y * mb_cols + x * 4 + p32 + p16 + tile_progress) << 16) / cm->MBs;
+ cpi->seg0_progress = ((y * mb_cols + x * 4 + p32 + p16 + tile_progress)
+ << 16) / cm->MBs;
}
} else {
mbmi->segment_id = 0;
@@ -596,11 +582,11 @@ static void set_offsets(VP9_COMP *cpi,
}
static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
- TOKENEXTRA **tp, int *totalrate, int *totaldist,
+ TOKENEXTRA **tp, int *totalrate, int64_t *totaldist,
BLOCK_SIZE_TYPE bsize, PICK_MODE_CONTEXT *ctx) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
x->rd_search = 1;
@@ -624,22 +610,21 @@ static void pick_sb_modes(VP9_COMP *cpi, int mi_row, int mi_col,
}
static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
MODE_INFO *mi = xd->mode_info_context;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
+ MB_MODE_INFO * const mbmi = &mi->mbmi;
if (cm->frame_type != KEY_FRAME) {
int segment_id, seg_ref_active;
segment_id = mbmi->segment_id;
- seg_ref_active = vp9_segfeature_active(xd, segment_id,
- SEG_LVL_REF_FRAME);
+ seg_ref_active = vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME);
if (!seg_ref_active)
- cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)]
- [mbmi->ref_frame[0] > INTRA_FRAME]++;
+ cpi->intra_inter_count[vp9_get_pred_context(cm, xd, PRED_INTRA_INTER)][mbmi
+ ->ref_frame[0] > INTRA_FRAME]++;
// If the segment reference feature is enabled we have only a single
// reference frame allowed for the segment so exclude it from
@@ -647,19 +632,18 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
if ((mbmi->ref_frame[0] > INTRA_FRAME) && !seg_ref_active) {
if (cm->comp_pred_mode == HYBRID_PREDICTION)
cpi->comp_inter_count[vp9_get_pred_context(cm, xd,
- PRED_COMP_INTER_INTER)]
- [mbmi->ref_frame[1] > INTRA_FRAME]++;
+ PRED_COMP_INTER_INTER)][mbmi
+ ->ref_frame[1] > INTRA_FRAME]++;
if (mbmi->ref_frame[1] > INTRA_FRAME) {
- cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)]
- [mbmi->ref_frame[0] == GOLDEN_FRAME]++;
+ cpi->comp_ref_count[vp9_get_pred_context(cm, xd, PRED_COMP_REF_P)][mbmi
+ ->ref_frame[0] == GOLDEN_FRAME]++;
} else {
- cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)]
- [0][mbmi->ref_frame[0] != LAST_FRAME]++;
+ cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P1)][0][mbmi
+ ->ref_frame[0] != LAST_FRAME]++;
if (mbmi->ref_frame[0] != LAST_FRAME)
- cpi->single_ref_count[vp9_get_pred_context(cm, xd,
- PRED_SINGLE_REF_P2)]
- [1][mbmi->ref_frame[0] != GOLDEN_FRAME]++;
+ cpi->single_ref_count[vp9_get_pred_context(cm, xd, PRED_SINGLE_REF_P2)][1][mbmi
+ ->ref_frame[0] != GOLDEN_FRAME]++;
}
}
// Count of last ref frame 0,0 usage
@@ -673,7 +657,7 @@ static void update_stats(VP9_COMP *cpi, int mi_row, int mi_col) {
// partition down to 4x4 block size is enabled.
static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
- MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCKD * const xd = &x->e_mbd;
switch (bsize) {
case BLOCK_SIZE_SB64X64:
@@ -704,7 +688,7 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
return &x->ab4x4_context[xd->sb_index][xd->mb_index][xd->b_index];
default:
assert(0);
- return NULL;
+ return NULL ;
}
}
@@ -722,48 +706,45 @@ static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
return &x->b_partitioning[xd->sb_index][xd->mb_index][xd->b_index];
default:
assert(0);
- return NULL;
+ return NULL ;
}
}
static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
- PARTITION_CONTEXT sa[8],
- PARTITION_CONTEXT sl[8],
+ PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
int p;
int bwl = b_width_log2(bsize), bw = 1 << bwl;
int bhl = b_height_log2(bsize), bh = 1 << bhl;
int mwl = mi_width_log2(bsize), mw = 1 << mwl;
int mhl = mi_height_log2(bsize), mh = 1 << mhl;
for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->above_context[p] +
- ((mi_col * 2) >> xd->plane[p].subsampling_x),
- a + bw * p,
- sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
- vpx_memcpy(cm->left_context[p] +
- ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
- l + bh * p,
- sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
- }
+ vpx_memcpy(
+ cm->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
+ a + bw * p, sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+ vpx_memcpy(
+ cm->left_context[p]
+ + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),l + bh * p,
+ sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+ }
vpx_memcpy(cm->above_seg_context + mi_col, sa,
sizeof(PARTITION_CONTEXT) * mw);
vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
- sizeof(PARTITION_CONTEXT) * mh);
-}
+ sizeof(PARTITION_CONTEXT) * mh)
+ ;}
static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
- ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
- ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
- PARTITION_CONTEXT sa[8],
- PARTITION_CONTEXT sl[8],
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+ PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
+ BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
int p;
int bwl = b_width_log2(bsize), bw = 1 << bwl;
int bhl = b_height_log2(bsize), bh = 1 << bhl;
@@ -772,25 +753,26 @@ static void save_context(VP9_COMP *cpi, int mi_row, int mi_col,
// buffer the above/left context information of the block in search.
for (p = 0; p < MAX_MB_PLANE; ++p) {
- vpx_memcpy(a + bw * p, cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
- vpx_memcpy(l + bh * p, cm->left_context[p] +
- ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
- sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
- }
+ vpx_memcpy(
+ a + bw * p,
+ cm->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
+ sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+ vpx_memcpy(
+ l + bh * p,
+ cm->left_context[p]
+ + ((mi_row & MI_MASK)* 2 >> xd->plane[p].subsampling_y),sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+ }
vpx_memcpy(sa, cm->above_seg_context + mi_col,
sizeof(PARTITION_CONTEXT) * mw);
vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
- sizeof(PARTITION_CONTEXT) * mh);
-}
+ sizeof(PARTITION_CONTEXT) * mh)
+ ;}
-static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
- int mi_row, int mi_col, int output_enabled,
- BLOCK_SIZE_TYPE bsize, int sub_index) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+ int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
@@ -813,12 +795,11 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
}
}
-static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
- int mi_row, int mi_col, int output_enabled,
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col,
+ int output_enabled, BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
const int bsl = b_width_log2(bsize), bs = (1 << bsl) / 4;
int bwl, bhl;
@@ -838,17 +819,17 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
if (bsl == bwl && bsl == bhl) {
if (output_enabled && bsize >= BLOCK_SIZE_SB8X8)
- cpi->partition_count[pl][PARTITION_NONE]++;
+ cpi->partition_count[pl][PARTITION_NONE]++;
encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
} else if (bsl == bhl && bsl > bwl) {
if (output_enabled)
cpi->partition_count[pl][PARTITION_VERT]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
+ encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
encode_b(cpi, tp, mi_row, mi_col + bs, output_enabled, c1, 1);
} else if (bsl == bwl && bsl > bhl) {
if (output_enabled)
cpi->partition_count[pl][PARTITION_HORZ]++;
- encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
+ encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, 0);
encode_b(cpi, tp, mi_row + bs, mi_col, output_enabled, c1, 1);
} else {
BLOCK_SIZE_TYPE subsize;
@@ -869,8 +850,8 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
}
}
- if (bsize >= BLOCK_SIZE_SB8X8 &&
- (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
+ if (bsize >= BLOCK_SIZE_SB8X8
+ && (bsize == BLOCK_SIZE_SB8X8 || bsl == bwl || bsl == bhl)) {
set_partition_seg_context(cm, xd, mi_row, mi_col);
update_partition_context(xd, c1, bsize);
}
@@ -880,26 +861,28 @@ static void set_partitioning(VP9_COMP *cpi, MODE_INFO *m,
BLOCK_SIZE_TYPE bsize) {
VP9_COMMON *const cm = &cpi->common;
const int mis = cm->mode_info_stride;
- int bsl = b_width_log2(bsize);
- int bs = (1 << bsl) / 2; //
int block_row, block_col;
- int row, col;
-
- // this test function sets the entire macroblock to the same bsize
- for (block_row = 0; block_row < 8; block_row += bs) {
- for (block_col = 0; block_col < 8; block_col += bs) {
- for (row = 0; row < bs; row++) {
- for (col = 0; col < bs; col++) {
- m[(block_row+row)*mis + block_col+col].mbmi.sb_type = bsize;
- }
- }
+ for (block_row = 0; block_row < 8; ++block_row) {
+ for (block_col = 0; block_col < 8; ++block_col) {
+ m[block_row * mis + block_col].mbmi.sb_type = bsize;
+ }
+ }
+}
+static void copy_partitioning(VP9_COMP *cpi, MODE_INFO *m, MODE_INFO *p) {
+ VP9_COMMON *const cm = &cpi->common;
+ const int mis = cm->mode_info_stride;
+ int block_row, block_col;
+ for (block_row = 0; block_row < 8; ++block_row) {
+ for (block_col = 0; block_col < 8; ++block_col) {
+ m[block_row * mis + block_col].mbmi.sb_type =
+ p[block_row * mis + block_col].mbmi.sb_type;
}
}
}
-static void set_block_size(VP9_COMMON *const cm,
- MODE_INFO *m, BLOCK_SIZE_TYPE bsize, int mis,
- int mi_row, int mi_col) {
+static void set_block_size(VP9_COMMON * const cm, MODE_INFO *m,
+ BLOCK_SIZE_TYPE bsize, int mis, int mi_row,
+ int mi_col) {
int row, col;
int bwl = b_width_log2(bsize);
int bhl = b_height_log2(bsize);
@@ -911,10 +894,11 @@ static void set_block_size(VP9_COMMON *const cm,
for (col = 0; col < bs; col++) {
if (mi_row + row >= cm->mi_rows || mi_col + col >= cm->mi_cols)
continue;
- m2[row*mis+col].mbmi.sb_type = bsize;
+ m2[row * mis + col].mbmi.sb_type = bsize;
}
}
}
+
typedef struct {
int64_t sum_square_error;
int64_t sum_error;
@@ -922,11 +906,15 @@ typedef struct {
int variance;
} var;
+typedef struct {
+ var none;
+ var horz[2];
+ var vert[2];
+} partition_variance;
+
#define VT(TYPE, BLOCKSIZE) \
typedef struct { \
- var none; \
- var horz[2]; \
- var vert[2]; \
+ partition_variance vt; \
BLOCKSIZE split[4]; } TYPE;
VT(v8x8, var)
@@ -934,20 +922,67 @@ VT(v16x16, v8x8)
VT(v32x32, v16x16)
VT(v64x64, v32x32)
+typedef struct {
+ partition_variance *vt;
+ var *split[4];
+} vt_node;
+
typedef enum {
V16X16,
V32X32,
V64X64,
} TREE_LEVEL;
+static void tree_to_node(void *data, BLOCK_SIZE_TYPE block_size, vt_node *node) {
+ int i;
+ switch (block_size) {
+ case BLOCK_SIZE_SB64X64: {
+ v64x64 *vt = (v64x64 *) data;
+ node->vt = &vt->vt;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].vt.none;
+ break;
+ }
+ case BLOCK_SIZE_SB32X32: {
+ v32x32 *vt = (v32x32 *) data;
+ node->vt = &vt->vt;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].vt.none;
+ break;
+ }
+ case BLOCK_SIZE_MB16X16: {
+ v16x16 *vt = (v16x16 *) data;
+ node->vt = &vt->vt;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i].vt.none;
+ break;
+ }
+ case BLOCK_SIZE_SB8X8: {
+ v8x8 *vt = (v8x8 *) data;
+ node->vt = &vt->vt;
+ for (i = 0; i < 4; i++)
+ node->split[i] = &vt->split[i];
+ break;
+ }
+ default:
+ node->vt = 0;
+ for (i = 0; i < 4; i++)
+ node->split[i] = 0;
+ assert(-1);
+ }
+}
+
// Set variance values given sum square error, sum error, count.
static void fill_variance(var *v, int64_t s2, int64_t s, int c) {
v->sum_square_error = s2;
v->sum_error = s;
v->count = c;
- v->variance = 256
- * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
- / v->count;
+ if (c > 0)
+ v->variance = 256
+ * (v->sum_square_error - v->sum_error * v->sum_error / v->count)
+ / v->count;
+ else
+ v->variance = 0;
}
// Combine 2 variance structures by summing the sum_error, sum_square_error,
@@ -956,31 +991,95 @@ void sum_2_variances(var *r, var *a, var*b) {
fill_variance(r, a->sum_square_error + b->sum_square_error,
a->sum_error + b->sum_error, a->count + b->count);
}
-// Fill one level of our variance tree, by summing the split sums into each of
-// the horizontal, vertical and none from split and recalculating variance.
-#define fill_variance_tree(VT) \
- sum_2_variances(VT.horz[0], VT.split[0].none, VT.split[1].none); \
- sum_2_variances(VT.horz[1], VT.split[2].none, VT.split[3].none); \
- sum_2_variances(VT.vert[0], VT.split[0].none, VT.split[2].none); \
- sum_2_variances(VT.vert[1], VT.split[1].none, VT.split[3].none); \
- sum_2_variances(VT.none, VT.vert[0], VT.vert[1]);
-
-// Set the blocksize in the macroblock info structure if the variance is less
-// than our threshold to one of none, horz, vert.
-#define set_vt_size(VT, BLOCKSIZE, R, C, ACTION) \
- if (VT.none.variance < threshold) { \
- set_block_size(cm, m, BLOCKSIZE, mis, R, C); \
- ACTION; \
- } \
- if (VT.horz[0].variance < threshold && VT.horz[1].variance < threshold ) { \
- set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_HORZ), mis, R, C); \
- ACTION; \
- } \
- if (VT.vert[0].variance < threshold && VT.vert[1].variance < threshold ) { \
- set_block_size(cm, m, get_subsize(BLOCKSIZE, PARTITION_VERT), mis, R, C); \
- ACTION; \
+
+static void fill_variance_tree(void *data, BLOCK_SIZE_TYPE block_size) {
+ vt_node node;
+ tree_to_node(data, block_size, &node);
+ sum_2_variances(&node.vt->horz[0], node.split[0], node.split[1]);
+ sum_2_variances(&node.vt->horz[1], node.split[2], node.split[3]);
+ sum_2_variances(&node.vt->vert[0], node.split[0], node.split[2]);
+ sum_2_variances(&node.vt->vert[1], node.split[1], node.split[3]);
+ sum_2_variances(&node.vt->none, &node.vt->vert[0], &node.vt->vert[1]);
+}
+
+#if PERFORM_RANDOM_PARTITIONING
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
+ BLOCK_SIZE_TYPE block_size, int mi_row,
+ int mi_col, int mi_size) {
+ VP9_COMMON * const cm = &cpi->common;
+ vt_node vt;
+ const int mis = cm->mode_info_stride;
+ int64_t threshold = 4 * cpi->common.base_qindex * cpi->common.base_qindex;
+
+ tree_to_node(data, block_size, &vt);
+
+ // split none is available only if we have more than half a block size
+ // in width and height inside the visible image
+ if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows &&
+ (rand() & 3) < 1) {
+ set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+ return 1;
+ }
+
+ // vertical split is available on all but the bottom border
+ if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
+ && (rand() & 3) < 1) {
+ set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+ mi_col);
+ return 1;
}
+ // horizontal split is available on all but the right border
+ if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
+ && (rand() & 3) < 1) {
+ set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+ mi_col);
+ return 1;
+ }
+
+ return 0;
+}
+
+#else
+
+static int set_vt_partitioning(VP9_COMP *cpi, void *data, MODE_INFO *m,
+ BLOCK_SIZE_TYPE block_size, int mi_row,
+ int mi_col, int mi_size) {
+ VP9_COMMON * const cm = &cpi->common;
+ vt_node vt;
+ const int mis = cm->mode_info_stride;
+ int64_t threshold = 50 * cpi->common.base_qindex;
+
+ tree_to_node(data, block_size, &vt);
+
+ // split none is available only if we have more than half a block size
+ // in width and height inside the visible image
+ if (mi_col + mi_size < cm->mi_cols && mi_row + mi_size < cm->mi_rows
+ && vt.vt->none.variance < threshold) {
+ set_block_size(cm, m, block_size, mis, mi_row, mi_col);
+ return 1;
+ }
+
+ // vertical split is available on all but the bottom border
+ if (mi_row + mi_size < cm->mi_rows && vt.vt->vert[0].variance < threshold
+ && vt.vt->vert[1].variance < threshold) {
+ set_block_size(cm, m, get_subsize(block_size, PARTITION_VERT), mis, mi_row,
+ mi_col);
+ return 1;
+ }
+
+ // horizontal split is available on all but the right border
+ if (mi_col + mi_size < cm->mi_cols && vt.vt->horz[0].variance < threshold
+ && vt.vt->horz[1].variance < threshold) {
+ set_block_size(cm, m, get_subsize(block_size, PARTITION_HORZ), mis, mi_row,
+ mi_col);
+ return 1;
+ }
+
+ return 0;
+}
+#endif
+
static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
int mi_col) {
VP9_COMMON * const cm = &cpi->common;
@@ -993,8 +1092,8 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
v64x64 vt;
unsigned char * s;
int sp;
- const unsigned char * d = xd->plane[0].pre->buf;
- int dp = xd->plane[0].pre->stride;
+ const unsigned char * d;
+ int dp;
int pixels_wide = 64, pixels_high = 64;
vpx_memset(&vt, 0, sizeof(vt));
@@ -1014,81 +1113,89 @@ static void choose_partitioning(VP9_COMP *cpi, MODE_INFO *m, int mi_row,
// but this needs more experimentation.
threshold = threshold * cpi->common.base_qindex * cpi->common.base_qindex;
- // if ( cm->frame_type == KEY_FRAME ) {
d = vp9_64x64_zeros;
dp = 64;
- // }
+ if (cm->frame_type != KEY_FRAME) {
+ int_mv nearest_mv, near_mv;
+ YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[0];
+ YV12_BUFFER_CONFIG *second_ref_fb = NULL;
+
+ setup_pre_planes(xd, ref_fb, second_ref_fb, mi_row, mi_col,
+ xd->scale_factor, xd->scale_factor_uv);
+ xd->mode_info_context->mbmi.ref_frame[0] = LAST_FRAME;
+ xd->mode_info_context->mbmi.sb_type = BLOCK_SIZE_SB64X64;
+ vp9_find_best_ref_mvs(xd, m->mbmi.ref_mvs[m->mbmi.ref_frame[0]],
+ &nearest_mv, &near_mv);
+
+ xd->mode_info_context->mbmi.mv[0] = nearest_mv;
+ vp9_build_inter_predictors_sby(xd, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+ d = xd->plane[0].dst.buf;
+ dp = xd->plane[0].dst.stride;
+
+ }
// Fill in the entire tree of 8x8 variances for splits.
for (i = 0; i < 4; i++) {
const int x32_idx = ((i & 1) << 5);
const int y32_idx = ((i >> 1) << 5);
for (j = 0; j < 4; j++) {
- const int x_idx = x32_idx + ((j & 1) << 4);
- const int y_idx = y32_idx + ((j >> 1) << 4);
- const uint8_t *st = s + y_idx * sp + x_idx;
- const uint8_t *dt = d + y_idx * dp + x_idx;
- unsigned int sse = 0;
- int sum = 0;
+ const int x16_idx = x32_idx + ((j & 1) << 4);
+ const int y16_idx = y32_idx + ((j >> 1) << 4);
v16x16 *vst = &vt.split[i].split[j];
- sse = sum = 0;
- if (x_idx < pixels_wide && y_idx < pixels_high)
- vp9_get_sse_sum_8x8(st, sp, dt, dp, &sse, &sum);
- fill_variance(&vst->split[0].none, sse, sum, 64);
- sse = sum = 0;
- if (x_idx + 8 < pixels_wide && y_idx < pixels_high)
- vp9_get_sse_sum_8x8(st + 8, sp, dt + 8, dp, &sse, &sum);
- fill_variance(&vst->split[1].none, sse, sum, 64);
- sse = sum = 0;
- if (x_idx < pixels_wide && y_idx + 8 < pixels_high)
- vp9_get_sse_sum_8x8(st + 8 * sp, sp, dt + 8 * dp, dp, &sse, &sum);
- fill_variance(&vst->split[2].none, sse, sum, 64);
- sse = sum = 0;
- if (x_idx + 8 < pixels_wide && y_idx + 8 < pixels_high)
- vp9_get_sse_sum_8x8(st + 8 * sp + 8, sp, dt + 8 + 8 * dp, dp, &sse,
- &sum);
- fill_variance(&vst->split[3].none, sse, sum, 64);
+ for (k = 0; k < 4; k++) {
+ int x_idx = x16_idx + ((k & 1) << 3);
+ int y_idx = y16_idx + ((k >> 1) << 3);
+ unsigned int sse = 0;
+ int sum = 0;
+ if (x_idx < pixels_wide && y_idx < pixels_high)
+ vp9_get_sse_sum_8x8(s + y_idx * sp + x_idx, sp,
+ d + y_idx * dp + x_idx, dp, &sse, &sum);
+ fill_variance(&vst->split[k].vt.none, sse, sum, 64);
+ }
}
}
// Fill the rest of the variance tree by summing the split partition
// values.
for (i = 0; i < 4; i++) {
for (j = 0; j < 4; j++) {
- fill_variance_tree(&vt.split[i].split[j])
+ fill_variance_tree(&vt.split[i].split[j], BLOCK_SIZE_MB16X16);
}
- fill_variance_tree(&vt.split[i])
+ fill_variance_tree(&vt.split[i], BLOCK_SIZE_SB32X32);
}
- fill_variance_tree(&vt)
-
- // Now go through the entire structure, splitting every blocksize until
+ fill_variance_tree(&vt, BLOCK_SIZE_SB64X64);
+ // Now go through the entire structure, splitting every block size until
// we get to one that's got a variance lower than our threshold, or we
// hit 8x8.
- set_vt_size( vt, BLOCK_SIZE_SB64X64, mi_row, mi_col, return);
- for (i = 0; i < 4; ++i) {
- const int x32_idx = ((i & 1) << 2);
- const int y32_idx = ((i >> 1) << 2);
- set_vt_size(vt, BLOCK_SIZE_SB32X32, mi_row + y32_idx, mi_col + x32_idx,
- continue);
-
- for (j = 0; j < 4; ++j) {
- const int x16_idx = ((j & 1) << 1);
- const int y16_idx = ((j >> 1) << 1);
- set_vt_size(vt, BLOCK_SIZE_MB16X16, mi_row + y32_idx + y16_idx,
- mi_col+x32_idx+x16_idx, continue);
-
- for (k = 0; k < 4; ++k) {
- const int x8_idx = (k & 1);
- const int y8_idx = (k >> 1);
- set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
- mi_row + y32_idx + y16_idx + y8_idx,
- mi_col + x32_idx + x16_idx + x8_idx);
+ if (!set_vt_partitioning(cpi, &vt, m, BLOCK_SIZE_SB64X64, mi_row, mi_col,
+ 4)) {
+ for (i = 0; i < 4; ++i) {
+ const int x32_idx = ((i & 1) << 2);
+ const int y32_idx = ((i >> 1) << 2);
+ if (!set_vt_partitioning(cpi, &vt.split[i], m, BLOCK_SIZE_SB32X32,
+ (mi_row + y32_idx), (mi_col + x32_idx), 2)) {
+ for (j = 0; j < 4; ++j) {
+ const int x16_idx = ((j & 1) << 1);
+ const int y16_idx = ((j >> 1) << 1);
+ if (!set_vt_partitioning(cpi, &vt.split[i].split[j], m,
+ BLOCK_SIZE_MB16X16,
+ (mi_row + y32_idx + y16_idx),
+ (mi_col + x32_idx + x16_idx), 1)) {
+ for (k = 0; k < 4; ++k) {
+ const int x8_idx = (k & 1);
+ const int y8_idx = (k >> 1);
+ set_block_size(cm, m, BLOCK_SIZE_SB8X8, mis,
+ (mi_row + y32_idx + y16_idx + y8_idx),
+ (mi_col + x32_idx + x16_idx + x8_idx));
+ }
+ }
+ }
}
}
}
}
static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize,
- int *rate, int *dist) {
+ int *rate, int64_t *dist) {
VP9_COMMON * const cm = &cpi->common;
MACROBLOCK * const x = &cpi->mb;
MACROBLOCKD *xd = &cpi->mb.e_mbd;
@@ -1098,18 +1205,18 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
int bsl = b_width_log2(bsize);
int bh = (1 << bhl);
int bs = (1 << bsl);
- int bss = (1 << bsl)/4;
+ int bss = (1 << bsl) / 4;
int i, pl;
PARTITION_TYPE partition;
BLOCK_SIZE_TYPE subsize;
ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
- int r = 0, d = 0;
+ int r = 0;
+ int64_t d = 0;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
-
// parse the partition type
if ((bwl == bsl) && (bhl == bsl))
partition = PARTITION_NONE;
@@ -1124,18 +1231,15 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
subsize = get_subsize(bsize, partition);
- // TODO(JBB): this restriction is here because pick_sb_modes can return
- // r's that are INT_MAX meaning we can't select a mode / mv for this block.
- // when the code is made to work for less than sb8x8 we need to come up with
- // a solution to this problem.
- assert(subsize >= BLOCK_SIZE_SB8X8);
-
- if (bsize >= BLOCK_SIZE_SB8X8) {
- xd->left_seg_context = cm->left_seg_context + (mi_row & MI_MASK);
- xd->above_seg_context = cm->above_seg_context + mi_col;
+ if (bsize < BLOCK_SIZE_SB8X8) {
+ if (xd->ab_index != 0) {
+ *rate = 0;
+ *dist = 0;
+ return;
+ }
+ } else {
*(get_sb_partitioning(x, bsize)) = subsize;
}
-
pl = partition_plane_context(xd, bsize);
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
switch (partition) {
@@ -1149,7 +1253,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
get_block_context(x, subsize));
if (mi_row + (bh >> 1) <= cm->mi_rows) {
- int rt, dt;
+ int rt;
+ int64_t dt;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*(get_sb_index(xd, subsize)) = 1;
@@ -1167,7 +1272,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, subsize,
get_block_context(x, subsize));
if (mi_col + (bs >> 1) <= cm->mi_cols) {
- int rt, dt;
+ int rt;
+ int64_t dt;
update_state(cpi, get_block_context(x, subsize), subsize, 0);
encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
*(get_sb_index(xd, subsize)) = 1;
@@ -1186,7 +1292,8 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
int x_idx = (i & 1) * (bs >> 2);
int y_idx = (i >> 1) * (bs >> 2);
int jj = i >> 1, ii = i & 0x01;
- int rt, dt;
+ int rt;
+ int64_t dt;
if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
@@ -1206,17 +1313,6 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
assert(0);
}
- // update partition context
-#if CONFIG_AB4X4
- if (bsize >= BLOCK_SIZE_SB8X8 &&
- (bsize == BLOCK_SIZE_SB8X8 || partition != PARTITION_SPLIT)) {
-#else
- if (bsize > BLOCK_SIZE_SB8X8
- && (bsize == BLOCK_SIZE_MB16X16 || partition != PARTITION_SPLIT)) {
-#endif
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- update_partition_context(xd, subsize, bsize);
- }
restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
if (r < INT_MAX && d < INT_MAX)
@@ -1229,21 +1325,21 @@ static void rd_use_partition(VP9_COMP *cpi, MODE_INFO *m, TOKENEXTRA **tp,
// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
// unlikely to be selected depending on previously rate-distortion optimization
// results, for encoding speed-up.
-static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
- int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize,
- int *rate, int *dist) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row,
+ int mi_col, BLOCK_SIZE_TYPE bsize, int *rate,
+ int64_t *dist) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
int bsl = b_width_log2(bsize), bs = 1 << bsl;
int ms = bs / 2;
- ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
PARTITION_CONTEXT sl[8], sa[8];
TOKENEXTRA *tp_orig = *tp;
int i, pl;
BLOCK_SIZE_TYPE subsize;
- int srate = INT_MAX, sdist = INT_MAX;
+ int srate = INT_MAX;
+ int64_t sdist = INT_MAX;
if (bsize < BLOCK_SIZE_SB8X8)
if (xd->ab_index != 0) {
@@ -1256,121 +1352,132 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
save_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
// PARTITION_SPLIT
- if (bsize >= BLOCK_SIZE_SB8X8) {
- int r4 = 0, d4 = 0;
- subsize = get_subsize(bsize, PARTITION_SPLIT);
- *(get_sb_partitioning(x, bsize)) = subsize;
+ if (!cpi->sf.use_partitions_greater_than
+ || (cpi->sf.use_partitions_greater_than
+ && bsize > cpi->sf.greater_than_block_size)) {
+ if (bsize >= BLOCK_SIZE_SB8X8) {
+ int r4 = 0;
+ int64_t d4 = 0;
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ *(get_sb_partitioning(x, bsize)) = subsize;
- for (i = 0; i < 4; ++i) {
- int x_idx = (i & 1) * (ms >> 1);
- int y_idx = (i >> 1) * (ms >> 1);
- int r = 0, d = 0;
+ for (i = 0; i < 4; ++i) {
+ int x_idx = (i & 1) * (ms >> 1);
+ int y_idx = (i >> 1) * (ms >> 1);
+ int r = 0;
+ int64_t d = 0;
- if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
- continue;
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
+ continue;
- *(get_sb_index(xd, subsize)) = i;
- rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
- &r, &d);
+ *(get_sb_index(xd, subsize)) = i;
+ rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, &r,
+ &d);
- r4 += r;
- d4 += d;
+ r4 += r;
+ d4 += d;
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ if (r4 < INT_MAX)
+ r4 += x->partition_cost[pl][PARTITION_SPLIT];
+ assert(r4 >= 0);
+ assert(d4 >= 0);
+ srate = r4;
+ sdist = d4;
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r4 < INT_MAX)
- r4 += x->partition_cost[pl][PARTITION_SPLIT];
- assert(r4 >= 0);
- assert(d4 >= 0);
- srate = r4;
- sdist = d4;
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
-
- // PARTITION_HORZ
- if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
- int r2, d2;
- int r = 0, d = 0;
- subsize = get_subsize(bsize, PARTITION_HORZ);
- *(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
- get_block_context(x, subsize));
-
- if (mi_row + (ms >> 1) < cm->mi_rows) {
- update_state(cpi, get_block_context(x, subsize), subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+ if (!cpi->sf.use_partitions_less_than
+ || (cpi->sf.use_partitions_less_than
+ && bsize <= cpi->sf.less_than_block_size)) {
+ // PARTITION_HORZ
+ if (bsize >= BLOCK_SIZE_SB8X8 && mi_col + (ms >> 1) < cm->mi_cols) {
+ int r2, r = 0;
+ int64_t d2, d = 0;
+ subsize = get_subsize(bsize, PARTITION_HORZ);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
get_block_context(x, subsize));
- r2 += r;
- d2 += d;
- }
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r2 < INT_MAX)
- r2 += x->partition_cost[pl][PARTITION_HORZ];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
- RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r2;
- sdist = d2;
- *(get_sb_partitioning(x, bsize)) = subsize;
+
+ if (mi_row + (ms >> 1) < cm->mi_rows) {
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+ get_block_context(x, subsize));
+ r2 += r;
+ d2 += d;
+ }
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ if (r2 < INT_MAX)
+ r2 += x->partition_cost[pl][PARTITION_HORZ];
+ if (RDCOST(x->rdmult, x->rddiv, r2, d2)
+ < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- }
- // PARTITION_VERT
- if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
- int r2, d2;
- subsize = get_subsize(bsize, PARTITION_VERT);
- *(get_sb_index(xd, subsize)) = 0;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
- get_block_context(x, subsize));
- if (mi_col + (ms >> 1) < cm->mi_cols) {
- int r = 0, d = 0;
- update_state(cpi, get_block_context(x, subsize), subsize, 0);
- encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
-
- *(get_sb_index(xd, subsize)) = 1;
- pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+ // PARTITION_VERT
+ if (bsize >= BLOCK_SIZE_SB8X8 && mi_row + (ms >> 1) < cm->mi_rows) {
+ int r2;
+ int64_t d2;
+ subsize = get_subsize(bsize, PARTITION_VERT);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
get_block_context(x, subsize));
- r2 += r;
- d2 += d;
- }
- set_partition_seg_context(cm, xd, mi_row, mi_col);
- pl = partition_plane_context(xd, bsize);
- if (r2 < INT_MAX)
- r2 += x->partition_cost[pl][PARTITION_VERT];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
- RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r2;
- sdist = d2;
- *(get_sb_partitioning(x, bsize)) = subsize;
- }
- restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
- }
+ if (mi_col + (ms >> 1) < cm->mi_cols) {
+ int r = 0;
+ int64_t d = 0;
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
- // PARTITION_NONE
- if ((mi_row + (ms >> 1) < cm->mi_rows) &&
- (mi_col + (ms >> 1) < cm->mi_cols)) {
- int r, d;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
- get_block_context(x, bsize));
- if (bsize >= BLOCK_SIZE_SB8X8) {
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+ get_block_context(x, subsize));
+ r2 += r;
+ d2 += d;
+ }
set_partition_seg_context(cm, xd, mi_row, mi_col);
pl = partition_plane_context(xd, bsize);
- r += x->partition_cost[pl][PARTITION_NONE];
+ if (r2 < INT_MAX)
+ r2 += x->partition_cost[pl][PARTITION_VERT];
+ if (RDCOST(x->rdmult, x->rddiv, r2, d2)
+ < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
}
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
- srate = r;
- sdist = d;
- if (bsize >= BLOCK_SIZE_SB8X8)
- *(get_sb_partitioning(x, bsize)) = bsize;
+ // PARTITION_NONE
+ if ((mi_row + (ms >> 1) < cm->mi_rows) &&
+ (mi_col + (ms >> 1) < cm->mi_cols)) {
+ int r;
+ int64_t d;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+ get_block_context(x, bsize));
+ if (bsize >= BLOCK_SIZE_SB8X8) {
+ set_partition_seg_context(cm, xd, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ r += x->partition_cost[pl][PARTITION_NONE];
+ }
+
+ if (RDCOST(x->rdmult, x->rddiv, r, d)
+ < RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ srate = r;
+ sdist = d;
+ if (bsize >= BLOCK_SIZE_SB8X8)
+ *(get_sb_partitioning(x, bsize)) = bsize;
+ }
}
}
-
*rate = srate;
*dist = sdist;
@@ -1388,9 +1495,9 @@ static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
}
}
-static void encode_sb_row(VP9_COMP *cpi, int mi_row,
- TOKENEXTRA **tp, int *totalrate) {
- VP9_COMMON *const cm = &cpi->common;
+static void encode_sb_row(VP9_COMP *cpi, int mi_row, TOKENEXTRA **tp,
+ int *totalrate) {
+ VP9_COMMON * const cm = &cpi->common;
int mi_col;
// Initialize the left context for the new SB row
@@ -1398,27 +1505,49 @@ static void encode_sb_row(VP9_COMP *cpi, int mi_row,
vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
// Code each SB in the row
- for (mi_col = cm->cur_tile_mi_col_start;
- mi_col < cm->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) {
- int dummy_rate, dummy_dist;
- if (cpi->speed < 5) {
- rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
- &dummy_rate, &dummy_dist);
- } else {
+ for (mi_col = cm->cur_tile_mi_col_start; mi_col < cm->cur_tile_mi_col_end;
+ mi_col += 64 / MI_SIZE) {
+ int dummy_rate;
+ int64_t dummy_dist;
+ if (cpi->sf.partition_by_variance || cpi->sf.use_lastframe_partitioning ||
+ cpi->sf.use_one_partition_size_always ) {
const int idx_str = cm->mode_info_stride * mi_row + mi_col;
MODE_INFO *m = cm->mi + idx_str;
- // set_partitioning(cpi, m, BLOCK_SIZE_SB64X64);
- choose_partitioning(cpi, cm->mi, mi_row, mi_col);
- rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
- &dummy_rate, &dummy_dist);
+ MODE_INFO *p = cm->prev_mi + idx_str;
+
+ if (cpi->sf.use_one_partition_size_always) {
+ set_offsets(cpi, mi_row, mi_col, BLOCK_SIZE_SB64X64);
+ set_partitioning(cpi, m, cpi->sf.always_this_block_size);
+ rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist);
+ } else if (cpi->sf.partition_by_variance) {
+ choose_partitioning(cpi, cm->mi, mi_row, mi_col);
+ rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist);
+ } else {
+ if ((cpi->common.current_video_frame & 1) == 0 || cm->prev_mi == 0
+ || cpi->common.show_frame == 0
+ || cpi->common.frame_type == KEY_FRAME
+ || cpi->is_src_frame_alt_ref) {
+ rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist);
+ } else {
+ copy_partitioning(cpi, m, p);
+ rd_use_partition(cpi, m, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist);
+ }
+ }
+ } else {
+ rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist);
}
}
}
static void init_encode_frame_mb_context(VP9_COMP *cpi) {
- MACROBLOCK *const x = &cpi->mb;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCK * const x = &cpi->mb;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCKD * const xd = &x->e_mbd;
x->act_zbin_adj = 0;
cpi->seg0_idx = 0;
@@ -1438,7 +1567,7 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
// TODO(jkoleszar): are these initializations required?
setup_pre_planes(xd, &cm->yv12_fb[cm->ref_frame_map[cpi->lst_fb_idx]], NULL,
- 0, 0, NULL, NULL);
+ 0, 0, NULL, NULL );
setup_dst_planes(xd, &cm->yv12_fb[cm->new_fb_idx], 0, 0);
vp9_build_block_offsets(x);
@@ -1463,36 +1592,36 @@ static void init_encode_frame_mb_context(VP9_COMP *cpi) {
// Note: this memset assumes above_context[0], [1] and [2]
// are allocated as part of the same buffer.
- vpx_memset(cm->above_context[0], 0, sizeof(ENTROPY_CONTEXT) * 2 *
- MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
- vpx_memset(cm->above_seg_context, 0, sizeof(PARTITION_CONTEXT) *
- mi_cols_aligned_to_sb(cm));
+ vpx_memset(
+ cm->above_context[0], 0,
+ sizeof(ENTROPY_CONTEXT) * 2 * MAX_MB_PLANE * mi_cols_aligned_to_sb(cm));
+ vpx_memset(cm->above_seg_context, 0,
+ sizeof(PARTITION_CONTEXT) * mi_cols_aligned_to_sb(cm));
}
static void switch_lossless_mode(VP9_COMP *cpi, int lossless) {
if (lossless) {
- cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
- cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
- cpi->mb.optimize = 0;
- cpi->common.filter_level = 0;
- cpi->zbin_mode_boost_enabled = 0;
- cpi->common.txfm_mode = ONLY_4X4;
+ cpi->mb.fwd_txm8x4 = vp9_short_walsh8x4;
+ cpi->mb.fwd_txm4x4 = vp9_short_walsh4x4;
+ cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_iwalsh4x4_1_add;
+ cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_iwalsh4x4_add;
+ cpi->mb.optimize = 0;
+ cpi->common.filter_level = 0;
+ cpi->zbin_mode_boost_enabled = 0;
+ cpi->common.txfm_mode = ONLY_4X4;
} else {
- cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
- cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
- cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
- cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
+ cpi->mb.fwd_txm8x4 = vp9_short_fdct8x4;
+ cpi->mb.fwd_txm4x4 = vp9_short_fdct4x4;
+ cpi->mb.e_mbd.inv_txm4x4_1_add = vp9_short_idct4x4_1_add;
+ cpi->mb.e_mbd.inv_txm4x4_add = vp9_short_idct4x4_add;
}
}
-
static void encode_frame_internal(VP9_COMP *cpi) {
int mi_row;
- MACROBLOCK *const x = &cpi->mb;
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCK * const x = &cpi->mb;
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCKD * const xd = &x->e_mbd;
int totalrate;
// fprintf(stderr, "encode_frame_internal frame %d (%d) type %d\n",
@@ -1524,10 +1653,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
vp9_zero(cpi->coef_counts);
vp9_zero(cm->fc.eob_branch_counts);
- cpi->mb.e_mbd.lossless = cm->base_qindex == 0 &&
- cm->y_dc_delta_q == 0 &&
- cm->uv_dc_delta_q == 0 &&
- cm->uv_ac_delta_q == 0;
+ cpi->mb.e_mbd.lossless = cm->base_qindex == 0 && cm->y_dc_delta_q == 0
+ && cm->uv_dc_delta_q == 0 && cm->uv_ac_delta_q == 0;
switch_lossless_mode(cpi, cpi->mb.e_mbd.lossless);
vp9_frame_init_quantizer(cpi);
@@ -1553,7 +1680,7 @@ static void encode_frame_internal(VP9_COMP *cpi) {
set_prev_mi(cm);
{
- struct vpx_usec_timer emr_timer;
+ struct vpx_usec_timer emr_timer;
vpx_usec_timer_start(&emr_timer);
{
@@ -1570,9 +1697,9 @@ static void encode_frame_internal(VP9_COMP *cpi) {
// For each row of SBs in the frame
vp9_get_tile_col_offsets(cm, tile_col);
for (mi_row = cm->cur_tile_mi_row_start;
- mi_row < cm->cur_tile_mi_row_end;
- mi_row += 8)
+ mi_row < cm->cur_tile_mi_row_end; mi_row += 8)
encode_sb_row(cpi, mi_row, &tp, &totalrate);
+
cpi->tok_count[tile_row][tile_col] = (unsigned int)(tp - tp_old);
assert(tp - cpi->tok <=
get_token_alloc(cm->mb_rows, cm->mb_cols));
@@ -1602,9 +1729,8 @@ static int check_dual_ref_flags(VP9_COMP *cpi) {
if (vp9_segfeature_active(xd, 1, SEG_LVL_REF_FRAME)) {
return 0;
} else {
- return (!!(ref_flags & VP9_GOLD_FLAG) +
- !!(ref_flags & VP9_LAST_FLAG) +
- !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+ return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
+ + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
}
}
@@ -1631,35 +1757,33 @@ static void set_txfm_flag(MODE_INFO *mi, int mis, int ymbs, int xmbs,
}
}
-static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi,
- int mis, TX_SIZE txfm_max,
- int bw, int bh, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MB_MODE_INFO *const mbmi = &mi->mbmi;
+static void reset_skip_txfm_size_b(VP9_COMP *cpi, MODE_INFO *mi, int mis,
+ TX_SIZE txfm_max, int bw, int bh, int mi_row,
+ int mi_col, BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON * const cm = &cpi->common;
+ MB_MODE_INFO * const mbmi = &mi->mbmi;
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
if (mbmi->txfm_size > txfm_max) {
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
const int segment_id = mbmi->segment_id;
const int ymbs = MIN(bh, cm->mi_rows - mi_row);
const int xmbs = MIN(bw, cm->mi_cols - mi_col);
xd->mode_info_context = mi;
- assert(vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) ||
- get_skip_flag(mi, mis, ymbs, xmbs));
+ assert(
+ vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP) || get_skip_flag(mi, mis, ymbs, xmbs));
set_txfm_flag(mi, mis, ymbs, xmbs, txfm_max);
}
}
static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
- TX_SIZE txfm_max,
- int mi_row, int mi_col,
+ TX_SIZE txfm_max, int mi_row, int mi_col,
BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
+ VP9_COMMON * const cm = &cpi->common;
const int mis = cm->mode_info_stride;
int bwl, bhl;
const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
@@ -1671,18 +1795,18 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
bhl = mi_height_log2(mi->mbmi.sb_type);
if (bwl == bsl && bhl == bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl,
- mi_row, mi_col, bsize);
+ reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, 1 << bsl, mi_row,
+ mi_col, bsize);
} else if (bwl == bsl && bhl < bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs,
- mi_row, mi_col, bsize);
+ reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, 1 << bsl, bs, mi_row, mi_col,
+ bsize);
reset_skip_txfm_size_b(cpi, mi + bs * mis, mis, txfm_max, 1 << bsl, bs,
mi_row + bs, mi_col, bsize);
} else if (bwl < bsl && bhl == bsl) {
- reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl,
- mi_row, mi_col, bsize);
- reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl,
- mi_row, mi_col + bs, bsize);
+ reset_skip_txfm_size_b(cpi, mi, mis, txfm_max, bs, 1 << bsl, mi_row, mi_col,
+ bsize);
+ reset_skip_txfm_size_b(cpi, mi + bs, mis, txfm_max, bs, 1 << bsl, mi_row,
+ mi_col + bs, bsize);
} else {
BLOCK_SIZE_TYPE subsize;
int n;
@@ -1700,32 +1824,30 @@ static void reset_skip_txfm_size_sb(VP9_COMP *cpi, MODE_INFO *mi,
for (n = 0; n < 4; n++) {
const int y_idx = n >> 1, x_idx = n & 0x01;
- reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs,
- txfm_max, mi_row + y_idx * bs,
- mi_col + x_idx * bs, subsize);
+ reset_skip_txfm_size_sb(cpi, mi + y_idx * bs * mis + x_idx * bs, txfm_max,
+ mi_row + y_idx * bs, mi_col + x_idx * bs,
+ subsize);
}
}
}
static void reset_skip_txfm_size(VP9_COMP *cpi, TX_SIZE txfm_max) {
- VP9_COMMON *const cm = &cpi->common;
+ VP9_COMMON * const cm = &cpi->common;
int mi_row, mi_col;
const int mis = cm->mode_info_stride;
MODE_INFO *mi, *mi_ptr = cm->mi;
- for (mi_row = 0; mi_row < cm->mi_rows;
- mi_row += 8, mi_ptr += 8 * mis) {
+ for (mi_row = 0; mi_row < cm->mi_rows; mi_row += 8, mi_ptr += 8 * mis) {
mi = mi_ptr;
- for (mi_col = 0; mi_col < cm->mi_cols;
- mi_col += 8, mi += 8) {
- reset_skip_txfm_size_sb(cpi, mi, txfm_max,
- mi_row, mi_col, BLOCK_SIZE_SB64X64);
+ for (mi_col = 0; mi_col < cm->mi_cols; mi_col += 8, mi += 8) {
+ reset_skip_txfm_size_sb(cpi, mi, txfm_max, mi_row, mi_col,
+ BLOCK_SIZE_SB64X64);
}
}
}
void vp9_encode_frame(VP9_COMP *cpi) {
- VP9_COMMON *const cm = &cpi->common;
+ VP9_COMMON * const cm = &cpi->common;
// In the longer term the encoder should be generalized to match the
// decoder such that we allow compound where one of the 3 buffers has a
@@ -1733,10 +1855,10 @@ void vp9_encode_frame(VP9_COMP *cpi) {
// requires further work in the rd loop. For now the only supported encoder
// side behaviour is where the ALT ref buffer has oppositie sign bias to
// the other two.
- if ((cm->ref_frame_sign_bias[ALTREF_FRAME] ==
- cm->ref_frame_sign_bias[GOLDEN_FRAME]) ||
- (cm->ref_frame_sign_bias[ALTREF_FRAME] ==
- cm->ref_frame_sign_bias[LAST_FRAME])) {
+ if ((cm->ref_frame_sign_bias[ALTREF_FRAME]
+ == cm->ref_frame_sign_bias[GOLDEN_FRAME])
+ || (cm->ref_frame_sign_bias[ALTREF_FRAME]
+ == cm->ref_frame_sign_bias[LAST_FRAME])) {
cm->allow_comp_inter_inter = 0;
} else {
cm->allow_comp_inter_inter = 1;
@@ -1770,14 +1892,14 @@ void vp9_encode_frame(VP9_COMP *cpi) {
/* prediction (compound, single or hybrid) mode selection */
if (frame_type == 3 || !cm->allow_comp_inter_inter)
pred_type = SINGLE_PREDICTION_ONLY;
- else if (cpi->rd_prediction_type_threshes[frame_type][1] >
- cpi->rd_prediction_type_threshes[frame_type][0] &&
- cpi->rd_prediction_type_threshes[frame_type][1] >
- cpi->rd_prediction_type_threshes[frame_type][2] &&
- check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
+ else if (cpi->rd_prediction_type_threshes[frame_type][1]
+ > cpi->rd_prediction_type_threshes[frame_type][0]
+ && cpi->rd_prediction_type_threshes[frame_type][1]
+ > cpi->rd_prediction_type_threshes[frame_type][2]
+ && check_dual_ref_flags(cpi) && cpi->static_mb_pct == 100)
pred_type = COMP_PREDICTION_ONLY;
- else if (cpi->rd_prediction_type_threshes[frame_type][0] >
- cpi->rd_prediction_type_threshes[frame_type][2])
+ else if (cpi->rd_prediction_type_threshes[frame_type][0]
+ > cpi->rd_prediction_type_threshes[frame_type][2])
pred_type = SINGLE_PREDICTION_ONLY;
else
pred_type = HYBRID_PREDICTION;
@@ -1790,43 +1912,44 @@ void vp9_encode_frame(VP9_COMP *cpi) {
cpi->mb.e_mbd.lossless = 1;
} else
#if 0
- /* FIXME (rbultje): this code is disabled until we support cost updates
- * while a frame is being encoded; the problem is that each time we
- * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
- * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
- * further behind and not being chosen for subsequent frames either. This
- * is essentially a local minimum problem that we can probably fix by
- * estimating real costs more closely within a frame, perhaps by re-
- * calculating costs on-the-fly as frame encoding progresses. */
- if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
- txfm_type = TX_MODE_SELECT;
- } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
- && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
- cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
- ) {
- txfm_type = ONLY_4X4;
- } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
- cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
- txfm_type = ALLOW_16X16;
- } else
+ /* FIXME (rbultje): this code is disabled until we support cost updates
+ * while a frame is being encoded; the problem is that each time we
+ * "revert" to 4x4 only (or even 8x8 only), the coefficient probabilities
+ * for 16x16 (and 8x8) start lagging behind, thus leading to them lagging
+ * further behind and not being chosen for subsequent frames either. This
+ * is essentially a local minimum problem that we can probably fix by
+ * estimating real costs more closely within a frame, perhaps by re-
+ * calculating costs on-the-fly as frame encoding progresses. */
+ if (cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+ cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] &&
+ cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] &&
+ cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] >
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+ txfm_type = TX_MODE_SELECT;
+ } else if (cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]
+ && cpi->rd_tx_select_threshes[frame_type][ONLY_4X4] >
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16]
+ ) {
+ txfm_type = ONLY_4X4;
+ } else if (cpi->rd_tx_select_threshes[frame_type][ALLOW_16X16] >=
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_8X8]) {
+ txfm_type = ALLOW_16X16;
+ } else
txfm_type = ALLOW_8X8;
#else
- txfm_type = cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32] >
- cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
- ALLOW_32X32 : TX_MODE_SELECT;
+ txfm_type =
+ cpi->rd_tx_select_threshes[frame_type][ALLOW_32X32]
+ > cpi->rd_tx_select_threshes[frame_type][TX_MODE_SELECT] ?
+ ALLOW_32X32 : TX_MODE_SELECT;
#endif
cpi->common.txfm_mode = txfm_type;
cpi->common.comp_pred_mode = pred_type;
encode_frame_internal(cpi);
for (i = 0; i < NB_PREDICTION_TYPES; ++i) {
- const int diff = (int)(cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
+ const int diff = (int) (cpi->rd_comp_pred_diff[i] / cpi->common.MBs);
cpi->rd_prediction_type_threshes[frame_type][i] += diff;
cpi->rd_prediction_type_threshes[frame_type][i] >>= 1;
}
@@ -1836,8 +1959,8 @@ void vp9_encode_frame(VP9_COMP *cpi) {
int diff;
if (i == TX_MODE_SELECT)
pd -= RDCOST(cpi->mb.rdmult, cpi->mb.rddiv,
- 2048 * (TX_SIZE_MAX_SB - 1), 0);
- diff = (int)(pd / cpi->common.MBs);
+ 2048 * (TX_SIZE_MAX_SB - 1), 0);
+ diff = (int) (pd / cpi->common.MBs);
cpi->rd_tx_select_threshes[frame_type][i] += diff;
cpi->rd_tx_select_threshes[frame_type][i] /= 2;
}
@@ -1890,12 +2013,12 @@ void vp9_encode_frame(VP9_COMP *cpi) {
for (i = 0; i < TX_SIZE_CONTEXTS; i++)
count32x32 += cm->fc.tx_count_32x32p[i][TX_32X32];
- if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
- count32x32 == 0) {
+ if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0
+ && count32x32 == 0) {
cpi->common.txfm_mode = ALLOW_8X8;
reset_skip_txfm_size(cpi, TX_8X8);
- } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
- count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+ } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0
+ && count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
cpi->common.txfm_mode = ONLY_4X4;
reset_skip_txfm_size(cpi, TX_4X4);
} else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
@@ -1957,18 +2080,17 @@ static void adjust_act_zbin(VP9_COMP *cpi, MACROBLOCK *x) {
b = 4 * act + cpi->activity_avg;
if (act > cpi->activity_avg)
- x->act_zbin_adj = (int)(((int64_t)b + (a >> 1)) / a) - 1;
+ x->act_zbin_adj = (int) (((int64_t) b + (a >> 1)) / a) - 1;
else
- x->act_zbin_adj = 1 - (int)(((int64_t)a + (b >> 1)) / b);
+ x->act_zbin_adj = 1 - (int) (((int64_t) a + (b >> 1)) / b);
#endif
}
-static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
- int output_enabled, int mi_row, int mi_col,
- BLOCK_SIZE_TYPE bsize) {
- VP9_COMMON *const cm = &cpi->common;
- MACROBLOCK *const x = &cpi->mb;
- MACROBLOCKD *const xd = &x->e_mbd;
+static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t, int output_enabled,
+ int mi_row, int mi_col, BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON * const cm = &cpi->common;
+ MACROBLOCK * const x = &cpi->mb;
+ MACROBLOCKD * const xd = &x->e_mbd;
int n;
MODE_INFO *mi = xd->mode_info_context;
MB_MODE_INFO *mbmi = &mi->mbmi;
@@ -2015,10 +2137,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
}
if (mbmi->ref_frame[0] == INTRA_FRAME) {
- vp9_encode_intra_block_y(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
- BLOCK_SIZE_SB8X8 : bsize);
- vp9_encode_intra_block_uv(cm, x, (bsize < BLOCK_SIZE_SB8X8) ?
- BLOCK_SIZE_SB8X8 : bsize);
+ vp9_encode_intra_block_y(
+ cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+ vp9_encode_intra_block_uv(
+ cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
if (output_enabled)
sum_intra_stats(cpi, x);
} else {
@@ -2032,12 +2154,12 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
assert(cm->frame_type != KEY_FRAME);
- setup_pre_planes(xd, ref_fb, second_ref_fb,
- mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
+ setup_pre_planes(xd, ref_fb, second_ref_fb, mi_row, mi_col,
+ xd->scale_factor, xd->scale_factor_uv);
- vp9_build_inter_predictors_sb(xd, mi_row, mi_col,
- bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8
- : bsize);
+ vp9_build_inter_predictors_sb(
+ xd, mi_row, mi_col,
+ bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8 : bsize);
}
if (xd->mode_info_context->mbmi.ref_frame[0] == INTRA_FRAME) {
@@ -2049,14 +2171,14 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
(bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
} else {
// FIXME(rbultje): not tile-aware (mi - 1)
- int mb_skip_context =
- (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;
+ int mb_skip_context = (mi - 1)->mbmi.mb_skip_coeff
+ + (mi - mis)->mbmi.mb_skip_coeff;
mbmi->mb_skip_coeff = 1;
if (output_enabled)
cm->fc.mbskip_count[mb_skip_context][1]++;
- vp9_reset_sb_tokens_context(xd,
- (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+ vp9_reset_sb_tokens_context(
+ xd, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
}
// copy skip flag on all mb_mode_info contexts in this SB
@@ -2068,10 +2190,10 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
}
if (output_enabled) {
- if (cm->txfm_mode == TX_MODE_SELECT &&
- mbmi->sb_type >= BLOCK_SIZE_SB8X8 &&
- !(mbmi->ref_frame[0] != INTRA_FRAME && (mbmi->mb_skip_coeff ||
- vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
+ if (cm->txfm_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_SIZE_SB8X8
+ && !(mbmi->ref_frame[0] != INTRA_FRAME
+ && (mbmi->mb_skip_coeff
+ || vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP)))) {
const int context = vp9_get_pred_context(cm, xd, PRED_TX_SIZE);
if (bsize >= BLOCK_SIZE_SB32X32) {
cm->fc.tx_count_32x32p[context][mbmi->txfm_size]++;
@@ -2083,7 +2205,7 @@ static void encode_superblock(VP9_COMP *cpi, TOKENEXTRA **t,
} else {
int x, y;
TX_SIZE sz = (cm->txfm_mode == TX_MODE_SELECT) ? TX_32X32 : cm->txfm_mode;
- // The new intra coding scheme requires no change of transform size
+ // The new intra coding scheme requires no change of transform size
if (mi->mbmi.ref_frame[0] != INTRA_FRAME) {
if (sz == TX_32X32 && bsize < BLOCK_SIZE_SB32X32)
sz = TX_16X16;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 4f45496df..2f133ccbc 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -22,10 +22,10 @@
DECLARE_ALIGNED(16, extern const uint8_t,
vp9_pt_energy_class[MAX_ENTROPY_TOKENS]);
-void vp9_subtract_block(int rows, int cols,
- int16_t *diff_ptr, int diff_stride,
- const uint8_t *src_ptr, int src_stride,
- const uint8_t *pred_ptr, int pred_stride) {
+void vp9_subtract_block_c(int rows, int cols,
+ int16_t *diff_ptr, ptrdiff_t diff_stride,
+ const uint8_t *src_ptr, ptrdiff_t src_stride,
+ const uint8_t *pred_ptr, ptrdiff_t pred_stride) {
int r, c;
for (r = 0; r < rows; r++) {
diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 579690346..3042c9f7f 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h
@@ -42,10 +42,6 @@ void vp9_encode_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_xform_quant_sby(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_xform_quant_sbuv(VP9_COMMON *cm, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
-void vp9_subtract_block(int rows, int cols,
- int16_t *diff_ptr, int diff_stride,
- const uint8_t *src_ptr, int src_stride,
- const uint8_t *pred_ptr, int pred_stride);
void vp9_subtract_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize);
void vp9_subtract_sb(MACROBLOCK *xd, BLOCK_SIZE_TYPE bsize);
diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index a582d183d..ea6aa296a 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c
@@ -541,7 +541,7 @@ void vp9_encode_mv(vp9_writer* w, const MV* mv, const MV* ref,
const MV diff = {mv->row - ref->row,
mv->col - ref->col};
const MV_JOINT_TYPE j = vp9_get_mv_joint(&diff);
- usehp = usehp && vp9_use_nmv_hp(ref);
+ usehp = usehp && vp9_use_mv_hp(ref);
write_token(w, vp9_mv_joint_tree, mvctx->joints, &vp9_mv_joint_encodings[j]);
if (mv_joint_vertical(j))
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 5e26cd82a..522f89982 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -986,9 +986,11 @@ static int estimate_max_q(VP9_COMP *cpi,
// Corrections for higher compression speed settings
// (reduced compression expected)
+ // FIXME(jimbankoski): Once we settle on vp9 speed features we need to
+ // change this code.
if (cpi->compressor_speed == 1)
speed_correction = cpi->oxcf.cpu_used <= 5 ?
- 1.04 + (cpi->oxcf.cpu_used * 0.04) :
+ 1.04 + (/*cpi->oxcf.cpu_used*/0 * 0.04) :
1.25;
// Try and pick a max Q that will be high enough to encode the
@@ -1051,7 +1053,7 @@ static int estimate_cq(VP9_COMP *cpi,
// (reduced compression expected)
if (cpi->compressor_speed == 1) {
if (cpi->oxcf.cpu_used <= 5)
- speed_correction = 1.04 + (cpi->oxcf.cpu_used * 0.04);
+ speed_correction = 1.04 + (/*cpi->oxcf.cpu_used*/ 0 * 0.04);
else
speed_correction = 1.25;
}
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 2e99736ce..0f1062313 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -366,7 +366,7 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
}
if (xd->allow_high_precision_mv) {
- usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ usehp = vp9_use_mv_hp(&ref_mv->as_mv);
} else {
usehp = 0;
}
@@ -556,7 +556,7 @@ int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
}
if (xd->allow_high_precision_mv) {
- usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ usehp = vp9_use_mv_hp(&ref_mv->as_mv);
} else {
usehp = 0;
}
@@ -930,7 +930,7 @@ int vp9_find_best_sub_pixel_step(MACROBLOCK *x,
}
if (x->e_mbd.allow_high_precision_mv) {
- usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ usehp = vp9_use_mv_hp(&ref_mv->as_mv);
} else {
usehp = 0;
}
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index 6a14df471..e02e73232 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -591,22 +591,25 @@ static void set_rd_speed_thresholds(VP9_COMP *cpi, int mode, int speed) {
sf->thresh_mult[THR_COMP_SPLITLA ] += speed_multiplier * 4500;
sf->thresh_mult[THR_COMP_SPLITGA ] += speed_multiplier * 4500;
- if (speed > 4) {
+ if (cpi->sf.skip_lots_of_modes) {
for (i = 0; i < MAX_MODES; ++i)
sf->thresh_mult[i] = INT_MAX;
- sf->thresh_mult[THR_DC ] = 0;
- sf->thresh_mult[THR_TM ] = 0;
- sf->thresh_mult[THR_NEWMV ] = 4000;
- sf->thresh_mult[THR_NEWG ] = 4000;
- sf->thresh_mult[THR_NEWA ] = 4000;
+ sf->thresh_mult[THR_DC] = 0;
+ sf->thresh_mult[THR_TM] = 0;
+ sf->thresh_mult[THR_NEWMV] = 4000;
+ sf->thresh_mult[THR_NEWG] = 4000;
+ sf->thresh_mult[THR_NEWA] = 4000;
sf->thresh_mult[THR_NEARESTMV] = 0;
- sf->thresh_mult[THR_NEARESTG ] = 0;
- sf->thresh_mult[THR_NEARESTA ] = 0;
- sf->thresh_mult[THR_NEARMV ] = 2000;
- sf->thresh_mult[THR_NEARG ] = 2000;
- sf->thresh_mult[THR_NEARA ] = 2000;
+ sf->thresh_mult[THR_NEARESTG] = 0;
+ sf->thresh_mult[THR_NEARESTA] = 0;
+ sf->thresh_mult[THR_NEARMV] = 2000;
+ sf->thresh_mult[THR_NEARG] = 2000;
+ sf->thresh_mult[THR_NEARA] = 2000;
sf->thresh_mult[THR_COMP_NEARESTLA] = 2000;
+ sf->thresh_mult[THR_SPLITMV] = 2500;
+ sf->thresh_mult[THR_SPLITG] = 2500;
+ sf->thresh_mult[THR_SPLITA] = 2500;
sf->recode_loop = 0;
}
@@ -681,6 +684,18 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
sf->max_step_search_steps = MAX_MVSEARCH_STEPS;
sf->comp_inter_joint_search_thresh = BLOCK_SIZE_AB4X4;
sf->adpative_rd_thresh = 0;
+ sf->use_lastframe_partitioning = 0;
+ sf->use_largest_txform = 0;
+ sf->use_8tap_always = 0;
+ sf->use_avoid_tested_higherror = 0;
+ sf->skip_lots_of_modes = 0;
+ sf->adjust_thresholds_by_speed = 0;
+ sf->partition_by_variance = 0;
+ sf->use_one_partition_size_always = 0;
+ sf->use_partitions_less_than = 0;
+ sf->less_than_block_size = BLOCK_SIZE_MB16X16;
+ sf->use_partitions_greater_than = 0;
+ sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
#if CONFIG_MULTIPLE_ARF
// Switch segmentation off.
@@ -703,17 +718,51 @@ void vp9_set_speed_features(VP9_COMP *cpi) {
#endif
sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
sf->adpative_rd_thresh = 1;
- if (speed > 0) {
+ if (speed == 1) {
sf->comp_inter_joint_search_thresh = BLOCK_SIZE_TYPES;
sf->optimize_coefficients = 0;
sf->first_step = 1;
+ sf->use_avoid_tested_higherror = 1;
+ sf->adjust_thresholds_by_speed = 1;
}
- break;
+ if (speed == 2) {
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->use_lastframe_partitioning = 1;
+ sf->first_step = 0;
+ }
+ if (speed == 3) {
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->partition_by_variance = 1;
+ sf->first_step = 0;
+ }
+ if (speed == 4) {
+ sf->first_step = 0;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->use_one_partition_size_always = 1;
+ sf->always_this_block_size = BLOCK_SIZE_MB16X16;
+ }
+ if (speed == 2) {
+ sf->first_step = 0;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->use_partitions_less_than = 1;
+ sf->less_than_block_size = BLOCK_SIZE_MB16X16;
+ }
+ if (speed == 3) {
+ sf->first_step = 0;
+ sf->comp_inter_joint_search_thresh = BLOCK_SIZE_SB8X8;
+ sf->use_partitions_greater_than = 1;
+ sf->greater_than_block_size = BLOCK_SIZE_SB8X8;
+ }
+
+ break;
}; /* switch */
// Set rd thresholds based on mode and speed setting
- set_rd_speed_thresholds(cpi, mode, speed);
+ if(cpi->sf.adjust_thresholds_by_speed)
+ set_rd_speed_thresholds(cpi, mode, speed);
+ else
+ set_rd_speed_thresholds(cpi, mode, 0);
// Slow quant, dct and trellis not worthwhile for first pass
// so make sure they are always turned off.
@@ -2993,7 +3042,7 @@ static void encode_frame_to_data_rate(VP9_COMP *cpi,
!cpi->common.frame_parallel_decoding_mode) {
vp9_adapt_mode_probs(&cpi->common);
vp9_adapt_mode_context(&cpi->common);
- vp9_adapt_nmv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
+ vp9_adapt_mv_probs(&cpi->common, cpi->mb.e_mbd.allow_high_precision_mv);
}
}
@@ -3327,7 +3376,7 @@ static void Pass2Encode(VP9_COMP *cpi, unsigned long *size,
vp9_second_pass(cpi);
encode_frame_to_data_rate(cpi, size, dest, frame_flags);
-
+ //vp9_print_modes_and_motion_vectors(&cpi->common, "encode.stt");
#ifdef DISABLE_RC_LONG_TERM_MEM
cpi->twopass.bits_left -= cpi->this_frame_target;
#else
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index f5f1c0772..0811976d0 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -216,6 +216,19 @@ typedef struct {
int static_segmentation;
int comp_inter_joint_search_thresh;
int adpative_rd_thresh;
+ int use_lastframe_partitioning;
+ int use_largest_txform;
+ int use_8tap_always;
+ int use_avoid_tested_higherror;
+ int skip_lots_of_modes;
+ int adjust_thresholds_by_speed;
+ int partition_by_variance;
+ int use_one_partition_size_always;
+ BLOCK_SIZE_TYPE always_this_block_size;
+ int use_partitions_greater_than;
+ BLOCK_SIZE_TYPE greater_than_block_size;
+ int use_partitions_less_than;
+ BLOCK_SIZE_TYPE less_than_block_size;
} SPEED_FEATURES;
enum BlockSize {
diff --git a/vp9/encoder/vp9_quantize.c b/vp9/encoder/vp9_quantize.c
index 53d8be775..ccbb624b0 100644
--- a/vp9/encoder/vp9_quantize.c
+++ b/vp9/encoder/vp9_quantize.c
@@ -35,6 +35,153 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
uint16_t *eob_ptr,
const int *scan, int mul) {
int i, rc, eob;
+ int zbins[2], nzbins[2], zbin;
+ int x, y, z, sz;
+ int zero_run = 0;
+ int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+ int zero_flag = n_coeffs;
+
+ vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+ eob = -1;
+
+ // Base ZBIN
+ zbins[0] = zbin_ptr[0] + zbin_oq_value;
+ zbins[1] = zbin_ptr[1] + zbin_oq_value;
+ nzbins[0] = zbins[0] * -1;
+ nzbins[1] = zbins[1] * -1;
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = n_coeffs - 1; i >= 0; i--) {
+ rc = scan[i];
+ z = coeff_ptr[rc] * mul;
+
+ if (z < zbins[rc != 0] && z > nzbins[rc != 0]) {
+ zero_flag--;
+ } else {
+ break;
+ }
+ }
+
+ // Quantization pass: All coefficients with index >= zero_flag are
+ // skippable. Note: zero_flag can be zero.
+ for (i = 0; i < zero_flag; i++) {
+ rc = scan[i];
+ z = coeff_ptr[rc] * mul;
+
+ zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+ zero_run += (zero_run < 15);
+
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz;
+
+ if (x >= zbin) {
+ x += (round_ptr[rc != 0]);
+ y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+ >> quant_shift_ptr[rc != 0]; // quantize (x)
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value
+
+ if (y) {
+ eob = i; // last nonzero coeffs
+ zero_run = 0; // set zero_run
+ }
+ }
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+
+// This function works well for large transform size.
+static void quantize_sparse(int16_t *zbin_boost_orig_ptr,
+ int16_t *coeff_ptr, int n_coeffs, int skip_block,
+ int16_t *zbin_ptr, int16_t *round_ptr,
+ int16_t *quant_ptr, uint8_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ int16_t *dequant_ptr, int zbin_oq_value,
+ uint16_t *eob_ptr, const int *scan, int mul,
+ int *idx_arr) {
+ int i, rc, eob;
+ int zbins[2], pzbins[2], nzbins[2], zbin;
+ int x, y, z, sz;
+ int zero_run = 0;
+ int16_t *zbin_boost_ptr = zbin_boost_orig_ptr;
+ int idx = 0;
+ int pre_idx = 0;
+
+ vpx_memset(qcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+ vpx_memset(dqcoeff_ptr, 0, n_coeffs*sizeof(int16_t));
+
+ eob = -1;
+
+ // Base ZBIN
+ zbins[0] = zbin_ptr[0] + zbin_oq_value;
+ zbins[1] = zbin_ptr[1] + zbin_oq_value;
+ // Positive and negative ZBIN
+ pzbins[0] = zbins[0]/mul;
+ pzbins[1] = zbins[1]/mul;
+ nzbins[0] = pzbins[0] * -1;
+ nzbins[1] = pzbins[1] * -1;
+
+ if (!skip_block) {
+ // Pre-scan pass
+ for (i = 0; i < n_coeffs; i++) {
+ rc = scan[i];
+ z = coeff_ptr[rc];
+
+ // If the coefficient is out of the base ZBIN range, keep it for
+ // quantization.
+ if (z >= pzbins[rc != 0] || z <= nzbins[rc != 0])
+ idx_arr[idx++] = i;
+ }
+
+ // Quantization pass: only process the coefficients selected in
+ // pre-scan pass. Note: idx can be zero.
+ for (i = 0; i < idx; i++) {
+ rc = scan[idx_arr[i]];
+
+ // Calculate ZBIN
+ zero_run += idx_arr[i] - pre_idx;
+ if(zero_run > 15) zero_run = 15;
+ zbin = (zbins[rc != 0] + zbin_boost_ptr[zero_run]);
+
+ pre_idx = idx_arr[i];
+ z = coeff_ptr[rc] * mul;
+ sz = (z >> 31); // sign of z
+ x = (z ^ sz) - sz; // x = abs(z)
+
+ if (x >= zbin) {
+ x += (round_ptr[rc != 0]);
+ y = ((int)(((int)(x * quant_ptr[rc != 0]) >> 16) + x))
+ >> quant_shift_ptr[rc != 0]; // quantize (x)
+
+ x = (y ^ sz) - sz; // get the sign back
+ qcoeff_ptr[rc] = x; // write to destination
+ dqcoeff_ptr[rc] = x * dequant_ptr[rc != 0] / mul; // dequantized value
+
+ if (y) {
+ eob = idx_arr[i]; // last nonzero coeffs
+ zero_run = -1; // set zero_run
+ }
+ }
+ }
+ }
+ *eob_ptr = eob + 1;
+}
+#if 0
+// Original quantize function
+static void quantize(int16_t *zbin_boost_orig_ptr,
+ int16_t *coeff_ptr, int n_coeffs, int skip_block,
+ int16_t *zbin_ptr, int16_t *round_ptr, int16_t *quant_ptr,
+ uint8_t *quant_shift_ptr,
+ int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+ int16_t *dequant_ptr, int zbin_oq_value,
+ uint16_t *eob_ptr,
+ const int *scan, int mul) {
+ int i, rc, eob;
int zbin;
int x, y, z, sz;
int zero_run = 0;
@@ -74,6 +221,7 @@ static void quantize(int16_t *zbin_boost_orig_ptr,
*eob_ptr = eob + 1;
}
+#endif
void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
TX_TYPE tx_type) {
@@ -97,19 +245,40 @@ void vp9_quantize(MACROBLOCK *mb, int plane, int block, int n_coeffs,
break;
}
- quantize(mb->plane[plane].zrun_zbin_boost,
- BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
- n_coeffs, mb->skip_block,
- mb->plane[plane].zbin,
- mb->plane[plane].round,
- mb->plane[plane].quant,
- mb->plane[plane].quant_shift,
- BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
- BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- xd->plane[plane].dequant,
- mb->plane[plane].zbin_extra,
- &xd->plane[plane].eobs[block],
- scan, mul);
+ // Call different quantization for different transform size.
+ if (n_coeffs >= 1024) {
+ // Save index of picked coefficient in pre-scan pass.
+ int idx_arr[1024];
+
+ quantize_sparse(mb->plane[plane].zrun_zbin_boost,
+ BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+ n_coeffs, mb->skip_block,
+ mb->plane[plane].zbin,
+ mb->plane[plane].round,
+ mb->plane[plane].quant,
+ mb->plane[plane].quant_shift,
+ BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+ BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+ xd->plane[plane].dequant,
+ mb->plane[plane].zbin_extra,
+ &xd->plane[plane].eobs[block],
+ scan, mul, idx_arr);
+ }
+ else {
+ quantize(mb->plane[plane].zrun_zbin_boost,
+ BLOCK_OFFSET(mb->plane[plane].coeff, block, 16),
+ n_coeffs, mb->skip_block,
+ mb->plane[plane].zbin,
+ mb->plane[plane].round,
+ mb->plane[plane].quant,
+ mb->plane[plane].quant_shift,
+ BLOCK_OFFSET(xd->plane[plane].qcoeff, block, 16),
+ BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+ xd->plane[plane].dequant,
+ mb->plane[plane].zbin_extra,
+ &xd->plane[plane].eobs[block],
+ scan, mul);
+ }
}
void vp9_regular_quantize_b_4x4(MACROBLOCK *mb, int b_idx, TX_TYPE tx_type,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 9cb7ab0e1..a48e7dbb3 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -274,12 +274,14 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex) {
}
}
-int vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff, int block_size) {
- int i, error = 0;
+int64_t vp9_block_error_c(int16_t *coeff, int16_t *dqcoeff,
+ intptr_t block_size) {
+ int i;
+ int64_t error = 0;
for (i = 0; i < block_size; i++) {
int this_diff = coeff[i] - dqcoeff[i];
- error += this_diff * this_diff;
+ error += (unsigned)this_diff * this_diff;
}
return error;
@@ -417,7 +419,7 @@ static INLINE int cost_coeffs(VP9_COMMON *const cm, MACROBLOCK *mb,
static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
int (*r)[2], int *rate,
- int *d, int *distortion,
+ int64_t *d, int64_t *distortion,
int *s, int *skip,
int64_t txfm_cache[NB_TXFM_MODES],
TX_SIZE max_txfm_size) {
@@ -496,27 +498,15 @@ static void choose_txfm_size_from_rd(VP9_COMP *cpi, MACROBLOCK *x,
rd[TX_4X4][1] : rd[TX_8X8][1];
}
-static int block_error(int16_t *coeff, int16_t *dqcoeff,
- int block_size, int shift) {
- int i;
- int64_t error = 0;
-
- for (i = 0; i < block_size; i++) {
- int this_diff = coeff[i] - dqcoeff[i];
- error += (unsigned)this_diff * this_diff;
- }
- error >>= shift;
-
- return error > INT_MAX ? INT_MAX : (int)error;
-}
-
-static int block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+static int64_t block_error_sby(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+ int shift) {
const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
- return block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
- 16 << (bwl + bhl), shift);
+ return vp9_block_error(x->plane[0].coeff, x->e_mbd.plane[0].dqcoeff,
+ 16 << (bwl + bhl)) >> shift;
}
-static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
+static int64_t block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize,
+ int shift) {
const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
int64_t sum = 0;
int plane;
@@ -524,11 +514,10 @@ static int block_error_sbuv(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, int shift) {
for (plane = 1; plane < MAX_MB_PLANE; plane++) {
const int subsampling = x->e_mbd.plane[plane].subsampling_x +
x->e_mbd.plane[plane].subsampling_y;
- sum += block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
- 16 << (bwl + bhl - subsampling), 0);
+ sum += vp9_block_error(x->plane[plane].coeff, x->e_mbd.plane[plane].dqcoeff,
+ 16 << (bwl + bhl - subsampling));
}
- sum >>= shift;
- return sum > INT_MAX ? INT_MAX : (int)sum;
+ return sum >> shift;
}
struct rdcost_block_args {
@@ -586,7 +575,8 @@ static int rdcost_uv(VP9_COMMON *const cm, MACROBLOCK *x,
}
static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
- int *rate, int *distortion, int *skippable,
+ int *rate, int64_t *distortion,
+ int *skippable,
BLOCK_SIZE_TYPE bsize, TX_SIZE tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
xd->mode_info_context->mbmi.txfm_size = tx_size;
@@ -602,11 +592,12 @@ static void super_block_yrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
}
static void super_block_yrd(VP9_COMP *cpi,
- MACROBLOCK *x, int *rate, int *distortion,
+ MACROBLOCK *x, int *rate, int64_t *distortion,
int *skip, BLOCK_SIZE_TYPE bs,
int64_t txfm_cache[NB_TXFM_MODES]) {
VP9_COMMON *const cm = &cpi->common;
- int r[TX_SIZE_MAX_SB][2], d[TX_SIZE_MAX_SB], s[TX_SIZE_MAX_SB];
+ int r[TX_SIZE_MAX_SB][2], s[TX_SIZE_MAX_SB];
+ int64_t d[TX_SIZE_MAX_SB];
MACROBLOCKD *xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -614,7 +605,7 @@ static void super_block_yrd(VP9_COMP *cpi,
if (mbmi->ref_frame[0] > INTRA_FRAME)
vp9_subtract_sby(x, bs);
- if (cpi->speed > 4) {
+ if (cpi->sf.use_largest_txform) {
if (bs >= BLOCK_SIZE_SB32X32) {
mbmi->txfm_size = TX_32X32;
} else if (bs >= BLOCK_SIZE_MB16X16) {
@@ -651,13 +642,13 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
int *bmode_costs,
ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
int *bestrate, int *bestratey,
- int *bestdistortion,
+ int64_t *bestdistortion,
BLOCK_SIZE_TYPE bsize) {
MB_PREDICTION_MODE mode;
MACROBLOCKD *xd = &x->e_mbd;
int64_t best_rd = INT64_MAX;
int rate = 0;
- int distortion;
+ int64_t distortion;
VP9_COMMON *const cm = &cpi->common;
const int src_stride = x->plane[0].src.stride;
uint8_t *src, *dst;
@@ -777,7 +768,7 @@ static int64_t rd_pick_intra4x4block(VP9_COMP *cpi, MACROBLOCK *x, int ib,
static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
int *Rate, int *rate_y,
- int *Distortion, int64_t best_rd) {
+ int64_t *Distortion, int64_t best_rd) {
int i, j;
MACROBLOCKD *const xd = &mb->e_mbd;
BLOCK_SIZE_TYPE bsize = xd->mode_info_context->mbmi.sb_type;
@@ -785,7 +776,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
int bh = 1 << b_height_log2(bsize);
int idx, idy;
int cost = 0;
- int distortion = 0;
+ int64_t distortion = 0;
int tot_rate_y = 0;
int64_t total_rd = 0;
ENTROPY_CONTEXT t_above[4], t_left[4];
@@ -802,7 +793,7 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
const int mis = xd->mode_info_stride;
MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(best_mode);
int UNINITIALIZED_IS_SAFE(r), UNINITIALIZED_IS_SAFE(ry);
- int UNINITIALIZED_IS_SAFE(d);
+ int64_t UNINITIALIZED_IS_SAFE(d);
i = idy * 2 + idx;
if (xd->frame_type == KEY_FRAME) {
@@ -844,14 +835,14 @@ static int64_t rd_pick_intra4x4mby_modes(VP9_COMP *cpi, MACROBLOCK *mb,
static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
- int *distortion, int *skippable,
+ int64_t *distortion, int *skippable,
BLOCK_SIZE_TYPE bsize,
int64_t txfm_cache[NB_TXFM_MODES]) {
MB_PREDICTION_MODE mode;
MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
MACROBLOCKD *const xd = &x->e_mbd;
- int this_rate, this_rate_tokenonly;
- int this_distortion, s;
+ int this_rate, this_rate_tokenonly, s;
+ int64_t this_distortion;
int64_t best_rd = INT64_MAX, this_rd;
TX_SIZE UNINITIALIZED_IS_SAFE(best_tx);
int i;
@@ -912,7 +903,7 @@ static int64_t rd_pick_intra_sby_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
- int *rate, int *distortion,
+ int *rate, int64_t *distortion,
int *skippable, BLOCK_SIZE_TYPE bsize,
TX_SIZE uv_tx_size) {
MACROBLOCKD *const xd = &x->e_mbd;
@@ -927,7 +918,7 @@ static void super_block_uvrd_for_txfm(VP9_COMMON *const cm, MACROBLOCK *x,
}
static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
- int *rate, int *distortion, int *skippable,
+ int *rate, int64_t *distortion, int *skippable,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
MB_MODE_INFO *const mbmi = &xd->mode_info_context->mbmi;
@@ -952,13 +943,13 @@ static void super_block_uvrd(VP9_COMMON *const cm, MACROBLOCK *x,
static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
int *rate, int *rate_tokenonly,
- int *distortion, int *skippable,
+ int64_t *distortion, int *skippable,
BLOCK_SIZE_TYPE bsize) {
MB_PREDICTION_MODE mode;
MB_PREDICTION_MODE UNINITIALIZED_IS_SAFE(mode_selected);
int64_t best_rd = INT64_MAX, this_rd;
- int this_rate_tokenonly, this_rate;
- int this_distortion, s;
+ int this_rate_tokenonly, this_rate, s;
+ int64_t this_distortion;
for (mode = DC_PRED; mode <= TM_PRED; mode++) {
x->e_mbd.mode_info_context->mbmi.uv_mode = mode;
@@ -1101,7 +1092,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
MACROBLOCK *x,
int i,
int *labelyrate,
- int *distortion,
+ int64_t *distortion,
ENTROPY_CONTEXT *ta,
ENTROPY_CONTEXT *tl) {
int k;
@@ -1126,7 +1117,7 @@ static int64_t encode_inter_mb_segment(VP9_COMMON *const cm,
raster_block_offset_uint8(xd, BLOCK_SIZE_SB8X8, 0, i,
xd->plane[0].dst.buf,
xd->plane[0].dst.stride);
- int thisdistortion = 0;
+ int64_t thisdistortion = 0;
int thisrate = 0;
*labelyrate = 0;
@@ -1189,7 +1180,7 @@ typedef struct {
int64_t segment_rd;
int r;
- int d;
+ int64_t d;
int segment_yrate;
MB_PREDICTION_MODE modes[4];
int_mv mvs[4], second_mvs[4];
@@ -1281,21 +1272,18 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
BEST_SEG_INFO *bsi,
int_mv seg_mvs[4][MAX_REF_FRAMES],
int mi_row, int mi_col) {
- int i, j;
- int br = 0, bd = 0;
+ int i, j, br = 0, rate = 0, sbr = 0, idx, idy;
+ int64_t bd = 0, sbd = 0;
MB_PREDICTION_MODE this_mode;
MB_MODE_INFO * mbmi = &x->e_mbd.mode_info_context->mbmi;
const int label_count = 4;
int64_t this_segment_rd = 0, other_segment_rd;
int label_mv_thresh;
- int rate = 0;
- int sbr = 0, sbd = 0;
int segmentyrate = 0;
int best_eobs[4] = { 0 };
BLOCK_SIZE_TYPE bsize = mbmi->sb_type;
int bwl = b_width_log2(bsize), bw = 1 << bwl;
int bhl = b_height_log2(bsize), bh = 1 << bhl;
- int idx, idy;
vp9_variance_fn_ptr_t *v_fn_ptr;
ENTROPY_CONTEXT t_above[4], t_left[4];
ENTROPY_CONTEXT t_above_b[4], t_left_b[4];
@@ -1340,7 +1328,7 @@ static void rd_check_segment_txsize(VP9_COMP *cpi, MACROBLOCK *x,
// search for the best motion vector on this segment
for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
int64_t this_rd;
- int distortion;
+ int64_t distortion;
int labelyrate;
ENTROPY_CONTEXT t_above_s[4], t_left_s[4];
const struct buf_2d orig_src = x->plane[0].src;
@@ -1527,7 +1515,7 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
int64_t best_rd,
int *returntotrate,
int *returnyrate,
- int *returndistortion,
+ int64_t *returndistortion,
int *skippable, int mvthresh,
int_mv seg_mvs[4][MAX_REF_FRAMES],
int mi_row, int mi_col) {
@@ -1800,18 +1788,133 @@ static YV12_BUFFER_CONFIG *get_scaled_ref_frame(VP9_COMP *cpi, int ref_frame) {
return scaled_ref_frame;
}
-static void model_rd_from_var_lapndz(int var, int n, int qstep,
- int *rate, int *dist) {
- // This function models the rate and distortion for a Laplacian
+static double linear_interpolate(double x, int ntab, double step,
+ const double *tab) {
+ double y = x / step;
+ int d = (int) y;
+ double a = y - d;
+ if (d >= ntab - 1)
+ return tab[ntab - 1];
+ else
+ return tab[d] * (1 - a) + tab[d + 1] * a;
+}
+
+static double model_rate_norm(double x) {
+ // Normalized rate
+ // This function models the rate for a Laplacian source
// source with given variance when quantized with a uniform quantizer
// with given stepsize. The closed form expressions are in:
// Hang and Chen, "Source Model for transform video coder and its
// application - Part I: Fundamental Theory", IEEE Trans. Circ.
// Sys. for Video Tech., April 1997.
- // The function is implemented as piecewise approximation to the
- // exact computation.
- // TODO(debargha): Implement the functions by interpolating from a
- // look-up table
+ static const double rate_tab_step = 0.125;
+ static const double rate_tab[] = {
+ 256.0000, 4.944453, 3.949276, 3.371593,
+ 2.965771, 2.654550, 2.403348, 2.193612,
+ 2.014208, 1.857921, 1.719813, 1.596364,
+ 1.484979, 1.383702, 1.291025, 1.205767,
+ 1.126990, 1.053937, 0.985991, 0.922644,
+ 0.863472, 0.808114, 0.756265, 0.707661,
+ 0.662070, 0.619287, 0.579129, 0.541431,
+ 0.506043, 0.472828, 0.441656, 0.412411,
+ 0.384980, 0.359260, 0.335152, 0.312563,
+ 0.291407, 0.271600, 0.253064, 0.235723,
+ 0.219508, 0.204351, 0.190189, 0.176961,
+ 0.164611, 0.153083, 0.142329, 0.132298,
+ 0.122945, 0.114228, 0.106106, 0.098541,
+ 0.091496, 0.084937, 0.078833, 0.073154,
+ 0.067872, 0.062959, 0.058392, 0.054147,
+ 0.050202, 0.046537, 0.043133, 0.039971,
+ 0.037036, 0.034312, 0.031783, 0.029436,
+ 0.027259, 0.025240, 0.023367, 0.021631,
+ 0.020021, 0.018528, 0.017145, 0.015863,
+ 0.014676, 0.013575, 0.012556, 0.011612,
+ 0.010738, 0.009929, 0.009180, 0.008487,
+ 0.007845, 0.007251, 0.006701, 0.006193,
+ 0.005722, 0.005287, 0.004884, 0.004512,
+ 0.004168, 0.003850, 0.003556, 0.003284,
+ 0.003032, 0.002800, 0.002585, 0.002386,
+ 0.002203, 0.002034, 0.001877, 0.001732,
+ 0.001599, 0.001476, 0.001362, 0.001256,
+ 0.001159, 0.001069, 0.000987, 0.000910,
+ 0.000840, 0.000774, 0.000714, 0.000659,
+ 0.000608, 0.000560, 0.000517, 0.000476,
+ 0.000439, 0.000405, 0.000373, 0.000344,
+ 0.000317, 0.000292, 0.000270, 0.000248,
+ 0.000229, 0.000211, 0.000195, 0.000179,
+ 0.000165, 0.000152, 0.000140, 0.000129,
+ 0.000119, 0.000110, 0.000101, 0.000093,
+ 0.000086, 0.000079, 0.000073, 0.000067,
+ 0.000062, 0.000057, 0.000052, 0.000048,
+ 0.000044, 0.000041, 0.000038, 0.000035,
+ 0.000032, 0.000029, 0.000027, 0.000025,
+ 0.000023, 0.000021, 0.000019, 0.000018,
+ 0.000016, 0.000015, 0.000014, 0.000013,
+ 0.000012, 0.000011, 0.000010, 0.000009,
+ 0.000008, 0.000008, 0.000007, 0.000007,
+ 0.000006, 0.000006, 0.000005, 0.000005,
+ 0.000004, 0.000004, 0.000004, 0.000003,
+ 0.000003, 0.000003, 0.000003, 0.000002,
+ 0.000002, 0.000002, 0.000002, 0.000002,
+ 0.000002, 0.000001, 0.000001, 0.000001,
+ 0.000001, 0.000001, 0.000001, 0.000001,
+ 0.000001, 0.000001, 0.000001, 0.000001,
+ 0.000001, 0.000001, 0.000000, 0.000000,
+ };
+ const int rate_tab_num = sizeof(rate_tab)/sizeof(rate_tab[0]);
+ assert(x >= 0.0);
+ return linear_interpolate(x, rate_tab_num, rate_tab_step, rate_tab);
+}
+
+static double model_dist_norm(double x) {
+ // Normalized distortion
+ // This function models the normalized distortion for a Laplacian source
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Dn(x) = 1 - 1/sqrt(2) * x / sinh(x/sqrt(2))
+ // where x = qpstep / sqrt(variance)
+ // Note the actual distortion is Dn * variance.
+ static const double dist_tab_step = 0.25;
+ static const double dist_tab[] = {
+ 0.000000, 0.005189, 0.020533, 0.045381,
+ 0.078716, 0.119246, 0.165508, 0.215979,
+ 0.269166, 0.323686, 0.378318, 0.432034,
+ 0.484006, 0.533607, 0.580389, 0.624063,
+ 0.664475, 0.701581, 0.735418, 0.766092,
+ 0.793751, 0.818575, 0.840761, 0.860515,
+ 0.878045, 0.893554, 0.907238, 0.919281,
+ 0.929857, 0.939124, 0.947229, 0.954306,
+ 0.960475, 0.965845, 0.970512, 0.974563,
+ 0.978076, 0.981118, 0.983750, 0.986024,
+ 0.987989, 0.989683, 0.991144, 0.992402,
+ 0.993485, 0.994417, 0.995218, 0.995905,
+ 0.996496, 0.997002, 0.997437, 0.997809,
+ 0.998128, 0.998401, 0.998635, 0.998835,
+ 0.999006, 0.999152, 0.999277, 0.999384,
+ 0.999475, 0.999553, 0.999619, 0.999676,
+ 0.999724, 0.999765, 0.999800, 0.999830,
+ 0.999855, 0.999877, 0.999895, 0.999911,
+ 0.999924, 0.999936, 0.999945, 0.999954,
+ 0.999961, 0.999967, 0.999972, 0.999976,
+ 0.999980, 0.999983, 0.999985, 0.999988,
+ 0.999989, 0.999991, 0.999992, 0.999994,
+ 0.999995, 0.999995, 0.999996, 0.999997,
+ 0.999997, 0.999998, 0.999998, 0.999998,
+ 0.999999, 0.999999, 0.999999, 0.999999,
+ 0.999999, 0.999999, 0.999999, 1.000000,
+ };
+ const int dist_tab_num = sizeof(dist_tab)/sizeof(dist_tab[0]);
+ assert(x >= 0.0);
+ return linear_interpolate(x, dist_tab_num, dist_tab_step, dist_tab);
+}
+
+static void model_rd_from_var_lapndz(int var, int n, int qstep,
+ int *rate, int64_t *dist) {
+ // This function models the rate and distortion for a Laplacian
+ // source with given variance when quantized with a uniform quantizer
+ // with given stepsize. The closed form expression is:
+ // Rn(x) = H(sqrt(r)) + sqrt(r)*[1 + H(r)/(1 - r)],
+ // where r = exp(-sqrt(2) * x) and x = qpstep / sqrt(variance)
vp9_clear_system_state();
if (var == 0 || n == 0) {
*rate = 0;
@@ -1819,29 +1922,18 @@ static void model_rd_from_var_lapndz(int var, int n, int qstep,
} else {
double D, R;
double s2 = (double) var / n;
- double s = sqrt(s2);
- double x = qstep / s;
- if (x > 1.0) {
- double y = exp(-x / 2);
- double y2 = y * y;
- D = 2.069981728764738 * y2 - 2.764286806516079 * y + 1.003956960819275;
- R = 0.924056758535089 * y2 + 2.738636469814024 * y - 0.005169662030017;
- } else {
- double x2 = x * x;
- D = 0.075303187668830 * x2 + 0.004296954321112 * x - 0.000413209252807;
- if (x > 0.125)
- R = 1 / (-0.03459733614226 * x2 + 0.36561675733603 * x +
- 0.1626989668625);
- else
- R = -1.442252874826093 * log(x) + 1.944647760719664;
- }
+ double x = qstep / sqrt(s2);
+ // TODO(debargha): Make the modeling functions take (qstep^2 / s2)
+ // as argument rather than qstep / sqrt(s2) to obviate the need for
+ // the sqrt() operation.
+ D = model_dist_norm(x);
+ R = model_rate_norm(x);
if (R < 0) {
- *rate = 0;
- *dist = var;
- } else {
- *rate = (n * R * 256 + 0.5);
- *dist = (n * D * s2 + 0.5);
+ R = 0;
+ D = var;
}
+ *rate = (n * R * 256 + 0.5);
+ *dist = (n * D * s2 + 0.5);
}
vp9_clear_system_state();
}
@@ -1854,12 +1946,13 @@ static enum BlockSize get_plane_block_size(BLOCK_SIZE_TYPE bsize,
static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
MACROBLOCK *x, MACROBLOCKD *xd,
- int *out_rate_sum, int *out_dist_sum) {
+ int *out_rate_sum, int64_t *out_dist_sum) {
// Note our transform coeffs are 8 times an orthogonal transform.
// Hence quantizer step is also 8 times. To get effective quantizer
// we need to divide by 8 before sending to modeling function.
- unsigned int sse, var;
- int i, rate_sum = 0, dist_sum = 0;
+ unsigned int sse;
+ int i, rate_sum = 0;
+ int64_t dist_sum = 0;
for (i = 0; i < MAX_MB_PLANE; ++i) {
struct macroblock_plane *const p = &x->plane[i];
@@ -1869,17 +1962,18 @@ static void model_rd_for_sb(VP9_COMP *cpi, BLOCK_SIZE_TYPE bsize,
const int bw = plane_block_width(bsize, pd);
const int bh = plane_block_height(bsize, pd);
const enum BlockSize bs = get_block_size(bw, bh);
- int rate, dist;
- var = cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
- pd->dst.buf, pd->dst.stride, &sse);
- model_rd_from_var_lapndz(var, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
+ int rate;
+ int64_t dist;
+ cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+ pd->dst.buf, pd->dst.stride, &sse);
+ model_rd_from_var_lapndz(sse, bw * bh, pd->dequant[1] >> 3, &rate, &dist);
rate_sum += rate;
dist_sum += dist;
}
*out_rate_sum = rate_sum;
- *out_dist_sum = dist_sum;
+ *out_dist_sum = dist_sum << 4;
}
static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
@@ -2134,9 +2228,10 @@ static void joint_motion_search(VP9_COMP *cpi, MACROBLOCK *x,
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize,
int64_t txfm_cache[],
- int *rate2, int *distortion, int *skippable,
- int *rate_y, int *distortion_y,
- int *rate_uv, int *distortion_uv,
+ int *rate2, int64_t *distortion,
+ int *skippable,
+ int *rate_y, int64_t *distortion_y,
+ int *rate_uv, int64_t *distortion_uv,
int *mode_excluded, int *disable_skip,
INTERPOLATIONFILTERTYPE *best_filter,
int_mv *frame_mv,
@@ -2236,11 +2331,12 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
(mbmi->mv[1].as_mv.col & 15) == 0;
// Search for best switchable filter by checking the variance of
// pred error irrespective of whether the filter will be used
- if (cpi->speed > 4) {
+ if (cpi->sf.use_8tap_always) {
*best_filter = EIGHTTAP;
} else {
int i, newbest;
- int tmp_rate_sum = 0, tmp_dist_sum = 0;
+ int tmp_rate_sum = 0;
+ int64_t tmp_dist_sum = 0;
for (i = 0; i < VP9_SWITCHABLE_FILTERS; ++i) {
int rs = 0;
const INTERPOLATIONFILTERTYPE filter = vp9_switchable_interp[i];
@@ -2255,7 +2351,8 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
if (interpolating_intpel_seen && is_intpel_interp) {
rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_sum, tmp_dist_sum);
} else {
- int rate_sum = 0, dist_sum = 0;
+ int rate_sum = 0;
+ int64_t dist_sum = 0;
vp9_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum);
rd = RDCOST(x->rdmult, x->rddiv, rs + rate_sum, dist_sum);
@@ -2399,19 +2496,20 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int *returnrate, int *returndist,
+ int *returnrate, int64_t *returndist,
BLOCK_SIZE_TYPE bsize,
PICK_MODE_CONTEXT *ctx) {
VP9_COMMON *cm = &cpi->common;
MACROBLOCKD *xd = &x->e_mbd;
- int rate_y = 0, rate_uv;
- int rate_y_tokenonly = 0, rate_uv_tokenonly;
- int dist_y = 0, dist_uv;
- int y_skip = 0, uv_skip;
+ int rate_y = 0, rate_uv = 0;
+ int rate_y_tokenonly = 0, rate_uv_tokenonly = 0;
+ int64_t dist_y = 0, dist_uv = 0;
+ int y_skip = 0, uv_skip = 0;
int64_t txfm_cache[NB_TXFM_MODES], err;
MB_PREDICTION_MODE mode;
TX_SIZE txfm_size;
- int rate4x4_y, rate4x4_y_tokenonly, dist4x4_y;
+ int rate4x4_y, rate4x4_y_tokenonly;
+ int64_t dist4x4_y;
int64_t err4x4 = INT64_MAX;
int i;
@@ -2462,7 +2560,7 @@ void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
int *returnrate,
- int *returndistortion,
+ int64_t *returndistortion,
BLOCK_SIZE_TYPE bsize,
PICK_MODE_CONTEXT *ctx) {
VP9_COMMON *cm = &cpi->common;
@@ -2497,7 +2595,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
INTERPOLATIONFILTERTYPE best_filter = SWITCHABLE;
INTERPOLATIONFILTERTYPE tmp_best_filter = SWITCHABLE;
int rate_uv_intra[TX_SIZE_MAX_SB], rate_uv_tokenonly[TX_SIZE_MAX_SB];
- int dist_uv[TX_SIZE_MAX_SB], skip_uv[TX_SIZE_MAX_SB];
+ int64_t dist_uv[TX_SIZE_MAX_SB];
+ int skip_uv[TX_SIZE_MAX_SB];
MB_PREDICTION_MODE mode_uv[TX_SIZE_MAX_SB];
struct scale_factors scale_factor[4];
unsigned int ref_frame_mask = 0;
@@ -2536,7 +2635,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
best_txfm_rd[i] = INT64_MAX;
// Create a mask set to 1 for each frame used by a smaller resolution.
- if (cpi->speed > 0) {
+ if (cpi->sf.use_avoid_tested_higherror) {
switch (block_size) {
case BLOCK_64X64:
for (i = 0; i < 4; i++) {
@@ -2576,8 +2675,9 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
frame_mv[ZEROMV][ref_frame].as_int = 0;
}
- if (cpi->speed == 0
- || (cpi->speed > 0 && (ref_frame_mask & (1 << INTRA_FRAME)))) {
+ if (!cpi->sf.use_avoid_tested_higherror
+ || (cpi->sf.use_avoid_tested_higherror
+ && (ref_frame_mask & (1 << INTRA_FRAME)))) {
mbmi->mode = DC_PRED;
mbmi->ref_frame[0] = INTRA_FRAME;
for (i = 0; i <= (bsize < BLOCK_SIZE_MB16X16 ? TX_4X4 :
@@ -2599,7 +2699,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int disable_skip = 0;
int compmode_cost = 0;
int rate2 = 0, rate_y = 0, rate_uv = 0;
- int distortion2 = 0, distortion_y = 0, distortion_uv = 0;
+ int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
int skippable;
int64_t txfm_cache[NB_TXFM_MODES];
int i;
@@ -2623,7 +2723,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
this_mode = vp9_mode_order[mode_index].mode;
ref_frame = vp9_mode_order[mode_index].ref_frame;
- if (cpi->speed > 0 && bsize >= BLOCK_SIZE_SB8X8) {
+ if (cpi->sf.use_avoid_tested_higherror && bsize >= BLOCK_SIZE_SB8X8) {
if (!(ref_frame_mask & (1 << ref_frame))) {
continue;
}
@@ -2786,11 +2886,13 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
distortion2 = distortion_y + distortion_uv;
} else if (this_mode == SPLITMV) {
const int is_comp_pred = mbmi->ref_frame[1] > 0;
- int rate, distortion;
+ int rate;
+ int64_t distortion;
int64_t this_rd_thresh;
int64_t tmp_rd, tmp_best_rd = INT64_MAX, tmp_best_rdu = INT64_MAX;
int tmp_best_rate = INT_MAX, tmp_best_ratey = INT_MAX;
- int tmp_best_distortion = INT_MAX, tmp_best_skippable = 0;
+ int64_t tmp_best_distortion = INT_MAX;
+ int tmp_best_skippable = 0;
int switchable_filter_index;
int_mv *second_ref = is_comp_pred ?
&mbmi->ref_mvs[mbmi->ref_frame[1]][0] : NULL;
diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index dcf5d00e9..67ef73db7 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h
@@ -20,12 +20,12 @@ void vp9_initialize_rd_consts(VP9_COMP *cpi, int qindex);
void vp9_initialize_me_consts(VP9_COMP *cpi, int qindex);
void vp9_rd_pick_intra_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
- int *r, int *d, BLOCK_SIZE_TYPE bsize,
+ int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
PICK_MODE_CONTEXT *ctx);
int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
- int *r, int *d, BLOCK_SIZE_TYPE bsize,
+ int *r, int64_t *d, BLOCK_SIZE_TYPE bsize,
PICK_MODE_CONTEXT *ctx);
void vp9_init_me_luts();
diff --git a/vp9/encoder/x86/vp9_encodeopt.asm b/vp9/encoder/x86/vp9_encodeopt.asm
deleted file mode 100644
index 734cb61ca..000000000
--- a/vp9/encoder/x86/vp9_encodeopt.asm
+++ /dev/null
@@ -1,125 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;int vp9_block_error_xmm(short *coeff_ptr, short *dcoef_ptr)
-global sym(vp9_block_error_xmm) PRIVATE
-sym(vp9_block_error_xmm):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prologue
-
- mov rsi, arg(0) ;coeff_ptr
- mov rdi, arg(1) ;dcoef_ptr
-
- movdqa xmm0, [rsi]
- movdqa xmm1, [rdi]
-
- movdqa xmm2, [rsi+16]
- movdqa xmm3, [rdi+16]
-
- psubw xmm0, xmm1
- psubw xmm2, xmm3
-
- pmaddwd xmm0, xmm0
- pmaddwd xmm2, xmm2
-
- paddd xmm0, xmm2
-
- pxor xmm5, xmm5
- movdqa xmm1, xmm0
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
-
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- psrldq xmm0, 8
- paddd xmm0, xmm1
-
- movq rax, xmm0
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;int vp9_block_error_mmx(short *coeff_ptr, short *dcoef_ptr)
-global sym(vp9_block_error_mmx) PRIVATE
-sym(vp9_block_error_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 2
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(0) ;coeff_ptr
- pxor mm7, mm7
-
- mov rdi, arg(1) ;dcoef_ptr
- movq mm3, [rsi]
-
- movq mm4, [rdi]
- movq mm5, [rsi+8]
-
- movq mm6, [rdi+8]
- pxor mm1, mm1 ; from movd mm1, dc ; dc =0
-
- movq mm2, mm7
- psubw mm5, mm6
-
- por mm1, mm2
- pmaddwd mm5, mm5
-
- pcmpeqw mm1, mm7
- psubw mm3, mm4
-
- pand mm1, mm3
- pmaddwd mm1, mm1
-
- paddd mm1, mm5
- movq mm3, [rsi+16]
-
- movq mm4, [rdi+16]
- movq mm5, [rsi+24]
-
- movq mm6, [rdi+24]
- psubw mm5, mm6
-
- pmaddwd mm5, mm5
- psubw mm3, mm4
-
- pmaddwd mm3, mm3
- paddd mm3, mm5
-
- paddd mm1, mm3
- movq mm0, mm1
-
- psrlq mm1, 32
- paddd mm0, mm1
-
- movq rax, mm0
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm
new file mode 100644
index 000000000..bb1ea71b9
--- /dev/null
+++ b/vp9/encoder/x86/vp9_error_sse2.asm
@@ -0,0 +1,57 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vp9_block_error(int16_t *coeff, int16_t *dqcoeff, intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error, 3, 3, 6, uqc, dqc, size
+ pxor m4, m4 ; accumulator
+ pxor m5, m5 ; dedicated zero register
+ lea uqcq, [uqcq+sizeq*2]
+ lea dqcq, [dqcq+sizeq*2]
+ neg sizeq
+.loop:
+ mova m0, [uqcq+sizeq*2]
+ mova m2, [dqcq+sizeq*2]
+ mova m1, [uqcq+sizeq*2+mmsize]
+ mova m3, [dqcq+sizeq*2+mmsize]
+ psubw m0, m2
+ psubw m1, m3
+ ; individual errors are max. 15bit+sign, so squares are 30bit, and
+ ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+ pmaddwd m0, m0
+ pmaddwd m1, m1
+ ; accumulate in 64bit
+ punpckldq m2, m0, m5
+ punpckhdq m0, m5
+ punpckldq m3, m1, m5
+ punpckhdq m1, m5
+ paddq m4, m2
+ paddq m4, m0
+ paddq m4, m3
+ paddq m4, m1
+ add sizeq, mmsize
+ jl .loop
+
+ ; accumulate horizontally and store in return value
+ movhlps m5, m4
+ paddq m4, m5
+%if ARCH_X86_64
+ movq rax, m4
+%else
+ pshufd m5, m4, 0x1
+ movd eax, m4
+ movd edx, m5
+%endif
+ RET
diff --git a/vp9/encoder/x86/vp9_subpel_variance.asm b/vp9/encoder/x86/vp9_subpel_variance.asm
new file mode 100644
index 000000000..19e2feb57
--- /dev/null
+++ b/vp9/encoder/x86/vp9_subpel_variance.asm
@@ -0,0 +1,1288 @@
+;
+; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION_RODATA
+pw_8: times 8 dw 8
+bilin_filter_m_sse2: times 8 dw 16
+ times 8 dw 0
+ times 8 dw 15
+ times 8 dw 1
+ times 8 dw 14
+ times 8 dw 2
+ times 8 dw 13
+ times 8 dw 3
+ times 8 dw 12
+ times 8 dw 4
+ times 8 dw 11
+ times 8 dw 5
+ times 8 dw 10
+ times 8 dw 6
+ times 8 dw 9
+ times 8 dw 7
+ times 16 dw 8
+ times 8 dw 7
+ times 8 dw 9
+ times 8 dw 6
+ times 8 dw 10
+ times 8 dw 5
+ times 8 dw 11
+ times 8 dw 4
+ times 8 dw 12
+ times 8 dw 3
+ times 8 dw 13
+ times 8 dw 2
+ times 8 dw 14
+ times 8 dw 1
+ times 8 dw 15
+
+bilin_filter_m_ssse3: times 8 db 16, 0
+ times 8 db 15, 1
+ times 8 db 14, 2
+ times 8 db 13, 3
+ times 8 db 12, 4
+ times 8 db 11, 5
+ times 8 db 10, 6
+ times 8 db 9, 7
+ times 16 db 8
+ times 8 db 7, 9
+ times 8 db 6, 10
+ times 8 db 5, 11
+ times 8 db 4, 12
+ times 8 db 3, 13
+ times 8 db 2, 14
+ times 8 db 1, 15
+
+SECTION .text
+
+; int vp9_sub_pixel_varianceNxh(const uint8_t *src, ptrdiff_t src_stride,
+; int x_offset, int y_offset,
+; const uint8_t *dst, ptrdiff_t dst_stride,
+; int height, unsigned int *sse);
+;
+; This function returns the SE and stores SSE in the given pointer.
+
+%macro SUM_SSE 6 ; src1, dst1, src2, dst2, sum, sse
+ psubw %3, %4
+ psubw %1, %2
+ paddw %5, %3
+ pmaddwd %3, %3
+ paddw %5, %1
+ pmaddwd %1, %1
+ paddd %6, %3
+ paddd %6, %1
+%endmacro
+
+%macro STORE_AND_RET 0
+%if mmsize == 16
+ ; if H=64 and W=16, we have 8 words of each 2(1bit)x64(6bit)x9bit=16bit
+ ; in m6, i.e. it _exactly_ fits in a signed word per word in the xmm reg.
+ ; We have to sign-extend it before adding the words within the register
+ ; and outputing to a dword.
+ pcmpgtw m5, m6 ; mask for 0 > x
+ movhlps m3, m7
+ punpcklwd m4, m6, m5
+ punpckhwd m6, m5 ; sign-extend m6 word->dword
+ paddd m7, m3
+ paddd m6, m4
+ pshufd m3, m7, 0x1
+ movhlps m4, m6
+ paddd m7, m3
+ paddd m6, m4
+ mov r1, ssem ; r1 = unsigned int *sse
+ pshufd m4, m6, 0x1
+ movd [r1], m7 ; store sse
+ paddd m6, m4
+ movd rax, m6 ; store sum as return value
+%else ; mmsize == 8
+ pshufw m4, m6, 0xe
+ pshufw m3, m7, 0xe
+ paddw m6, m4
+ paddd m7, m3
+ pcmpgtw m5, m6 ; mask for 0 > x
+ mov r1, ssem ; r1 = unsigned int *sse
+ punpcklwd m6, m5 ; sign-extend m6 word->dword
+ movd [r1], m7 ; store sse
+ pshufw m4, m6, 0xe
+ paddd m6, m4
+ movd rax, m6 ; store sum as return value
+%endif
+ RET
+%endmacro
+
+%macro SUBPEL_VARIANCE 1-2 0 ; W
+%if cpuflag(ssse3)
+%define bilin_filter_m bilin_filter_m_ssse3
+%define filter_idx_shift 4
+%else
+%define bilin_filter_m bilin_filter_m_sse2
+%define filter_idx_shift 5
+%endif
+; FIXME(rbultje) only bilinear filters use >8 registers, and ssse3 only uses
+; 11, not 13, if the registers are ordered correctly. May make a minor speed
+; difference on Win64
+%ifdef PIC
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 9, 10, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, height, sse
+%define sec_str sec_strideq
+%else
+cglobal sub_pixel_variance%1xh, 7, 8, 13, src, src_stride, x_offset, y_offset, \
+ dst, dst_stride, height, sse
+%endif
+%define h heightd
+%define bilin_filter sseq
+%else
+%if %2 == 1 ; avg
+cglobal sub_pixel_avg_variance%1xh, 7 + 2 * ARCH_X86_64, \
+ 7 + 2 * ARCH_X86_64, 13, src, src_stride, \
+ x_offset, y_offset, \
+ dst, dst_stride, \
+ sec, sec_stride, \
+ height, sse
+%if ARCH_X86_64
+%define h heightd
+%define sec_str sec_strideq
+%else
+%define h dword heightm
+%define sec_str sec_stridemp
+%endif
+%else
+cglobal sub_pixel_variance%1xh, 7, 7, 13, src, src_stride, x_offset, y_offset, \
+ dst, dst_stride, height, sse
+%define h heightd
+%endif
+%define bilin_filter bilin_filter_m
+%endif
+ ASSERT %1 <= 16 ; m6 overflows if w > 16
+ pxor m6, m6 ; sum
+ pxor m7, m7 ; sse
+ ; FIXME(rbultje) if both filters are bilinear, we don't actually use m5; we
+ ; could perhaps use it for something more productive then
+ pxor m5, m5 ; dedicated zero register
+%if %1 < 16
+ sar h, 1
+%if %2 == 1 ; avg
+ shl sec_str, 1
+%endif
+%endif
+
+ ; FIXME(rbultje) replace by jumptable?
+ test x_offsetd, x_offsetd
+ jnz .x_nonzero
+ ; x_offset == 0
+ test y_offsetd, y_offsetd
+ jnz .x_zero_y_nonzero
+
+ ; x_offset == 0 && y_offset == 0
+.x_zero_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ mova m1, [dstq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%if %2 == 0 ; !avg
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m0, [srcq+src_strideq]
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
+%endif
+%else ; !avg
+ movh m2, [srcq+src_strideq]
+%endif
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_zero_y_zero_loop
+ STORE_AND_RET
+
+.x_zero_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_zero_y_nonhalf
+
+ ; x_offset == 0 && y_offset == 0.5
+.x_zero_y_half_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m2, [srcq+src_strideq*2]
+%else ; mmsize == 8
+ punpckldq m2, [srcq+src_strideq*2]
+%endif
+ movh m1, [dstq]
+%if mmsize == 16
+ movlhps m0, m2
+%else ; mmsize == 8
+ punpckldq m0, m2
+%endif
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, m2
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ movh m4, [srcq+src_strideq*2]
+ movh m1, [dstq]
+ pavgb m0, m2
+ movh m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_zero_y_half_loop
+ STORE_AND_RET
+
+.x_zero_y_nonhalf:
+ ; x_offset == 0 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else ; x86-32 or mmx
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+.x_zero_y_other_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+src_strideq]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ ; FIXME(rbultje) instead of out=((num-x)*in1+x*in2+rnd)>>log2(num), we can
+ ; also do out=in1+(((num-x)*(in2-in1)+rnd)>>log2(num)). Total number of
+ ; instructions is the same (5), but it is 1 mul instead of 2, so might be
+ ; slightly faster because of pmullw latency. It would also cut our rodata
+ ; tables in half for this function, and save 1-2 registers on x86-64.
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq*2]
+ movh m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movh m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m4, filter_y_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movh m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_zero_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonzero:
+ cmp x_offsetd, 8
+ jne .x_nonhalf
+ ; x_offset == 0.5
+ test y_offsetd, y_offsetd
+ jnz .x_half_y_nonzero
+
+ ; x_offset == 0.5 && y_offset == 0
+.x_half_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+%if %2 == 1 ; avg
+ pavgb m0, [secq]
+%endif
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m4, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m0, [srcq+src_strideq]
+ movhps m4, [srcq+src_strideq+1]
+%else ; mmsize == 8
+ punpckldq m0, [srcq+src_strideq]
+ punpckldq m4, [srcq+src_strideq+1]
+%endif
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, m4
+ punpcklbw m3, m5
+ pavgb m0, [secq]
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ movh m2, [srcq+src_strideq]
+ movh m1, [dstq]
+ pavgb m0, m4
+ movh m4, [srcq+src_strideq+1]
+ movh m3, [dstq+dst_strideq]
+ pavgb m2, m4
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_half_y_zero_loop
+ STORE_AND_RET
+
+.x_half_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_half_y_nonhalf
+
+ ; x_offset == 0.5 && y_offset == 0.5
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m3
+ punpckhbw m3, m1, m5
+ pavgb m0, m4
+%if %2 == 1 ; avg
+ punpcklbw m1, m5
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_half_loop:
+ movh m2, [srcq]
+ movh m3, [srcq+1]
+%if %2 == 1 ; avg
+%if mmsize == 16
+ movhps m2, [srcq+src_strideq]
+ movhps m3, [srcq+src_strideq+1]
+%else
+ punpckldq m2, [srcq+src_strideq]
+ punpckldq m3, [srcq+src_strideq+1]
+%endif
+ pavgb m2, m3
+%if mmsize == 16
+ movlhps m0, m2
+ movhlps m4, m2
+%else ; mmsize == 8
+ punpckldq m0, m2
+ pshufw m4, m2, 0xe
+%endif
+ movh m1, [dstq]
+ pavgb m0, m2
+ movh m3, [dstq+dst_strideq]
+ pavgb m0, [secq]
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%else ; !avg
+ movh m4, [srcq+src_strideq]
+ movh m1, [srcq+src_strideq+1]
+ pavgb m2, m3
+ pavgb m4, m1
+ pavgb m0, m2
+ pavgb m2, m4
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ punpcklbw m0, m5
+ punpcklbw m2, m5
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_half_y_half_loop
+ STORE_AND_RET
+
+.x_half_y_nonhalf:
+ ; x_offset == 0.5 && y_offset == bilin interpolation
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+y_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_y_a m8
+%define filter_y_b m9
+%define filter_rnd m10
+%else
+ add y_offsetq, bilin_filter
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%if %1 == 16
+ movu m0, [srcq]
+ movu m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+.x_half_y_other_loop:
+ movu m4, [srcq]
+ movu m2, [srcq+1]
+ mova m1, [dstq]
+ pavgb m4, m2
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ pmullw m2, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, filter_rnd
+ punpcklbw m0, m5
+ paddw m2, m3
+ punpcklbw m3, m4, m5
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+%endif
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m3, [srcq+1]
+ add srcq, src_strideq
+ pavgb m0, m3
+%if notcpuflag(ssse3)
+ punpcklbw m0, m5
+%endif
+.x_half_y_other_loop:
+ movh m2, [srcq]
+ movh m1, [srcq+1]
+ movh m4, [srcq+src_strideq]
+ movh m3, [srcq+src_strideq+1]
+ pavgb m2, m1
+ pavgb m4, m3
+ movh m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ movh m1, [dstq]
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_y_a
+ pmullw m1, m2, filter_y_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ paddw m0, m1
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m2, m1
+ movh m1, [dstq]
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_half_y_other_loop
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf:
+ test y_offsetd, y_offsetd
+ jnz .x_nonhalf_y_nonzero
+
+ ; x_offset == bilin interpolation && y_offset == 0
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+.x_other_y_zero_loop:
+%if %1 == 16
+ movu m0, [srcq]
+ movu m4, [srcq+1]
+ mova m1, [dstq]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m4, m5
+ punpcklbw m0, m5
+ punpcklbw m4, m5
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, filter_rnd
+ paddw m2, m3
+ paddw m0, m4
+%endif
+ psraw m2, 4
+ psraw m0, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m1, [srcq+1]
+ movh m2, [srcq+src_strideq]
+ movh m4, [srcq+src_strideq+1]
+ movh m3, [dstq+dst_strideq]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ movh m1, [dstq]
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_x_a
+ pmaddubsw m2, filter_x_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ punpcklbw m2, m5
+ punpcklbw m4, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m0, m1
+ paddw m2, filter_rnd
+ movh m1, [dstq]
+ paddw m2, m4
+%endif
+ psraw m0, 4
+ psraw m2, 4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_other_y_zero_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonzero:
+ cmp y_offsetd, 8
+ jne .x_nonhalf_y_nonhalf
+
+ ; x_offset == bilin interpolation && y_offset == 0.5
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_rnd m10
+%else
+ add x_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_half_loop:
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+%if cpuflag(ssse3)
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ pavgb m0, m4
+ punpckhbw m3, m1, m5
+ punpcklbw m1, m5
+%else
+ punpckhbw m2, m4, m5
+ punpckhbw m1, m3, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ paddw m4, m3
+ paddw m2, m1
+ mova m1, [dstq]
+ psraw m4, 4
+ psraw m2, 4
+ punpckhbw m3, m1, m5
+ ; FIXME(rbultje) the repeated pack/unpack here around m0/m2 is because we
+ ; have a 1-register shortage to be able to store the backup of the bilin
+ ; filtered second line as words as cache for the next line. Packing into
+ ; a byte costs 1 pack and 2 unpacks, but saves a register.
+ packuswb m4, m2
+ punpcklbw m1, m5
+ pavgb m0, m4
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ pavgb m0, [secq]
+%endif
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ add srcq, src_strideq
+ psraw m0, 4
+.x_other_y_half_loop:
+ movh m2, [srcq]
+ movh m1, [srcq+1]
+ movh m4, [srcq+src_strideq]
+ movh m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movh m1, [dstq]
+ movh m3, [dstq+dst_strideq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ movh m1, [dstq]
+ paddw m4, m3
+ movh m3, [dstq+dst_strideq]
+%endif
+ psraw m2, 4
+ psraw m4, 4
+ pavgw m0, m2
+ pavgw m2, m4
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline - also consider going to bytes here
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_other_y_half_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_rnd
+ STORE_AND_RET
+
+.x_nonhalf_y_nonhalf:
+%ifdef PIC
+ lea bilin_filter, [bilin_filter_m]
+%endif
+ shl x_offsetd, filter_idx_shift
+ shl y_offsetd, filter_idx_shift
+%if ARCH_X86_64 && mmsize == 16
+ mova m8, [bilin_filter+x_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m9, [bilin_filter+x_offsetq+16]
+%endif
+ mova m10, [bilin_filter+y_offsetq]
+%if notcpuflag(ssse3) ; FIXME(rbultje) don't scatter registers on x86-64
+ mova m11, [bilin_filter+y_offsetq+16]
+%endif
+ mova m12, [pw_8]
+%define filter_x_a m8
+%define filter_x_b m9
+%define filter_y_a m10
+%define filter_y_b m11
+%define filter_rnd m12
+%else
+ add x_offsetq, bilin_filter
+ add y_offsetq, bilin_filter
+%define filter_x_a [x_offsetq]
+%define filter_x_b [x_offsetq+16]
+%define filter_y_a [y_offsetq]
+%define filter_y_b [y_offsetq+16]
+%define filter_rnd [pw_8]
+%endif
+ ; x_offset == bilin interpolation && y_offset == bilin interpolation
+%if %1 == 16
+ movu m0, [srcq]
+ movu m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m0, filter_x_a
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+%else
+ punpckhbw m2, m0, m5
+ punpckhbw m3, m1, m5
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m2, filter_rnd
+ paddw m0, m1
+ paddw m2, m3
+%endif
+ psraw m0, 4
+ psraw m2, 4
+ add srcq, src_strideq
+ packuswb m0, m2
+.x_other_y_other_loop:
+%if cpuflag(ssse3)
+ movu m4, [srcq]
+ movu m3, [srcq+1]
+ mova m1, [dstq]
+ punpckhbw m2, m4, m3
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ punpckhbw m3, m1, m5
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m4, m2
+ punpckhbw m2, m0, m4
+ punpcklbw m0, m4
+ pmaddubsw m2, filter_y_a
+ pmaddubsw m0, filter_y_a
+ punpcklbw m1, m5
+ paddw m2, filter_rnd
+ paddw m0, filter_rnd
+ psraw m2, 4
+ psraw m0, 4
+%else
+ movu m3, [srcq]
+ movu m4, [srcq+1]
+ punpckhbw m1, m3, m5
+ punpckhbw m2, m4, m5
+ punpcklbw m3, m5
+ punpcklbw m4, m5
+ pmullw m3, filter_x_a
+ pmullw m4, filter_x_b
+ paddw m3, filter_rnd
+ pmullw m1, filter_x_a
+ pmullw m2, filter_x_b
+ paddw m1, filter_rnd
+ paddw m3, m4
+ paddw m1, m2
+ psraw m3, 4
+ psraw m1, 4
+ packuswb m4, m3, m1
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+ pmullw m2, filter_y_a
+ pmullw m1, filter_y_b
+ paddw m2, filter_rnd
+ pmullw m0, filter_y_a
+ pmullw m3, filter_y_b
+ paddw m2, m1
+ mova m1, [dstq]
+ paddw m0, filter_rnd
+ psraw m2, 4
+ paddw m0, m3
+ punpckhbw m3, m1, m5
+ psraw m0, 4
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ add srcq, src_strideq
+ add dstq, dst_strideq
+%else ; %1 < 16
+ movh m0, [srcq]
+ movh m1, [srcq+1]
+%if cpuflag(ssse3)
+ punpcklbw m0, m1
+ pmaddubsw m0, filter_x_a
+ paddw m0, filter_rnd
+%else
+ punpcklbw m0, m5
+ punpcklbw m1, m5
+ pmullw m0, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m0, filter_rnd
+ paddw m0, m1
+%endif
+ psraw m0, 4
+%if cpuflag(ssse3)
+ packuswb m0, m0
+%endif
+ add srcq, src_strideq
+.x_other_y_other_loop:
+ movh m2, [srcq]
+ movh m1, [srcq+1]
+ movh m4, [srcq+src_strideq]
+ movh m3, [srcq+src_strideq+1]
+%if cpuflag(ssse3)
+ punpcklbw m2, m1
+ punpcklbw m4, m3
+ pmaddubsw m2, filter_x_a
+ pmaddubsw m4, filter_x_a
+ movh m3, [dstq+dst_strideq]
+ movh m1, [dstq]
+ paddw m2, filter_rnd
+ paddw m4, filter_rnd
+ psraw m2, 4
+ psraw m4, 4
+ packuswb m2, m2
+ packuswb m4, m4
+ punpcklbw m0, m2
+ punpcklbw m2, m4
+ pmaddubsw m0, filter_y_a
+ pmaddubsw m2, filter_y_a
+ punpcklbw m3, m5
+ paddw m0, filter_rnd
+ paddw m2, filter_rnd
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m1, m5
+%else
+ punpcklbw m2, m5
+ punpcklbw m1, m5
+ punpcklbw m4, m5
+ punpcklbw m3, m5
+ pmullw m2, filter_x_a
+ pmullw m1, filter_x_b
+ paddw m2, filter_rnd
+ pmullw m4, filter_x_a
+ pmullw m3, filter_x_b
+ paddw m4, filter_rnd
+ paddw m2, m1
+ paddw m4, m3
+ psraw m2, 4
+ psraw m4, 4
+ pmullw m0, filter_y_a
+ pmullw m3, m2, filter_y_b
+ paddw m0, filter_rnd
+ pmullw m2, filter_y_a
+ pmullw m1, m4, filter_y_b
+ paddw m2, filter_rnd
+ paddw m0, m3
+ movh m3, [dstq+dst_strideq]
+ paddw m2, m1
+ movh m1, [dstq]
+ psraw m0, 4
+ psraw m2, 4
+ punpcklbw m3, m5
+ punpcklbw m1, m5
+%endif
+%if %2 == 1 ; avg
+ ; FIXME(rbultje) pipeline
+ packuswb m0, m2
+ pavgb m0, [secq]
+ punpckhbw m2, m0, m5
+ punpcklbw m0, m5
+%endif
+ SUM_SSE m0, m1, m2, m3, m6, m7
+ mova m0, m4
+
+ lea srcq, [srcq+src_strideq*2]
+ lea dstq, [dstq+dst_strideq*2]
+%endif
+%if %2 == 1 ; avg
+ add secq, sec_str
+%endif
+ dec h
+ jg .x_other_y_other_loop
+%undef filter_x_a
+%undef filter_x_b
+%undef filter_y_a
+%undef filter_y_b
+%undef filter_rnd
+ STORE_AND_RET
+%endmacro
+
+; FIXME(rbultje) the non-bilinear versions (i.e. x=0,8&&y=0,8) are identical
+; between the ssse3 and non-ssse3 version. It may make sense to merge their
+; code in the sense that the ssse3 version would jump to the appropriate
+; location in the sse/2 version, rather than duplicating that code in the
+; binary.
+
+INIT_MMX sse
+SUBPEL_VARIANCE 4
+INIT_XMM sse2
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE 4
+INIT_XMM ssse3
+SUBPEL_VARIANCE 8
+SUBPEL_VARIANCE 16
+
+INIT_MMX sse
+SUBPEL_VARIANCE 4, 1
+INIT_XMM sse2
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
+
+INIT_MMX ssse3
+SUBPEL_VARIANCE 4, 1
+INIT_XMM ssse3
+SUBPEL_VARIANCE 8, 1
+SUBPEL_VARIANCE 16, 1
diff --git a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
index 8a2a471f5..2ecc23e55 100644
--- a/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm
@@ -8,292 +8,8 @@
; be found in the AUTHORS file in the root of the source tree.
;
-
%include "vpx_ports/x86_abi_support.asm"
-%define xmm_filter_shift 7
-
-;void vp9_filter_block2d_bil_var_sse2
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int xoffset,
-; int yoffset,
-; int *sum,
-; unsigned int *sumsquared;;
-;
-;)
-global sym(vp9_filter_block2d_bil_var_sse2) PRIVATE
-sym(vp9_filter_block2d_bil_var_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- push rbx
- ; end prolog
-
- pxor xmm6, xmm6 ;
- pxor xmm7, xmm7 ;
-
- lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
- movdqa xmm4, XMMWORD PTR [rsi]
-
- lea rcx, [GLOBAL(bilinear_filters_sse2)]
- movsxd rax, dword ptr arg(5) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je filter_block2d_bil_var_sse2_sp_only
-
- shl rax, 5 ; point to filter coeff with xoffset
- lea rax, [rax + rcx] ; HFilter
-
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je filter_block2d_bil_var_sse2_fp_only
-
- shl rdx, 5
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
-
- pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
- movdqa xmm5, xmm1
-
- movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
- lea rsi, [rsi + rbx]
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-filter_block2d_bil_var_sse2_loop:
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0 ;
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movdqa xmm3, xmm5 ;
- movdqa xmm5, xmm1 ;
-
- pmullw xmm3, [rdx] ;
- pmullw xmm1, [rdx+16] ;
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- lea rsi, [rsi + rbx] ;ref_pixels_per_line
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_var_sse2_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_sp_only:
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
- je filter_block2d_bil_var_sse2_full_pixel
-
- shl rdx, 5
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
- movq xmm1, QWORD PTR [rsi] ;
- punpcklbw xmm1, xmm0 ;
-
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
- lea rsi, [rsi + rax]
-
-filter_block2d_bil_sp_only_loop:
- movq xmm3, QWORD PTR [rsi] ;
- punpcklbw xmm3, xmm0 ;
- movdqa xmm5, xmm3
-
- pmullw xmm1, [rdx] ;
- pmullw xmm3, [rdx+16] ;
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- movdqa xmm1, xmm5 ;
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_sp_only_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_full_pixel:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
- pxor xmm0, xmm0 ;
-
-filter_block2d_bil_full_pixel_loop:
- movq xmm1, QWORD PTR [rsi] ;
- punpcklbw xmm1, xmm0 ;
-
- movq xmm2, QWORD PTR [rdi] ;
- punpcklbw xmm2, xmm0 ;
-
- psubw xmm1, xmm2 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
-
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_full_pixel_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_var_sse2_fp_only:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0 ;
- movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
-
-filter_block2d_bil_fp_only_loop:
- movq xmm1, QWORD PTR [rsi] ;
- movq xmm3, QWORD PTR [rsi+1] ;
-
- punpcklbw xmm1, xmm0 ;
- pmullw xmm1, [rax] ;
- punpcklbw xmm3, xmm0 ;
- pmullw xmm3, [rax+16] ;
-
- paddw xmm1, xmm3 ;
- paddw xmm1, xmm4 ;
- psraw xmm1, xmm_filter_shift ;
-
- movq xmm3, QWORD PTR [rdi] ;
- punpcklbw xmm3, xmm0 ;
-
- psubw xmm1, xmm3 ;
- paddw xmm6, xmm1 ;
-
- pmaddwd xmm1, xmm1 ;
- paddd xmm7, xmm1 ;
- lea rsi, [rsi + rdx]
- lea rdi, [rdi + rbx] ;src_pixels_per_line
-
- sub rcx, 1 ;
- jnz filter_block2d_bil_fp_only_loop ;
-
- jmp filter_block2d_bil_variance
-
-filter_block2d_bil_variance:
- movdq2q mm6, xmm6 ;
- movdq2q mm7, xmm7 ;
-
- psrldq xmm6, 8
- psrldq xmm7, 8
-
- movdq2q mm2, xmm6
- movdq2q mm3, xmm7
-
- paddw mm6, mm2
- paddd mm7, mm3
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rsi, arg(7) ; sum
- mov rdi, arg(8) ; sumsquared
-
- movd [rsi], mm2 ; xsum
- movd [rdi], mm4 ; xxsum
-
- ; begin epilog
- pop rbx
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
;void vp9_half_horiz_vert_variance16x_h_sse2
;(
; unsigned char *ref_ptr,
@@ -619,27 +335,3 @@ sym(vp9_half_horiz_variance16x_h_sse2):
UNSHADOW_ARGS
pop rbp
ret
-
-SECTION_RODATA
-; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-bilinear_filters_sse2:
- dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
- dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
- dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
- dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
- dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
- dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
- dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
- dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
- dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
- dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
- dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
- dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
- dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
- dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
- dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
diff --git a/vp9/encoder/x86/vp9_subtract_mmx.asm b/vp9/encoder/x86/vp9_subtract_mmx.asm
deleted file mode 100644
index e9eda4fed..000000000
--- a/vp9/encoder/x86/vp9_subtract_mmx.asm
+++ /dev/null
@@ -1,432 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
-; short *diff, unsigned char *Predictor,
-; int pitch);
-global sym(vp9_subtract_b_mmx_impl) PRIVATE
-sym(vp9_subtract_b_mmx_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
-
- mov rdi, arg(2) ;diff
- mov rax, arg(3) ;Predictor
- mov rsi, arg(0) ;z
- movsxd rdx, dword ptr arg(1);src_stride;
- movsxd rcx, dword ptr arg(4);pitch
- pxor mm7, mm7
-
- movd mm0, [rsi]
- movd mm1, [rax]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi], mm0
-
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*2],mm0
-
-
- movd mm0, [rsi+rdx*2]
- movd mm1, [rax+rcx*2]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*4], mm0
-
- lea rsi, [rsi+rdx*2]
- lea rcx, [rcx+rcx*2]
-
-
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq [rdi+rcx*2], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
-
-;void vp9_subtract_mby_mmx(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_mmx) PRIVATE
-sym(vp9_subtract_mby_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- push rsi
- push rdi
- ; end prolog
-
-
- mov rsi, arg(1) ;src
- mov rdi, arg(0) ;diff
-
- mov rax, arg(2) ;pred
- movsxd rdx, dword ptr arg(3) ;stride
-
- mov rcx, 16
- pxor mm0, mm0
-
-.submby_loop:
-
- movq mm1, [rsi]
- movq mm3, [rax]
-
- movq mm2, mm1
- movq mm4, mm3
-
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
-
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm3
- psubw mm2, mm4
-
- movq [rdi], mm1
- movq [rdi+8], mm2
-
-
- movq mm1, [rsi+8]
- movq mm3, [rax+8]
-
- movq mm2, mm1
- movq mm4, mm3
-
- punpcklbw mm1, mm0
- punpcklbw mm3, mm0
-
- punpckhbw mm2, mm0
- punpckhbw mm4, mm0
-
- psubw mm1, mm3
- psubw mm2, mm4
-
- movq [rdi+16], mm1
- movq [rdi+24], mm2
-
-
- add rdi, 32
- add rax, 16
-
- lea rsi, [rsi+rdx]
-
- sub rcx, 1
- jnz .submby_loop
-
- pop rdi
- pop rsi
- ; begin epilog
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_mmx) PRIVATE
-sym(vp9_subtract_mbuv_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- push rsi
- push rdi
- ; end prolog
-
- ;short *udiff = diff + 256;
- ;short *vdiff = diff + 320;
- ;unsigned char *upred = pred + 256;
- ;unsigned char *vpred = pred + 320;
-
- ;unsigned char *z = usrc;
- ;unsigned short *diff = udiff;
- ;unsigned char *Predictor= upred;
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(1) ;z = usrc
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- add rax, 256 ;Predictor = pred + 256
- movsxd rdx, dword ptr arg(4) ;stride;
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
- ;unsigned char *z = vsrc;
- ;unsigned short *diff = vdiff;
- ;unsigned char *Predictor= vpred;
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(2) ;z = usrc
- add rdi, 320*2 ;diff = diff + 320 (shorts)
- add rax, 320 ;Predictor = pred + 320
- movsxd rdx, dword ptr arg(4) ;stride;
- pxor mm7, mm7
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi]
- movq mm1, [rax]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi], mm0
- movq [rdi+8], mm3
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+8]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+16], mm0
- movq [rdi+24], mm3
-
- movq mm0, [rsi+rdx*2]
- movq mm1, [rax+16]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
- movq [rdi+32], mm0
- movq [rdi+40], mm3
- lea rsi, [rsi+rdx*2]
-
-
- movq mm0, [rsi+rdx]
- movq mm1, [rax+24]
- movq mm3, mm0
- movq mm4, mm1
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- punpckhbw mm3, mm7
- punpckhbw mm4, mm7
- psubw mm0, mm1
- psubw mm3, mm4
-
- movq [rdi+48], mm0
- movq [rdi+56], mm3
-
- ; begin epilog
- pop rdi
- pop rsi
- UNSHADOW_ARGS
- pop rbp
- ret
diff --git a/vp9/encoder/x86/vp9_subtract_sse2.asm b/vp9/encoder/x86/vp9_subtract_sse2.asm
index 739d9487e..982408083 100644
--- a/vp9/encoder/x86/vp9_subtract_sse2.asm
+++ b/vp9/encoder/x86/vp9_subtract_sse2.asm
@@ -8,349 +8,120 @@
; be found in the AUTHORS file in the root of the source tree.
;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-;void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
-; short *diff, unsigned char *Predictor,
-; int pitch);
-global sym(vp9_subtract_b_sse2_impl) PRIVATE
-sym(vp9_subtract_b_sse2_impl):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(2) ;diff
- mov rax, arg(3) ;Predictor
- mov rsi, arg(0) ;z
- movsxd rdx, dword ptr arg(1);src_stride;
- movsxd rcx, dword ptr arg(4);pitch
- pxor mm7, mm7
-
- movd mm0, [rsi]
- movd mm1, [rax]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi], mm0
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*2], mm0
-
- movd mm0, [rsi+rdx*2]
- movd mm1, [rax+rcx*2]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*4], mm0
-
- lea rsi, [rsi+rdx*2]
- lea rcx, [rcx+rcx*2]
-
- movd mm0, [rsi+rdx]
- movd mm1, [rax+rcx]
- punpcklbw mm0, mm7
- punpcklbw mm1, mm7
- psubw mm0, mm1
- movq MMWORD PTR [rdi+rcx*2], mm0
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_subtract_mby_sse2(short *diff, unsigned char *src, unsigned char *pred, int stride)
-global sym(vp9_subtract_mby_sse2) PRIVATE
-sym(vp9_subtract_mby_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 4
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(1) ;src
- mov rdi, arg(0) ;diff
-
- mov rax, arg(2) ;pred
- movsxd rdx, dword ptr arg(3) ;stride
-
- mov rcx, 8 ; do two lines at one time
-
-.submby_loop:
- movdqa xmm0, XMMWORD PTR [rsi] ; src
- movdqa xmm1, XMMWORD PTR [rax] ; pred
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- movdqa xmm4, XMMWORD PTR [rsi + rdx]
- movdqa xmm5, XMMWORD PTR [rax + 16]
-
- movdqa xmm6, xmm4
- psubb xmm4, xmm5
-
- pxor xmm5, [GLOBAL(t80)] ;convert to signed values
- pxor xmm6, [GLOBAL(t80)]
- pcmpgtb xmm5, xmm6 ; obtain sign information
-
- movdqa xmm6, xmm4
- movdqa xmm7, xmm5
- punpcklbw xmm4, xmm5 ; put sign back to subtraction
- punpckhbw xmm6, xmm7 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi +32], xmm4
- movdqa XMMWORD PTR [rdi +48], xmm6
-
- add rdi, 64
- add rax, 32
- lea rsi, [rsi+rdx*2]
-
- sub rcx, 1
- jnz .submby_loop
-
- pop rdi
- pop rsi
- ; begin epilog
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride)
-global sym(vp9_subtract_mbuv_sse2) PRIVATE
-sym(vp9_subtract_mbuv_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 5
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rdi, arg(0) ;diff
- mov rax, arg(3) ;pred
- mov rsi, arg(1) ;z = usrc
- add rdi, 256*2 ;diff = diff + 256 (shorts)
- add rax, 256 ;Predictor = pred + 256
- movsxd rdx, dword ptr arg(4) ;stride;
- lea rcx, [rdx + rdx*2]
-
- ;u
- ;line 0 1
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- ;line 2 3
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+16] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 32], xmm0
- movdqa XMMWORD PTR [rdi + 48], xmm2
-
- ;line 4 5
- lea rsi, [rsi + rdx*4]
-
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 64], xmm0
- movdqa XMMWORD PTR [rdi + 80], xmm2
-
- ;line 6 7
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 96], xmm0
- movdqa XMMWORD PTR [rdi + 112], xmm2
-
- ;v
- mov rsi, arg(2) ;z = vsrc
- add rdi, 64*2 ;diff = diff + 320 (shorts)
- add rax, 64 ;Predictor = pred + 320
-
- ;line 0 1
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi], xmm0
- movdqa XMMWORD PTR [rdi +16], xmm2
-
- ;line 2 3
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+16] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 32], xmm0
- movdqa XMMWORD PTR [rdi + 48], xmm2
-
- ;line 4 5
- lea rsi, [rsi + rdx*4]
-
- movq xmm0, MMWORD PTR [rsi] ; src
- movq xmm2, MMWORD PTR [rsi+rdx]
- movdqa xmm1, XMMWORD PTR [rax + 32] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 64], xmm0
- movdqa XMMWORD PTR [rdi + 80], xmm2
-
- ;line 6 7
- movq xmm0, MMWORD PTR [rsi+rdx*2] ; src
- movq xmm2, MMWORD PTR [rsi+rcx]
- movdqa xmm1, XMMWORD PTR [rax+ 48] ; pred
- punpcklqdq xmm0, xmm2
-
- movdqa xmm2, xmm0
- psubb xmm0, xmm1 ; subtraction with sign missed
-
- pxor xmm1, [GLOBAL(t80)] ;convert to signed values
- pxor xmm2, [GLOBAL(t80)]
- pcmpgtb xmm1, xmm2 ; obtain sign information
-
- movdqa xmm2, xmm0
- movdqa xmm3, xmm1
- punpcklbw xmm0, xmm1 ; put sign back to subtraction
- punpckhbw xmm2, xmm3 ; put sign back to subtraction
-
- movdqa XMMWORD PTR [rdi + 96], xmm0
- movdqa XMMWORD PTR [rdi + 112], xmm2
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-SECTION_RODATA
-align 16
-t80:
- times 16 db 0x80
+%include "third_party/x86inc/x86inc.asm"
+
+SECTION .text
+
+; void vp9_subtract_block(int rows, int cols,
+; int16_t *diff, ptrdiff_t diff_stride,
+; const uint8_t *src, ptrdiff_t src_stride,
+; const uint8_t *pred, ptrdiff_t pred_stride)
+
+INIT_XMM sse2
+cglobal subtract_block, 7, 7, 8, \
+ rows, cols, diff, diff_stride, src, src_stride, \
+ pred, pred_stride
+%define pred_str colsq
+ pxor m7, m7 ; dedicated zero register
+ cmp colsd, 4
+ je .case_4
+ cmp colsd, 8
+ je .case_8
+ cmp colsd, 16
+ je .case_16
+ cmp colsd, 32
+ je .case_32
+
+%macro loop16 6
+ mova m0, [srcq+%1]
+ mova m4, [srcq+%2]
+ mova m1, [predq+%3]
+ mova m5, [predq+%4]
+ punpckhbw m2, m0, m7
+ punpckhbw m3, m1, m7
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ psubw m2, m3
+ psubw m0, m1
+ punpckhbw m1, m4, m7
+ punpckhbw m3, m5, m7
+ punpcklbw m4, m7
+ punpcklbw m5, m7
+ psubw m1, m3
+ psubw m4, m5
+ mova [diffq+mmsize*0+%5], m0
+ mova [diffq+mmsize*1+%5], m2
+ mova [diffq+mmsize*0+%6], m4
+ mova [diffq+mmsize*1+%6], m1
+%endmacro
+
+ mov pred_str, pred_stridemp
+.loop_64:
+ loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
+ loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize, 4*mmsize, 6*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_64
+ RET
+
+.case_32:
+ mov pred_str, pred_stridemp
+.loop_32:
+ loop16 0, mmsize, 0, mmsize, 0, 2*mmsize
+ lea diffq, [diffq+diff_strideq*2]
+ add predq, pred_str
+ add srcq, src_strideq
+ dec rowsd
+ jg .loop_32
+ RET
+
+.case_16:
+ mov pred_str, pred_stridemp
+.loop_16:
+ loop16 0, src_strideq, 0, pred_str, 0, diff_strideq*2
+ lea diffq, [diffq+diff_strideq*4]
+ lea predq, [predq+pred_str*2]
+ lea srcq, [srcq+src_strideq*2]
+ sub rowsd, 2
+ jg .loop_16
+ RET
+
+%macro loop_h 0
+ movh m0, [srcq]
+ movh m2, [srcq+src_strideq]
+ movh m1, [predq]
+ movh m3, [predq+pred_str]
+ punpcklbw m0, m7
+ punpcklbw m1, m7
+ punpcklbw m2, m7
+ punpcklbw m3, m7
+ psubw m0, m1
+ psubw m2, m3
+ mova [diffq], m0
+ mova [diffq+diff_strideq*2], m2
+%endmacro
+
+.case_8:
+ mov pred_str, pred_stridemp
+.loop_8:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_8
+ RET
+
+INIT_MMX
+.case_4:
+ mov pred_str, pred_stridemp
+.loop_4:
+ loop_h
+ lea diffq, [diffq+diff_strideq*4]
+ lea srcq, [srcq+src_strideq*2]
+ lea predq, [predq+pred_str*2]
+ sub rowsd, 2
+ jg .loop_4
+ RET
diff --git a/vp9/encoder/x86/vp9_variance_impl_mmx.asm b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
index 9f140c96b..d3dbefed8 100644
--- a/vp9/encoder/x86/vp9_variance_impl_mmx.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_mmx.asm
@@ -508,344 +508,3 @@ sym(vp9_get4x4sse_cs_mmx):
UNSHADOW_ARGS
pop rbp
ret
-
-%define mmx_filter_shift 7
-
-;void vp9_filter_block2d_bil4x4_var_mmx
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil4x4_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil4x4_var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 8
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
-
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
-
- mov rax, arg(4) ;HFilter ;
- mov rdx, arg(5) ;VFilter ;
-
- mov rsi, arg(0) ;ref_ptr ;
- mov rdi, arg(2) ;src_ptr ;
-
- mov rcx, 4 ;
- pxor mm0, mm0 ;
-
- movd mm1, [rsi] ;
- movd mm3, [rsi+1] ;
-
- punpcklbw mm1, mm0 ;
- pmullw mm1, [rax] ;
-
- punpcklbw mm3, mm0 ;
- pmullw mm3, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- movq mm5, mm1
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- add rsi, r8
-%endif
-
-.filter_block2d_bil4x4_var_mmx_loop:
-
- movd mm1, [rsi] ;
- movd mm3, [rsi+1] ;
-
- punpcklbw mm1, mm0 ;
- pmullw mm1, [rax] ;
-
- punpcklbw mm3, mm0 ;
- pmullw mm3, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- movq mm3, mm5 ;
-
- movq mm5, mm1 ;
- pmullw mm3, [rdx] ;
-
- pmullw mm1, [rdx+8] ;
- paddw mm1, mm3 ;
-
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- psraw mm1, mmx_filter_shift ;
-
- movd mm3, [rdi] ;
- punpcklbw mm3, mm0 ;
-
- psubw mm1, mm3 ;
- paddw mm6, mm1 ;
-
- pmaddwd mm1, mm1 ;
- paddd mm7, mm1 ;
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz .filter_block2d_bil4x4_var_mmx_loop ;
-
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rdi, arg(6) ;sum
- mov rsi, arg(7) ;sumsquared
-
- movd dword ptr [rdi], mm2 ;
- movd dword ptr [rsi], mm4 ;
-
-
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-
-;void vp9_filter_block2d_bil_var_mmx
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; unsigned short *HFilter,
-; unsigned short *VFilter,
-; int *sum,
-; unsigned int *sumsquared
-;)
-global sym(vp9_filter_block2d_bil_var_mmx) PRIVATE
-sym(vp9_filter_block2d_bil_var_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- GET_GOT rbx
- push rsi
- push rdi
- sub rsp, 16
- ; end prolog
-
- pxor mm6, mm6 ;
- pxor mm7, mm7 ;
- mov rax, arg(5) ;HFilter ;
-
- mov rdx, arg(6) ;VFilter ;
- mov rsi, arg(0) ;ref_ptr ;
-
- mov rdi, arg(2) ;src_ptr ;
- movsxd rcx, dword ptr arg(4) ;Height ;
-
- pxor mm0, mm0 ;
- movq mm1, [rsi] ;
-
- movq mm3, [rsi+1] ;
- movq mm2, mm1 ;
-
- movq mm4, mm3 ;
- punpcklbw mm1, mm0 ;
-
- punpckhbw mm2, mm0 ;
- pmullw mm1, [rax] ;
-
- pmullw mm2, [rax] ;
- punpcklbw mm3, mm0 ;
-
- punpckhbw mm4, mm0 ;
- pmullw mm3, [rax+8] ;
-
- pmullw mm4, [rax+8] ;
- paddw mm1, mm3 ;
-
- paddw mm2, mm4 ;
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm2, mmx_filter_shift ;
- movq mm5, mm1
-
- packuswb mm5, mm2 ;
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- add rsi, r8
-%endif
-
-.filter_block2d_bil_var_mmx_loop:
-
- movq mm1, [rsi] ;
- movq mm3, [rsi+1] ;
-
- movq mm2, mm1 ;
- movq mm4, mm3 ;
-
- punpcklbw mm1, mm0 ;
- punpckhbw mm2, mm0 ;
-
- pmullw mm1, [rax] ;
- pmullw mm2, [rax] ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- pmullw mm3, [rax+8] ;
- pmullw mm4, [rax+8] ;
-
- paddw mm1, mm3 ;
- paddw mm2, mm4 ;
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- psraw mm1, mmx_filter_shift ;
-
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
- psraw mm2, mmx_filter_shift ;
-
- movq mm3, mm5 ;
- movq mm4, mm5 ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- movq mm5, mm1 ;
- packuswb mm5, mm2 ;
-
- pmullw mm3, [rdx] ;
- pmullw mm4, [rdx] ;
-
- pmullw mm1, [rdx+8] ;
- pmullw mm2, [rdx+8] ;
-
- paddw mm1, mm3 ;
- paddw mm2, mm4 ;
-
- paddw mm1, [GLOBAL(mmx_bi_rd)] ;
- paddw mm2, [GLOBAL(mmx_bi_rd)] ;
-
- psraw mm1, mmx_filter_shift ;
- psraw mm2, mmx_filter_shift ;
-
- movq mm3, [rdi] ;
- movq mm4, mm3 ;
-
- punpcklbw mm3, mm0 ;
- punpckhbw mm4, mm0 ;
-
- psubw mm1, mm3 ;
- psubw mm2, mm4 ;
-
- paddw mm6, mm1 ;
- pmaddwd mm1, mm1 ;
-
- paddw mm6, mm2 ;
- pmaddwd mm2, mm2 ;
-
- paddd mm7, mm1 ;
- paddd mm7, mm2 ;
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line ;
- add rdi, dword ptr arg(3) ;src_pixels_per_line ;
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ;
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line ;
- add rsi, r8
- add rdi, r9
-%endif
- sub rcx, 1 ;
- jnz .filter_block2d_bil_var_mmx_loop ;
-
-
- pxor mm3, mm3 ;
- pxor mm2, mm2 ;
-
- punpcklwd mm2, mm6 ;
- punpckhwd mm3, mm6 ;
-
- paddd mm2, mm3 ;
- movq mm6, mm2 ;
-
- psrlq mm6, 32 ;
- paddd mm2, mm6 ;
-
- psrad mm2, 16 ;
- movq mm4, mm7 ;
-
- psrlq mm4, 32 ;
- paddd mm4, mm7 ;
-
- mov rdi, arg(7) ;sum
- mov rsi, arg(8) ;sumsquared
-
- movd dword ptr [rdi], mm2 ;
- movd dword ptr [rsi], mm4 ;
-
- ; begin epilog
- add rsp, 16
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-;short mmx_bi_rd[4] = { 64, 64, 64, 64};
-align 16
-mmx_bi_rd:
- times 4 dw 64
diff --git a/vp9/encoder/x86/vp9_variance_impl_sse2.asm b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
index 896dd185d..2c5088134 100644
--- a/vp9/encoder/x86/vp9_variance_impl_sse2.asm
+++ b/vp9/encoder/x86/vp9_variance_impl_sse2.asm
@@ -11,8 +11,6 @@
%include "vpx_ports/x86_abi_support.asm"
-%define xmm_filter_shift 7
-
;unsigned int vp9_get_mb_ss_sse2
;(
; short *src_ptr
@@ -734,28 +732,3 @@ sym(vp9_half_horiz_variance8x_h_sse2):
UNSHADOW_ARGS
pop rbp
ret
-
-
-SECTION_RODATA
-; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-bilinear_filters_sse2:
- dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
- dw 120, 120, 120, 120, 120, 120, 120, 120, 8, 8, 8, 8, 8, 8, 8, 8
- dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
- dw 104, 104, 104, 104, 104, 104, 104, 104, 24, 24, 24, 24, 24, 24, 24, 24
- dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
- dw 88, 88, 88, 88, 88, 88, 88, 88, 40, 40, 40, 40, 40, 40, 40, 40
- dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
- dw 72, 72, 72, 72, 72, 72, 72, 72, 56, 56, 56, 56, 56, 56, 56, 56
- dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
- dw 56, 56, 56, 56, 56, 56, 56, 56, 72, 72, 72, 72, 72, 72, 72, 72
- dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
- dw 40, 40, 40, 40, 40, 40, 40, 40, 88, 88, 88, 88, 88, 88, 88, 88
- dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
- dw 24, 24, 24, 24, 24, 24, 24, 24, 104, 104, 104, 104, 104, 104, 104, 104
- dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112
- dw 8, 8, 8, 8, 8, 8, 8, 8, 120, 120, 120, 120, 120, 120, 120, 120
diff --git a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm b/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
deleted file mode 100644
index 98a4a16f6..000000000
--- a/vp9/encoder/x86/vp9_variance_impl_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-; Use of this source code is governed by a BSD-style license
-; that can be found in the LICENSE file in the root of the source
-; tree. An additional intellectual property rights grant can be found
-; in the file PATENTS. All contributing project authors may
-; be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%define xmm_filter_shift 7
-
-
-;void vp9_filter_block2d_bil_var_ssse3
-;(
-; unsigned char *ref_ptr,
-; int ref_pixels_per_line,
-; unsigned char *src_ptr,
-; int src_pixels_per_line,
-; unsigned int Height,
-; int xoffset,
-; int yoffset,
-; int *sum,
-; unsigned int *sumsquared;;
-;
-;)
-;Note: The filter coefficient at offset=0 is 128. Since the second register
-;for Pmaddubsw is signed bytes, we must calculate zero offset seperately.
-global sym(vp9_filter_block2d_bil_var_ssse3) PRIVATE
-sym(vp9_filter_block2d_bil_var_ssse3):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 9
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- pxor xmm6, xmm6
- pxor xmm7, xmm7
-
- lea rcx, [GLOBAL(bilinear_filters_ssse3)]
- movsxd rax, dword ptr arg(5) ; xoffset
-
- cmp rax, 0 ; skip first_pass filter if xoffset=0
- je .filter_block2d_bil_var_ssse3_sp_only
-
- shl rax, 4 ; point to filter coeff with xoffset
- lea rax, [rax + rcx] ; HFilter
-
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; skip second_pass filter if yoffset=0
- je .filter_block2d_bil_var_ssse3_fp_only
-
- shl rdx, 4
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
-
- movdqu xmm0, XMMWORD PTR [rsi]
- movdqu xmm1, XMMWORD PTR [rsi+1]
- movdqa xmm2, xmm0
-
- punpcklbw xmm0, xmm1
- punpckhbw xmm2, xmm1
- pmaddubsw xmm0, [rax]
- pmaddubsw xmm2, [rax]
-
- paddw xmm0, [GLOBAL(xmm_bi_rd)]
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- psraw xmm0, xmm_filter_shift
- psraw xmm2, xmm_filter_shift
-
- packuswb xmm0, xmm2
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
-%else
- movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
- lea rsi, [rsi + r8]
-%endif
-
-.filter_block2d_bil_var_ssse3_loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rsi+1]
- movdqa xmm3, xmm1
-
- punpcklbw xmm1, xmm2
- punpckhbw xmm3, xmm2
- pmaddubsw xmm1, [rax]
- pmaddubsw xmm3, [rax]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
- packuswb xmm1, xmm3
-
- movdqa xmm2, xmm0
- movdqa xmm0, xmm1
- movdqa xmm3, xmm2
-
- punpcklbw xmm2, xmm1
- punpckhbw xmm3, xmm1
- pmaddubsw xmm2, [rdx]
- pmaddubsw xmm3, [rdx]
-
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm2, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
-
- movq xmm1, QWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm1, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm2, xmm1
- psubw xmm3, xmm5
- paddw xmm6, xmm2
- paddw xmm6, xmm3
- pmaddwd xmm2, xmm2
- pmaddwd xmm3, xmm3
- paddd xmm7, xmm2
- paddd xmm7, xmm3
-
-%if ABI_IS_32BIT
- add rsi, dword ptr arg(1) ;ref_pixels_per_line
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rsi, [rsi + r8]
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_var_ssse3_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_sp_only:
- movsxd rdx, dword ptr arg(6) ; yoffset
-
- cmp rdx, 0 ; Both xoffset =0 and yoffset=0
- je .filter_block2d_bil_var_ssse3_full_pixel
-
- shl rdx, 4
- lea rdx, [rdx + rcx] ; VFilter
-
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
-
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqa xmm0, xmm1
-
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
- lea rsi, [rsi + rax]
-
-.filter_block2d_bil_sp_only_loop:
- movdqu xmm3, XMMWORD PTR [rsi]
- movdqa xmm2, xmm1
- movdqa xmm0, xmm3
-
- punpcklbw xmm1, xmm3
- punpckhbw xmm2, xmm3
- pmaddubsw xmm1, [rdx]
- pmaddubsw xmm2, [rdx]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm2, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm2, xmm_filter_shift
-
- movq xmm3, QWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm3, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm1, xmm3
- psubw xmm2, xmm5
- paddw xmm6, xmm1
- paddw xmm6, xmm2
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm7, xmm1
- paddd xmm7, xmm2
-
- movdqa xmm1, xmm0
- lea rsi, [rsi + rax] ;ref_pixels_per_line
-
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_sp_only_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_full_pixel:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
- movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
- pxor xmm0, xmm0
-
-.filter_block2d_bil_full_pixel_loop:
- movq xmm1, QWORD PTR [rsi]
- punpcklbw xmm1, xmm0
- movq xmm2, QWORD PTR [rsi+8]
- punpcklbw xmm2, xmm0
-
- movq xmm3, QWORD PTR [rdi]
- punpcklbw xmm3, xmm0
- movq xmm4, QWORD PTR [rdi+8]
- punpcklbw xmm4, xmm0
-
- psubw xmm1, xmm3
- psubw xmm2, xmm4
- paddw xmm6, xmm1
- paddw xmm6, xmm2
- pmaddwd xmm1, xmm1
- pmaddwd xmm2, xmm2
- paddd xmm7, xmm1
- paddd xmm7, xmm2
-
- lea rsi, [rsi + rax] ;ref_pixels_per_line
- lea rdi, [rdi + rdx] ;src_pixels_per_line
- sub rcx, 1
- jnz .filter_block2d_bil_full_pixel_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_var_ssse3_fp_only:
- mov rsi, arg(0) ;ref_ptr
- mov rdi, arg(2) ;src_ptr
- movsxd rcx, dword ptr arg(4) ;Height
- movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
-
- pxor xmm0, xmm0
-
-%if ABI_IS_32BIT=0
- movsxd r9, dword ptr arg(3) ;src_pixels_per_line
-%endif
-
-.filter_block2d_bil_fp_only_loop:
- movdqu xmm1, XMMWORD PTR [rsi]
- movdqu xmm2, XMMWORD PTR [rsi+1]
- movdqa xmm3, xmm1
-
- punpcklbw xmm1, xmm2
- punpckhbw xmm3, xmm2
- pmaddubsw xmm1, [rax]
- pmaddubsw xmm3, [rax]
-
- paddw xmm1, [GLOBAL(xmm_bi_rd)]
- paddw xmm3, [GLOBAL(xmm_bi_rd)]
- psraw xmm1, xmm_filter_shift
- psraw xmm3, xmm_filter_shift
-
- movq xmm2, XMMWORD PTR [rdi]
- pxor xmm4, xmm4
- punpcklbw xmm2, xmm4
- movq xmm5, QWORD PTR [rdi+8]
- punpcklbw xmm5, xmm4
-
- psubw xmm1, xmm2
- psubw xmm3, xmm5
- paddw xmm6, xmm1
- paddw xmm6, xmm3
- pmaddwd xmm1, xmm1
- pmaddwd xmm3, xmm3
- paddd xmm7, xmm1
- paddd xmm7, xmm3
-
- lea rsi, [rsi + rdx]
-%if ABI_IS_32BIT
- add rdi, dword ptr arg(3) ;src_pixels_per_line
-%else
- lea rdi, [rdi + r9]
-%endif
-
- sub rcx, 1
- jnz .filter_block2d_bil_fp_only_loop
-
- jmp .filter_block2d_bil_variance
-
-.filter_block2d_bil_variance:
- pxor xmm0, xmm0
- pxor xmm1, xmm1
- pxor xmm5, xmm5
-
- punpcklwd xmm0, xmm6
- punpckhwd xmm1, xmm6
- psrad xmm0, 16
- psrad xmm1, 16
- paddd xmm0, xmm1
- movdqa xmm1, xmm0
-
- movdqa xmm6, xmm7
- punpckldq xmm6, xmm5
- punpckhdq xmm7, xmm5
- paddd xmm6, xmm7
-
- punpckldq xmm0, xmm5
- punpckhdq xmm1, xmm5
- paddd xmm0, xmm1
-
- movdqa xmm7, xmm6
- movdqa xmm1, xmm0
-
- psrldq xmm7, 8
- psrldq xmm1, 8
-
- paddd xmm6, xmm7
- paddd xmm0, xmm1
-
- mov rsi, arg(7) ;[Sum]
- mov rdi, arg(8) ;[SSE]
-
- movd [rsi], xmm0
- movd [rdi], xmm6
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-SECTION_RODATA
-align 16
-xmm_bi_rd:
- times 8 dw 64
-align 16
-bilinear_filters_ssse3:
- times 8 db 128, 0
- times 8 db 120, 8
- times 8 db 112, 16
- times 8 db 104, 24
- times 8 db 96, 32
- times 8 db 88, 40
- times 8 db 80, 48
- times 8 db 72, 56
- times 8 db 64, 64
- times 8 db 56, 72
- times 8 db 48, 80
- times 8 db 40, 88
- times 8 db 32, 96
- times 8 db 24, 104
- times 8 db 16, 112
- times 8 db 8, 120
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c
index bad1cfa74..d1415606e 100644
--- a/vp9/encoder/x86/vp9_variance_mmx.c
+++ b/vp9/encoder/x86/vp9_variance_mmx.c
@@ -13,27 +13,6 @@
#include "vp9/common/vp9_pragmas.h"
#include "vpx_ports/mem.h"
-extern void filter_block1d_h6_mmx
-(
- const unsigned char *src_ptr,
- unsigned short *output_ptr,
- unsigned int src_pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- short *vp7_filter
-);
-extern void filter_block1d_v6_mmx
-(
- const short *src_ptr,
- unsigned char *output_ptr,
- unsigned int pixels_per_line,
- unsigned int pixel_step,
- unsigned int output_height,
- unsigned int output_width,
- short *vp7_filter
-);
-
extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr);
extern unsigned int vp9_get8x8var_mmx
(
@@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx
unsigned int *SSE,
int *Sum
);
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-
unsigned int vp9_variance4x4_mmx(
const unsigned char *src_ptr,
@@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx(
return (var - (((unsigned int)avg * avg) >> 7));
}
-
-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
-unsigned int vp9_sub_pixel_variance4x4_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse)
-
-{
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil4x4_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 4));
-}
-
-
-unsigned int vp9_sub_pixel_variance8x8_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
-
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-unsigned int vp9_sub_pixel_variance16x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
-
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum0, &xxsum0
- );
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-
-
-}
-
-unsigned int vp9_sub_pixel_mse16x16_mmx(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse);
- return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum0, &xxsum0
- );
-
-
- vp9_filter_block2d_bil_var_mmx(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum1, &xxsum1
- );
-
- xsum0 += xsum1;
- xxsum0 += xxsum1;
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_mmx
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 7));
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_h_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_v_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8,
- ref_ptr, recon_stride, sse);
-}
-
-
-unsigned int vp9_variance_halfpixvar16x16_hv_mmx(
- const unsigned char *src_ptr,
- int source_stride,
- const unsigned char *ref_ptr,
- int recon_stride,
- unsigned int *sse) {
- return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8,
- ref_ptr, recon_stride, sse);
-}
diff --git a/vp9/encoder/x86/vp9_variance_sse2.c b/vp9/encoder/x86/vp9_variance_sse2.c
index 67ca9257c..b4ff8509c 100644
--- a/vp9/encoder/x86/vp9_variance_sse2.c
+++ b/vp9/encoder/x86/vp9_variance_sse2.c
@@ -9,29 +9,11 @@
*/
#include "vpx_config.h"
+
#include "vp9/encoder/vp9_variance.h"
#include "vp9/common/vp9_pragmas.h"
#include "vpx_ports/mem.h"
-#define HALFNDX 8
-
-extern void filter_block1d_h6_mmx(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d_v6_mmx(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_h6_sse2(const unsigned char *src_ptr, unsigned short *output_ptr, unsigned int src_pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-extern void filter_block1d8_v6_sse2(const short *src_ptr, unsigned char *output_ptr, unsigned int pixels_per_line, unsigned int pixel_step, unsigned int output_height, unsigned int output_width, short *vp7_filter);
-
-extern void vp9_filter_block2d_bil4x4_var_mmx
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- const short *HFilter,
- const short *VFilter,
- int *sum,
- unsigned int *sumsquared
-);
-
extern unsigned int vp9_get4x4var_mmx
(
const unsigned char *src_ptr,
@@ -64,18 +46,6 @@ unsigned int vp9_get8x8var_sse2
unsigned int *SSE,
int *Sum
);
-void vp9_filter_block2d_bil_var_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int xoffset,
- int yoffset,
- int *sum,
- unsigned int *sumsquared
-);
void vp9_half_horiz_vert_variance8x_h_sse2
(
const unsigned char *ref_ptr,
@@ -137,8 +107,6 @@ void vp9_half_vert_variance16x_h_sse2
unsigned int *sumsquared
);
-DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]);
-
typedef unsigned int (*get_var_sse2) (
const unsigned char *src_ptr,
int source_stride,
@@ -375,347 +343,162 @@ unsigned int vp9_variance32x64_sse2(const uint8_t *src_ptr,
return (var - (((int64_t)avg * avg) >> 11));
}
-unsigned int vp9_sub_pixel_variance4x4_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
- vp9_filter_block2d_bil4x4_var_mmx(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line,
- vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset],
- &xsum, &xxsum
- );
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 4));
+#define DECL(w, opt) \
+int vp9_sub_pixel_variance##w##xh_##opt(const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECLS
+#undef DECL
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_variance##w##x##h##_##opt(const uint8_t *src, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst, \
+ int dst_stride, \
+ unsigned int *sse_ptr) { \
+ unsigned int sse; \
+ int se = vp9_sub_pixel_variance##wf##xh_##opt(src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ h, &sse); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_sub_pixel_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sse_ptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
-
-unsigned int vp9_sub_pixel_variance8x8_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum, &xxsum);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum, &xxsum);
- }
-
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 6));
-}
-
-static void sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse, int *avg) {
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
- // note we could avoid these if statements if the calling function
- // just called the appropriate functions inside.
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0
- );
-
- vp9_filter_block2d_bil_var_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum1, &xxsum1
- );
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- }
-
- *sse = xxsum0;
- *avg = xsum0;
-}
-
-unsigned int vp9_sub_pixel_variance16x16_sse2(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse_ptr) {
- int avg;
- unsigned int sse;
-
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse, &avg);
- *sse_ptr = sse;
-
- return (sse - (((unsigned int) avg * avg) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance32x32_sse2(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse_ptr) {
- int avg0, avg1, avg2, avg3;
- unsigned int sse0, sse1, sse2, sse3;
-
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse0, &avg0);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse1, &avg1);
- src_ptr += 16 * src_pixels_per_line;
- dst_ptr += 16 * dst_pixels_per_line;
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse3, &avg3);
- sse0 += sse1 + sse2 + sse3;
- avg0 += avg1 + avg2 + avg3;
- *sse_ptr = sse0;
-
- return (sse0 - (((unsigned int) avg0 * avg0) >> 10));
-}
-
-unsigned int vp9_sub_pixel_variance64x64_sse2(const uint8_t *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const uint8_t *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse_ptr) {
- int avg0, avg1, avg2, avg3, avg4;
- unsigned int sse0, sse1, sse2, sse3, sse4;
-
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse0, &avg0);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse1, &avg1);
- sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 32, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 48, dst_pixels_per_line,
- &sse3, &avg3);
- src_ptr += 16 * src_pixels_per_line;
- dst_ptr += 16 * dst_pixels_per_line;
- avg0 += avg1 + avg2 + avg3;
- sse0 += sse1 + sse2 + sse3;
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse1, &avg1);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 32, dst_pixels_per_line,
- &sse3, &avg3);
- sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 48, dst_pixels_per_line,
- &sse4, &avg4);
- src_ptr += 16 * src_pixels_per_line;
- dst_ptr += 16 * dst_pixels_per_line;
- avg0 += avg1 + avg2 + avg3 + avg4;
- sse0 += sse1 + sse2 + sse3 + sse4;
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse1, &avg1);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 32, dst_pixels_per_line,
- &sse3, &avg3);
- sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 48, dst_pixels_per_line,
- &sse4, &avg4);
- src_ptr += 16 * src_pixels_per_line;
- dst_ptr += 16 * dst_pixels_per_line;
- avg0 += avg1 + avg2 + avg3 + avg4;
- sse0 += sse1 + sse2 + sse3 + sse4;
- sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line,
- &sse1, &avg1);
- sub_pixel_variance16x16_sse2(src_ptr + 16, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 16, dst_pixels_per_line,
- &sse2, &avg2);
- sub_pixel_variance16x16_sse2(src_ptr + 32, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 32, dst_pixels_per_line,
- &sse3, &avg3);
- sub_pixel_variance16x16_sse2(src_ptr + 48, src_pixels_per_line, xoffset,
- yoffset, dst_ptr + 48, dst_pixels_per_line,
- &sse4, &avg4);
- avg0 += avg1 + avg2 + avg3 + avg4;
- sse0 += sse1 + sse2 + sse3 + sse4;
- *sse_ptr = sse0;
-
- return (sse0 - (((unsigned int) avg0 * avg0) >> 12));
-}
-
-unsigned int vp9_sub_pixel_mse16x16_sse2(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- vp9_sub_pixel_variance16x16_sse2(src_ptr, src_pixels_per_line, xoffset,
- yoffset, dst_ptr, dst_pixels_per_line, sse);
- return *sse;
-}
-
-unsigned int vp9_sub_pixel_variance16x8_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-
-) {
- int xsum0, xsum1;
- unsigned int xxsum0, xxsum1;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum0, &xxsum0);
-
- vp9_filter_block2d_bil_var_sse2(
- src_ptr + 8, src_pixels_per_line,
- dst_ptr + 8, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum1, &xxsum1);
- xsum0 += xsum1;
- xxsum0 += xxsum1;
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
-
-unsigned int vp9_sub_pixel_variance8x16_wmt
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum;
- unsigned int xxsum;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance8x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum, &xxsum);
- } else {
- vp9_filter_block2d_bil_var_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum, &xxsum);
- }
-
- *sse = xxsum;
- return (xxsum - (((unsigned int)xsum * xsum) >> 7));
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16, 8, 16, 4, 3, opt1,); \
+FN(8, 16, 8, 3, 4, opt1,); \
+FN(8, 8, 8, 3, 3, opt1,); \
+FN(8, 4, 8, 3, 2, opt1,); \
+FN(4, 8, 4, 2, 3, opt2,); \
+FN(4, 4, 4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
+
+#define DECL(w, opt) \
+int vp9_sub_pixel_avg_variance##w##xh_##opt(const uint8_t *src, \
+ ptrdiff_t src_stride, \
+ int x_offset, int y_offset, \
+ const uint8_t *dst, \
+ ptrdiff_t dst_stride, \
+ const uint8_t *sec, \
+ ptrdiff_t sec_stride, \
+ int height, unsigned int *sse)
+#define DECLS(opt1, opt2) \
+DECL(4, opt2); \
+DECL(8, opt1); \
+DECL(16, opt1)
+
+DECLS(sse2, sse);
+DECLS(ssse3, ssse3);
+#undef DECL
+#undef DECLS
+
+#define FN(w, h, wf, wlog2, hlog2, opt, cast) \
+unsigned int vp9_sub_pixel_avg_variance##w##x##h##_##opt(const uint8_t *src, \
+ int src_stride, \
+ int x_offset, \
+ int y_offset, \
+ const uint8_t *dst, \
+ int dst_stride, \
+ unsigned int *sseptr, \
+ const uint8_t *sec) { \
+ unsigned int sse; \
+ int se = vp9_sub_pixel_avg_variance##wf##xh_##opt(src, src_stride, x_offset, \
+ y_offset, dst, dst_stride, \
+ sec, w, h, &sse); \
+ if (w > wf) { \
+ unsigned int sse2; \
+ int se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 16, src_stride, \
+ x_offset, y_offset, \
+ dst + 16, dst_stride, \
+ sec + 16, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ if (w > wf * 2) { \
+ se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 32, src_stride, \
+ x_offset, y_offset, \
+ dst + 32, dst_stride, \
+ sec + 32, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ se2 = vp9_sub_pixel_avg_variance##wf##xh_##opt(src + 48, src_stride, \
+ x_offset, y_offset, \
+ dst + 48, dst_stride, \
+ sec + 48, w, h, &sse2); \
+ se += se2; \
+ sse += sse2; \
+ } \
+ } \
+ *sseptr = sse; \
+ return sse - ((cast se * se) >> (wlog2 + hlog2)); \
}
+#define FNS(opt1, opt2) \
+FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
+FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
+FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
+FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
+FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
+FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
+FN(16, 16, 16, 4, 4, opt1, (unsigned int)); \
+FN(16, 8, 16, 4, 3, opt1,); \
+FN(8, 16, 8, 3, 4, opt1,); \
+FN(8, 8, 8, 3, 3, opt1,); \
+FN(8, 4, 8, 3, 2, opt1,); \
+FN(4, 8, 4, 2, 3, opt2,); \
+FN(4, 4, 4, 2, 2, opt2,)
+
+FNS(sse2, sse);
+FNS(ssse3, ssse3);
+
+#undef FNS
+#undef FN
unsigned int vp9_variance_halfpixvar16x16_h_wmt(
const unsigned char *src_ptr,
diff --git a/vp9/encoder/x86/vp9_variance_ssse3.c b/vp9/encoder/x86/vp9_variance_ssse3.c
deleted file mode 100644
index 882acad78..000000000
--- a/vp9/encoder/x86/vp9_variance_ssse3.c
+++ /dev/null
@@ -1,142 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "vpx_config.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/common/vp9_pragmas.h"
-#include "vpx_ports/mem.h"
-
-#define HALFNDX 8
-
-extern void vp9_half_horiz_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_half_horiz_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_half_vert_variance16x_h_sse2
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int *sum,
- unsigned int *sumsquared
-);
-extern void vp9_filter_block2d_bil_var_ssse3
-(
- const unsigned char *ref_ptr,
- int ref_pixels_per_line,
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- unsigned int Height,
- int xoffset,
- int yoffset,
- int *sum,
- unsigned int *sumsquared
-);
-
-unsigned int vp9_sub_pixel_variance16x16_ssse3
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-) {
- int xsum0;
- unsigned int xxsum0;
-
- // note we could avoid these if statements if the calling function
- // just called the appropriate functions inside.
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 16,
- xoffset, yoffset,
- &xsum0, &xxsum0);
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8));
-}
-
-unsigned int vp9_sub_pixel_variance16x8_ssse3
-(
- const unsigned char *src_ptr,
- int src_pixels_per_line,
- int xoffset,
- int yoffset,
- const unsigned char *dst_ptr,
- int dst_pixels_per_line,
- unsigned int *sse
-
-) {
- int xsum0;
- unsigned int xxsum0;
-
- if (xoffset == HALFNDX && yoffset == 0) {
- vp9_half_horiz_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == 0 && yoffset == HALFNDX) {
- vp9_half_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else if (xoffset == HALFNDX && yoffset == HALFNDX) {
- vp9_half_horiz_vert_variance16x_h_sse2(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- &xsum0, &xxsum0);
- } else {
- vp9_filter_block2d_bil_var_ssse3(
- src_ptr, src_pixels_per_line,
- dst_ptr, dst_pixels_per_line, 8,
- xoffset, yoffset,
- &xsum0, &xxsum0);
- }
-
- *sse = xxsum0;
- return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7));
-}
diff --git a/vp9/encoder/x86/vp9_x86_csystemdependent.c b/vp9/encoder/x86/vp9_x86_csystemdependent.c
deleted file mode 100644
index 6016e14eb..000000000
--- a/vp9/encoder/x86/vp9_x86_csystemdependent.c
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- * Use of this source code is governed by a BSD-style license
- * that can be found in the LICENSE file in the root of the source
- * tree. An additional intellectual property rights grant can be found
- * in the file PATENTS. All contributing project authors may
- * be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vpx_ports/x86.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/x86/vp9_dct_mmx.h"
-
-// TODO(jimbankoski) Consider rewriting the c to take the same values rather
-// than going through these pointer conversions
-#if 0 && HAVE_MMX
-void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
- vp9_short_fdct4x4_mmx(input, output, pitch);
- vp9_short_fdct4x4_mmx(input + 4, output + 16, pitch);
-}
-
-void vp9_subtract_b_mmx_impl(unsigned char *z, int src_stride,
- short *diff, unsigned char *predictor,
- int pitch);
-void vp9_subtract_b_mmx(BLOCK *be, BLOCKD *bd, int pitch) {
- unsigned char *z = *(be->base_src) + be->src;
- unsigned int src_stride = be->src_stride;
- short *diff = &be->src_diff[0];
- unsigned char *predictor = *(bd->base_dst) + bd->dst;
- // TODO(jingning): The prototype function in c has been changed. Need to
- // modify the mmx and sse versions.
- vp9_subtract_b_mmx_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
-
-#if 0 && HAVE_SSE2
-void vp9_subtract_b_sse2_impl(unsigned char *z, int src_stride,
- short *diff, unsigned char *predictor,
- int pitch);
-void vp9_subtract_b_sse2(BLOCK *be, BLOCKD *bd, int pitch) {
- unsigned char *z = *(be->base_src) + be->src;
- unsigned int src_stride = be->src_stride;
- short *diff = &be->src_diff[0];
- unsigned char *predictor = *(bd->base_dst) + bd->dst;
- // TODO(jingning): The prototype function in c has been changed. Need to
- // modify the mmx and sse versions.
- vp9_subtract_b_sse2_impl(z, src_stride, diff, predictor, pitch);
-}
-
-#endif
diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 4bed6c0d7..a1e93753d 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk
@@ -73,27 +73,24 @@ VP9_CX_SRCS-yes += encoder/vp9_mbgraph.h
VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_mcomp_x86.h
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_x86_csystemdependent.c
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_mmx.c
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_variance_impl_mmx.asm
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_sad_mmx.asm
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.asm
VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_dct_mmx.h
-VP9_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp9_subtract_mmx.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_sse2.c
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_sad4d_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_fwalsh_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subtract_sse2.asm
+VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_error_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_subpel_variance_impl_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_temporal_filter_apply_sse2.asm
VP9_CX_SRCS-$(HAVE_SSE3) += encoder/x86/vp9_sad_sse3.asm
VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_sad_ssse3.asm
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_ssse3.c
-VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_variance_impl_ssse3.asm
VP9_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/vp9_sad_sse4.asm
-VP9_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/vp9_encodeopt.asm
VP9_CX_SRCS-$(ARCH_X86_64) += encoder/x86/vp9_ssim_opt.asm
VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_sse2.c