summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xbuild/make/configure.sh41
-rw-r--r--libs.mk4
-rw-r--r--vp8/common/asm_com_offsets.c16
-rw-r--r--vp8/common/mfqe.c271
-rw-r--r--vp8/common/postproc.c209
-rw-r--r--vp8/common/postproc.h4
-rw-r--r--vp8/common/rtcd_defs.sh9
-rw-r--r--vp8/common/x86/mfqe_sse2.asm281
-rw-r--r--vp8/encoder/onyx_if.c12
-rw-r--r--vp8/vp8_common.mk2
10 files changed, 613 insertions, 236 deletions
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 571fd84e9..51878864a 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -635,42 +635,45 @@ process_common_toolchain() {
# Handle darwin variants. Newer SDKs allow targeting older
# platforms, so find the newest SDK available.
- if [ -d "/Developer/SDKs/MacOSX10.4u.sdk" ]; then
- osx_sdk_dir="/Developer/SDKs/MacOSX10.4u.sdk"
- fi
- if [ -d "/Developer/SDKs/MacOSX10.5.sdk" ]; then
- osx_sdk_dir="/Developer/SDKs/MacOSX10.5.sdk"
- fi
- if [ -d "/Developer/SDKs/MacOSX10.6.sdk" ]; then
- osx_sdk_dir="/Developer/SDKs/MacOSX10.6.sdk"
- fi
- if [ -d "/Developer/SDKs/MacOSX10.7.sdk" ]; then
- osx_sdk_dir="/Developer/SDKs/MacOSX10.7.sdk"
+ case ${toolchain} in
+ *-darwin*)
+ if [ -z "${DEVELOPER_DIR}" ]; then
+ DEVELOPER_DIR=`xcode-select -print-path 2> /dev/null`
+ [ $? -ne 0 ] && OSX_SKIP_DIR_CHECK=1
+ fi
+ if [ -z "${OSX_SKIP_DIR_CHECK}" ]; then
+ OSX_SDK_ROOTS="${DEVELOPER_DIR}/SDKs"
+ OSX_SDK_VERSIONS="MacOSX10.4u.sdk MacOSX10.5.sdk MacOSX10.6.sdk"
+ OSX_SDK_VERSIONS="${OSX_SDK_VERSIONS} MacOSX10.7.sdk"
+ for v in ${OSX_SDK_VERSIONS}; do
+ if [ -d "${OSX_SDK_ROOTS}/${v}" ]; then
+ osx_sdk_dir="${OSX_SDK_ROOTS}/${v}"
+ fi
+ done
+ fi
+ ;;
+ esac
+
+ if [ -d "${osx_sdk_dir}" ]; then
+ add_cflags "-isysroot ${osx_sdk_dir}"
+ add_ldflags "-isysroot ${osx_sdk_dir}"
fi
case ${toolchain} in
*-darwin8-*)
- add_cflags "-isysroot ${osx_sdk_dir}"
add_cflags "-mmacosx-version-min=10.4"
- add_ldflags "-isysroot ${osx_sdk_dir}"
add_ldflags "-mmacosx-version-min=10.4"
;;
*-darwin9-*)
- add_cflags "-isysroot ${osx_sdk_dir}"
add_cflags "-mmacosx-version-min=10.5"
- add_ldflags "-isysroot ${osx_sdk_dir}"
add_ldflags "-mmacosx-version-min=10.5"
;;
*-darwin10-*)
- add_cflags "-isysroot ${osx_sdk_dir}"
add_cflags "-mmacosx-version-min=10.6"
- add_ldflags "-isysroot ${osx_sdk_dir}"
add_ldflags "-mmacosx-version-min=10.6"
;;
*-darwin11-*)
- add_cflags "-isysroot ${osx_sdk_dir}"
add_cflags "-mmacosx-version-min=10.7"
- add_ldflags "-isysroot ${osx_sdk_dir}"
add_ldflags "-mmacosx-version-min=10.7"
;;
esac
diff --git a/libs.mk b/libs.mk
index b77276e1b..5889a88b7 100644
--- a/libs.mk
+++ b/libs.mk
@@ -329,8 +329,8 @@ CLEAN-OBJS += $(BUILD_PFX)vpx_version.h
#
# Rule to generate runtime cpu detection files
#
-$(OBJS-yes:.o=.d): vpx_rtcd.h
-vpx_rtcd.h: $(SRC_PATH_BARE)/$(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
+$(OBJS-yes:.o=.d): $(BUILD_PFX)vpx_rtcd.h
+$(BUILD_PFX)vpx_rtcd.h: $(SRC_PATH_BARE)/$(sort $(filter %rtcd_defs.sh,$(CODEC_SRCS)))
@echo " [CREATE] $@"
$(qexec)$(SRC_PATH_BARE)/build/make/rtcd.sh --arch=$(TGT_ISA) \
--sym=vpx_rtcd \
diff --git a/vp8/common/asm_com_offsets.c b/vp8/common/asm_com_offsets.c
index 5cf151980..ae22b5f6b 100644
--- a/vp8/common/asm_com_offsets.c
+++ b/vp8/common/asm_com_offsets.c
@@ -15,6 +15,10 @@
#include "vpx_scale/yv12config.h"
#include "vp8/common/blockd.h"
+#if CONFIG_POSTPROC
+#include "postproc.h"
+#endif /* CONFIG_POSTPROC */
+
BEGIN
/* vpx_scale */
@@ -30,6 +34,11 @@ DEFINE(yv12_buffer_config_v_buffer, offsetof(YV12_BUFFER_CONFIG, v_b
DEFINE(yv12_buffer_config_border, offsetof(YV12_BUFFER_CONFIG, border));
DEFINE(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS);
+#if CONFIG_POSTPROC
+/* mfqe.c / filter_by_weight */
+DEFINE(MFQE_PRECISION_VAL, MFQE_PRECISION);
+#endif /* CONFIG_POSTPROC */
+
END
/* add asserts for any offset that is not supported by assembly code */
@@ -53,3 +62,10 @@ ct_assert(B_HU_PRED, B_HU_PRED == 9);
/* vp8_yv12_extend_frame_borders_neon makes several assumptions based on this */
ct_assert(VP8BORDERINPIXELS_VAL, VP8BORDERINPIXELS == 32)
#endif
+
+#if HAVE_SSE2
+#if CONFIG_POSTPROC
+/* vp8_filter_by_weight16x16 and 8x8 */
+ct_assert(MFQE_PRECISION_VAL, MFQE_PRECISION == 4)
+#endif /* CONFIG_POSTPROC */
+#endif /* HAVE_SSE2 */
diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
new file mode 100644
index 000000000..05d95395b
--- /dev/null
+++ b/vp8/common/mfqe.c
@@ -0,0 +1,271 @@
+/*
+ * Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+/* MFQE: Multiframe Quality Enhancement
+ * In rate limited situations keyframes may cause significant visual artifacts
+ * commonly referred to as "popping." This file implements a postproccesing
+ * algorithm which blends data from the preceeding frame when there is no
+ * motion and the q from the previous frame is lower which indicates that it is
+ * higher quality.
+ */
+
+#include "postproc.h"
+#include "variance.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_rtcd.h"
+#include "vpx_scale/yv12config.h"
+
+#include <limits.h>
+#include <stdlib.h>
+
+
+static inline void filter_by_weight(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int block_size, int src_weight)
+{
+ int dst_weight = (1 << MFQE_PRECISION) - src_weight;
+ int rounding_bit = 1 << (MFQE_PRECISION - 1);
+ int r, c;
+
+ for (r = 0; r < block_size; r++)
+ {
+ for (c = 0; c < block_size; c++)
+ {
+ dst[c] = (src[c] * src_weight +
+ dst[c] * dst_weight +
+ rounding_bit) >> MFQE_PRECISION;
+ }
+ src += src_stride;
+ dst += dst_stride;
+ }
+}
+
+void vp8_filter_by_weight16x16_c(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int src_weight)
+{
+ filter_by_weight(src, src_stride, dst, dst_stride, 16, src_weight);
+}
+
+void vp8_filter_by_weight8x8_c(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int src_weight)
+{
+ filter_by_weight(src, src_stride, dst, dst_stride, 8, src_weight);
+}
+
+void vp8_filter_by_weight4x4_c(unsigned char *src, int src_stride,
+ unsigned char *dst, int dst_stride,
+ int src_weight)
+{
+ filter_by_weight(src, src_stride, dst, dst_stride, 4, src_weight);
+}
+
+static inline void apply_ifactor(unsigned char *y_src,
+ int y_src_stride,
+ unsigned char *y_dst,
+ int y_dst_stride,
+ unsigned char *u_src,
+ unsigned char *v_src,
+ int uv_src_stride,
+ unsigned char *u_dst,
+ unsigned char *v_dst,
+ int uv_dst_stride,
+ int block_size,
+ int src_weight)
+{
+ if (block_size == 16)
+ {
+ vp8_filter_by_weight16x16(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+ vp8_filter_by_weight8x8(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+ vp8_filter_by_weight8x8(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+ }
+ else /* if (block_size == 8) */
+ {
+ vp8_filter_by_weight8x8(y_src, y_src_stride, y_dst, y_dst_stride, src_weight);
+ vp8_filter_by_weight4x4(u_src, uv_src_stride, u_dst, uv_dst_stride, src_weight);
+ vp8_filter_by_weight4x4(v_src, uv_src_stride, v_dst, uv_dst_stride, src_weight);
+ }
+}
+
+static void multiframe_quality_enhance_block
+(
+ int blksize, /* Currently only values supported are 16 and 8 */
+ int qcurr,
+ int qprev,
+ unsigned char *y,
+ unsigned char *u,
+ unsigned char *v,
+ int y_stride,
+ int uv_stride,
+ unsigned char *yd,
+ unsigned char *ud,
+ unsigned char *vd,
+ int yd_stride,
+ int uvd_stride
+)
+{
+ static const unsigned char VP8_ZEROS[16]=
+ {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
+ };
+
+ int uvblksize = blksize >> 1;
+ int qdiff = qcurr - qprev;
+
+ int i;
+ unsigned char *up;
+ unsigned char *udp;
+ unsigned char *vp;
+ unsigned char *vdp;
+
+ unsigned int act, sad, thr, sse;
+
+ if (blksize == 16)
+ {
+ act = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
+ sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, INT_MAX)+128)>>8;
+ }
+ else /* if (blksize == 8) */
+ {
+ act = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
+ sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, INT_MAX)+32)>>6;
+ }
+
+ /* thr = qdiff/8 + log2(act) + log4(qprev) */
+ thr = (qdiff>>3);
+ while (act>>=1) thr++;
+ while (qprev>>=2) thr++;
+
+ if (sad < thr)
+ {
+ int ifactor = (sad << MFQE_PRECISION) / thr;
+ ifactor >>= (qdiff >> 5);
+
+ if (ifactor)
+ {
+ apply_ifactor(y, y_stride, yd, yd_stride,
+ u, v, uv_stride,
+ ud, vd, uvd_stride,
+ blksize, ifactor);
+ }
+ /* else implicitly copy from previous frame */
+ }
+ else
+ {
+ if (blksize == 16)
+ {
+ vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
+ vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
+ vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
+ }
+ else /* if (blksize == 8) */
+ {
+ vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
+ for (up = u, udp = ud, i = 0; i < uvblksize; ++i, up += uv_stride, udp += uvd_stride)
+ vpx_memcpy(udp, up, uvblksize);
+ for (vp = v, vdp = vd, i = 0; i < uvblksize; ++i, vp += uv_stride, vdp += uvd_stride)
+ vpx_memcpy(vdp, vp, uvblksize);
+ }
+ }
+}
+
+void vp8_multiframe_quality_enhance
+(
+ VP8_COMMON *cm
+)
+{
+ YV12_BUFFER_CONFIG *show = cm->frame_to_show;
+ YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+ FRAME_TYPE frame_type = cm->frame_type;
+ /* Point at base of Mb MODE_INFO list has motion vectors etc */
+ const MODE_INFO *mode_info_context = cm->mi;
+ int mb_row;
+ int mb_col;
+ int qcurr = cm->base_qindex;
+ int qprev = cm->postproc_state.last_base_qindex;
+
+ unsigned char *y_ptr, *u_ptr, *v_ptr;
+ unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
+
+ /* Set up the buffer pointers */
+ y_ptr = show->y_buffer;
+ u_ptr = show->u_buffer;
+ v_ptr = show->v_buffer;
+ yd_ptr = dest->y_buffer;
+ ud_ptr = dest->u_buffer;
+ vd_ptr = dest->v_buffer;
+
+ /* postprocess each macro block */
+ for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
+ {
+ for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
+ {
+ /* if motion is high there will likely be no benefit */
+ if (((frame_type == INTER_FRAME &&
+ abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 &&
+ abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) ||
+ (frame_type == KEY_FRAME)))
+ {
+ if (mode_info_context->mbmi.mode == B_PRED || mode_info_context->mbmi.mode == SPLITMV)
+ {
+ int i, j;
+ for (i=0; i<2; ++i)
+ for (j=0; j<2; ++j)
+ multiframe_quality_enhance_block(8, qcurr, qprev,
+ y_ptr + 8*(i*show->y_stride+j),
+ u_ptr + 4*(i*show->uv_stride+j),
+ v_ptr + 4*(i*show->uv_stride+j),
+ show->y_stride,
+ show->uv_stride,
+ yd_ptr + 8*(i*dest->y_stride+j),
+ ud_ptr + 4*(i*dest->uv_stride+j),
+ vd_ptr + 4*(i*dest->uv_stride+j),
+ dest->y_stride,
+ dest->uv_stride);
+ }
+ else
+ {
+ multiframe_quality_enhance_block(16, qcurr, qprev, y_ptr,
+ u_ptr, v_ptr,
+ show->y_stride,
+ show->uv_stride,
+ yd_ptr, ud_ptr, vd_ptr,
+ dest->y_stride,
+ dest->uv_stride);
+ }
+ }
+ else
+ {
+ vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
+ vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
+ vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
+ }
+ y_ptr += 16;
+ u_ptr += 8;
+ v_ptr += 8;
+ yd_ptr += 16;
+ ud_ptr += 8;
+ vd_ptr += 8;
+ mode_info_context++; /* step to next MB */
+ }
+
+ y_ptr += show->y_stride * 16 - 16 * cm->mb_cols;
+ u_ptr += show->uv_stride * 8 - 8 * cm->mb_cols;
+ v_ptr += show->uv_stride * 8 - 8 * cm->mb_cols;
+ yd_ptr += dest->y_stride * 16 - 16 * cm->mb_cols;
+ ud_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols;
+ vd_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols;
+
+ mode_info_context++; /* Skip border mb */
+ }
+}
diff --git a/vp8/common/postproc.c b/vp8/common/postproc.c
index d41445d03..cc0d9f80b 100644
--- a/vp8/common/postproc.c
+++ b/vp8/common/postproc.c
@@ -17,7 +17,6 @@
#include "vpx_scale/yv12extend.h"
#include "vpx_scale/vpxscale.h"
#include "systemdependent.h"
-#include "variance.h"
#include <limits.h>
#include <math.h>
@@ -30,7 +29,6 @@
( (0.439*(float)(t>>16)) - (0.368*(float)(t>>8&0xff)) - (0.071*(float)(t&0xff)) + 128)
/* global constants */
-#define MFQE_PRECISION 4
#if CONFIG_POSTPROC_VISUALIZER
static const unsigned char MB_PREDICTION_MODE_colors[MB_MODE_COUNT][3] =
{
@@ -696,213 +694,6 @@ static void constrain_line (int x0, int *x1, int y0, int *y1, int width, int hei
}
-static void multiframe_quality_enhance_block
-(
- int blksize, /* Currently only values supported are 16, 8, 4 */
- int qcurr,
- int qprev,
- unsigned char *y,
- unsigned char *u,
- unsigned char *v,
- int y_stride,
- int uv_stride,
- unsigned char *yd,
- unsigned char *ud,
- unsigned char *vd,
- int yd_stride,
- int uvd_stride
-)
-{
- static const unsigned char VP8_ZEROS[16]=
- {
- 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
- };
- int blksizeby2 = blksize >> 1;
- int qdiff = qcurr - qprev;
-
- int i, j;
- unsigned char *yp;
- unsigned char *ydp;
- unsigned char *up;
- unsigned char *udp;
- unsigned char *vp;
- unsigned char *vdp;
-
- unsigned int act, sse, sad, thr;
- if (blksize == 16)
- {
- act = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
- sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, INT_MAX)+128)>>8;
- }
- else if (blksize == 8)
- {
- act = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
- sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, INT_MAX)+32)>>6;
- }
- else
- {
- act = (vp8_variance4x4(yd, yd_stride, VP8_ZEROS, 0, &sse)+8)>>4;
- sad = (vp8_sad4x4(y, y_stride, yd, yd_stride, INT_MAX)+8)>>4;
- }
- /* thr = qdiff/8 + log2(act) + log4(qprev) */
- thr = (qdiff>>3);
- while (act>>=1) thr++;
- while (qprev>>=2) thr++;
- if (sad < thr)
- {
- static const int roundoff = (1 << (MFQE_PRECISION - 1));
- int ifactor = (sad << MFQE_PRECISION) / thr;
- ifactor >>= (qdiff >> 5);
- // TODO: SIMD optimize this section
- if (ifactor)
- {
- int icfactor = (1 << MFQE_PRECISION) - ifactor;
- for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
- {
- for (j = 0; j < blksize; ++j)
- ydp[j] = (int)((yp[j] * ifactor + ydp[j] * icfactor + roundoff) >> MFQE_PRECISION);
- }
- for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
- {
- for (j = 0; j < blksizeby2; ++j)
- udp[j] = (int)((up[j] * ifactor + udp[j] * icfactor + roundoff) >> MFQE_PRECISION);
- }
- for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
- {
- for (j = 0; j < blksizeby2; ++j)
- vdp[j] = (int)((vp[j] * ifactor + vdp[j] * icfactor + roundoff) >> MFQE_PRECISION);
- }
- }
- }
- else
- {
- if (blksize == 16)
- {
- vp8_copy_mem16x16(y, y_stride, yd, yd_stride);
- vp8_copy_mem8x8(u, uv_stride, ud, uvd_stride);
- vp8_copy_mem8x8(v, uv_stride, vd, uvd_stride);
- }
- else if (blksize == 8)
- {
- vp8_copy_mem8x8(y, y_stride, yd, yd_stride);
- for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
- vpx_memcpy(udp, up, blksizeby2);
- for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
- vpx_memcpy(vdp, vp, blksizeby2);
- }
- else
- {
- for (yp = y, ydp = yd, i = 0; i < blksize; ++i, yp += y_stride, ydp += yd_stride)
- vpx_memcpy(ydp, yp, blksize);
- for (up = u, udp = ud, i = 0; i < blksizeby2; ++i, up += uv_stride, udp += uvd_stride)
- vpx_memcpy(udp, up, blksizeby2);
- for (vp = v, vdp = vd, i = 0; i < blksizeby2; ++i, vp += uv_stride, vdp += uvd_stride)
- vpx_memcpy(vdp, vp, blksizeby2);
- }
- }
-}
-
-void vp8_multiframe_quality_enhance
-(
- VP8_COMMON *cm
-)
-{
- YV12_BUFFER_CONFIG *show = cm->frame_to_show;
- YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
-
- FRAME_TYPE frame_type = cm->frame_type;
- /* Point at base of Mb MODE_INFO list has motion vectors etc */
- const MODE_INFO *mode_info_context = cm->mi;
- int mb_row;
- int mb_col;
- int qcurr = cm->base_qindex;
- int qprev = cm->postproc_state.last_base_qindex;
-
- unsigned char *y_ptr, *u_ptr, *v_ptr;
- unsigned char *yd_ptr, *ud_ptr, *vd_ptr;
-
- /* Set up the buffer pointers */
- y_ptr = show->y_buffer;
- u_ptr = show->u_buffer;
- v_ptr = show->v_buffer;
- yd_ptr = dest->y_buffer;
- ud_ptr = dest->u_buffer;
- vd_ptr = dest->v_buffer;
-
- /* postprocess each macro block */
- for (mb_row = 0; mb_row < cm->mb_rows; mb_row++)
- {
- for (mb_col = 0; mb_col < cm->mb_cols; mb_col++)
- {
- /* if motion is high there will likely be no benefit */
- if (((frame_type == INTER_FRAME &&
- abs(mode_info_context->mbmi.mv.as_mv.row) <= 10 &&
- abs(mode_info_context->mbmi.mv.as_mv.col) <= 10) ||
- (frame_type == KEY_FRAME)))
- {
- if (mode_info_context->mbmi.mode == B_PRED || mode_info_context->mbmi.mode == SPLITMV)
- {
- int i, j;
- for (i=0; i<2; ++i)
- for (j=0; j<2; ++j)
- multiframe_quality_enhance_block(8,
- qcurr,
- qprev,
- y_ptr + 8*(i*show->y_stride+j),
- u_ptr + 4*(i*show->uv_stride+j),
- v_ptr + 4*(i*show->uv_stride+j),
- show->y_stride,
- show->uv_stride,
- yd_ptr + 8*(i*dest->y_stride+j),
- ud_ptr + 4*(i*dest->uv_stride+j),
- vd_ptr + 4*(i*dest->uv_stride+j),
- dest->y_stride,
- dest->uv_stride);
- }
- else
- {
- multiframe_quality_enhance_block(16,
- qcurr,
- qprev,
- y_ptr,
- u_ptr,
- v_ptr,
- show->y_stride,
- show->uv_stride,
- yd_ptr,
- ud_ptr,
- vd_ptr,
- dest->y_stride,
- dest->uv_stride);
-
- }
- }
- else
- {
- vp8_copy_mem16x16(y_ptr, show->y_stride, yd_ptr, dest->y_stride);
- vp8_copy_mem8x8(u_ptr, show->uv_stride, ud_ptr, dest->uv_stride);
- vp8_copy_mem8x8(v_ptr, show->uv_stride, vd_ptr, dest->uv_stride);
- }
- y_ptr += 16;
- u_ptr += 8;
- v_ptr += 8;
- yd_ptr += 16;
- ud_ptr += 8;
- vd_ptr += 8;
- mode_info_context++; /* step to next MB */
- }
-
- y_ptr += show->y_stride * 16 - 16 * cm->mb_cols;
- u_ptr += show->uv_stride * 8 - 8 * cm->mb_cols;
- v_ptr += show->uv_stride * 8 - 8 * cm->mb_cols;
- yd_ptr += dest->y_stride * 16 - 16 * cm->mb_cols;
- ud_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols;
- vd_ptr += dest->uv_stride * 8 - 8 * cm->mb_cols;
-
- mode_info_context++; /* Skip border mb */
- }
-}
-
int vp8_post_proc_frame(VP8_COMMON *oci, YV12_BUFFER_CONFIG *dest, vp8_ppflags_t *ppflags)
{
int q = oci->filter_level * 10 / 6;
diff --git a/vp8/common/postproc.h b/vp8/common/postproc.h
index 1db74379f..4a792dcbe 100644
--- a/vp8/common/postproc.h
+++ b/vp8/common/postproc.h
@@ -40,4 +40,8 @@ void vp8_deblock(YV12_BUFFER_CONFIG *source,
int q,
int low_var_thresh,
int flag);
+
+#define MFQE_PRECISION 4
+
+void vp8_multiframe_quality_enhance(struct VP8Common *cm);
#endif
diff --git a/vp8/common/rtcd_defs.sh b/vp8/common/rtcd_defs.sh
index ff8e30c3f..0fdb4fa00 100644
--- a/vp8/common/rtcd_defs.sh
+++ b/vp8/common/rtcd_defs.sh
@@ -166,6 +166,15 @@ if [ "$CONFIG_POSTPROC" = "yes" ]; then
prototype void vp8_blend_b "unsigned char *y, unsigned char *u, unsigned char *v, int y1, int u1, int v1, int alpha, int stride"
# no asm yet
+
+ prototype void vp8_filter_by_weight16x16 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+ specialize vp8_filter_by_weight16x16 sse2
+
+ prototype void vp8_filter_by_weight8x8 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+ specialize vp8_filter_by_weight8x8 sse2
+
+ prototype void vp8_filter_by_weight4x4 "unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight"
+ # no asm yet
fi
#
diff --git a/vp8/common/x86/mfqe_sse2.asm b/vp8/common/x86/mfqe_sse2.asm
new file mode 100644
index 000000000..10d21f320
--- /dev/null
+++ b/vp8/common/x86/mfqe_sse2.asm
@@ -0,0 +1,281 @@
+;
+; Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+
+%include "vpx_ports/x86_abi_support.asm"
+
+;void vp8_filter_by_weight16x16_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp8_filter_by_weight16x16_sse2)
+sym(vp8_filter_by_weight16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ SAVE_XMM 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 16 ; loop count
+ pxor xmm6, xmm6
+
+.combine
+ movdqa xmm2, [rax]
+ movdqa xmm4, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ movdqa xmm3, xmm2
+ punpcklbw xmm2, xmm6
+ punpckhbw xmm3, xmm6
+ pmullw xmm2, xmm0
+ pmullw xmm3, xmm0
+
+ ; dst * dst_weight
+ movdqa xmm5, xmm4
+ punpcklbw xmm4, xmm6
+ punpckhbw xmm5, xmm6
+ pmullw xmm4, xmm1
+ pmullw xmm5, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm4
+ paddw xmm3, xmm5
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ paddw xmm3, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+ psrlw xmm3, 4
+
+ packuswb xmm2, xmm3
+ movdqa [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ RESTORE_XMM
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp8_filter_by_weight8x8_sse2
+;(
+; unsigned char *src,
+; int src_stride,
+; unsigned char *dst,
+; int dst_stride,
+; int src_weight
+;)
+global sym(vp8_filter_by_weight8x8_sse2)
+sym(vp8_filter_by_weight8x8_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 5
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ movd xmm0, arg(4) ; src_weight
+ pshuflw xmm0, xmm0, 0x0 ; replicate to all low words
+ punpcklqdq xmm0, xmm0 ; replicate to all hi words
+
+ movdqa xmm1, [GLOBAL(tMFQE)]
+ psubw xmm1, xmm0 ; dst_weight
+
+ mov rax, arg(0) ; src
+ mov rsi, arg(1) ; src_stride
+ mov rdx, arg(2) ; dst
+ mov rdi, arg(3) ; dst_stride
+
+ mov rcx, 8 ; loop count
+ pxor xmm4, xmm4
+
+.combine
+ movq xmm2, [rax]
+ movq xmm3, [rdx]
+ add rax, rsi
+
+ ; src * src_weight
+ punpcklbw xmm2, xmm4
+ pmullw xmm2, xmm0
+
+ ; dst * dst_weight
+ punpcklbw xmm3, xmm4
+ pmullw xmm3, xmm1
+
+ ; sum, round and shift
+ paddw xmm2, xmm3
+ paddw xmm2, [GLOBAL(tMFQE_round)]
+ psrlw xmm2, 4
+
+ packuswb xmm2, xmm4
+ movq [rdx], xmm2
+ add rdx, rdi
+
+ dec rcx
+ jnz .combine
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+
+ ret
+
+;void vp8_variance_and_sad_16x16_sse2 | arg
+;(
+; unsigned char *src1, 0
+; int stride1, 1
+; unsigned char *src2, 2
+; int stride2, 3
+; unsigned int *variance, 4
+; unsigned int *sad, 5
+;)
+global sym(vp8_variance_and_sad_16x16_sse2)
+sym(vp8_variance_and_sad_16x16_sse2):
+ push rbp
+ mov rbp, rsp
+ SHADOW_ARGS_TO_STACK 6
+ GET_GOT rbx
+ push rsi
+ push rdi
+ ; end prolog
+
+ mov rax, arg(0) ; src1
+ mov rcx, arg(1) ; stride1
+ mov rdx, arg(2) ; src2
+ mov rdi, arg(3) ; stride2
+
+ mov rsi, 16 ; block height
+
+ ; Prep accumulator registers
+ pxor xmm3, xmm3 ; SAD
+ pxor xmm4, xmm4 ; sum of src2
+ pxor xmm5, xmm5 ; sum of src2^2
+
+ ; Because we're working with the actual output frames
+ ; we can't depend on any kind of data alignment.
+.accumulate
+ movdqa xmm0, [rax] ; src1
+ movdqa xmm1, [rdx] ; src2
+ add rax, rcx ; src1 + stride1
+ add rdx, rdi ; src2 + stride2
+
+ ; SAD(src1, src2)
+ psadbw xmm0, xmm1
+ paddusw xmm3, xmm0
+
+ ; SUM(src2)
+ pxor xmm2, xmm2
+ psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0
+ paddusw xmm4, xmm2
+
+ ; pmaddubsw would be ideal if it took two unsigned values. instead,
+ ; it expects a signed and an unsigned value. so instead we zero extend
+ ; and operate on words.
+ pxor xmm2, xmm2
+ movdqa xmm0, xmm1
+ punpcklbw xmm0, xmm2
+ punpckhbw xmm1, xmm2
+ pmaddwd xmm0, xmm0
+ pmaddwd xmm1, xmm1
+ paddd xmm5, xmm0
+ paddd xmm5, xmm1
+
+ sub rsi, 1
+ jnz .accumulate
+
+ ; phaddd only operates on adjacent double words.
+ ; Finalize SAD and store
+ movdqa xmm0, xmm3
+ psrldq xmm0, 8
+ paddusw xmm0, xmm3
+ paddd xmm0, [GLOBAL(t128)]
+ psrld xmm0, 8
+
+ mov rax, arg(5)
+ movd [rax], xmm0
+
+ ; Accumulate sum of src2
+ movdqa xmm0, xmm4
+ psrldq xmm0, 8
+ paddusw xmm0, xmm4
+ ; Square src2. Ignore high value
+ pmuludq xmm0, xmm0
+ psrld xmm0, 8
+
+ ; phaddw could be used to sum adjacent values but we want
+ ; all the values summed. promote to doubles, accumulate,
+ ; shift and sum
+ pxor xmm2, xmm2
+ movdqa xmm1, xmm5
+ punpckldq xmm1, xmm2
+ punpckhdq xmm5, xmm2
+ paddd xmm1, xmm5
+ movdqa xmm2, xmm1
+ psrldq xmm1, 8
+ paddd xmm1, xmm2
+
+ psubd xmm1, xmm0
+
+ ; (variance + 128) >> 8
+ paddd xmm1, [GLOBAL(t128)]
+ psrld xmm1, 8
+ mov rax, arg(4)
+
+ movd [rax], xmm1
+
+
+ ; begin epilog
+ pop rdi
+ pop rsi
+ RESTORE_GOT
+ UNSHADOW_ARGS
+ pop rbp
+ ret
+
+SECTION_RODATA
+align 16
+t128:
+ ddq 128
+align 16
+tMFQE: ; 1 << MFQE_PRECISION
+ times 8 dw 0x10
+align 16
+tMFQE_round: ; 1 << (MFQE_PRECISION - 1)
+ times 8 dw 0x08
+
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 607b56a2a..1de7514d0 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -144,7 +144,7 @@ extern void vp8cx_init_quantizer(VP8_COMP *cpi);
extern const int vp8cx_base_skip_false_prob[128];
// Tables relating active max Q to active min Q
-static const int kf_low_motion_minq[QINDEX_RANGE] =
+static const unsigned char kf_low_motion_minq[QINDEX_RANGE] =
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -155,7 +155,7 @@ static const int kf_low_motion_minq[QINDEX_RANGE] =
11,11,12,12,13,13,13,13,14,14,15,15,15,15,16,16,
16,16,17,17,18,18,18,18,19,20,20,21,21,22,23,23
};
-static const int kf_high_motion_minq[QINDEX_RANGE] =
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] =
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
@@ -166,7 +166,7 @@ static const int kf_high_motion_minq[QINDEX_RANGE] =
16,16,17,17,18,18,18,18,19,19,20,20,20,20,21,21,
21,21,22,22,23,23,24,25,25,26,26,27,28,28,29,30
};
-static const int gf_low_motion_minq[QINDEX_RANGE] =
+static const unsigned char gf_low_motion_minq[QINDEX_RANGE] =
{
0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2,
3,3,3,3,4,4,4,4,5,5,5,5,6,6,6,6,
@@ -177,7 +177,7 @@ static const int gf_low_motion_minq[QINDEX_RANGE] =
35,35,36,36,37,37,38,38,39,39,40,40,41,41,42,42,
43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58
};
-static const int gf_mid_motion_minq[QINDEX_RANGE] =
+static const unsigned char gf_mid_motion_minq[QINDEX_RANGE] =
{
0,0,0,0,1,1,1,1,1,1,2,2,3,3,3,4,
4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
@@ -188,7 +188,7 @@ static const int gf_mid_motion_minq[QINDEX_RANGE] =
38,39,39,40,40,41,41,42,42,43,43,44,45,46,47,48,
49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
};
-static const int gf_high_motion_minq[QINDEX_RANGE] =
+static const unsigned char gf_high_motion_minq[QINDEX_RANGE] =
{
0,0,0,0,1,1,1,1,1,2,2,2,3,3,3,4,
4,4,5,5,5,6,6,6,7,7,7,8,8,8,9,9,
@@ -199,7 +199,7 @@ static const int gf_high_motion_minq[QINDEX_RANGE] =
41,41,42,42,43,44,45,46,47,48,49,50,51,52,53,54,
55,56,57,58,59,60,62,64,66,68,70,72,74,76,78,80
};
-static const int inter_minq[QINDEX_RANGE] =
+static const unsigned char inter_minq[QINDEX_RANGE] =
{
0,0,1,1,2,3,3,4,4,5,6,6,7,8,8,9,
9,10,11,11,12,13,13,14,15,15,16,17,17,18,19,20,
diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk
index f68d007c1..3403557e9 100644
--- a/vp8/vp8_common.mk
+++ b/vp8/vp8_common.mk
@@ -79,6 +79,7 @@ VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.c
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/filter_x86.h
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/vp8_asm_stubs.c
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/loopfilter_x86.c
+VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/mfqe.c
VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.h
VP8_COMMON_SRCS-$(CONFIG_POSTPROC) += common/postproc.c
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/dequantize_mmx.asm
@@ -112,6 +113,7 @@ VP8_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/sad_sse4.asm
ifeq ($(CONFIG_POSTPROC),yes)
VP8_COMMON_SRCS-$(ARCH_X86)$(ARCH_X86_64) += common/x86/postproc_x86.c
VP8_COMMON_SRCS-$(HAVE_MMX) += common/x86/postproc_mmx.asm
+VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
VP8_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
endif