diff options
-rwxr-xr-x | build/make/iosbuild.sh | 91 | ||||
-rw-r--r-- | vp8/common/rtcd_defs.pl | 4 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_sse4.asm | 256 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_sse4.c | 130 | ||||
-rw-r--r-- | vp8/vp8cx.mk | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.c | 142 | ||||
-rw-r--r-- | vp9/encoder/vp9_denoiser.h | 8 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.c | 7 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 18 | ||||
-rw-r--r-- | vp9/encoder/vp9_speed_features.c | 4 |
10 files changed, 381 insertions, 281 deletions
diff --git a/build/make/iosbuild.sh b/build/make/iosbuild.sh index 230c0ce8c..9d9c3749b 100755 --- a/build/make/iosbuild.sh +++ b/build/make/iosbuild.sh @@ -50,9 +50,79 @@ build_target() { vlog "***Done building target: ${target}***" } +# Returns the preprocessor symbol for the target specified by $1. +target_to_preproc_symbol() { + target="$1" + case "${target}" in + armv6-*) + echo "__ARM_ARCH_6__" + ;; + armv7-*) + echo "__ARM_ARCH_7__" + ;; + armv7s-*) + echo "__ARM_ARCH_7S__" + ;; + x86-*) + echo "__i386__" + ;; + x86_64-*) + echo "__x86_64__" + ;; + *) + echo "#error ${target} unknown/unsupported" + return 1 + ;; + esac +} + +# Create a vpx_config.h shim that, based on preprocessor settings for the +# current target CPU, includes the real vpx_config.h for the current target. +# $1 is the list of targets. +create_vpx_framework_config_shim() { + local targets="$1" + local config_file="${HEADER_DIR}/vpx_config.h" + local preproc_symbol="" + local target="" + local include_guard="VPX_FRAMEWORK_HEADERS_VPX_VPX_CONFIG_H_" + + local file_header="/* + * Copyright (c) $(date +%Y) The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +/* GENERATED FILE: DO NOT EDIT! */ + +#ifndef ${include_guard} +#define ${include_guard} + +#if defined" + + printf "%s" "${file_header}" > "${config_file}" + for target in ${targets}; do + preproc_symbol=$(target_to_preproc_symbol "${target}") + printf " ${preproc_symbol}\n" >> "${config_file}" + printf "#include \"VPX/vpx/${target}/vpx_config.h\"\n" >> "${config_file}" + printf "#elif defined" >> "${config_file}" + mkdir "${HEADER_DIR}/${target}" + cp -p "${BUILD_ROOT}/${target}/vpx_config.h" "${HEADER_DIR}/${target}" + done + + # Consume the last line of output from the loop: We don't want it. + sed -i '' -e '$d' "${config_file}" + + printf "#endif\n\n" >> "${config_file}" + printf "#endif // ${include_guard}" >> "${config_file}" +} + # Configures and builds each target specified by $1, and then builds # VPX.framework. -build_targets() { +build_framework() { local lib_list="" local targets="$1" local target="" @@ -75,15 +145,20 @@ build_targets() { cd "${ORIG_PWD}" - # Includes are identical for all platforms, and according to dist target - # behavior vpx_config.h and vpx_version.h aren't actually necessary for user - # apps built with libvpx. So, just copy the includes from the last target - # built. - # TODO(tomfinegan): The above is a lame excuse. Build common config/version - # includes that use the preprocessor to include the correct file. + # The basic libvpx API includes are all the same; just grab the most recent + # set. cp -p "${target_dist_dir}"/include/vpx/* "${HEADER_DIR}" + + # Build the fat library. ${LIPO} -create ${lib_list} -output ${FRAMEWORK_DIR}/VPX + # Create the vpx_config.h shim that allows usage of vpx_config.h from + # within VPX.framework. + create_vpx_framework_config_shim "${targets}" + + # Copy in vpx_version.h. + cp -p "${BUILD_ROOT}/${target}/vpx_version.h" "${HEADER_DIR}" + vlog "Created fat library ${FRAMEWORK_DIR}/VPX containing:" for lib in ${lib_list}; do vlog " $(echo ${lib} | awk -F / '{print $2, $NF}')" @@ -166,4 +241,4 @@ cat << EOF EOF fi -build_targets "${TARGETS}" +build_framework "${TARGETS}" diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index cbfd76a8d..3e4077460 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -463,9 +463,7 @@ $vp8_short_walsh4x4_neon_asm=vp8_short_walsh4x4_neon; # Quantizer # add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *"; -specialize qw/vp8_regular_quantize_b sse2/; -# TODO(johann) Update sse4 implementation and re-enable -#$vp8_regular_quantize_b_sse4_1=vp8_regular_quantize_b_sse4; +specialize qw/vp8_regular_quantize_b sse2 sse4_1/; add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *"; specialize qw/vp8_fast_quantize_b sse2 ssse3 media neon_asm/; diff --git a/vp8/encoder/x86/quantize_sse4.asm b/vp8/encoder/x86/quantize_sse4.asm deleted file mode 100644 index dbd171bfc..000000000 --- a/vp8/encoder/x86/quantize_sse4.asm +++ /dev/null @@ -1,256 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -%include "vp8_asm_enc_offsets.asm" - - -; void vp8_regular_quantize_b_sse4 | arg -; (BLOCK *b, | 0 -; BLOCKD *d) | 1 - -global sym(vp8_regular_quantize_b_sse4) PRIVATE -sym(vp8_regular_quantize_b_sse4): - -%if ABI_IS_32BIT - push rbp - mov rbp, rsp - GET_GOT rbx - push rdi - push rsi - - ALIGN_STACK 16, rax - %define qcoeff 0 ; 32 - %define stack_size 32 - sub rsp, stack_size -%else - %if LIBVPX_YASM_WIN64 - SAVE_XMM 8, u - push rdi - push rsi - %endif -%endif - ; end prolog - -%if ABI_IS_32BIT - mov rdi, arg(0) ; BLOCK *b - mov rsi, arg(1) ; BLOCKD *d -%else - %if LIBVPX_YASM_WIN64 - mov rdi, rcx ; BLOCK *b - mov rsi, rdx ; BLOCKD *d - %else - ;mov rdi, rdi ; BLOCK *b - ;mov rsi, rsi ; BLOCKD *d - %endif -%endif - - mov rax, [rdi + vp8_block_coeff] - mov rcx, [rdi + vp8_block_zbin] - mov rdx, [rdi + vp8_block_round] - movd xmm7, [rdi + vp8_block_zbin_extra] - - ; z - movdqa xmm0, [rax] - movdqa xmm1, [rax + 16] - - ; duplicate zbin_oq_value - pshuflw xmm7, xmm7, 0 - punpcklwd xmm7, xmm7 - - movdqa xmm2, xmm0 - movdqa xmm3, xmm1 - - ; sz - psraw xmm0, 15 - psraw xmm1, 15 - - ; (z ^ sz) - pxor xmm2, xmm0 - pxor xmm3, xmm1 - - ; x = abs(z) - psubw xmm2, xmm0 - psubw xmm3, xmm1 - - ; zbin - movdqa xmm4, [rcx] - movdqa xmm5, [rcx + 16] - - ; *zbin_ptr + zbin_oq_value - paddw xmm4, xmm7 - paddw xmm5, xmm7 - - movdqa xmm6, xmm2 - movdqa xmm7, xmm3 - - ; x - (*zbin_ptr + zbin_oq_value) - psubw xmm6, xmm4 - psubw xmm7, xmm5 - - ; round - movdqa xmm4, [rdx] - movdqa xmm5, [rdx + 16] - - mov rax, [rdi + vp8_block_quant_shift] - mov rcx, [rdi + vp8_block_quant] - mov rdx, [rdi + vp8_block_zrun_zbin_boost] - - ; x + round - paddw xmm2, xmm4 - paddw xmm3, xmm5 - - ; quant - movdqa xmm4, [rcx] - movdqa xmm5, [rcx + 16] - - ; y = x * quant_ptr >> 16 - pmulhw xmm4, xmm2 - pmulhw xmm5, xmm3 - - ; y += x - paddw xmm2, xmm4 - paddw xmm3, xmm5 - - pxor xmm4, xmm4 -%if ABI_IS_32BIT - movdqa [rsp + qcoeff], xmm4 - movdqa [rsp + qcoeff + 16], xmm4 -%else - pxor xmm8, xmm8 -%endif - - ; quant_shift - movdqa xmm5, [rax] - - ; zrun_zbin_boost - mov rax, rdx - -%macro ZIGZAG_LOOP 5 - ; x - pextrw ecx, %4, %2 - - ; if (x >= zbin) - sub cx, WORD PTR[rdx] ; x - zbin - lea rdx, [rdx + 2] ; zbin_boost_ptr++ - jl .rq_zigzag_loop_%1 ; x < zbin - - pextrw edi, %3, %2 ; y - - ; downshift by quant_shift[rc] - pextrb ecx, xmm5, %1 ; quant_shift[rc] - sar edi, cl ; also sets Z bit - je .rq_zigzag_loop_%1 ; !y -%if ABI_IS_32BIT - mov WORD PTR[rsp + qcoeff + %1 *2], di -%else - pinsrw %5, edi, %2 ; qcoeff[rc] -%endif - mov rdx, rax ; reset to b->zrun_zbin_boost -.rq_zigzag_loop_%1: -%endmacro -; in vp8_default_zig_zag1d order: see vp8/common/entropy.c -ZIGZAG_LOOP 0, 0, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 1, 1, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 4, 4, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 8, 0, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 5, 5, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 2, 2, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 3, 3, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 6, 6, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 9, 1, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 12, 4, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 13, 5, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 10, 2, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 7, 7, xmm2, xmm6, xmm4 -ZIGZAG_LOOP 11, 3, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 14, 6, xmm3, xmm7, xmm8 -ZIGZAG_LOOP 15, 7, xmm3, xmm7, xmm8 - - mov rcx, [rsi + vp8_blockd_dequant] - mov rdi, [rsi + vp8_blockd_dqcoeff] - -%if ABI_IS_32BIT - movdqa xmm4, [rsp + qcoeff] - movdqa xmm5, [rsp + qcoeff + 16] -%else - %define xmm5 xmm8 -%endif - - ; y ^ sz - pxor xmm4, xmm0 - pxor xmm5, xmm1 - ; x = (y ^ sz) - sz - psubw xmm4, xmm0 - psubw xmm5, xmm1 - - ; dequant - movdqa xmm0, [rcx] - movdqa xmm1, [rcx + 16] - - mov rcx, [rsi + vp8_blockd_qcoeff] - - pmullw xmm0, xmm4 - pmullw xmm1, xmm5 - - ; store qcoeff - movdqa [rcx], xmm4 - movdqa [rcx + 16], xmm5 - - ; store dqcoeff - movdqa [rdi], xmm0 - movdqa [rdi + 16], xmm1 - - mov rcx, [rsi + vp8_blockd_eob] - - ; select the last value (in zig_zag order) for EOB - pxor xmm6, xmm6 - pcmpeqw xmm4, xmm6 - pcmpeqw xmm5, xmm6 - - packsswb xmm4, xmm5 - pshufb xmm4, [GLOBAL(zig_zag1d)] - pmovmskb edx, xmm4 - xor rdi, rdi - mov eax, -1 - xor dx, ax - bsr eax, edx - sub edi, edx - sar edi, 31 - add eax, 1 - and eax, edi - - mov BYTE PTR [rcx], al ; store eob - - ; begin epilog -%if ABI_IS_32BIT - add rsp, stack_size - pop rsp - - pop rsi - pop rdi - RESTORE_GOT - pop rbp -%else - %undef xmm5 - %if LIBVPX_YASM_WIN64 - pop rsi - pop rdi - RESTORE_XMM - %endif -%endif - - ret - -SECTION_RODATA -align 16 -; vp8/common/entropy.c: vp8_default_zig_zag1d -zig_zag1d: - db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 diff --git a/vp8/encoder/x86/quantize_sse4.c b/vp8/encoder/x86/quantize_sse4.c new file mode 100644 index 000000000..b2fecfd2f --- /dev/null +++ b/vp8/encoder/x86/quantize_sse4.c @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + + +#include <smmintrin.h> /* SSE4.1 */ + +#include "./vp8_rtcd.h" +#include "vp8/encoder/block.h" +#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */ + +#define SELECT_EOB(i, z, x, y, q) \ + do { \ + __label__ select_eob_end; \ + short boost = *zbin_boost_ptr; \ + short x_z = _mm_extract_epi16(x, z); \ + short y_z = _mm_extract_epi16(y, z); \ + int cmp = (x_z < boost) | (y_z == 0); \ + zbin_boost_ptr++; \ + if (cmp) \ + goto select_eob_end; \ + q = _mm_insert_epi16(q, y_z, z); \ + eob = i; \ + zbin_boost_ptr = b->zrun_zbin_boost; \ + select_eob_end:; \ + } while (0) + +void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) { + char eob = 0; + short *zbin_boost_ptr = b->zrun_zbin_boost; + + __m128i sz0, x0, sz1, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, + dqcoeff0, dqcoeff1; + __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift)); + __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8)); + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff+8)); + __m128i zbin_extra = _mm_cvtsi32_si128(b->zbin_extra); + __m128i zbin0 = _mm_load_si128((__m128i *)(b->zbin)); + __m128i zbin1 = _mm_load_si128((__m128i *)(b->zbin + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant0 = _mm_load_si128((__m128i *)(b->quant)); + __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + __m128i qcoeff0 = _mm_setzero_si128(); + __m128i qcoeff1 = _mm_setzero_si128(); + + /* Duplicate to all lanes. */ + zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0); + zbin_extra = _mm_unpacklo_epi16(zbin_extra, zbin_extra); + + /* Sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z): (z ^ sz) - sz */ + x0 = _mm_xor_si128(z0, sz0); + x1 = _mm_xor_si128(z1, sz1); + x0 = _mm_sub_epi16(x0, sz0); + x1 = _mm_sub_epi16(x1, sz1); + + /* zbin[] + zbin_extra */ + zbin0 = _mm_add_epi16(zbin0, zbin_extra); + zbin1 = _mm_add_epi16(zbin1, zbin_extra); + + /* In C x is compared to zbin where zbin = zbin[] + boost + extra. Rebalance + * the equation because boost is the only value which can change: + * x - (zbin[] + extra) >= boost */ + x_minus_zbin0 = _mm_sub_epi16(x0, zbin0); + x_minus_zbin1 = _mm_sub_epi16(x1, zbin1); + + /* All the remaining calculations are valid whether they are done now with + * simd or later inside the loop one at a time. */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + y0 = _mm_mulhi_epi16(x0, quant0); + y1 = _mm_mulhi_epi16(x1, quant1); + + y0 = _mm_add_epi16(y0, x0); + y1 = _mm_add_epi16(y1, x1); + + /* Instead of shifting each value independently we convert the scaling + * factor with 1 << (16 - shift) so we can use multiply/return high half. */ + y0 = _mm_mulhi_epi16(y0, quant_shift0); + y1 = _mm_mulhi_epi16(y1, quant_shift1); + + /* Return the sign: (y ^ sz) - sz */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + y0 = _mm_sub_epi16(y0, sz0); + y1 = _mm_sub_epi16(y1, sz1); + + /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */ + SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0); + SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1); + SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1); + + _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1); + + dqcoeff0 = _mm_mullo_epi16(qcoeff0, dequant0); + dqcoeff1 = _mm_mullo_epi16(qcoeff1, dequant1); + + _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1); + + *d->eob = eob; +} diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index b7b948add..a0dbdcfa9 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -89,6 +89,7 @@ VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c +VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.c ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c @@ -97,7 +98,6 @@ endif VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c -VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm VP8_CX_SRCS-$(ARCH_X86_64) += encoder/x86/ssim_opt_x86_64.asm diff --git a/vp9/encoder/vp9_denoiser.c b/vp9/encoder/vp9_denoiser.c index 687b4c2b2..ae78b69b4 100644 --- a/vp9/encoder/vp9_denoiser.c +++ b/vp9/encoder/vp9_denoiser.c @@ -20,17 +20,129 @@ int vp9_denoiser_filter() { return 0; } +int update_running_avg(uint8_t *mc_avg, int mc_avg_stride, uint8_t *avg, + int avg_stride, uint8_t *sig, int sig_stride, + int increase_denoising, BLOCK_SIZE bs) { + int r, c; + int diff, adj, absdiff; + int shift_inc1 = 0, shift_inc2 = 1; + int adj_val[] = {3, 4, 6}; + int total_adj = 0; + + if (increase_denoising) { + shift_inc1 = 1; + shift_inc2 = 2; + } + + for (r = 0; r < heights[bs]; ++r) { + for (c = 0; c < widths[bs]; ++c) { + diff = mc_avg[c] - sig[c]; + absdiff = abs(diff); + + if (absdiff <= 3 + shift_inc1) { + avg[c] = mc_avg[c]; + total_adj += diff; + } else { + switch (absdiff) { + case 4: case 5: case 6: case 7: + adj = adj_val[0]; + case 8: case 9: case 10: case 11: + case 12: case 13: case 14: case 15: + adj = adj_val[1]; + default: + adj = adj_val[2]; + } + if (diff > 0) { + avg[c] = MIN(UINT8_MAX, sig[c] + adj); + total_adj += adj; + } else { + avg[c] = MAX(0, sig[c] - adj); + total_adj -= adj; + } + } + } + sig += sig_stride; + avg += avg_stride; + mc_avg += mc_avg_stride; + } + return total_adj; +} + +uint8_t *block_start(uint8_t *framebuf, int stride, int mi_row, int mi_col) { + return framebuf + (stride * mi_row) + mi_col; +} + +void copy_block(uint8_t *dest, int dest_stride, + uint8_t *src, int src_stride, BLOCK_SIZE bs) { + int r, c; + for (r = 0; r < heights[bs]; ++r) { + for (c = 0; c < widths[bs]; ++c) { + dest[c] = src[c]; + } + dest += dest_stride; + src += src_stride; + } + return; +} + void vp9_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb, MODE_INFO **grid, int mi_row, int mi_col, BLOCK_SIZE bs) { + int decision = COPY_BLOCK; + + YV12_BUFFER_CONFIG avg = denoiser->running_avg_y[INTRA_FRAME]; + struct buf_2d src = mb->plane[0].src; + + update_running_avg(denoiser->mc_running_avg_y.y_buffer, + denoiser->mc_running_avg_y.y_stride, + denoiser->running_avg_y[INTRA_FRAME].y_buffer, + denoiser->running_avg_y[INTRA_FRAME].y_stride, + mb->plane[0].src.buf, mb->plane[0].src.stride, 0, bs); + + if (decision == FILTER_BLOCK) { + } + if (decision == COPY_BLOCK) { + copy_block(block_start(avg.y_buffer, avg.y_stride, mi_row, mi_col), + avg.y_stride, + block_start(src.buf, src.stride, mi_row, mi_col), + src.stride, + bs); + } return; } +void copy_frame(YV12_BUFFER_CONFIG dest, YV12_BUFFER_CONFIG src) { + memcpy(dest.buffer_alloc, src.buffer_alloc, src.buffer_alloc_sz); +} + void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, + YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, int refresh_last_frame) { + int i; + if (frame_type == KEY_FRAME) { + copy_frame(denoiser->running_avg_y[LAST_FRAME], src); + for (i = 2; i < MAX_REF_FRAMES - 1; i++) { + copy_frame(denoiser->running_avg_y[i], + denoiser->running_avg_y[LAST_FRAME]); + } + } else { /* For non key frames */ + if (refresh_alt_ref_frame) { + copy_frame(denoiser->running_avg_y[ALTREF_FRAME], + denoiser->running_avg_y[INTRA_FRAME]); + } + if (refresh_golden_frame) { + copy_frame(denoiser->running_avg_y[GOLDEN_FRAME], + denoiser->running_avg_y[INTRA_FRAME]); + } + if (refresh_last_frame) { + copy_frame(denoiser->running_avg_y[LAST_FRAME], + denoiser->running_avg_y[INTRA_FRAME]); + } + } + return; } @@ -39,11 +151,39 @@ void vp9_denoiser_update_frame_stats() { } int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, - int border) { + int ssx, int ssy, int border) { + int i, fail; + assert(denoiser); + + for (i = 0; i < MAX_REF_FRAMES; ++i) { + fail = vp9_alloc_frame_buffer(&denoiser->running_avg_y[i], width, height, + ssx, ssy, border); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } + } + + fail = vp9_alloc_frame_buffer(&denoiser->mc_running_avg_y, width, height, + ssx, ssy, border); + if (fail) { + vp9_denoiser_free(denoiser); + return 1; + } + return 0; } void vp9_denoiser_free(VP9_DENOISER *denoiser) { + int i; + for (i = 0; i < MAX_REF_FRAMES; ++i) { + if (&denoiser->running_avg_y[i] != NULL) { + vp9_free_frame_buffer(&denoiser->running_avg_y[i]); + } + } + if (&denoiser->mc_running_avg_y != NULL) { + vp9_free_frame_buffer(&denoiser->mc_running_avg_y); + } return; } diff --git a/vp9/encoder/vp9_denoiser.h b/vp9/encoder/vp9_denoiser.h index a7a8d9329..81fc7ebe5 100644 --- a/vp9/encoder/vp9_denoiser.h +++ b/vp9/encoder/vp9_denoiser.h @@ -12,6 +12,7 @@ #define VP9_ENCODER_DENOISER_H_ #include "vp9/encoder/vp9_block.h" +#include "vpx_scale/yv12config.h" #ifdef __cplusplus extern "C" { @@ -23,11 +24,12 @@ enum vp9_denoiser_decision { }; typedef struct vp9_denoiser { - struct buf_2d running_avg_y; - struct buf_2d mc_running_avg_y; + YV12_BUFFER_CONFIG running_avg_y[MAX_REF_FRAMES]; + YV12_BUFFER_CONFIG mc_running_avg_y; } VP9_DENOISER; void vp9_denoiser_update_frame_info(VP9_DENOISER *denoiser, + YV12_BUFFER_CONFIG src, FRAME_TYPE frame_type, int refresh_alt_ref_frame, int refresh_golden_frame, @@ -40,7 +42,7 @@ void vp9_denoiser_denoise(VP9_DENOISER *denoiser, void vp9_denoiser_update_frame_stats(); int vp9_denoiser_alloc(VP9_DENOISER *denoiser, int width, int height, - int border); + int ssx, int ssy, int border); void vp9_denoiser_free(VP9_DENOISER *denoiser); diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c index 525eccd56..b464361fd 100644 --- a/vp9/encoder/vp9_encoder.c +++ b/vp9/encoder/vp9_encoder.c @@ -683,6 +683,12 @@ void vp9_change_config(struct VP9_COMP *cpi, const VP9EncoderConfig *oxcf) { #if CONFIG_DENOISING vp9_denoiser_alloc(&(cpi->denoiser), cm->width, cm->height, + // TODO(tkopp) An unrelated bug causes + // cm->subsampling_{x,y} to be uninitialized at this point + // in execution. For now we assume YUV-420, which is x/y + // subsampling of 1. + 1, 1, + // cm->subsampling_x, cm->subsampling_y, VP9_ENC_BORDER_IN_PIXELS); #endif } @@ -1558,6 +1564,7 @@ void vp9_update_reference_frames(VP9_COMP *cpi) { } #if CONFIG_DENOISING vp9_denoiser_update_frame_info(&cpi->denoiser, + *cpi->Source, cpi->common.frame_type, cpi->refresh_alt_ref_frame, cpi->refresh_golden_frame, diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index 3f4fcd1e1..b7972b77a 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -172,15 +172,15 @@ static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize, else x->skip_txfm = 0; - // TODO(jingning) This is a temporary solution to account for frames with - // light changes. Need to customize the rate-distortion modeling for non-RD - // mode decision. - if ((sse >> 3) > var) - sse = var; - vp9_model_rd_from_var_lapndz(var + sse, 1 << num_pels_log2_lookup[bsize], - ac_quant >> 3, &rate, &dist); - *out_rate_sum = rate; + vp9_model_rd_from_var_lapndz(sse - var, 1 << num_pels_log2_lookup[bsize], + dc_quant >> 3, &rate, &dist); + *out_rate_sum = rate >> 1; *out_dist_sum = dist << 3; + + vp9_model_rd_from_var_lapndz(var, 1 << num_pels_log2_lookup[bsize], + ac_quant >> 3, &rate, &dist); + *out_rate_sum += rate; + *out_dist_sum += dist << 4; } // TODO(jingning) placeholder for inter-frame non-RD mode decision. @@ -469,7 +469,7 @@ int64_t vp9_pick_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, // Perform intra prediction search, if the best SAD is above a certain // threshold. if (!x->skip && best_rd > inter_mode_thresh && - bsize < cpi->sf.max_intra_bsize) { + bsize <= cpi->sf.max_intra_bsize) { for (this_mode = DC_PRED; this_mode <= DC_PRED; ++this_mode) { vp9_predict_intra_block(xd, 0, b_width_log2(bsize), mbmi->tx_size, this_mode, diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c index 6beb87234..83d900d42 100644 --- a/vp9/encoder/vp9_speed_features.c +++ b/vp9/encoder/vp9_speed_features.c @@ -270,6 +270,10 @@ static void set_rt_speed_feature(VP9_COMP *cpi, SPEED_FEATURES *sf, sf->source_var_thresh = 360; sf->tx_size_search_method = USE_TX_8X8; + // TODO(yunqingwang): max_intra_bsize is used to decide if DC_PRED mode + // is checked for a partition block. Later, we can try to allow large + // partitions to do intra mode checking. + sf->max_intra_bsize = BLOCK_8X8; } if (speed >= 7) { |