diff options
-rwxr-xr-x | configure | 1 | ||||
-rw-r--r-- | test/android/Android.mk | 8 | ||||
-rw-r--r-- | test/borders_test.cc | 1 | ||||
-rw-r--r-- | test/cpu_speed_test.cc | 1 | ||||
-rw-r--r-- | third_party/libwebm/Android.mk | 11 | ||||
-rw-r--r-- | vp8/common/arm/neon/iwalsh_neon.asm | 87 | ||||
-rw-r--r-- | vp8/common/arm/neon/iwalsh_neon.c | 102 | ||||
-rw-r--r-- | vp8/common/arm/neon/loopfilter_neon.asm | 409 | ||||
-rw-r--r-- | vp8/common/arm/neon/loopfilter_neon.c | 538 | ||||
-rw-r--r-- | vp8/encoder/arm/neon/denoising_neon.c | 59 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_ssse3.asm | 138 | ||||
-rw-r--r-- | vp8/encoder/x86/quantize_ssse3.c | 110 | ||||
-rw-r--r-- | vp8/vp8_common.mk | 4 | ||||
-rw-r--r-- | vp8/vp8cx.mk | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_encoder.h | 1 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 3 | ||||
-rw-r--r-- | vp9/encoder/vp9_temporal_filter.c | 126 | ||||
-rw-r--r-- | vp9/vp9_cx_iface.c | 6 | ||||
-rw-r--r-- | vpx/vp8cx.h | 6 | ||||
-rw-r--r-- | vpxenc.c | 4 |
20 files changed, 911 insertions, 706 deletions
@@ -709,6 +709,7 @@ process_toolchain() { soft_enable webm_io ;; *-android-*) + soft_enable webm_io # GTestLog must be modified to use Android logging utilities. ;; *-darwin-*) diff --git a/test/android/Android.mk b/test/android/Android.mk index 13af601e2..4e750b24b 100644 --- a/test/android/Android.mk +++ b/test/android/Android.mk @@ -14,8 +14,14 @@ CUR_WD := $(call my-dir) BINDINGS_DIR := $(CUR_WD)/../../.. LOCAL_PATH := $(CUR_WD)/../../.. +#libwebm +include $(CLEAR_VARS) +include $(BINDINGS_DIR)/libvpx/third_party/libwebm/Android.mk +LOCAL_PATH := $(CUR_WD)/../../.. + #libvpx include $(CLEAR_VARS) +LOCAL_STATIC_LIBRARIES := libwebm include $(BINDINGS_DIR)/libvpx/build/make/Android.mk LOCAL_PATH := $(CUR_WD)/../.. @@ -33,7 +39,7 @@ include $(BUILD_STATIC_LIBRARY) include $(CLEAR_VARS) LOCAL_ARM_MODE := arm LOCAL_MODULE := libvpx_test -LOCAL_STATIC_LIBRARIES := gtest +LOCAL_STATIC_LIBRARIES := gtest libwebm LOCAL_SHARED_LIBRARIES := vpx include $(LOCAL_PATH)/test/test.mk LOCAL_C_INCLUDES := $(BINDINGS_DIR) diff --git a/test/borders_test.cc b/test/borders_test.cc index a2f5a1bdd..b30be4580 100644 --- a/test/borders_test.cc +++ b/test/borders_test.cc @@ -35,6 +35,7 @@ class BordersTest : public ::libvpx_test::EncoderTest, encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); } } diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc index ca201bbc7..be651b4fe 100644 --- a/test/cpu_speed_test.cc +++ b/test/cpu_speed_test.cc @@ -37,6 +37,7 @@ class CpuSpeedTest : public ::libvpx_test::EncoderTest, encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1); encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7); encoder->Control(VP8E_SET_ARNR_STRENGTH, 5); + encoder->Control(VP8E_SET_ARNR_TYPE, 3); } } diff --git a/third_party/libwebm/Android.mk b/third_party/libwebm/Android.mk new file mode 100644 index 000000000..13868b660 --- /dev/null +++ b/third_party/libwebm/Android.mk @@ -0,0 +1,11 @@ +LOCAL_PATH := $(call my-dir) +include $(CLEAR_VARS) + +LOCAL_CPP_EXTENSION := .cpp +LOCAL_SRC_FILES := mkvmuxer.cpp \ + mkvmuxerutil.cpp \ + mkvparser.cpp \ + mkvreader.cpp \ + mkvwriter.cpp +LOCAL_MODULE := libwebm +include $(BUILD_STATIC_LIBRARY) diff --git a/vp8/common/arm/neon/iwalsh_neon.asm b/vp8/common/arm/neon/iwalsh_neon.asm deleted file mode 100644 index e8ea2a619..000000000 --- a/vp8/common/arm/neon/iwalsh_neon.asm +++ /dev/null @@ -1,87 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - EXPORT |vp8_short_inv_walsh4x4_neon| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_inv_walsh4x4_neon(short *input, short *mb_dqcoeff) -|vp8_short_inv_walsh4x4_neon| PROC - - ; read in all four lines of values: d0->d3 - vld1.i16 {q0-q1}, [r0@128] - - ; first for loop - vadd.s16 d4, d0, d3 ;a = [0] + [12] - vadd.s16 d6, d1, d2 ;b = [4] + [8] - vsub.s16 d5, d0, d3 ;d = [0] - [12] - vsub.s16 d7, d1, d2 ;c = [4] - [8] - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vtrn.32 d0, d2 ;d0: 0 1 8 9 - ;d2: 2 3 10 11 - vtrn.32 d1, d3 ;d1: 4 5 12 13 - ;d3: 6 7 14 15 - - vtrn.16 d0, d1 ;d0: 0 4 8 12 - ;d1: 1 5 9 13 - vtrn.16 d2, d3 ;d2: 2 6 10 14 - ;d3: 3 7 11 15 - - ; second for loop - - vadd.s16 d4, d0, d3 ;a = [0] + [3] - vadd.s16 d6, d1, d2 ;b = [1] + [2] - vsub.s16 d5, d0, d3 ;d = [0] - [3] - vsub.s16 d7, d1, d2 ;c = [1] - [2] - - vmov.i16 q8, #3 - - vadd.s16 q0, q2, q3 ; a+b d+c - vsub.s16 q1, q2, q3 ; a-b d-c - - vadd.i16 q0, q0, q8 ;e/f += 3 - vadd.i16 q1, q1, q8 ;g/h += 3 - - vshr.s16 q0, q0, #3 ;e/f >> 3 - vshr.s16 q1, q1, #3 ;g/h >> 3 - - mov r2, #64 - add r3, r1, #32 - - vst1.i16 d0[0], [r1],r2 - vst1.i16 d1[0], [r3],r2 - vst1.i16 d2[0], [r1],r2 - vst1.i16 d3[0], [r3],r2 - - vst1.i16 d0[1], [r1],r2 - vst1.i16 d1[1], [r3],r2 - vst1.i16 d2[1], [r1],r2 - vst1.i16 d3[1], [r3],r2 - - vst1.i16 d0[2], [r1],r2 - vst1.i16 d1[2], [r3],r2 - vst1.i16 d2[2], [r1],r2 - vst1.i16 d3[2], [r3],r2 - - vst1.i16 d0[3], [r1],r2 - vst1.i16 d1[3], [r3],r2 - vst1.i16 d2[3], [r1] - vst1.i16 d3[3], [r3] - - bx lr - ENDP ; |vp8_short_inv_walsh4x4_neon| - - END diff --git a/vp8/common/arm/neon/iwalsh_neon.c b/vp8/common/arm/neon/iwalsh_neon.c new file mode 100644 index 000000000..6ea9dd712 --- /dev/null +++ b/vp8/common/arm/neon/iwalsh_neon.c @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +void vp8_short_inv_walsh4x4_neon( + int16_t *input, + int16_t *mb_dqcoeff) { + int16x8_t q0s16, q1s16, q2s16, q3s16; + int16x4_t d4s16, d5s16, d6s16, d7s16; + int16x4x2_t v2tmp0, v2tmp1; + int32x2x2_t v2tmp2, v2tmp3; + int16x8_t qAdd3; + + q0s16 = vld1q_s16(input); + q1s16 = vld1q_s16(input + 8); + + // 1st for loop + d4s16 = vadd_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d6s16 = vadd_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + d5s16 = vsub_s16(vget_low_s16(q0s16), vget_high_s16(q1s16)); + d7s16 = vsub_s16(vget_high_s16(q0s16), vget_low_s16(q1s16)); + + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + v2tmp2 = vtrn_s32(vreinterpret_s32_s16(vget_low_s16(q0s16)), + vreinterpret_s32_s16(vget_low_s16(q1s16))); + v2tmp3 = vtrn_s32(vreinterpret_s32_s16(vget_high_s16(q0s16)), + vreinterpret_s32_s16(vget_high_s16(q1s16))); + v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), + vreinterpret_s16_s32(v2tmp3.val[0])); + v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), + vreinterpret_s16_s32(v2tmp3.val[1])); + + // 2nd for loop + d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); + d6s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); + d5s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); + d7s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); + q2s16 = vcombine_s16(d4s16, d5s16); + q3s16 = vcombine_s16(d6s16, d7s16); + + qAdd3 = vdupq_n_s16(3); + + q0s16 = vaddq_s16(q2s16, q3s16); + q1s16 = vsubq_s16(q2s16, q3s16); + + q0s16 = vaddq_s16(q0s16, qAdd3); + q1s16 = vaddq_s16(q1s16, qAdd3); + + q0s16 = vshrq_n_s16(q0s16, 3); + q1s16 = vshrq_n_s16(q1s16, 3); + + // store + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 0); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 0); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 1); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 1); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 2); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 2); + mb_dqcoeff += 16; + + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q0s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_low_s16(q1s16), 3); + mb_dqcoeff += 16; + vst1_lane_s16(mb_dqcoeff, vget_high_s16(q1s16), 3); + mb_dqcoeff += 16; + return; +} diff --git a/vp8/common/arm/neon/loopfilter_neon.asm b/vp8/common/arm/neon/loopfilter_neon.asm deleted file mode 100644 index c4f09c775..000000000 --- a/vp8/common/arm/neon/loopfilter_neon.asm +++ /dev/null @@ -1,409 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_loop_filter_horizontal_edge_y_neon| - EXPORT |vp8_loop_filter_horizontal_edge_uv_neon| - EXPORT |vp8_loop_filter_vertical_edge_y_neon| - EXPORT |vp8_loop_filter_vertical_edge_uv_neon| - ARM - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -|vp8_loop_filter_horizontal_edge_y_neon| PROC - push {lr} - vpush {d8-d15} - - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, r1, lsl #2 ; move src pointer down by 4 lines - ldr r3, [sp, #68] ; load thresh - add r12, r2, r1 - add r1, r1, r1 - - vdup.u8 q2, r3 ; duplicate thresh - - vld1.u8 {q3}, [r2@128], r1 ; p3 - vld1.u8 {q4}, [r12@128], r1 ; p2 - vld1.u8 {q5}, [r2@128], r1 ; p1 - vld1.u8 {q6}, [r12@128], r1 ; p0 - vld1.u8 {q7}, [r2@128], r1 ; q0 - vld1.u8 {q8}, [r12@128], r1 ; q1 - vld1.u8 {q9}, [r2@128] ; q2 - vld1.u8 {q10}, [r12@128] ; q3 - - sub r2, r2, r1, lsl #1 - sub r12, r12, r1, lsl #1 - - bl vp8_loop_filter_neon - - vst1.u8 {q5}, [r2@128], r1 ; store op1 - vst1.u8 {q6}, [r12@128], r1 ; store op0 - vst1.u8 {q7}, [r2@128], r1 ; store oq0 - vst1.u8 {q8}, [r12@128], r1 ; store oq1 - - vpop {d8-d15} - pop {pc} - ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| - - -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp8_loop_filter_horizontal_edge_uv_neon| PROC - push {lr} - vpush {d8-d15} - - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - ldr r12, [sp, #68] ; load thresh - ldr r2, [sp, #72] ; load v ptr - vdup.u8 q2, r12 ; duplicate thresh - - sub r3, r0, r1, lsl #2 ; move u pointer down by 4 lines - sub r12, r2, r1, lsl #2 ; move v pointer down by 4 lines - - vld1.u8 {d6}, [r3@64], r1 ; p3 - vld1.u8 {d7}, [r12@64], r1 ; p3 - vld1.u8 {d8}, [r3@64], r1 ; p2 - vld1.u8 {d9}, [r12@64], r1 ; p2 - vld1.u8 {d10}, [r3@64], r1 ; p1 - vld1.u8 {d11}, [r12@64], r1 ; p1 - vld1.u8 {d12}, [r3@64], r1 ; p0 - vld1.u8 {d13}, [r12@64], r1 ; p0 - vld1.u8 {d14}, [r3@64], r1 ; q0 - vld1.u8 {d15}, [r12@64], r1 ; q0 - vld1.u8 {d16}, [r3@64], r1 ; q1 - vld1.u8 {d17}, [r12@64], r1 ; q1 - vld1.u8 {d18}, [r3@64], r1 ; q2 - vld1.u8 {d19}, [r12@64], r1 ; q2 - vld1.u8 {d20}, [r3@64] ; q3 - vld1.u8 {d21}, [r12@64] ; q3 - - bl vp8_loop_filter_neon - - sub r0, r0, r1, lsl #1 - sub r2, r2, r1, lsl #1 - - vst1.u8 {d10}, [r0@64], r1 ; store u op1 - vst1.u8 {d11}, [r2@64], r1 ; store v op1 - vst1.u8 {d12}, [r0@64], r1 ; store u op0 - vst1.u8 {d13}, [r2@64], r1 ; store v op0 - vst1.u8 {d14}, [r0@64], r1 ; store u oq0 - vst1.u8 {d15}, [r2@64], r1 ; store v oq0 - vst1.u8 {d16}, [r0@64] ; store u oq1 - vst1.u8 {d17}, [r2@64] ; store v oq1 - - vpop {d8-d15} - pop {pc} - ENDP ; |vp8_loop_filter_horizontal_edge_uv_neon| - -; void vp8_loop_filter_vertical_edge_y_neon(unsigned char *src, int pitch, -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; int count) -; r0 unsigned char *src -; r1 int pitch -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, - -|vp8_loop_filter_vertical_edge_y_neon| PROC - push {lr} - vpush {d8-d15} - - vdup.u8 q0, r2 ; duplicate blimit - vdup.u8 q1, r3 ; duplicate limit - sub r2, r0, #4 ; src ptr down by 4 columns - add r1, r1, r1 - ldr r3, [sp, #68] ; load thresh - add r12, r2, r1, asr #1 - - vld1.u8 {d6}, [r2], r1 - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d10}, [r2], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d14}, [r2], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d18}, [r2], r1 - vld1.u8 {d20}, [r12], r1 - - vld1.u8 {d7}, [r2], r1 ; load second 8-line src data - vld1.u8 {d9}, [r12], r1 - vld1.u8 {d11}, [r2], r1 - vld1.u8 {d13}, [r12], r1 - vld1.u8 {d15}, [r2], r1 - vld1.u8 {d17}, [r12], r1 - vld1.u8 {d19}, [r2] - vld1.u8 {d21}, [r12] - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r3 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp8_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - - sub r0, r0, #2 ; dst ptr - - vswp d14, d12 - vswp d16, d15 - - add r12, r0, r1, asr #1 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r12], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r12], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r12], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r12], r1 - - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r12], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r12], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r12], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r12] - - vpop {d8-d15} - pop {pc} - ENDP ; |vp8_loop_filter_vertical_edge_y_neon| - -; void vp8_loop_filter_vertical_edge_uv_neon(unsigned char *u, int pitch -; const signed char *flimit, -; const signed char *limit, -; const signed char *thresh, -; unsigned char *v) -; r0 unsigned char *u, -; r1 int pitch, -; r2 unsigned char blimit -; r3 unsigned char limit -; sp unsigned char thresh, -; sp+4 unsigned char *v -|vp8_loop_filter_vertical_edge_uv_neon| PROC - push {lr} - vpush {d8-d15} - - vdup.u8 q0, r2 ; duplicate blimit - sub r12, r0, #4 ; move u pointer down by 4 columns - ldr r2, [sp, #72] ; load v ptr - vdup.u8 q1, r3 ; duplicate limit - sub r3, r2, #4 ; move v pointer down by 4 columns - - vld1.u8 {d6}, [r12], r1 ;load u data - vld1.u8 {d7}, [r3], r1 ;load v data - vld1.u8 {d8}, [r12], r1 - vld1.u8 {d9}, [r3], r1 - vld1.u8 {d10}, [r12], r1 - vld1.u8 {d11}, [r3], r1 - vld1.u8 {d12}, [r12], r1 - vld1.u8 {d13}, [r3], r1 - vld1.u8 {d14}, [r12], r1 - vld1.u8 {d15}, [r3], r1 - vld1.u8 {d16}, [r12], r1 - vld1.u8 {d17}, [r3], r1 - vld1.u8 {d18}, [r12], r1 - vld1.u8 {d19}, [r3], r1 - vld1.u8 {d20}, [r12] - vld1.u8 {d21}, [r3] - - ldr r12, [sp, #68] ; load thresh - - ;transpose to 8x16 matrix - vtrn.32 q3, q7 - vtrn.32 q4, q8 - vtrn.32 q5, q9 - vtrn.32 q6, q10 - - vdup.u8 q2, r12 ; duplicate thresh - - vtrn.16 q3, q5 - vtrn.16 q4, q6 - vtrn.16 q7, q9 - vtrn.16 q8, q10 - - vtrn.8 q3, q4 - vtrn.8 q5, q6 - vtrn.8 q7, q8 - vtrn.8 q9, q10 - - bl vp8_loop_filter_neon - - vswp d12, d11 - vswp d16, d13 - vswp d14, d12 - vswp d16, d15 - - sub r0, r0, #2 - sub r2, r2, #2 - - ;store op1, op0, oq0, oq1 - vst4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r1 - vst4.8 {d14[0], d15[0], d16[0], d17[0]}, [r2], r1 - vst4.8 {d10[1], d11[1], d12[1], d13[1]}, [r0], r1 - vst4.8 {d14[1], d15[1], d16[1], d17[1]}, [r2], r1 - vst4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r1 - vst4.8 {d14[2], d15[2], d16[2], d17[2]}, [r2], r1 - vst4.8 {d10[3], d11[3], d12[3], d13[3]}, [r0], r1 - vst4.8 {d14[3], d15[3], d16[3], d17[3]}, [r2], r1 - vst4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r1 - vst4.8 {d14[4], d15[4], d16[4], d17[4]}, [r2], r1 - vst4.8 {d10[5], d11[5], d12[5], d13[5]}, [r0], r1 - vst4.8 {d14[5], d15[5], d16[5], d17[5]}, [r2], r1 - vst4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r1 - vst4.8 {d14[6], d15[6], d16[6], d17[6]}, [r2], r1 - vst4.8 {d10[7], d11[7], d12[7], d13[7]}, [r0] - vst4.8 {d14[7], d15[7], d16[7], d17[7]}, [r2] - - vpop {d8-d15} - pop {pc} - ENDP ; |vp8_loop_filter_vertical_edge_uv_neon| - -; void vp8_loop_filter_neon(); -; This is a helper function for the loopfilters. The invidual functions do the -; necessary load, transpose (if necessary) and store. - -; r0-r3 PRESERVE -; q0 flimit -; q1 limit -; q2 thresh -; q3 p3 -; q4 p2 -; q5 p1 -; q6 p0 -; q7 q0 -; q8 q1 -; q9 q2 -; q10 q3 -|vp8_loop_filter_neon| PROC - - ; vp8_filter_mask - vabd.u8 q11, q3, q4 ; abs(p3 - p2) - vabd.u8 q12, q4, q5 ; abs(p2 - p1) - vabd.u8 q13, q5, q6 ; abs(p1 - p0) - vabd.u8 q14, q8, q7 ; abs(q1 - q0) - vabd.u8 q3, q9, q8 ; abs(q2 - q1) - vabd.u8 q4, q10, q9 ; abs(q3 - q2) - - vmax.u8 q11, q11, q12 - vmax.u8 q12, q13, q14 - vmax.u8 q3, q3, q4 - vmax.u8 q15, q11, q12 - - vabd.u8 q9, q6, q7 ; abs(p0 - q0) - - ; vp8_hevmask - vcgt.u8 q13, q13, q2 ; (abs(p1 - p0) > thresh)*-1 - vcgt.u8 q14, q14, q2 ; (abs(q1 - q0) > thresh)*-1 - vmax.u8 q15, q15, q3 - - vmov.u8 q10, #0x80 ; 0x80 - - vabd.u8 q2, q5, q8 ; a = abs(p1 - q1) - vqadd.u8 q9, q9, q9 ; b = abs(p0 - q0) * 2 - - vcge.u8 q15, q1, q15 - - ; vp8_filter() function - ; convert to signed - veor q7, q7, q10 ; qs0 - vshr.u8 q2, q2, #1 ; a = a / 2 - veor q6, q6, q10 ; ps0 - - veor q5, q5, q10 ; ps1 - vqadd.u8 q9, q9, q2 ; a = b + a - - veor q8, q8, q10 ; qs1 - - vmov.u8 q10, #3 ; #3 - - vsubl.s8 q2, d14, d12 ; ( qs0 - ps0) - vsubl.s8 q11, d15, d13 - - vcge.u8 q9, q0, q9 ; (a > flimit * 2 + limit) * -1 - - vmovl.u8 q4, d20 - - vqsub.s8 q1, q5, q8 ; vp8_filter = clamp(ps1-qs1) - vorr q14, q13, q14 ; vp8_hevmask - - vmul.i16 q2, q2, q4 ; 3 * ( qs0 - ps0) - vmul.i16 q11, q11, q4 - - vand q1, q1, q14 ; vp8_filter &= hev - vand q15, q15, q9 ; vp8_filter_mask - - vaddw.s8 q2, q2, d2 - vaddw.s8 q11, q11, d3 - - vmov.u8 q9, #4 ; #4 - - ; vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) - vqmovn.s16 d2, q2 - vqmovn.s16 d3, q11 - vand q1, q1, q15 ; vp8_filter &= mask - - vqadd.s8 q2, q1, q10 ; Filter2 = clamp(vp8_filter+3) - vqadd.s8 q1, q1, q9 ; Filter1 = clamp(vp8_filter+4) - vshr.s8 q2, q2, #3 ; Filter2 >>= 3 - vshr.s8 q1, q1, #3 ; Filter1 >>= 3 - - - vqadd.s8 q11, q6, q2 ; u = clamp(ps0 + Filter2) - vqsub.s8 q10, q7, q1 ; u = clamp(qs0 - Filter1) - - ; outer tap adjustments: ++vp8_filter >> 1 - vrshr.s8 q1, q1, #1 - vbic q1, q1, q14 ; vp8_filter &= ~hev - vmov.u8 q0, #0x80 ; 0x80 - vqadd.s8 q13, q5, q1 ; u = clamp(ps1 + vp8_filter) - vqsub.s8 q12, q8, q1 ; u = clamp(qs1 - vp8_filter) - - veor q6, q11, q0 ; *op0 = u^0x80 - veor q7, q10, q0 ; *oq0 = u^0x80 - veor q5, q13, q0 ; *op1 = u^0x80 - veor q8, q12, q0 ; *oq1 = u^0x80 - - bx lr - ENDP ; |vp8_loop_filter_horizontal_edge_y_neon| - -;----------------- - - END diff --git a/vp8/common/arm/neon/loopfilter_neon.c b/vp8/common/arm/neon/loopfilter_neon.c new file mode 100644 index 000000000..47f522bc6 --- /dev/null +++ b/vp8/common/arm/neon/loopfilter_neon.c @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2014 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <arm_neon.h> + +static inline void vp8_loop_filter_neon( + uint8x16_t qblimit, // flimit + uint8x16_t qlimit, // limit + uint8x16_t qthresh, // thresh + uint8x16_t q3, // p3 + uint8x16_t q4, // p2 + uint8x16_t q5, // p1 + uint8x16_t q6, // p0 + uint8x16_t q7, // q0 + uint8x16_t q8, // q1 + uint8x16_t q9, // q2 + uint8x16_t q10, // q3 + uint8x16_t *q5r, // p1 + uint8x16_t *q6r, // p0 + uint8x16_t *q7r, // q0 + uint8x16_t *q8r) { // q1 + uint8x16_t q0u8, q1u8, q2u8, q11u8, q12u8, q13u8, q14u8, q15u8; + int16x8_t q2s16, q11s16; + uint16x8_t q4u16; + int8x16_t q1s8, q2s8, q10s8, q11s8, q12s8, q13s8; + int8x8_t d2s8, d3s8; + + q11u8 = vabdq_u8(q3, q4); + q12u8 = vabdq_u8(q4, q5); + q13u8 = vabdq_u8(q5, q6); + q14u8 = vabdq_u8(q8, q7); + q3 = vabdq_u8(q9, q8); + q4 = vabdq_u8(q10, q9); + + q11u8 = vmaxq_u8(q11u8, q12u8); + q12u8 = vmaxq_u8(q13u8, q14u8); + q3 = vmaxq_u8(q3, q4); + q15u8 = vmaxq_u8(q11u8, q12u8); + + q9 = vabdq_u8(q6, q7); + + // vp8_hevmask + q13u8 = vcgtq_u8(q13u8, qthresh); + q14u8 = vcgtq_u8(q14u8, qthresh); + q15u8 = vmaxq_u8(q15u8, q3); + + q2u8 = vabdq_u8(q5, q8); + q9 = vqaddq_u8(q9, q9); + + q15u8 = vcgeq_u8(qlimit, q15u8); + + // vp8_filter() function + // convert to signed + q10 = vdupq_n_u8(0x80); + q8 = veorq_u8(q8, q10); + q7 = veorq_u8(q7, q10); + q6 = veorq_u8(q6, q10); + q5 = veorq_u8(q5, q10); + + q2u8 = vshrq_n_u8(q2u8, 1); + q9 = vqaddq_u8(q9, q2u8); + + q10 = vdupq_n_u8(3); + + q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7)), + vget_low_s8(vreinterpretq_s8_u8(q6))); + q11s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7)), + vget_high_s8(vreinterpretq_s8_u8(q6))); + + q9 = vcgeq_u8(qblimit, q9); + + q1s8 = vqsubq_s8(vreinterpretq_s8_u8(q5), + vreinterpretq_s8_u8(q8)); + + q14u8 = vorrq_u8(q13u8, q14u8); + + q4u16 = vmovl_u8(vget_low_u8(q10)); + q2s16 = vmulq_s16(q2s16, vreinterpretq_s16_u16(q4u16)); + q11s16 = vmulq_s16(q11s16, vreinterpretq_s16_u16(q4u16)); + + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q14u8); + q15u8 = vandq_u8(q15u8, q9); + + q1s8 = vreinterpretq_s8_u8(q1u8); + q2s16 = vaddw_s8(q2s16, vget_low_s8(q1s8)); + q11s16 = vaddw_s8(q11s16, vget_high_s8(q1s8)); + + q9 = vdupq_n_u8(4); + // vp8_filter = clamp(vp8_filter + 3 * ( qs0 - ps0)) + d2s8 = vqmovn_s16(q2s16); + d3s8 = vqmovn_s16(q11s16); + q1s8 = vcombine_s8(d2s8, d3s8); + q1u8 = vandq_u8(vreinterpretq_u8_s8(q1s8), q15u8); + q1s8 = vreinterpretq_s8_u8(q1u8); + + q2s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q10)); + q1s8 = vqaddq_s8(q1s8, vreinterpretq_s8_u8(q9)); + q2s8 = vshrq_n_s8(q2s8, 3); + q1s8 = vshrq_n_s8(q1s8, 3); + + q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6), q2s8); + q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7), q1s8); + + q1s8 = vrshrq_n_s8(q1s8, 1); + q1s8 = vbicq_s8(q1s8, vreinterpretq_s8_u8(q14u8)); + + q13s8 = vqaddq_s8(vreinterpretq_s8_u8(q5), q1s8); + q12s8 = vqsubq_s8(vreinterpretq_s8_u8(q8), q1s8); + + q0u8 = vdupq_n_u8(0x80); + *q8r = veorq_u8(vreinterpretq_u8_s8(q12s8), q0u8); + *q7r = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); + *q6r = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); + *q5r = veorq_u8(vreinterpretq_u8_s8(q13s8), q0u8); + return; +} + +void vp8_loop_filter_horizontal_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + src -= (pitch << 2); + + q3 = vld1q_u8(src); + src += pitch; + q4 = vld1q_u8(src); + src += pitch; + q5 = vld1q_u8(src); + src += pitch; + q6 = vld1q_u8(src); + src += pitch; + q7 = vld1q_u8(src); + src += pitch; + q8 = vld1q_u8(src); + src += pitch; + q9 = vld1q_u8(src); + src += pitch; + q10 = vld1q_u8(src); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + src -= (pitch * 5); + vst1q_u8(src, q5); + src += pitch; + vst1q_u8(src, q6); + src += pitch; + vst1q_u8(src, q7); + src += pitch; + vst1q_u8(src, q8); + return; +} + +void vp8_loop_filter_horizontal_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + u -= (pitch << 2); + v -= (pitch << 2); + + d6 = vld1_u8(u); + u += pitch; + d7 = vld1_u8(v); + v += pitch; + d8 = vld1_u8(u); + u += pitch; + d9 = vld1_u8(v); + v += pitch; + d10 = vld1_u8(u); + u += pitch; + d11 = vld1_u8(v); + v += pitch; + d12 = vld1_u8(u); + u += pitch; + d13 = vld1_u8(v); + v += pitch; + d14 = vld1_u8(u); + u += pitch; + d15 = vld1_u8(v); + v += pitch; + d16 = vld1_u8(u); + u += pitch; + d17 = vld1_u8(v); + v += pitch; + d18 = vld1_u8(u); + u += pitch; + d19 = vld1_u8(v); + v += pitch; + d20 = vld1_u8(u); + d21 = vld1_u8(v); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + u -= (pitch * 5); + vst1_u8(u, vget_low_u8(q5)); + u += pitch; + vst1_u8(u, vget_low_u8(q6)); + u += pitch; + vst1_u8(u, vget_low_u8(q7)); + u += pitch; + vst1_u8(u, vget_low_u8(q8)); + + v -= (pitch * 5); + vst1_u8(v, vget_high_u8(q5)); + v += pitch; + vst1_u8(v, vget_high_u8(q6)); + v += pitch; + vst1_u8(v, vget_high_u8(q7)); + v += pitch; + vst1_u8(v, vget_high_u8(q8)); + return; +} + +void vp8_loop_filter_vertical_edge_y_neon( + unsigned char *src, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh) { + unsigned char *s, *d; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + s = src - 4; + d6 = vld1_u8(s); + s += pitch; + d8 = vld1_u8(s); + s += pitch; + d10 = vld1_u8(s); + s += pitch; + d12 = vld1_u8(s); + s += pitch; + d14 = vld1_u8(s); + s += pitch; + d16 = vld1_u8(s); + s += pitch; + d18 = vld1_u8(s); + s += pitch; + d20 = vld1_u8(s); + s += pitch; + d7 = vld1_u8(s); + s += pitch; + d9 = vld1_u8(s); + s += pitch; + d11 = vld1_u8(s); + s += pitch; + d13 = vld1_u8(s); + s += pitch; + d15 = vld1_u8(s); + s += pitch; + d17 = vld1_u8(s); + s += pitch; + d19 = vld1_u8(s); + s += pitch; + d21 = vld1_u8(s); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + + d = src - 2; + vst4_lane_u8(d, q4ResultL, 0); + d += pitch; + vst4_lane_u8(d, q4ResultL, 1); + d += pitch; + vst4_lane_u8(d, q4ResultL, 2); + d += pitch; + vst4_lane_u8(d, q4ResultL, 3); + d += pitch; + vst4_lane_u8(d, q4ResultL, 4); + d += pitch; + vst4_lane_u8(d, q4ResultL, 5); + d += pitch; + vst4_lane_u8(d, q4ResultL, 6); + d += pitch; + vst4_lane_u8(d, q4ResultL, 7); + d += pitch; + vst4_lane_u8(d, q4ResultH, 0); + d += pitch; + vst4_lane_u8(d, q4ResultH, 1); + d += pitch; + vst4_lane_u8(d, q4ResultH, 2); + d += pitch; + vst4_lane_u8(d, q4ResultH, 3); + d += pitch; + vst4_lane_u8(d, q4ResultH, 4); + d += pitch; + vst4_lane_u8(d, q4ResultH, 5); + d += pitch; + vst4_lane_u8(d, q4ResultH, 6); + d += pitch; + vst4_lane_u8(d, q4ResultH, 7); + return; +} + +void vp8_loop_filter_vertical_edge_uv_neon( + unsigned char *u, + int pitch, + unsigned char blimit, + unsigned char limit, + unsigned char thresh, + unsigned char *v) { + unsigned char *us, *ud; + unsigned char *vs, *vd; + uint8x16_t qblimit, qlimit, qthresh, q3, q4; + uint8x16_t q5, q6, q7, q8, q9, q10; + uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; + uint8x8_t d15, d16, d17, d18, d19, d20, d21; + uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; + uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; + uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; + uint8x8x4_t q4ResultH, q4ResultL; + + qblimit = vdupq_n_u8(blimit); + qlimit = vdupq_n_u8(limit); + qthresh = vdupq_n_u8(thresh); + + us = u - 4; + d6 = vld1_u8(us); + us += pitch; + d8 = vld1_u8(us); + us += pitch; + d10 = vld1_u8(us); + us += pitch; + d12 = vld1_u8(us); + us += pitch; + d14 = vld1_u8(us); + us += pitch; + d16 = vld1_u8(us); + us += pitch; + d18 = vld1_u8(us); + us += pitch; + d20 = vld1_u8(us); + + vs = v - 4; + d7 = vld1_u8(vs); + vs += pitch; + d9 = vld1_u8(vs); + vs += pitch; + d11 = vld1_u8(vs); + vs += pitch; + d13 = vld1_u8(vs); + vs += pitch; + d15 = vld1_u8(vs); + vs += pitch; + d17 = vld1_u8(vs); + vs += pitch; + d19 = vld1_u8(vs); + vs += pitch; + d21 = vld1_u8(vs); + + q3 = vcombine_u8(d6, d7); + q4 = vcombine_u8(d8, d9); + q5 = vcombine_u8(d10, d11); + q6 = vcombine_u8(d12, d13); + q7 = vcombine_u8(d14, d15); + q8 = vcombine_u8(d16, d17); + q9 = vcombine_u8(d18, d19); + q10 = vcombine_u8(d20, d21); + + q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); + q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); + q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); + q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); + + q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), + vreinterpretq_u16_u32(q2tmp2.val[0])); + q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), + vreinterpretq_u16_u32(q2tmp3.val[0])); + q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), + vreinterpretq_u16_u32(q2tmp2.val[1])); + q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), + vreinterpretq_u16_u32(q2tmp3.val[1])); + + q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), + vreinterpretq_u8_u16(q2tmp5.val[0])); + q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), + vreinterpretq_u8_u16(q2tmp5.val[1])); + q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), + vreinterpretq_u8_u16(q2tmp7.val[0])); + q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), + vreinterpretq_u8_u16(q2tmp7.val[1])); + + q3 = q2tmp8.val[0]; + q4 = q2tmp8.val[1]; + q5 = q2tmp9.val[0]; + q6 = q2tmp9.val[1]; + q7 = q2tmp10.val[0]; + q8 = q2tmp10.val[1]; + q9 = q2tmp11.val[0]; + q10 = q2tmp11.val[1]; + + vp8_loop_filter_neon(qblimit, qlimit, qthresh, q3, q4, + q5, q6, q7, q8, q9, q10, + &q5, &q6, &q7, &q8); + + q4ResultL.val[0] = vget_low_u8(q5); // d10 + q4ResultL.val[1] = vget_low_u8(q6); // d12 + q4ResultL.val[2] = vget_low_u8(q7); // d14 + q4ResultL.val[3] = vget_low_u8(q8); // d16 + ud = u - 2; + vst4_lane_u8(ud, q4ResultL, 0); + ud += pitch; + vst4_lane_u8(ud, q4ResultL, 1); + ud += pitch; + vst4_lane_u8(ud, q4ResultL, 2); + ud += pitch; + vst4_lane_u8(ud, q4ResultL, 3); + ud += pitch; + vst4_lane_u8(ud, q4ResultL, 4); + ud += pitch; + vst4_lane_u8(ud, q4ResultL, 5); + ud += pitch; + vst4_lane_u8(ud, q4ResultL, 6); + ud += pitch; + vst4_lane_u8(ud, q4ResultL, 7); + + q4ResultH.val[0] = vget_high_u8(q5); // d11 + q4ResultH.val[1] = vget_high_u8(q6); // d13 + q4ResultH.val[2] = vget_high_u8(q7); // d15 + q4ResultH.val[3] = vget_high_u8(q8); // d17 + vd = v - 2; + vst4_lane_u8(vd, q4ResultH, 0); + vd += pitch; + vst4_lane_u8(vd, q4ResultH, 1); + vd += pitch; + vst4_lane_u8(vd, q4ResultH, 2); + vd += pitch; + vst4_lane_u8(vd, q4ResultH, 3); + vd += pitch; + vst4_lane_u8(vd, q4ResultH, 4); + vd += pitch; + vst4_lane_u8(vd, q4ResultH, 5); + vd += pitch; + vst4_lane_u8(vd, q4ResultH, 6); + vd += pitch; + vst4_lane_u8(vd, q4ResultH, 7); + return; +} diff --git a/vp8/encoder/arm/neon/denoising_neon.c b/vp8/encoder/arm/neon/denoising_neon.c index 3f8539759..23dc0a967 100644 --- a/vp8/encoder/arm/neon/denoising_neon.c +++ b/vp8/encoder/arm/neon/denoising_neon.c @@ -68,14 +68,11 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, int mc_running_avg_y_stride = mc_running_avg->y_stride; unsigned char *running_avg_y = running_avg->y_buffer + y_offset; int running_avg_y_stride = running_avg->y_stride; + int64x2_t v_sum_diff_total = vdupq_n_s64(0); /* Go over lines. */ int i; - int sum_diff = 0; for (i = 0; i < 16; ++i) { - int8x16_t v_sum_diff = vdupq_n_s8(0); - uint8x16_t v_running_avg_y; - /* Load inputs. */ const uint8x16_t v_sig = vld1q_u8(sig); const uint8x16_t v_mc_running_avg_y = vld1q_u8(mc_running_avg_y); @@ -117,12 +114,9 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, v_abs_adjustment); const uint8x16_t v_neg_adjustment = vandq_u8(v_diff_neg_mask, v_abs_adjustment); - v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); + + uint8x16_t v_running_avg_y = vqaddq_u8(v_sig, v_pos_adjustment); v_running_avg_y = vqsubq_u8(v_running_avg_y, v_neg_adjustment); - v_sum_diff = vqaddq_s8(v_sum_diff, - vreinterpretq_s8_u8(v_pos_adjustment)); - v_sum_diff = vqsubq_s8(v_sum_diff, - vreinterpretq_s8_u8(v_neg_adjustment)); /* Store results. */ vst1q_u8(running_avg_y, v_running_avg_y); @@ -131,23 +125,19 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, * for this macroblock. */ { - int s0 = vgetq_lane_s8(v_sum_diff, 0) + - vgetq_lane_s8(v_sum_diff, 1) + - vgetq_lane_s8(v_sum_diff, 2) + - vgetq_lane_s8(v_sum_diff, 3); - int s1 = vgetq_lane_s8(v_sum_diff, 4) + - vgetq_lane_s8(v_sum_diff, 5) + - vgetq_lane_s8(v_sum_diff, 6) + - vgetq_lane_s8(v_sum_diff, 7); - int s2 = vgetq_lane_s8(v_sum_diff, 8) + - vgetq_lane_s8(v_sum_diff, 9) + - vgetq_lane_s8(v_sum_diff, 10) + - vgetq_lane_s8(v_sum_diff, 11); - int s3 = vgetq_lane_s8(v_sum_diff, 12) + - vgetq_lane_s8(v_sum_diff, 13) + - vgetq_lane_s8(v_sum_diff, 14) + - vgetq_lane_s8(v_sum_diff, 15); - sum_diff += s0 + s1+ s2 + s3; + const int8x16_t v_sum_diff = + vqsubq_s8(vreinterpretq_s8_u8(v_pos_adjustment), + vreinterpretq_s8_u8(v_neg_adjustment)); + + const int16x8_t fe_dc_ba_98_76_54_32_10 = vpaddlq_s8(v_sum_diff); + + const int32x4_t fedc_ba98_7654_3210 = + vpaddlq_s16(fe_dc_ba_98_76_54_32_10); + + const int64x2_t fedcba98_76543210 = + vpaddlq_s32(fedc_ba98_7654_3210); + + v_sum_diff_total = vqaddq_s64(v_sum_diff_total, fedcba98_76543210); } /* Update pointers for next iteration. */ @@ -157,11 +147,20 @@ int vp8_denoiser_filter_neon(YV12_BUFFER_CONFIG *mc_running_avg, } /* Too much adjustments => copy block. */ - if (abs(sum_diff) > SUM_DIFF_THRESHOLD) - return COPY_BLOCK; + { + const int64x1_t x = vqadd_s64(vget_high_s64(v_sum_diff_total), + vget_low_s64(v_sum_diff_total)); + const int s0 = vget_lane_s32(vabs_s32(vreinterpret_s32_s64(x)), 0); + + if (s0 > SUM_DIFF_THRESHOLD) + return COPY_BLOCK; + } /* Tell above level that block was filtered. */ - vp8_copy_mem16x16(running_avg->y_buffer + y_offset, running_avg_y_stride, - signal->thismb, sig_stride); + running_avg_y -= running_avg_y_stride * 16; + sig -= sig_stride * 16; + + vp8_copy_mem16x16(running_avg_y, running_avg_y_stride, sig, sig_stride); + return FILTER_BLOCK; } diff --git a/vp8/encoder/x86/quantize_ssse3.asm b/vp8/encoder/x86/quantize_ssse3.asm deleted file mode 100644 index 7b1dc119f..000000000 --- a/vp8/encoder/x86/quantize_ssse3.asm +++ /dev/null @@ -1,138 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - -%include "vpx_ports/x86_abi_support.asm" -%include "vp8_asm_enc_offsets.asm" - - -; void vp8_fast_quantize_b_ssse3 | arg -; (BLOCK *b, | 0 -; BLOCKD *d) | 1 -; - -global sym(vp8_fast_quantize_b_ssse3) PRIVATE -sym(vp8_fast_quantize_b_ssse3): - push rbp - mov rbp, rsp - GET_GOT rbx - -%if ABI_IS_32BIT - push rdi - push rsi -%else - %if LIBVPX_YASM_WIN64 - push rdi - push rsi - %endif -%endif - ; end prolog - -%if ABI_IS_32BIT - mov rdi, arg(0) ; BLOCK *b - mov rsi, arg(1) ; BLOCKD *d -%else - %if LIBVPX_YASM_WIN64 - mov rdi, rcx ; BLOCK *b - mov rsi, rdx ; BLOCKD *d - %else - ;mov rdi, rdi ; BLOCK *b - ;mov rsi, rsi ; BLOCKD *d - %endif -%endif - - mov rax, [rdi + vp8_block_coeff] - mov rcx, [rdi + vp8_block_round] - mov rdx, [rdi + vp8_block_quant_fast] - - ; coeff - movdqa xmm0, [rax] - movdqa xmm4, [rax + 16] - - ; round - movdqa xmm2, [rcx] - movdqa xmm3, [rcx + 16] - - movdqa xmm1, xmm0 - movdqa xmm5, xmm4 - - ; sz = z >> 15 - psraw xmm0, 15 - psraw xmm4, 15 - - pabsw xmm1, xmm1 - pabsw xmm5, xmm5 - - paddw xmm1, xmm2 - paddw xmm5, xmm3 - - ; quant_fast - pmulhw xmm1, [rdx] - pmulhw xmm5, [rdx + 16] - - mov rax, [rsi + vp8_blockd_qcoeff] - mov rdi, [rsi + vp8_blockd_dequant] - mov rcx, [rsi + vp8_blockd_dqcoeff] - - movdqa xmm2, xmm1 ;store y for getting eob - movdqa xmm3, xmm5 - - pxor xmm1, xmm0 - pxor xmm5, xmm4 - psubw xmm1, xmm0 - psubw xmm5, xmm4 - - movdqa [rax], xmm1 - movdqa [rax + 16], xmm5 - - movdqa xmm0, [rdi] - movdqa xmm4, [rdi + 16] - - pmullw xmm0, xmm1 - pmullw xmm4, xmm5 - pxor xmm1, xmm1 - - pcmpgtw xmm2, xmm1 ;calculate eob - pcmpgtw xmm3, xmm1 - packsswb xmm2, xmm3 - pshufb xmm2, [GLOBAL(zz_shuf)] - - pmovmskb edx, xmm2 - - movdqa [rcx], xmm0 ;store dqcoeff - movdqa [rcx + 16], xmm4 ;store dqcoeff - mov rcx, [rsi + vp8_blockd_eob] - - bsr eax, edx ;count 0 - add eax, 1 - - cmp edx, 0 ;if all 0, eob=0 - cmove eax, edx - - mov BYTE PTR [rcx], al ;store eob - - ; begin epilog -%if ABI_IS_32BIT - pop rsi - pop rdi -%else - %if LIBVPX_YASM_WIN64 - pop rsi - pop rdi - %endif -%endif - - RESTORE_GOT - pop rbp - ret - -SECTION_RODATA -align 16 -zz_shuf: - db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 diff --git a/vp8/encoder/x86/quantize_ssse3.c b/vp8/encoder/x86/quantize_ssse3.c new file mode 100644 index 000000000..9b4471d4f --- /dev/null +++ b/vp8/encoder/x86/quantize_ssse3.c @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2012 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <tmmintrin.h> /* SSSE3 */ + +#include "vp8/encoder/block.h" + +/* bitscan reverse (bsr) */ +#if defined(_MSC_VER) +#include <intrin.h> +#pragma intrinsic(_BitScanReverse) +static int bsr(int mask) { + int eob; + _BitScanReverse(&eob, mask); + eob++; + if (mask == 0) + eob = 0; + return eob; +} +#else +static int bsr(int mask) { + int eob; + asm volatile("bsr %1, %0" : "=r" (eob) : "r" (mask) : "flags"); + eob++; + if (mask == 0) + eob = 0; + return eob; +} +#endif + +void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) { + int eob, mask; + + __m128i z0 = _mm_load_si128((__m128i *)(b->coeff)); + __m128i z1 = _mm_load_si128((__m128i *)(b->coeff + 8)); + __m128i round0 = _mm_load_si128((__m128i *)(b->round)); + __m128i round1 = _mm_load_si128((__m128i *)(b->round + 8)); + __m128i quant_fast0 = _mm_load_si128((__m128i *)(b->quant_fast)); + __m128i quant_fast1 = _mm_load_si128((__m128i *)(b->quant_fast + 8)); + __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant)); + __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8)); + + __m128i sz0, sz1, x, x0, x1, y0, y1, zeros, abs0, abs1; + + DECLARE_ALIGNED(16, const uint8_t, pshufb_zig_zag_mask[16]) = + { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; + __m128i zig_zag = _mm_load_si128((const __m128i *)pshufb_zig_zag_mask); + + /* sign of z: z >> 15 */ + sz0 = _mm_srai_epi16(z0, 15); + sz1 = _mm_srai_epi16(z1, 15); + + /* x = abs(z) */ + x0 = _mm_abs_epi16(z0); + x1 = _mm_abs_epi16(z1); + + /* x += round */ + x0 = _mm_add_epi16(x0, round0); + x1 = _mm_add_epi16(x1, round1); + + /* y = (x * quant) >> 16 */ + y0 = _mm_mulhi_epi16(x0, quant_fast0); + y1 = _mm_mulhi_epi16(x1, quant_fast1); + + /* ASM saves Y for EOB */ + /* I think we can ignore that because adding the sign doesn't change anything + * and multiplying 0 by dequant is OK as well */ + abs0 = y0; + abs1 = y1; + + /* Restore the sign bit. */ + y0 = _mm_xor_si128(y0, sz0); + y1 = _mm_xor_si128(y1, sz1); + x0 = _mm_sub_epi16(y0, sz0); + x1 = _mm_sub_epi16(y1, sz1); + + /* qcoeff = x */ + _mm_store_si128((__m128i *)(d->qcoeff), x0); + _mm_store_si128((__m128i *)(d->qcoeff + 8), x1); + + /* x * dequant */ + x0 = _mm_mullo_epi16(x0, dequant0); + x1 = _mm_mullo_epi16(x1, dequant1); + + /* dqcoeff = x * dequant */ + _mm_store_si128((__m128i *)(d->dqcoeff), x0); + _mm_store_si128((__m128i *)(d->dqcoeff + 8), x1); + + zeros = _mm_setzero_si128(); + + x0 = _mm_cmpgt_epi16(abs0, zeros); + x1 = _mm_cmpgt_epi16(abs1, zeros); + + x = _mm_packs_epi16(x0, x1); + + x = _mm_shuffle_epi8(x, zig_zag); + + mask = _mm_movemask_epi8(x); + + eob = bsr(mask); + + *d->eob = 0xFF & eob; +} diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index cdb271664..45063bd69 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -159,8 +159,6 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_hv_armv6$(ASM) # common (neon) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon$(ASM) -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon$(ASM) @@ -186,6 +184,8 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequant_idct_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon.c $(eval $(call rtcd_h_template,vp8_rtcd,vp8/common/rtcd_defs.pl)) diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index d7c6dd1e1..607382b4c 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -88,6 +88,7 @@ VP8_CX_SRCS-$(HAVE_MMX) += encoder/x86/vp8_enc_stubs_mmx.c VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/fwalsh_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/quantize_sse2.c +VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.c ifeq ($(CONFIG_TEMPORAL_DENOISING),yes) VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoising_sse2.c @@ -96,7 +97,6 @@ endif VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/subtract_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm VP8_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp8_enc_stubs_sse2.c -VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/quantize_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h index 6b9737076..364ea3a9e 100644 --- a/vp9/encoder/vp9_encoder.h +++ b/vp9/encoder/vp9_encoder.h @@ -276,6 +276,7 @@ typedef struct VP9EncoderConfig { int arnr_max_frames; int arnr_strength; + int arnr_type; int tile_columns; int tile_rows; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index a2fc1bb77..f854356b9 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -3250,9 +3250,8 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // an unfiltered alternative. We allow near/nearest as well // because they may result in zero-zero MVs but be cheaper. if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) { - const int altref_zero_mask = + mode_skip_mask = ~((1 << THR_NEARESTA) | (1 << THR_NEARA) | (1 << THR_ZEROA)); - mode_skip_mask |= altref_zero_mask; if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0) mode_skip_mask |= (1 << THR_NEARA); if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0) diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c index a176bbf91..a5234cd9e 100644 --- a/vp9/encoder/vp9_temporal_filter.c +++ b/vp9/encoder/vp9_temporal_filter.c @@ -355,33 +355,74 @@ static void temporal_filter_iterate_c(VP9_COMP *cpi, void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { VP9_COMMON *const cm = &cpi->common; int frame = 0; + int frames_to_blur_backward = 0; + int frames_to_blur_forward = 0; int frames_to_blur = 0; int start_frame = 0; int strength = cpi->active_arnr_strength; + int blur_type = cpi->oxcf.arnr_type; int max_frames = cpi->active_arnr_frames; - int frames_to_blur_backward = distance; - int frames_to_blur_forward = vp9_lookahead_depth(cpi->lookahead) - - (distance + 1); + const int num_frames_backward = distance; + const int num_frames_forward = vp9_lookahead_depth(cpi->lookahead) + - (num_frames_backward + 1); struct scale_factors sf; - // Determine which input frames to filter. - if (frames_to_blur_forward > frames_to_blur_backward) - frames_to_blur_forward = frames_to_blur_backward; + switch (blur_type) { + case 1: + // Backward Blur + frames_to_blur_backward = num_frames_backward; - if (frames_to_blur_backward > frames_to_blur_forward) - frames_to_blur_backward = frames_to_blur_forward; + if (frames_to_blur_backward >= max_frames) + frames_to_blur_backward = max_frames - 1; - // When max_frames is even we have 1 more frame backward than forward - if (frames_to_blur_forward > (max_frames - 1) / 2) - frames_to_blur_forward = (max_frames - 1) / 2; + frames_to_blur = frames_to_blur_backward + 1; + break; - if (frames_to_blur_backward > (max_frames / 2)) - frames_to_blur_backward = max_frames / 2; + case 2: + // Forward Blur + frames_to_blur_forward = num_frames_forward; - frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1; + if (frames_to_blur_forward >= max_frames) + frames_to_blur_forward = max_frames - 1; + + frames_to_blur = frames_to_blur_forward + 1; + break; + + case 3: + default: + // Center Blur + frames_to_blur_forward = num_frames_forward; + frames_to_blur_backward = num_frames_backward; + + if (frames_to_blur_forward > frames_to_blur_backward) + frames_to_blur_forward = frames_to_blur_backward; + + if (frames_to_blur_backward > frames_to_blur_forward) + frames_to_blur_backward = frames_to_blur_forward; + + // When max_frames is even we have 1 more frame backward than forward + if (frames_to_blur_forward > (max_frames - 1) / 2) + frames_to_blur_forward = ((max_frames - 1) / 2); + + if (frames_to_blur_backward > (max_frames / 2)) + frames_to_blur_backward = (max_frames / 2); + + frames_to_blur = frames_to_blur_backward + frames_to_blur_forward + 1; + break; + } start_frame = distance + frames_to_blur_forward; +#ifdef DEBUGFWG + // DEBUG FWG + printf( + "max:%d FBCK:%d FFWD:%d ftb:%d ftbbck:%d ftbfwd:%d sei:%d lasei:%d " + "start:%d", + max_frames, num_frames_backward, num_frames_forward, frames_to_blur, + frames_to_blur_backward, frames_to_blur_forward, cpi->source_encode_index, + cpi->last_alt_ref_sei, start_frame); +#endif + // Setup scaling factors. Scaling on each of the arnr frames is not supported vp9_setup_scale_factors_for_frame(&sf, get_frame_new_buffer(cm)->y_crop_width, @@ -390,7 +431,7 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { // Setup frame pointers, NULL indicates frame not included in filter vp9_zero(cpi->frames); - for (frame = 0; frame < frames_to_blur; ++frame) { + for (frame = 0; frame < frames_to_blur; frame++) { int which_buffer = start_frame - frame; struct lookahead_entry *buf = vp9_lookahead_peek(cpi->lookahead, which_buffer); @@ -404,11 +445,11 @@ void vp9_temporal_filter_prepare(VP9_COMP *cpi, int distance) { void vp9_configure_arnr_filter(VP9_COMP *cpi, const unsigned int frames_to_arnr, const int group_boost) { - int q; int half_gf_int; int frames_after_arf; - int frames_bwd; - int frames_fwd = (cpi->oxcf.arnr_max_frames - 1) >> 1; + int frames_bwd = cpi->oxcf.arnr_max_frames - 1; + int frames_fwd = cpi->oxcf.arnr_max_frames - 1; + int q; // Define the arnr filter width for this group of frames. We only // filter frames that lie within a distance of half the GF interval @@ -420,26 +461,47 @@ void vp9_configure_arnr_filter(VP9_COMP *cpi, frames_after_arf = vp9_lookahead_depth(cpi->lookahead) - frames_to_arnr - 1; - if (frames_fwd > frames_after_arf) - frames_fwd = frames_after_arf; - if (frames_fwd > half_gf_int) - frames_fwd = half_gf_int; - - frames_bwd = frames_fwd; - - // For even length filter there is one more frame backward - // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. - if (frames_bwd < half_gf_int) - frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1; + switch (cpi->oxcf.arnr_type) { + case 1: // Backward filter + frames_fwd = 0; + if (frames_bwd > half_gf_int) + frames_bwd = half_gf_int; + break; + + case 2: // Forward filter + if (frames_fwd > half_gf_int) + frames_fwd = half_gf_int; + if (frames_fwd > frames_after_arf) + frames_fwd = frames_after_arf; + frames_bwd = 0; + break; + + case 3: // Centered filter + default: + frames_fwd >>= 1; + if (frames_fwd > frames_after_arf) + frames_fwd = frames_after_arf; + if (frames_fwd > half_gf_int) + frames_fwd = half_gf_int; + + frames_bwd = frames_fwd; + + // For even length filter there is one more frame backward + // than forward: e.g. len=6 ==> bbbAff, len=7 ==> bbbAfff. + if (frames_bwd < half_gf_int) + frames_bwd += (cpi->oxcf.arnr_max_frames + 1) & 0x1; + break; + } cpi->active_arnr_frames = frames_bwd + 1 + frames_fwd; // Adjust the strength based on active max q if (cpi->common.current_video_frame > 1) - q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[INTER_FRAME])); + q = ((int)vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[INTER_FRAME])); else - q = ((int)vp9_convert_qindex_to_q(cpi->rc.avg_frame_qindex[KEY_FRAME])); - + q = ((int)vp9_convert_qindex_to_q( + cpi->rc.avg_frame_qindex[KEY_FRAME])); if (q > 16) { cpi->active_arnr_strength = cpi->oxcf.arnr_strength; } else { diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c index 1ca9fb93a..449e7d897 100644 --- a/vp9/vp9_cx_iface.c +++ b/vp9/vp9_cx_iface.c @@ -30,6 +30,7 @@ struct vp9_extracfg { unsigned int tile_rows; unsigned int arnr_max_frames; unsigned int arnr_strength; + unsigned int arnr_type; vp8e_tuning tuning; unsigned int cq_level; // constrained quality level unsigned int rc_max_intra_bitrate_pct; @@ -59,6 +60,7 @@ static const struct extraconfig_map extracfg_map[] = { 0, // tile_rows 7, // arnr_max_frames 5, // arnr_strength + 3, // arnr_type VP8_TUNE_PSNR, // tuning 10, // cq_level 0, // rc_max_intra_bitrate_pct @@ -201,6 +203,7 @@ static vpx_codec_err_t validate_config(vpx_codec_alg_priv_t *ctx, RANGE_CHECK_HI(extra_cfg, sharpness, 7); RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15); RANGE_CHECK_HI(extra_cfg, arnr_strength, 6); + RANGE_CHECK(extra_cfg, arnr_type, 1, 3); RANGE_CHECK(extra_cfg, cq_level, 0, 63); // TODO(yaowu): remove this when ssim tuning is implemented for vp9 @@ -364,6 +367,7 @@ static vpx_codec_err_t set_encoder_config( oxcf->arnr_max_frames = extra_cfg->arnr_max_frames; oxcf->arnr_strength = extra_cfg->arnr_strength; + oxcf->arnr_type = extra_cfg->arnr_type; oxcf->tuning = extra_cfg->tuning; @@ -494,6 +498,7 @@ static vpx_codec_err_t ctrl_set_param(vpx_codec_alg_priv_t *ctx, int ctrl_id, MAP(VP9E_SET_TILE_ROWS, extra_cfg.tile_rows); MAP(VP8E_SET_ARNR_MAXFRAMES, extra_cfg.arnr_max_frames); MAP(VP8E_SET_ARNR_STRENGTH, extra_cfg.arnr_strength); + MAP(VP8E_SET_ARNR_TYPE, extra_cfg.arnr_type); MAP(VP8E_SET_TUNING, extra_cfg.tuning); MAP(VP8E_SET_CQ_LEVEL, extra_cfg.cq_level); MAP(VP8E_SET_MAX_INTRA_BITRATE_PCT, extra_cfg.rc_max_intra_bitrate_pct); @@ -1107,6 +1112,7 @@ static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = { {VP9E_SET_TILE_ROWS, ctrl_set_param}, {VP8E_SET_ARNR_MAXFRAMES, ctrl_set_param}, {VP8E_SET_ARNR_STRENGTH, ctrl_set_param}, + {VP8E_SET_ARNR_TYPE, ctrl_set_param}, {VP8E_SET_TUNING, ctrl_set_param}, {VP8E_SET_CQ_LEVEL, ctrl_set_param}, {VP8E_SET_MAX_INTRA_BITRATE_PCT, ctrl_set_param}, diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index 8944a2664..bc8df807e 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -160,8 +160,10 @@ enum vp8e_enc_control_id { scale as used by the rc_*_quantizer config parameters */ VP8E_SET_ARNR_MAXFRAMES, /**< control function to set the max number of frames blurred creating arf*/ - VP8E_SET_ARNR_STRENGTH, /**< control function to set the filter strength for the arf */ - VP8E_SET_ARNR_TYPE, /**< control function to set the type of filter to use for the arf*/ + VP8E_SET_ARNR_STRENGTH, //!< control function to set the filter + //!< strength for the arf + VP8E_SET_ARNR_TYPE, //!< control function to set the type of + //!< filter to use for the arf VP8E_SET_TUNING, /**< control function to set visual tuning */ /*!\brief control function to set constrained quality level * @@ -363,7 +363,7 @@ static const arg_def_t frame_periodic_boost = ARG_DEF( static const arg_def_t *vp9_args[] = { &cpu_used, &auto_altref, &noise_sens, &sharpness, &static_thresh, - &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, + &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type, &tune_ssim, &cq_level, &max_intra_rate_pct, &lossless, &frame_parallel_decoding, &aq_mode, &frame_periodic_boost, NULL @@ -372,7 +372,7 @@ static const int vp9_arg_ctrl_map[] = { VP8E_SET_CPUUSED, VP8E_SET_ENABLEAUTOALTREF, VP8E_SET_NOISE_SENSITIVITY, VP8E_SET_SHARPNESS, VP8E_SET_STATIC_THRESHOLD, VP9E_SET_TILE_COLUMNS, VP9E_SET_TILE_ROWS, - VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, + VP8E_SET_ARNR_MAXFRAMES, VP8E_SET_ARNR_STRENGTH, VP8E_SET_ARNR_TYPE, VP8E_SET_TUNING, VP8E_SET_CQ_LEVEL, VP8E_SET_MAX_INTRA_BITRATE_PCT, VP9E_SET_LOSSLESS, VP9E_SET_FRAME_PARALLEL_DECODING, VP9E_SET_AQ_MODE, VP9E_SET_FRAME_PERIODIC_BOOST, |