diff options
-rw-r--r-- | build/make/Android.mk | 47 | ||||
-rwxr-xr-x | build/make/configure.sh | 7 | ||||
-rwxr-xr-x | build/make/gen_msvs_vcxproj.sh | 40 | ||||
-rw-r--r-- | test/android/README | 2 | ||||
-rw-r--r-- | vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm | 156 | ||||
-rw-r--r-- | vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c | 184 | ||||
-rw-r--r-- | vp8/common/arm/neon/sixtappredict_neon.c | 4 | ||||
-rw-r--r-- | vp8/common/arm/neon/variance_neon.c | 12 | ||||
-rw-r--r-- | vp8/vp8_common.mk | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_pickmode.c | 36 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 94 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.h | 43 | ||||
-rw-r--r-- | vpx/vp8cx.h | 8 | ||||
-rw-r--r-- | vpx_ports/arm_cpudetect.c | 7 |
14 files changed, 305 insertions, 337 deletions
diff --git a/build/make/Android.mk b/build/make/Android.mk index 48a0dd79c..369c2a568 100644 --- a/build/make/Android.mk +++ b/build/make/Android.mk @@ -53,12 +53,20 @@ LIBVPX_PATH := $(LOCAL_PATH)/libvpx ASM_CNV_PATH_LOCAL := $(TARGET_ARCH_ABI)/ads2gas ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL) -# Makefiles created by the libvpx configure process -# This will need to be fixed to handle x86. +# Use the makefiles generated by upstream configure to determine which files to +# build. Also set any architecture-specific flags. ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) include $(CONFIG_DIR)libs-armv7-android-gcc.mk -else + LOCAL_ARM_MODE := arm +else ifeq ($(TARGET_ARCH_ABI),armeabi) include $(CONFIG_DIR)libs-armv5te-android-gcc.mk + LOCAL_ARM_MODE := arm +else ifeq ($(TARGET_ARCH_ABI),x86) + include $(CONFIG_DIR)libs-x86-android-gcc.mk +else ifeq ($(TARGET_ARCH_ABI),mips) + include $(CONFIG_DIR)libs-mips-android-gcc.mk +else + $(error Not a supported TARGET_ARCH_ABI: $(TARGET_ARCH_ABI)) endif # Rule that is normally in Makefile created by libvpx @@ -72,10 +80,13 @@ SRC_PATH_BARE := $(LIBVPX_PATH) # Include the list of files to be built include $(LIBVPX_PATH)/libs.mk -# Want arm, not thumb, optimized -LOCAL_ARM_MODE := arm +# Optimise the code. May want to revisit this setting in the future. LOCAL_CFLAGS := -O3 +# For x86, include the source code in the search path so it will find files +# like x86inc.asm and x86_abi_support.asm +LOCAL_ASMFLAGS := -I$(LIBVPX_PATH) + # ----------------------------------------------------------------------------- # Template : asm_offsets_template # Arguments : 1: assembly offsets file to be created @@ -109,7 +120,7 @@ $(1) : $$(_OBJ) $(2) @grep $(OFFSET_PATTERN) $$< | tr -d '\#' | $(CONFIG_DIR)$(ASM_CONVERSION) > $$@ endef -# Use ads2gas script to convert from RVCT format to GAS format. This passes +# Use ads2gas script to convert from RVCT format to GAS format. This # puts the processed file under $(ASM_CNV_PATH). Local clean rule # to handle removing these ifeq ($(CONFIG_VP8_ENCODER), yes) @@ -146,18 +157,26 @@ LOCAL_SRC_FILES += $(foreach file, $(LOCAL_NEON_SRCS_C), libvpx/$(file).neon) # Pull out assembly files, splitting NEON from the rest. This is # done to specify that the NEON assembly files use NEON assembler flags. -CODEC_SRCS_ASM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE)) -CODEC_SRCS_ASM = $(foreach v, \ - $(CODEC_SRCS_ASM_ALL), \ - $(if $(findstring neon,$(v)),,$(v))) +# x86 assembly matches %.asm, arm matches %.asm.s + +# x86: + +CODEC_SRCS_ASM_X86 = $(filter %.asm, $(CODEC_SRCS_UNIQUE)) +LOCAL_SRC_FILES += $(foreach file, $(CODEC_SRCS_ASM_X86), libvpx/$(file)) + +# arm: +CODEC_SRCS_ASM_ARM_ALL = $(filter %.asm.s, $(CODEC_SRCS_UNIQUE)) +CODEC_SRCS_ASM_ARM = $(foreach v, \ + $(CODEC_SRCS_ASM_ARM_ALL), \ + $(if $(findstring neon,$(v)),,$(v))) CODEC_SRCS_ASM_ADS2GAS = $(patsubst %.s, \ $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \ - $(CODEC_SRCS_ASM)) + $(CODEC_SRCS_ASM_ARM)) LOCAL_SRC_FILES += $(CODEC_SRCS_ASM_ADS2GAS) ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) CODEC_SRCS_ASM_NEON = $(foreach v, \ - $(CODEC_SRCS_ASM_ALL),\ + $(CODEC_SRCS_ASM_ARM_ALL),\ $(if $(findstring neon,$(v)),$(v),)) CODEC_SRCS_ASM_NEON_ADS2GAS = $(patsubst %.s, \ $(ASM_CNV_PATH_LOCAL)/libvpx/%.s, \ @@ -189,6 +208,10 @@ $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vp9_rtcd.h endif $(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_scale_rtcd.h +ifeq ($(TARGET_ARCH_ABI),x86) +$(foreach file, $(LOCAL_SRC_FILES), $(LOCAL_PATH)/$(file)): vpx_config.asm +endif + .PHONY: clean clean: @echo "Clean: ads2gas files [$(TARGET_ARCH_ABI)]" diff --git a/build/make/configure.sh b/build/make/configure.sh index a65d395de..4c3b05f66 100755 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -859,6 +859,13 @@ EOF msvs_arch_dir=arm-msvs disable_feature multithread disable_feature unit_tests + vs_version=${tgt_cc##vs} + if [ $vs_version -ge 12 ]; then + # MSVC 2013 doesn't allow doing plain .exe projects for ARM, + # only "AppContainerApplication" which requires an AppxManifest. + # Therefore disable the examples, just build the library. + disable_feature examples + fi ;; rvct) CC=armcc diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh index 9dc790629..8529eed24 100755 --- a/build/make/gen_msvs_vcxproj.sh +++ b/build/make/gen_msvs_vcxproj.sh @@ -292,6 +292,18 @@ generate_vcxproj() { tag_content ProjectGuid "{${guid}}" tag_content RootNamespace ${name} tag_content Keyword ManagedCProj + if [ $vs_ver -ge 12 ] && [ "${platforms[0]}" = "ARM" ]; then + tag_content AppContainerApplication true + # The application type can be one of "Windows Store", + # "Windows Phone" or "Windows Phone Silverlight". The + # actual value doesn't matter from the libvpx point of view, + # since a static library built for one works on the others. + # The PlatformToolset field needs to be set in sync with this; + # for Windows Store and Windows Phone Silverlight it should be + # v120 while it should be v120_wp81 if the type is Windows Phone. + tag_content ApplicationType "Windows Store" + tag_content ApplicationTypeRevision 8.1 + fi close_tag PropertyGroup tag Import \ @@ -324,18 +336,10 @@ generate_vcxproj() { fi fi if [ "$vs_ver" = "12" ]; then - if [ "$plat" = "ARM" ]; then - # Setting the wp80 toolchain automatically sets the - # WINAPI_FAMILY define, which is required for building - # code for arm with the windows headers. Alternatively, - # one could add AppContainerApplication=true in the Globals - # section and add PrecompiledHeader=NotUsing and - # CompileAsWinRT=false in ClCompile and SubSystem=Console - # in Link. - tag_content PlatformToolset v120_wp80 - else - tag_content PlatformToolset v120 - fi + # Setting a PlatformToolset indicating windows phone isn't + # enough to build code for arm with MSVC 2013, one strictly + # has to enable AppContainerApplication as well. + tag_content PlatformToolset v120 fi tag_content CharacterSet Unicode if [ "$config" = "Release" ]; then @@ -427,15 +431,25 @@ generate_vcxproj() { if ${werror:-false}; then tag_content TreatWarningAsError true fi + if [ $vs_ver -ge 11 ]; then + # We need to override the defaults for these settings + # if AppContainerApplication is set. + tag_content CompileAsWinRT false + tag_content PrecompiledHeader NotUsing + tag_content SDLCheck false + fi close_tag ClCompile case "$proj_kind" in exe) open_tag Link if [ "$name" != "obj_int_extract" ]; then - tag_content AdditionalDependencies "$curlibs" + tag_content AdditionalDependencies "$curlibs;%(AdditionalDependencies)" tag_content AdditionalLibraryDirectories "$libdirs;%(AdditionalLibraryDirectories)" fi tag_content GenerateDebugInformation true + # Console is the default normally, but if + # AppContainerApplication is set, we need to override it. + tag_content SubSystem Console close_tag Link ;; dll) diff --git a/test/android/README b/test/android/README index 6840d911c..4a1adcf7f 100644 --- a/test/android/README +++ b/test/android/README @@ -3,7 +3,7 @@ Android.mk will build vpx unittests on android. ./libvpx/configure --target=armv7-android-gcc --enable-external-build \ --enable-postproc --disable-install-srcs --enable-multi-res-encoding \ --enable-temporal-denoising --disable-unit-tests --disable-install-docs \ - --disable-examples --disable-runtime-cpu-detect --sdk=$NDK + --disable-examples --disable-runtime-cpu-detect --sdk-path=$NDK 2) From the parent directory, invoke ndk-build: NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \ diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm new file mode 100644 index 000000000..78d13c895 --- /dev/null +++ b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.asm @@ -0,0 +1,156 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + + EXPORT |vp8_loop_filter_bvs_neon| + EXPORT |vp8_loop_filter_mbvs_neon| + ARM + PRESERVE8 + + AREA ||.text||, CODE, READONLY, ALIGN=2 + +; r0 unsigned char *s, PRESERVE +; r1 int p, PRESERVE +; q1 limit, PRESERVE + +|vp8_loop_filter_simple_vertical_edge_neon| PROC + vpush {d8-d15} + + sub r0, r0, #2 ; move src pointer down by 2 columns + add r12, r1, r1 + add r3, r0, r1 + + vld4.8 {d6[0], d7[0], d8[0], d9[0]}, [r0], r12 + vld4.8 {d6[1], d7[1], d8[1], d9[1]}, [r3], r12 + vld4.8 {d6[2], d7[2], d8[2], d9[2]}, [r0], r12 + vld4.8 {d6[3], d7[3], d8[3], d9[3]}, [r3], r12 + vld4.8 {d6[4], d7[4], d8[4], d9[4]}, [r0], r12 + vld4.8 {d6[5], d7[5], d8[5], d9[5]}, [r3], r12 + vld4.8 {d6[6], d7[6], d8[6], d9[6]}, [r0], r12 + vld4.8 {d6[7], d7[7], d8[7], d9[7]}, [r3], r12 + + vld4.8 {d10[0], d11[0], d12[0], d13[0]}, [r0], r12 + vld4.8 {d10[1], d11[1], d12[1], d13[1]}, [r3], r12 + vld4.8 {d10[2], d11[2], d12[2], d13[2]}, [r0], r12 + vld4.8 {d10[3], d11[3], d12[3], d13[3]}, [r3], r12 + vld4.8 {d10[4], d11[4], d12[4], d13[4]}, [r0], r12 + vld4.8 {d10[5], d11[5], d12[5], d13[5]}, [r3], r12 + vld4.8 {d10[6], d11[6], d12[6], d13[6]}, [r0], r12 + vld4.8 {d10[7], d11[7], d12[7], d13[7]}, [r3] + + vswp d7, d10 + vswp d12, d9 + + ;vp8_filter_mask() function + ;vp8_hevmask() function + sub r0, r0, r1, lsl #4 + vabd.u8 q15, q5, q4 ; abs(p0 - q0) + vabd.u8 q14, q3, q6 ; abs(p1 - q1) + + vqadd.u8 q15, q15, q15 ; abs(p0 - q0) * 2 + vshr.u8 q14, q14, #1 ; abs(p1 - q1) / 2 + vmov.u8 q0, #0x80 ; 0x80 + vmov.s16 q11, #3 + vqadd.u8 q15, q15, q14 ; abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + veor q4, q4, q0 ; qs0: q0 offset to convert to a signed value + veor q5, q5, q0 ; ps0: p0 offset to convert to a signed value + veor q3, q3, q0 ; ps1: p1 offset to convert to a signed value + veor q6, q6, q0 ; qs1: q1 offset to convert to a signed value + + vcge.u8 q15, q1, q15 ; abs(p0 - q0)*2 + abs(p1-q1)/2 > flimit*2 + limit)*-1 + + vsubl.s8 q2, d8, d10 ; ( qs0 - ps0) + vsubl.s8 q13, d9, d11 + + vqsub.s8 q14, q3, q6 ; vp8_filter = vp8_signed_char_clamp(ps1-qs1) + + vmul.s16 q2, q2, q11 ; 3 * ( qs0 - ps0) + vmul.s16 q13, q13, q11 + + vmov.u8 q11, #0x03 ; 0x03 + vmov.u8 q12, #0x04 ; 0x04 + + vaddw.s8 q2, q2, d28 ; vp8_filter + 3 * ( qs0 - ps0) + vaddw.s8 q13, q13, d29 + + vqmovn.s16 d28, q2 ; vp8_filter = vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) + vqmovn.s16 d29, q13 + + add r0, r0, #1 + add r3, r0, r1 + + vand q14, q14, q15 ; vp8_filter &= mask + + vqadd.s8 q2, q14, q11 ; Filter2 = vp8_signed_char_clamp(vp8_filter+3) + vqadd.s8 q3, q14, q12 ; Filter1 = vp8_signed_char_clamp(vp8_filter+4) + vshr.s8 q2, q2, #3 ; Filter2 >>= 3 + vshr.s8 q14, q3, #3 ; Filter1 >>= 3 + + ;calculate output + vqadd.s8 q11, q5, q2 ; u = vp8_signed_char_clamp(ps0 + Filter2) + vqsub.s8 q10, q4, q14 ; u = vp8_signed_char_clamp(qs0 - Filter1) + + veor q6, q11, q0 ; *op0 = u^0x80 + veor q7, q10, q0 ; *oq0 = u^0x80 + add r12, r1, r1 + vswp d13, d14 + + ;store op1, op0, oq0, oq1 + vst2.8 {d12[0], d13[0]}, [r0], r12 + vst2.8 {d12[1], d13[1]}, [r3], r12 + vst2.8 {d12[2], d13[2]}, [r0], r12 + vst2.8 {d12[3], d13[3]}, [r3], r12 + vst2.8 {d12[4], d13[4]}, [r0], r12 + vst2.8 {d12[5], d13[5]}, [r3], r12 + vst2.8 {d12[6], d13[6]}, [r0], r12 + vst2.8 {d12[7], d13[7]}, [r3], r12 + vst2.8 {d14[0], d15[0]}, [r0], r12 + vst2.8 {d14[1], d15[1]}, [r3], r12 + vst2.8 {d14[2], d15[2]}, [r0], r12 + vst2.8 {d14[3], d15[3]}, [r3], r12 + vst2.8 {d14[4], d15[4]}, [r0], r12 + vst2.8 {d14[5], d15[5]}, [r3], r12 + vst2.8 {d14[6], d15[6]}, [r0], r12 + vst2.8 {d14[7], d15[7]}, [r3] + + vpop {d8-d15} + bx lr + ENDP ; |vp8_loop_filter_simple_vertical_edge_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_bvs_neon| PROC + push {r4, lr} + ldrb r3, [r2] ; load blim from mem + mov r4, r0 + add r0, r0, #4 + vdup.s8 q1, r3 ; duplicate blim + bl vp8_loop_filter_simple_vertical_edge_neon + ; vp8_loop_filter_simple_vertical_edge_neon preserves r1 and q1 + add r0, r4, #8 + bl vp8_loop_filter_simple_vertical_edge_neon + add r0, r4, #12 + pop {r4, lr} + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| + +; r0 unsigned char *y +; r1 int ystride +; r2 const unsigned char *blimit + +|vp8_loop_filter_mbvs_neon| PROC + ldrb r3, [r2] ; load mblim from mem + vdup.s8 q1, r3 ; duplicate mblim + b vp8_loop_filter_simple_vertical_edge_neon + ENDP ;|vp8_loop_filter_bvs_neon| + END diff --git a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c b/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c deleted file mode 100644 index b0952b582..000000000 --- a/vp8/common/arm/neon/loopfiltersimpleverticaledge_neon.c +++ /dev/null @@ -1,184 +0,0 @@ -/* - * Copyright (c) 2014 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include <arm_neon.h> -#include "./vpx_config.h" - -static INLINE void vp8_loop_filter_simple_vertical_edge_neon( - unsigned char *s, - int p, - const unsigned char *blimit) { - int pitch; - unsigned char *src1, *src2; - uint8x16_t qblimit, q0u8; - uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; - int16x8_t q2s16, q13s16, q11s16; - int8x8_t d28s8, d29s8; - int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; - uint8x8x4_t d0u8x4; // d6, d7, d8, d9 - uint8x8x4_t d1u8x4; // d10, d11, d12, d13 - uint8x8x2_t d2u8x2; // d12, d13 - uint8x8x2_t d3u8x2; // d14, d15 - - pitch = p << 1; - qblimit = vdupq_n_u8(*blimit); - - src1 = s - 2; - d0u8x4 = vld4_lane_u8(src1, d0u8x4, 0); - src1 += pitch; - d0u8x4 = vld4_lane_u8(src1, d0u8x4, 2); - src1 += pitch; - d0u8x4 = vld4_lane_u8(src1, d0u8x4, 4); - src1 += pitch; - d0u8x4 = vld4_lane_u8(src1, d0u8x4, 6); - src1 += pitch; - d1u8x4 = vld4_lane_u8(src1, d1u8x4, 0); - src1 += pitch; - d1u8x4 = vld4_lane_u8(src1, d1u8x4, 2); - src1 += pitch; - d1u8x4 = vld4_lane_u8(src1, d1u8x4, 4); - src1 += pitch; - d1u8x4 = vld4_lane_u8(src1, d1u8x4, 6); - - src2 = s - 2 + p; - d0u8x4 = vld4_lane_u8(src2, d0u8x4, 1); - src2 += pitch; - d0u8x4 = vld4_lane_u8(src2, d0u8x4, 3); - src2 += pitch; - d0u8x4 = vld4_lane_u8(src2, d0u8x4, 5); - src2 += pitch; - d0u8x4 = vld4_lane_u8(src2, d0u8x4, 7); - src2 += pitch; - d1u8x4 = vld4_lane_u8(src2, d1u8x4, 1); - src2 += pitch; - d1u8x4 = vld4_lane_u8(src2, d1u8x4, 3); - src2 += pitch; - d1u8x4 = vld4_lane_u8(src2, d1u8x4, 5); - src2 += pitch; - d1u8x4 = vld4_lane_u8(src2, d1u8x4, 7); - - // vswp d7, d10 - // vswp d12, d9 - q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 - q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 - q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 - q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 - - q15u8 = vabdq_u8(q5u8, q4u8); - q14u8 = vabdq_u8(q3u8, q6u8); - - q15u8 = vqaddq_u8(q15u8, q15u8); - q14u8 = vshrq_n_u8(q14u8, 1); - q0u8 = vdupq_n_u8(0x80); - q11s16 = vdupq_n_s16(3); - q15u8 = vqaddq_u8(q15u8, q14u8); - - q3u8 = veorq_u8(q3u8, q0u8); - q4u8 = veorq_u8(q4u8, q0u8); - q5u8 = veorq_u8(q5u8, q0u8); - q6u8 = veorq_u8(q6u8, q0u8); - - q15u8 = vcgeq_u8(qblimit, q15u8); - - q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), - vget_low_s8(vreinterpretq_s8_u8(q5u8))); - q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), - vget_high_s8(vreinterpretq_s8_u8(q5u8))); - - q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), - vreinterpretq_s8_u8(q6u8)); - - q2s16 = vmulq_s16(q2s16, q11s16); - q13s16 = vmulq_s16(q13s16, q11s16); - - q11u8 = vdupq_n_u8(3); - q12u8 = vdupq_n_u8(4); - - q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); - q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); - - d28s8 = vqmovn_s16(q2s16); - d29s8 = vqmovn_s16(q13s16); - q14s8 = vcombine_s8(d28s8, d29s8); - - q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); - - q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); - q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); - q2s8 = vshrq_n_s8(q2s8, 3); - q14s8 = vshrq_n_s8(q3s8, 3); - - q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); - q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); - - q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); - q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); - - d2u8x2.val[0] = vget_low_u8(q6u8); // d12 - d2u8x2.val[1] = vget_low_u8(q7u8); // d14 - d3u8x2.val[0] = vget_high_u8(q6u8); // d13 - d3u8x2.val[1] = vget_high_u8(q7u8); // d15 - - src1 = s - 1; - vst2_lane_u8(src1, d2u8x2, 0); - src1 += pitch; - vst2_lane_u8(src1, d2u8x2, 2); - src1 += pitch; - vst2_lane_u8(src1, d2u8x2, 4); - src1 += pitch; - vst2_lane_u8(src1, d2u8x2, 6); - src1 += pitch; - vst2_lane_u8(src1, d3u8x2, 0); - src1 += pitch; - vst2_lane_u8(src1, d3u8x2, 2); - src1 += pitch; - vst2_lane_u8(src1, d3u8x2, 4); - src1 += pitch; - vst2_lane_u8(src1, d3u8x2, 6); - - src2 = s - 1 + p; - vst2_lane_u8(src2, d2u8x2, 1); - src2 += pitch; - vst2_lane_u8(src2, d2u8x2, 3); - src2 += pitch; - vst2_lane_u8(src2, d2u8x2, 5); - src2 += pitch; - vst2_lane_u8(src2, d2u8x2, 7); - src2 += pitch; - vst2_lane_u8(src2, d3u8x2, 1); - src2 += pitch; - vst2_lane_u8(src2, d3u8x2, 3); - src2 += pitch; - vst2_lane_u8(src2, d3u8x2, 5); - src2 += pitch; - vst2_lane_u8(src2, d3u8x2, 7); - return; -} - -void vp8_loop_filter_bvs_neon( - unsigned char *y_ptr, - int y_stride, - const unsigned char *blimit) { - y_ptr += 4; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); - y_ptr += 4; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); - y_ptr += 4; - vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); - return; -} - -void vp8_loop_filter_mbvs_neon( - unsigned char *y_ptr, - int y_stride, - const unsigned char *blimit) { - vp8_loop_filter_simple_vertical_edge_neon(y_ptr, y_stride, blimit); - return; -} diff --git a/vp8/common/arm/neon/sixtappredict_neon.c b/vp8/common/arm/neon/sixtappredict_neon.c index 22932e94a..7a4d9e051 100644 --- a/vp8/common/arm/neon/sixtappredict_neon.c +++ b/vp8/common/arm/neon/sixtappredict_neon.c @@ -10,6 +10,10 @@ #include <arm_neon.h> +#ifdef _MSC_VER +#define __builtin_prefetch(x) +#endif + static const int8_t vp8_sub_pel_filters[8][8] = { {0, 0, 128, 0, 0, 0, 0, 0}, /* note that 1/8 pel positionyys are */ {0, -6, 123, 12, -1, 0, 0, 0}, /* just as per alpha -0.5 bicubic */ diff --git a/vp8/common/arm/neon/variance_neon.c b/vp8/common/arm/neon/variance_neon.c index 3e25f7ddf..afd2dc3d1 100644 --- a/vp8/common/arm/neon/variance_neon.c +++ b/vp8/common/arm/neon/variance_neon.c @@ -10,6 +10,10 @@ #include <arm_neon.h> +#ifdef _MSC_VER +#define __builtin_prefetch(x) +#endif + unsigned int vp8_variance16x16_neon( const unsigned char *src_ptr, int source_stride, @@ -19,7 +23,7 @@ unsigned int vp8_variance16x16_neon( int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; - int64_t d0s64, d1s64; + int64x1_t d0s64, d1s64; uint8x16_t q0u8, q1u8, q2u8, q3u8; uint16x8_t q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; @@ -98,7 +102,7 @@ unsigned int vp8_variance16x8_neon( int i; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; - int64_t d0s64, d1s64; + int64x1_t d0s64, d1s64; uint8x16_t q0u8, q1u8, q2u8, q3u8; uint16x8_t q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; @@ -178,7 +182,7 @@ unsigned int vp8_variance8x16_neon( uint8x8_t d0u8, d2u8, d4u8, d6u8; int16x4_t d22s16, d23s16, d24s16, d25s16; uint32x2_t d0u32, d10u32; - int64_t d0s64, d1s64; + int64x1_t d0s64, d1s64; uint16x8_t q11u16, q12u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; @@ -243,7 +247,7 @@ unsigned int vp8_variance8x8_neon( uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8; int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16; uint32x2_t d0u32, d10u32; - int64_t d0s64, d1s64; + int64x1_t d0s64, d1s64; uint16x8_t q11u16, q12u16, q13u16, q14u16; int32x4_t q8s32, q9s32, q10s32; int64x2_t q0s64, q1s64, q5s64; diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 177e67eb3..2812111e9 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -160,6 +160,7 @@ VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/vp8_variance_halfpixvar16x16_ # common (neon) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfilter_neon$(ASM) +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/buildintrapredictorsmby_neon$(ASM) VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_blk_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp8_subpixelvariance8x8_neon$(ASM) @@ -175,7 +176,6 @@ VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dequantizeb_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/idct_dequant_full_2x_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iwalsh_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimplehorizontaledge_neon.c -VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/loopfiltersimpleverticaledge_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/mbloopfilter_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/sad_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/shortidct4x4llm_neon.c diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c index df520f20d..e750a53b6 100644 --- a/vp9/encoder/vp9_pickmode.c +++ b/vp9/encoder/vp9_pickmode.c @@ -85,39 +85,9 @@ static void full_pixel_motion_search(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.col >>= 3; mvp_full.row >>= 3; - if (cpi->sf.search_method == FAST_DIAMOND) { - // NOTE: this returns SAD - vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == FAST_HEX) { - // NOTE: this returns SAD - vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == HEX) { - // NOTE: this returns SAD - vp9_hex_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == SQUARE) { - // NOTE: this returns SAD - vp9_square_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else if (cpi->sf.search_method == BIGDIA) { - // NOTE: this returns SAD - vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - } else { - int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; - // NOTE: this returns variance - vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 1, - &cpi->fn_ptr[bsize], - &ref_mv, &tmp_mv->as_mv); - } + full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, &ref_mv, + &tmp_mv->as_mv, INT_MAX, 0); + x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; x->mv_row_min = tmp_row_min; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 857015e51..66464d07b 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1735,7 +1735,6 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, const BLOCK_SIZE bsize = mbmi->sb_type; const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize]; const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize]; - vp9_variance_fn_ptr_t *v_fn_ptr = &cpi->fn_ptr[bsize]; ENTROPY_CONTEXT t_above[2], t_left[2]; int subpelmv = 1, have_ref = 0; const int has_second_rf = has_second_ref(mbmi); @@ -1807,7 +1806,6 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) { MV *const new_mv = &mode_mv[NEWMV][0].as_mv; int step_param = 0; - int further_steps; int thissme, bestsme = INT_MAX; int sadpb = x->sadperbit4; MV mvp_full; @@ -1850,48 +1848,14 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, step_param = MAX(step_param, 8); } - further_steps = (MAX_MVSEARCH_STEPS - 1) - step_param; // adjust src pointer for this block mi_buf_shift(x, i); vp9_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv); - if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full, - step_param, - sadpb, 1, v_fn_ptr, 1, - &bsi->ref_mv[0]->as_mv, - new_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, new_mv, - &bsi->ref_mv[0]->as_mv, - v_fn_ptr, 1); - } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full, - step_param, - sadpb, 1, v_fn_ptr, 1, - &bsi->ref_mv[0]->as_mv, - new_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, new_mv, - &bsi->ref_mv[0]->as_mv, - v_fn_ptr, 1); - } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full, - step_param, - sadpb, 1, v_fn_ptr, 1, - &bsi->ref_mv[0]->as_mv, - new_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, new_mv, - &bsi->ref_mv[0]->as_mv, - v_fn_ptr, 1); - } else { - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 0, v_fn_ptr, - &bsi->ref_mv[0]->as_mv, - new_mv); - } + bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param, + sadpb, &bsi->ref_mv[0]->as_mv, new_mv, + INT_MAX, 1); // Should we do a full search (best quality only) if (is_best_mode(cpi->oxcf.mode)) { @@ -1900,7 +1864,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max); thissme = cpi->full_search_sad(x, &mvp_full, - sadpb, 16, v_fn_ptr, + sadpb, 16, &cpi->fn_ptr[bsize], &bsi->ref_mv[0]->as_mv, &best_mv->as_mv); if (thissme < bestsme) { @@ -1919,7 +1883,7 @@ static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x, new_mv, &bsi->ref_mv[0]->as_mv, cm->allow_high_precision_mv, - x->errorperbit, v_fn_ptr, + x->errorperbit, &cpi->fn_ptr[bsize], cpi->sf.subpel_force_stop, cpi->sf.subpel_iters_per_step, x->nmvjointcost, x->mvcost, @@ -2350,7 +2314,7 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi; struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; int bestsme = INT_MAX; - int further_steps, step_param; + int step_param; int sadpb = x->sadperbit16; MV mvp_full; int ref = mbmi->ref_frame[0]; @@ -2430,50 +2394,8 @@ static void single_motion_search(VP9_COMP *cpi, MACROBLOCK *x, mvp_full.col >>= 3; mvp_full.row >>= 3; - // Further step/diamond searches as necessary - further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; - - if (cpi->sf.search_method == FAST_DIAMOND) { - bestsme = vp9_fast_dia_search(x, &mvp_full, step_param, sadpb, 0, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == FAST_HEX) { - bestsme = vp9_fast_hex_search(x, &mvp_full, step_param, sadpb, 0, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == HEX) { - bestsme = vp9_hex_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == SQUARE) { - bestsme = vp9_square_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else if (cpi->sf.search_method == BIGDIA) { - bestsme = vp9_bigdia_search(x, &mvp_full, step_param, sadpb, 1, - &cpi->fn_ptr[bsize], 1, - &ref_mv, &tmp_mv->as_mv); - if (bestsme < INT_MAX) - bestsme = vp9_get_mvpred_var(x, &tmp_mv->as_mv, &ref_mv, - &cpi->fn_ptr[bsize], 1); - } else { - bestsme = vp9_full_pixel_diamond(cpi, x, &mvp_full, step_param, - sadpb, further_steps, 1, - &cpi->fn_ptr[bsize], - &ref_mv, &tmp_mv->as_mv); - } + bestsme = full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb, + &ref_mv, &tmp_mv->as_mv, INT_MAX, 1); x->mv_col_min = tmp_col_min; x->mv_col_max = tmp_col_max; diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h index d016ebde8..b6b51e553 100644 --- a/vp9/encoder/vp9_rdopt.h +++ b/vp9/encoder/vp9_rdopt.h @@ -87,6 +87,49 @@ void vp9_set_rd_speed_thresholds(VP9_COMP *cpi); void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi); +static INLINE int full_pixel_search(VP9_COMP *cpi, MACROBLOCK *x, + BLOCK_SIZE bsize, MV *mvp_full, + int step_param, int error_per_bit, + const MV *ref_mv, MV *tmp_mv, + int var_max, int rd) { + int var = 0; + + if (cpi->sf.search_method == FAST_DIAMOND) { + var = vp9_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0, + &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); + if (rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); + } else if (cpi->sf.search_method == FAST_HEX) { + var = vp9_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0, + &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); + if (rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); + } else if (cpi->sf.search_method == HEX) { + var = vp9_hex_search(x, mvp_full, step_param, error_per_bit, 1, + &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); + if (rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); + } else if (cpi->sf.search_method == SQUARE) { + var = vp9_square_search(x, mvp_full, step_param, error_per_bit, 1, + &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); + if (rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); + } else if (cpi->sf.search_method == BIGDIA) { + var = vp9_bigdia_search(x, mvp_full, step_param, error_per_bit, 1, + &cpi->fn_ptr[bsize], 1, ref_mv, tmp_mv); + if (rd && var < var_max) + var = vp9_get_mvpred_var(x, tmp_mv, ref_mv, &cpi->fn_ptr[bsize], 1); + } else { + int further_steps = (cpi->sf.max_step_search_steps - 1) - step_param; + + var = vp9_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit, + further_steps, 1, &cpi->fn_ptr[bsize], + ref_mv, tmp_mv); + } + + return var; +} + #ifdef __cplusplus } // extern "C" #endif diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h index bc8df807e..67cbdb1f5 100644 --- a/vpx/vp8cx.h +++ b/vpx/vp8cx.h @@ -162,8 +162,10 @@ enum vp8e_enc_control_id { VP8E_SET_ARNR_MAXFRAMES, /**< control function to set the max number of frames blurred creating arf*/ VP8E_SET_ARNR_STRENGTH, //!< control function to set the filter //!< strength for the arf - VP8E_SET_ARNR_TYPE, //!< control function to set the type of - //!< filter to use for the arf + + /*!\deprecated control function to set the filter type to use for the arf */ + VP8E_SET_ARNR_TYPE, + VP8E_SET_TUNING, /**< control function to set visual tuning */ /*!\brief control function to set constrained quality level * @@ -349,7 +351,7 @@ VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */ VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int) VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int) -VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_TYPE, unsigned int) +VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_SET_ARNR_TYPE, unsigned int) VPX_CTRL_USE_TYPE(VP8E_SET_TUNING, int) /* vp8e_tuning */ VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int) diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c index 542ff6786..fa0e030b5 100644 --- a/vpx_ports/arm_cpudetect.c +++ b/vpx_ports/arm_cpudetect.c @@ -12,6 +12,13 @@ #include <string.h> #include "arm.h" +#ifdef WINAPI_FAMILY +#include <winapifamily.h> +#if !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#define getenv(x) NULL +#endif +#endif + static int arm_cpu_env_flags(int *flags) { char *env; env = getenv("VPX_SIMD_CAPS"); |