diff options
59 files changed, 343 insertions, 6833 deletions
@@ -47,10 +47,8 @@ COMPILING THE APPLICATIONS/LIBRARIES: --help output of the configure script. As of this writing, the list of available targets is: - armv6-linux-rvct - armv6-linux-gcc - armv6-none-rvct arm64-darwin-gcc + arm64-linux-gcc armv7-android-gcc armv7-darwin-gcc armv7-linux-rvct @@ -60,6 +58,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: armv7-win32-vs12 armv7-win32-vs14 armv7s-darwin-gcc + armv8-linux-gcc mips32-linux-gcc mips64-linux-gcc sparc-solaris-gcc @@ -73,6 +72,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86-darwin12-gcc x86-darwin13-gcc x86-darwin14-gcc + x86-darwin15-gcc x86-iphonesimulator-gcc x86-linux-gcc x86-linux-icc @@ -90,6 +90,7 @@ COMPILING THE APPLICATIONS/LIBRARIES: x86_64-darwin12-gcc x86_64-darwin13-gcc x86_64-darwin14-gcc + x86_64-darwin15-gcc x86_64-iphonesimulator-gcc x86_64-linux-gcc x86_64-linux-icc @@ -127,7 +128,22 @@ VP8/VP9 TEST VECTORS: $ ./configure --enable-unit-tests $ LIBVPX_TEST_DATA_PATH=../libvpx-test-data make testdata +CODE STYLE: + The coding style used by this project is enforced with clang-format using the + configuration contained in the .clang-format file in the root of the + repository. + + Before pushing changes for review you can format your code with: + # Apply clang-format to modified .c, .h and .cc files + $ clang-format -i --style=file \ + $(git diff --name-only --diff-filter=ACMR '*.[hc]' '*.cc') + + Check the .clang-format file for the version used to generate it if there is + any difference between your local formatting and the review system. + + See also: http://clang.llvm.org/docs/ClangFormat.html + SUPPORT This library is an open source project supported by its community. Please - please email webm-discuss@webmproject.org for help. + email webm-discuss@webmproject.org for help. diff --git a/build/make/Android.mk b/build/make/Android.mk index 9eb6dd280..9ccb5da26 100644 --- a/build/make/Android.mk +++ b/build/make/Android.mk @@ -29,11 +29,6 @@ # include $(CLEAR_VARS) # include jni/libvpx/build/make/Android.mk # -# There are currently two TARGET_ARCH_ABI targets for ARM. -# armeabi and armeabi-v7a. armeabi-v7a is selected by creating an -# Application.mk in the jni directory that contains: -# APP_ABI := armeabi-v7a -# # By default libvpx will detect at runtime the existance of NEON extension. # For this we import the 'cpufeatures' module from the NDK sources. # libvpx can also be configured without this runtime detection method. @@ -42,9 +37,6 @@ # --disable-neon-asm # will remove any NEON dependency. -# To change to building armeabi, run ./libvpx/configure again, but with -# --target=armv6-android-gcc and modify the Application.mk file to -# set APP_ABI := armeabi # # Running ndk-build will build libvpx and include it in your project. # @@ -59,9 +51,6 @@ ASM_CNV_PATH := $(LOCAL_PATH)/$(ASM_CNV_PATH_LOCAL) ifeq ($(TARGET_ARCH_ABI),armeabi-v7a) include $(CONFIG_DIR)libs-armv7-android-gcc.mk LOCAL_ARM_MODE := arm -else ifeq ($(TARGET_ARCH_ABI),armeabi) - include $(CONFIG_DIR)libs-armv6-android-gcc.mk - LOCAL_ARM_MODE := arm else ifeq ($(TARGET_ARCH_ABI),arm64-v8a) include $(CONFIG_DIR)libs-armv8-android-gcc.mk LOCAL_ARM_MODE := arm diff --git a/build/make/configure.sh b/build/make/configure.sh index 2e1597779..14fada09d 100644 --- a/build/make/configure.sh +++ b/build/make/configure.sh @@ -680,9 +680,6 @@ process_common_toolchain() { aarch64*) tgt_isa=arm64 ;; - armv6*) - tgt_isa=armv6 - ;; armv7*-hardfloat* | armv7*-gnueabihf | arm-*-gnueabihf) tgt_isa=armv7 float_abi=hard @@ -883,36 +880,6 @@ process_common_toolchain() { if disabled neon && enabled neon_asm; then die "Disabling neon while keeping neon-asm is not supported" fi - case ${toolchain} in - # Apple iOS SDKs no longer support armv6 as of the version 9 - # release (coincides with release of Xcode 7). Only enable media - # when using earlier SDK releases. - *-darwin*) - if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then - soft_enable media - else - soft_disable media - RTCD_OPTIONS="${RTCD_OPTIONS}--disable-media " - fi - ;; - *) - soft_enable media - ;; - esac - ;; - armv6) - case ${toolchain} in - *-darwin*) - if [ "$(show_darwin_sdk_major_version iphoneos)" -lt 9 ]; then - soft_enable media - else - die "Your iOS SDK does not support armv6." - fi - ;; - *) - soft_enable media - ;; - esac ;; esac diff --git a/build/make/rtcd.pl b/build/make/rtcd.pl index 991b6abe7..9e746c46d 100755 --- a/build/make/rtcd.pl +++ b/build/make/rtcd.pl @@ -384,13 +384,8 @@ if ($opts{arch} eq 'x86') { } close CONFIG_FILE; mips; -} elsif ($opts{arch} eq 'armv6') { - @ALL_ARCHS = filter(qw/media/); - arm; } elsif ($opts{arch} =~ /armv7\w?/) { - @ALL_ARCHS = filter(qw/media neon_asm neon/); - @REQUIRES = filter(keys %required ? keys %required : qw/media/); - &require(@REQUIRES); + @ALL_ARCHS = filter(qw/neon_asm neon/); arm; } elsif ($opts{arch} eq 'armv8' || $opts{arch} eq 'arm64' ) { @ALL_ARCHS = filter(qw/neon/); @@ -99,9 +99,6 @@ EOF # alphabetically by architecture, generic-gnu last. all_platforms="${all_platforms} arm64-darwin-gcc" all_platforms="${all_platforms} arm64-linux-gcc" -all_platforms="${all_platforms} armv6-linux-rvct" -all_platforms="${all_platforms} armv6-linux-gcc" -all_platforms="${all_platforms} armv6-none-rvct" all_platforms="${all_platforms} armv7-android-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-darwin-gcc" #neon Cortex-A8 all_platforms="${all_platforms} armv7-linux-rvct" #neon Cortex-A8 @@ -236,8 +233,6 @@ ARCH_EXT_LIST_X86=" avx2 " ARCH_EXT_LIST=" - edsp - media neon neon_asm diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc index cd293016e..2990e5434 100644 --- a/test/dct16x16_test.cc +++ b/test/dct16x16_test.cc @@ -850,7 +850,7 @@ INSTANTIATE_TEST_CASE_P(C, PartialTrans16x16Test, VPX_BITS_8))); #endif // CONFIG_VP9_HIGHBITDEPTH -#if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE +#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE INSTANTIATE_TEST_CASE_P( NEON, Trans16x16DCT, ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_neon, diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc index 22952e4e6..2f843d799 100644 --- a/test/lpf_8_test.cc +++ b/test/lpf_8_test.cc @@ -95,7 +95,7 @@ TEST_P(Loop8Test6Param, OperationCheck) { ACMRandom rnd(ACMRandom::DeterministicSeed()); const int count_test_block = number_of_iterations; #if CONFIG_VP9_HIGHBITDEPTH - int32_t bd = bit_depth_; + const int32_t bd = bit_depth_; DECLARE_ALIGNED(16, uint16_t, s[kNumCoeffs]); DECLARE_ALIGNED(16, uint16_t, ref_s[kNumCoeffs]); #else @@ -119,7 +119,6 @@ TEST_P(Loop8Test6Param, OperationCheck) { thresh[16]) = { tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp }; int32_t p = kNumCoeffs / 32; - uint16_t tmp_s[kNumCoeffs]; int j = 0; while (j < kNumCoeffs) { @@ -127,20 +126,42 @@ TEST_P(Loop8Test6Param, OperationCheck) { if (val & 0x80) { // 50% chance to choose a new value. tmp_s[j] = rnd.Rand16(); j++; - } else { // 50% chance to repeat previous value in row X times + } else { // 50% chance to repeat previous value in row X times. + int k = 0; + while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) { + if (j < 1) { + tmp_s[j] = rnd.Rand16(); + } else if (val & 0x20) { // Increment by a value within the limit. + tmp_s[j] = tmp_s[j - 1] + (*limit - 1); + } else { // Decrement by a value within the limit. + tmp_s[j] = tmp_s[j - 1] - (*limit - 1); + } + j++; + } + } + } + + for (j = 0; j < kNumCoeffs;) { + const uint8_t val = rnd.Rand8(); + if (val & 0x80) { + j++; + } else { // 50% chance to repeat previous value in column X times. int k = 0; while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) { if (j < 1) { tmp_s[j] = rnd.Rand16(); - } else if (val & 0x20) { // Increment by an value within the limit - tmp_s[j] = (tmp_s[j - 1] + (*limit - 1)); - } else { // Decrement by an value within the limit - tmp_s[j] = (tmp_s[j - 1] - (*limit - 1)); + } else if (val & 0x20) { // Increment by a value within the limit. + tmp_s[(j % 32) * 32 + j / 32] = + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (*limit - 1); + } else { // Decrement by a value within the limit. + tmp_s[(j % 32) * 32 + j / 32] = + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (*limit - 1); } j++; } } } + for (j = 0; j < kNumCoeffs; j++) { if (i % 2) { s[j] = tmp_s[j] & mask_; @@ -227,6 +248,7 @@ TEST_P(Loop8Test6Param, ValueCheck) { ASM_REGISTER_STATE_CHECK( loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh)); #endif // CONFIG_VP9_HIGHBITDEPTH + for (int j = 0; j < kNumCoeffs; ++j) { err_count += ref_s[j] != s[j]; } @@ -295,14 +317,36 @@ TEST_P(Loop8Test9Param, OperationCheck) { if (j < 1) { tmp_s[j] = rnd.Rand16(); } else if (val & 0x20) { // Increment by a value within the limit. - tmp_s[j] = (tmp_s[j - 1] + (limit - 1)); - } else { // Decrement by an value within the limit. - tmp_s[j] = (tmp_s[j - 1] - (limit - 1)); + tmp_s[j] = tmp_s[j - 1] + (limit - 1); + } else { // Decrement by a value within the limit. + tmp_s[j] = tmp_s[j - 1] - (limit - 1); } j++; } } } + + for (j = 0; j < kNumCoeffs;) { + const uint8_t val = rnd.Rand8(); + if (val & 0x80) { + j++; + } else { // 50% chance to repeat previous value in column X times. + int k = 0; + while (k++ < ((val & 0x1f) + 1) && j < kNumCoeffs) { + if (j < 1) { + tmp_s[j] = rnd.Rand16(); + } else if (val & 0x20) { // Increment by a value within the limit. + tmp_s[(j % 32) * 32 + j / 32] = + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] + (limit - 1); + } else { // Decrement by a value within the limit. + tmp_s[(j % 32) * 32 + j / 32] = + tmp_s[((j - 1) % 32) * 32 + (j - 1) / 32] - (limit - 1); + } + j++; + } + } + } + for (j = 0; j < kNumCoeffs; j++) { if (i % 2) { s[j] = tmp_s[j] & mask_; diff --git a/test/sad_test.cc b/test/sad_test.cc index 9dd91ade9..e1f164a4e 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -640,13 +640,6 @@ INSTANTIATE_TEST_CASE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests)); //------------------------------------------------------------------------------ // ARM functions -#if HAVE_MEDIA -const SadMxNParam media_tests[] = { - SadMxNParam(16, 16, &vpx_sad16x16_media), -}; -INSTANTIATE_TEST_CASE_P(MEDIA, SADTest, ::testing::ValuesIn(media_tests)); -#endif // HAVE_MEDIA - #if HAVE_NEON const SadMxNParam neon_tests[] = { SadMxNParam(64, 64, &vpx_sad64x64_neon), diff --git a/test/variance_test.cc b/test/variance_test.cc index c8e4114f2..6e31165fa 100644 --- a/test/variance_test.cc +++ b/test/variance_test.cc @@ -1205,22 +1205,6 @@ INSTANTIATE_TEST_CASE_P( make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_avx2, 0))); #endif // HAVE_AVX2 -#if HAVE_MEDIA -INSTANTIATE_TEST_CASE_P(MEDIA, VpxMseTest, - ::testing::Values(MseParams(4, 4, - &vpx_mse16x16_media))); - -INSTANTIATE_TEST_CASE_P( - MEDIA, VpxVarianceTest, - ::testing::Values(VarianceParams(4, 4, &vpx_variance16x16_media), - VarianceParams(3, 3, &vpx_variance8x8_media))); - -INSTANTIATE_TEST_CASE_P( - MEDIA, VpxSubpelVarianceTest, - ::testing::Values(make_tuple(4, 4, &vpx_sub_pixel_variance16x16_media, 0), - make_tuple(3, 3, &vpx_sub_pixel_variance8x8_media, 0))); -#endif // HAVE_MEDIA - #if HAVE_NEON INSTANTIATE_TEST_CASE_P(NEON, VpxSseTest, ::testing::Values(SseParams(2, 2, diff --git a/vp8/common/arm/armv6/bilinearfilter_v6.asm b/vp8/common/arm/armv6/bilinearfilter_v6.asm deleted file mode 100644 index 9704b4210..000000000 --- a/vp8/common/arm/armv6/bilinearfilter_v6.asm +++ /dev/null @@ -1,237 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_filter_block2d_bil_first_pass_armv6| - EXPORT |vp8_filter_block2d_bil_second_pass_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - -;------------------------------------- -; r0 unsigned char *src_ptr, -; r1 unsigned short *dst_ptr, -; r2 unsigned int src_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *vp8_filter -;------------------------------------- -; The output is transposed stroed in output array to make it easy for second pass filtering. -|vp8_filter_block2d_bil_first_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp8_filter address - ldr r4, [sp, #36] ; width - - mov r12, r3 ; outer-loop counter - - add r7, r2, r4 ; preload next row - pld [r0, r7] - - sub r2, r2, r4 ; src increment for height loop - - ldr r5, [r11] ; load up filter coefficients - - mov r3, r3, lsl #1 ; height*2 - add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) - - mov r11, r1 ; save dst_ptr for each row - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_1st_filter - -|bil_height_loop_1st_v6| - ldrb r6, [r0] ; load source data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - mov lr, r4, lsr #2 ; 4-in-parellel loop counter - -|bil_width_loop_1st_v6| - ldrb r9, [r0, #3] - ldrb r10, [r0, #4] - - pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] - pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] - - smuad r6, r6, r5 ; apply the filter - pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] - smuad r7, r7, r5 - pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] - - smuad r8, r8, r5 - smuad r9, r9, r5 - - add r0, r0, #4 - subs lr, lr, #1 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #16, r6, asr #7 - usat r7, #16, r7, asr #7 - - strh r6, [r1], r3 ; result is transposed and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strh r7, [r1], r3 - add r9, r9, #0x40 - usat r8, #16, r8, asr #7 - usat r9, #16, r9, asr #7 - - strh r8, [r1], r3 ; result is transposed and stored - - ldrneb r6, [r0] ; load source data - strh r9, [r1], r3 - - ldrneb r7, [r0, #1] - ldrneb r8, [r0, #2] - - bne bil_width_loop_1st_v6 - - add r0, r0, r2 ; move to next input row - subs r12, r12, #1 - - add r9, r2, r4, lsl #1 ; adding back block width - pld [r0, r9] ; preload next row - - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_1st_v6 - - ldmia sp!, {r4 - r11, pc} - -|bil_null_1st_filter| -|bil_height_loop_null_1st| - mov lr, r4, lsr #2 ; loop counter - -|bil_width_loop_null_1st| - ldrb r6, [r0] ; load data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - ldrb r9, [r0, #3] - - strh r6, [r1], r3 ; store it to immediate buffer - add r0, r0, #4 - strh r7, [r1], r3 - subs lr, lr, #1 - strh r8, [r1], r3 - strh r9, [r1], r3 - - bne bil_width_loop_null_1st - - subs r12, r12, #1 - add r0, r0, r2 ; move to next input line - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_null_1st - - ldmia sp!, {r4 - r11, pc} - - ENDP ; |vp8_filter_block2d_bil_first_pass_armv6| - - -;--------------------------------- -; r0 unsigned short *src_ptr, -; r1 unsigned char *dst_ptr, -; r2 int dst_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *vp8_filter -;--------------------------------- -|vp8_filter_block2d_bil_second_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp8_filter address - ldr r4, [sp, #36] ; width - - ldr r5, [r11] ; load up filter coefficients - mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix - mov r11, r1 - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_2nd_filter - -|bil_height_loop_2nd| - ldr r6, [r0] ; load the data - ldr r8, [r0, #4] - ldrh r10, [r0, #8] - mov lr, r3, lsr #2 ; loop counter - -|bil_width_loop_2nd| - pkhtb r7, r6, r8 ; src[1] | src[2] - pkhtb r9, r8, r10 ; src[3] | src[4] - - smuad r6, r6, r5 ; apply filter - smuad r8, r8, r5 ; apply filter - - subs lr, lr, #1 - - smuadx r7, r7, r5 ; apply filter - smuadx r9, r9, r5 ; apply filter - - add r0, r0, #8 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #8, r6, asr #7 - usat r7, #8, r7, asr #7 - strb r6, [r1], r2 ; the result is transposed back and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strb r7, [r1], r2 - add r9, r9, #0x40 - usat r8, #8, r8, asr #7 - usat r9, #8, r9, asr #7 - strb r8, [r1], r2 ; the result is transposed back and stored - - ldrne r6, [r0] ; load data - strb r9, [r1], r2 - ldrne r8, [r0, #4] - ldrneh r10, [r0, #8] - - bne bil_width_loop_2nd - - subs r12, r12, #1 - add r0, r0, #4 ; update src for next row - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_2nd - ldmia sp!, {r4 - r11, pc} - -|bil_null_2nd_filter| -|bil_height_loop_null_2nd| - mov lr, r3, lsr #2 - -|bil_width_loop_null_2nd| - ldr r6, [r0], #4 ; load data - subs lr, lr, #1 - ldr r8, [r0], #4 - - strb r6, [r1], r2 ; store data - mov r7, r6, lsr #16 - strb r7, [r1], r2 - mov r9, r8, lsr #16 - strb r8, [r1], r2 - strb r9, [r1], r2 - - bne bil_width_loop_null_2nd - - subs r12, r12, #1 - add r0, r0, #4 - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_null_2nd - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_filter_block2d_second_pass_armv6| - - END diff --git a/vp8/common/arm/armv6/copymem16x16_v6.asm b/vp8/common/arm/armv6/copymem16x16_v6.asm deleted file mode 100644 index abf048c2f..000000000 --- a/vp8/common/arm/armv6/copymem16x16_v6.asm +++ /dev/null @@ -1,186 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem16x16_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem16x16_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem16x16_v6| PROC - stmdb sp!, {r4 - r7} - ;push {r4-r7} - - ;preload - pld [r0, #31] ; preload for next 16x16 block - - ands r4, r0, #15 - beq copy_mem16x16_fast - - ands r4, r0, #7 - beq copy_mem16x16_8 - - ands r4, r0, #3 - beq copy_mem16x16_4 - - ;copy one byte each time - ldrb r4, [r0] - ldrb r5, [r0, #1] - ldrb r6, [r0, #2] - ldrb r7, [r0, #3] - - mov r12, #16 - -copy_mem16x16_1_loop - strb r4, [r2] - strb r5, [r2, #1] - strb r6, [r2, #2] - strb r7, [r2, #3] - - ldrb r4, [r0, #4] - ldrb r5, [r0, #5] - ldrb r6, [r0, #6] - ldrb r7, [r0, #7] - - subs r12, r12, #1 - - strb r4, [r2, #4] - strb r5, [r2, #5] - strb r6, [r2, #6] - strb r7, [r2, #7] - - ldrb r4, [r0, #8] - ldrb r5, [r0, #9] - ldrb r6, [r0, #10] - ldrb r7, [r0, #11] - - strb r4, [r2, #8] - strb r5, [r2, #9] - strb r6, [r2, #10] - strb r7, [r2, #11] - - ldrb r4, [r0, #12] - ldrb r5, [r0, #13] - ldrb r6, [r0, #14] - ldrb r7, [r0, #15] - - add r0, r0, r1 - - strb r4, [r2, #12] - strb r5, [r2, #13] - strb r6, [r2, #14] - strb r7, [r2, #15] - - add r2, r2, r3 - - ldrneb r4, [r0] - ldrneb r5, [r0, #1] - ldrneb r6, [r0, #2] - ldrneb r7, [r0, #3] - - pld [r0, #31] ; preload for next 16x16 block - - bne copy_mem16x16_1_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - -;copy 4 bytes each time -copy_mem16x16_4 - ldr r4, [r0] - ldr r5, [r0, #4] - ldr r6, [r0, #8] - ldr r7, [r0, #12] - - mov r12, #16 - -copy_mem16x16_4_loop - subs r12, r12, #1 - add r0, r0, r1 - - str r4, [r2] - str r5, [r2, #4] - str r6, [r2, #8] - str r7, [r2, #12] - - add r2, r2, r3 - - ldrne r4, [r0] - ldrne r5, [r0, #4] - ldrne r6, [r0, #8] - ldrne r7, [r0, #12] - - pld [r0, #31] ; preload for next 16x16 block - - bne copy_mem16x16_4_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - -;copy 8 bytes each time -copy_mem16x16_8 - sub r1, r1, #16 - sub r3, r3, #16 - - mov r12, #16 - -copy_mem16x16_8_loop - ldmia r0!, {r4-r5} - ;ldm r0, {r4-r5} - ldmia r0!, {r6-r7} - - add r0, r0, r1 - - stmia r2!, {r4-r5} - subs r12, r12, #1 - ;stm r2, {r4-r5} - stmia r2!, {r6-r7} - - add r2, r2, r3 - - pld [r0, #31] ; preload for next 16x16 block - bne copy_mem16x16_8_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - -;copy 16 bytes each time -copy_mem16x16_fast - ;sub r1, r1, #16 - ;sub r3, r3, #16 - - mov r12, #16 - -copy_mem16x16_fast_loop - ldmia r0, {r4-r7} - ;ldm r0, {r4-r7} - add r0, r0, r1 - - subs r12, r12, #1 - stmia r2, {r4-r7} - ;stm r2, {r4-r7} - add r2, r2, r3 - - pld [r0, #31] ; preload for next 16x16 block - bne copy_mem16x16_fast_loop - - ldmia sp!, {r4 - r7} - ;pop {r4-r7} - mov pc, lr - - ENDP ; |vp8_copy_mem16x16_v6| - - END diff --git a/vp8/common/arm/armv6/copymem8x4_v6.asm b/vp8/common/arm/armv6/copymem8x4_v6.asm deleted file mode 100644 index d8362ef05..000000000 --- a/vp8/common/arm/armv6/copymem8x4_v6.asm +++ /dev/null @@ -1,128 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem8x4_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void vp8_copy_mem8x4_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem8x4_v6| PROC - ;push {r4-r5} - stmdb sp!, {r4-r5} - - ;preload - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - ands r4, r0, #7 - beq copy_mem8x4_fast - - ands r4, r0, #3 - beq copy_mem8x4_4 - - ;copy 1 byte each time - ldrb r4, [r0] - ldrb r5, [r0, #1] - - mov r12, #4 - -copy_mem8x4_1_loop - strb r4, [r2] - strb r5, [r2, #1] - - ldrb r4, [r0, #2] - ldrb r5, [r0, #3] - - subs r12, r12, #1 - - strb r4, [r2, #2] - strb r5, [r2, #3] - - ldrb r4, [r0, #4] - ldrb r5, [r0, #5] - - strb r4, [r2, #4] - strb r5, [r2, #5] - - ldrb r4, [r0, #6] - ldrb r5, [r0, #7] - - add r0, r0, r1 - - strb r4, [r2, #6] - strb r5, [r2, #7] - - add r2, r2, r3 - - ldrneb r4, [r0] - ldrneb r5, [r0, #1] - - bne copy_mem8x4_1_loop - - ldmia sp!, {r4 - r5} - ;pop {r4-r5} - mov pc, lr - -;copy 4 bytes each time -copy_mem8x4_4 - ldr r4, [r0] - ldr r5, [r0, #4] - - mov r12, #4 - -copy_mem8x4_4_loop - subs r12, r12, #1 - add r0, r0, r1 - - str r4, [r2] - str r5, [r2, #4] - - add r2, r2, r3 - - ldrne r4, [r0] - ldrne r5, [r0, #4] - - bne copy_mem8x4_4_loop - - ldmia sp!, {r4-r5} - ;pop {r4-r5} - mov pc, lr - -;copy 8 bytes each time -copy_mem8x4_fast - ;sub r1, r1, #8 - ;sub r3, r3, #8 - - mov r12, #4 - -copy_mem8x4_fast_loop - ldmia r0, {r4-r5} - ;ldm r0, {r4-r5} - add r0, r0, r1 - - subs r12, r12, #1 - stmia r2, {r4-r5} - ;stm r2, {r4-r5} - add r2, r2, r3 - - bne copy_mem8x4_fast_loop - - ldmia sp!, {r4-r5} - ;pop {r4-r5} - mov pc, lr - - ENDP ; |vp8_copy_mem8x4_v6| - - END diff --git a/vp8/common/arm/armv6/copymem8x8_v6.asm b/vp8/common/arm/armv6/copymem8x8_v6.asm deleted file mode 100644 index c6a60c610..000000000 --- a/vp8/common/arm/armv6/copymem8x8_v6.asm +++ /dev/null @@ -1,128 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_copy_mem8x8_v6| - ; ARM - ; REQUIRE8 - ; PRESERVE8 - - AREA Block, CODE, READONLY ; name this block of code -;void copy_mem8x8_v6( unsigned char *src, int src_stride, unsigned char *dst, int dst_stride) -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= -|vp8_copy_mem8x8_v6| PROC - ;push {r4-r5} - stmdb sp!, {r4-r5} - - ;preload - pld [r0] - pld [r0, r1] - pld [r0, r1, lsl #1] - - ands r4, r0, #7 - beq copy_mem8x8_fast - - ands r4, r0, #3 - beq copy_mem8x8_4 - - ;copy 1 byte each time - ldrb r4, [r0] - ldrb r5, [r0, #1] - - mov r12, #8 - -copy_mem8x8_1_loop - strb r4, [r2] - strb r5, [r2, #1] - - ldrb r4, [r0, #2] - ldrb r5, [r0, #3] - - subs r12, r12, #1 - - strb r4, [r2, #2] - strb r5, [r2, #3] - - ldrb r4, [r0, #4] - ldrb r5, [r0, #5] - - strb r4, [r2, #4] - strb r5, [r2, #5] - - ldrb r4, [r0, #6] - ldrb r5, [r0, #7] - - add r0, r0, r1 - - strb r4, [r2, #6] - strb r5, [r2, #7] - - add r2, r2, r3 - - ldrneb r4, [r0] - ldrneb r5, [r0, #1] - - bne copy_mem8x8_1_loop - - ldmia sp!, {r4 - r5} - ;pop {r4-r5} - mov pc, lr - -;copy 4 bytes each time -copy_mem8x8_4 - ldr r4, [r0] - ldr r5, [r0, #4] - - mov r12, #8 - -copy_mem8x8_4_loop - subs r12, r12, #1 - add r0, r0, r1 - - str r4, [r2] - str r5, [r2, #4] - - add r2, r2, r3 - - ldrne r4, [r0] - ldrne r5, [r0, #4] - - bne copy_mem8x8_4_loop - - ldmia sp!, {r4 - r5} - ;pop {r4-r5} - mov pc, lr - -;copy 8 bytes each time -copy_mem8x8_fast - ;sub r1, r1, #8 - ;sub r3, r3, #8 - - mov r12, #8 - -copy_mem8x8_fast_loop - ldmia r0, {r4-r5} - ;ldm r0, {r4-r5} - add r0, r0, r1 - - subs r12, r12, #1 - stmia r2, {r4-r5} - ;stm r2, {r4-r5} - add r2, r2, r3 - - bne copy_mem8x8_fast_loop - - ldmia sp!, {r4-r5} - ;pop {r4-r5} - mov pc, lr - - ENDP ; |vp8_copy_mem8x8_v6| - - END diff --git a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm b/vp8/common/arm/armv6/dc_only_idct_add_v6.asm deleted file mode 100644 index 9aa659fa7..000000000 --- a/vp8/common/arm/armv6/dc_only_idct_add_v6.asm +++ /dev/null @@ -1,70 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - EXPORT |vp8_dc_only_idct_add_v6| - - AREA |.text|, CODE, READONLY - -;void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, -; int pred_stride, unsigned char *dst_ptr, -; int dst_stride) -; r0 input_dc -; r1 pred_ptr -; r2 pred_stride -; r3 dst_ptr -; sp dst_stride - -|vp8_dc_only_idct_add_v6| PROC - stmdb sp!, {r4 - r7} - - add r0, r0, #4 ; input_dc += 4 - ldr r12, c0x0000FFFF - ldr r4, [r1], r2 - and r0, r12, r0, asr #3 ; input_dc >> 3 + mask - ldr r6, [r1], r2 - orr r0, r0, r0, lsl #16 ; a1 | a1 - - ldr r12, [sp, #16] ; dst stride - - uxtab16 r5, r0, r4 ; a1+2 | a1+0 - uxtab16 r4, r0, r4, ror #8 ; a1+3 | a1+1 - uxtab16 r7, r0, r6 - uxtab16 r6, r0, r6, ror #8 - usat16 r5, #8, r5 - usat16 r4, #8, r4 - usat16 r7, #8, r7 - usat16 r6, #8, r6 - orr r5, r5, r4, lsl #8 - orr r7, r7, r6, lsl #8 - ldr r4, [r1], r2 - str r5, [r3], r12 - ldr r6, [r1] - str r7, [r3], r12 - - uxtab16 r5, r0, r4 - uxtab16 r4, r0, r4, ror #8 - uxtab16 r7, r0, r6 - uxtab16 r6, r0, r6, ror #8 - usat16 r5, #8, r5 - usat16 r4, #8, r4 - usat16 r7, #8, r7 - usat16 r6, #8, r6 - orr r5, r5, r4, lsl #8 - orr r7, r7, r6, lsl #8 - str r5, [r3], r12 - str r7, [r3] - - ldmia sp!, {r4 - r7} - bx lr - - ENDP ; |vp8_dc_only_idct_add_v6| - -; Constant Pool -c0x0000FFFF DCD 0x0000FFFF - END diff --git a/vp8/common/arm/armv6/dequant_idct_v6.asm b/vp8/common/arm/armv6/dequant_idct_v6.asm deleted file mode 100644 index db48ded58..000000000 --- a/vp8/common/arm/armv6/dequant_idct_v6.asm +++ /dev/null @@ -1,190 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license and patent -; grant that can be found in the LICENSE file in the root of the source -; tree. All contributing project authors may be found in the AUTHORS -; file in the root of the source tree. -; - - EXPORT |vp8_dequant_idct_add_v6| - - AREA |.text|, CODE, READONLY -;void vp8_dequant_idct_v6(short *input, short *dq, -; unsigned char *dest, int stride) -; r0 = q -; r1 = dq -; r2 = dst -; r3 = stride - -|vp8_dequant_idct_add_v6| PROC - stmdb sp!, {r4-r11, lr} - - ldr r4, [r0] ;input - ldr r5, [r1], #4 ;dq - - sub sp, sp, #4 - str r3, [sp] - - mov r12, #4 - -vp8_dequant_add_loop - smulbb r6, r4, r5 - smultt r7, r4, r5 - - ldr r4, [r0, #4] ;input - ldr r5, [r1], #4 ;dq - - strh r6, [r0], #2 - strh r7, [r0], #2 - - smulbb r6, r4, r5 - smultt r7, r4, r5 - - subs r12, r12, #1 - - ldrne r4, [r0, #4] - ldrne r5, [r1], #4 - - strh r6, [r0], #2 - strh r7, [r0], #2 - - bne vp8_dequant_add_loop - - sub r0, r0, #32 - mov r1, r0 - -; short_idct4x4llm_v6_dual - ldr r3, cospi8sqrt2minus1 - ldr r4, sinpi8sqrt2 - ldr r6, [r0, #8] - mov r5, #2 -vp8_dequant_idct_loop1_v6 - ldr r12, [r0, #24] - ldr r14, [r0, #16] - smulwt r9, r3, r6 - smulwb r7, r3, r6 - smulwt r10, r4, r6 - smulwb r8, r4, r6 - pkhbt r7, r7, r9, lsl #16 - smulwt r11, r3, r12 - pkhbt r8, r8, r10, lsl #16 - uadd16 r6, r6, r7 - smulwt r7, r4, r12 - smulwb r9, r3, r12 - smulwb r10, r4, r12 - subs r5, r5, #1 - pkhbt r9, r9, r11, lsl #16 - ldr r11, [r0], #4 - pkhbt r10, r10, r7, lsl #16 - uadd16 r7, r12, r9 - usub16 r7, r8, r7 - uadd16 r6, r6, r10 - uadd16 r10, r11, r14 - usub16 r8, r11, r14 - uadd16 r9, r10, r6 - usub16 r10, r10, r6 - uadd16 r6, r8, r7 - usub16 r7, r8, r7 - str r6, [r1, #8] - ldrne r6, [r0, #8] - str r7, [r1, #16] - str r10, [r1, #24] - str r9, [r1], #4 - bne vp8_dequant_idct_loop1_v6 - - mov r5, #2 - sub r0, r1, #8 -vp8_dequant_idct_loop2_v6 - ldr r6, [r0], #4 - ldr r7, [r0], #4 - ldr r8, [r0], #4 - ldr r9, [r0], #4 - smulwt r1, r3, r6 - smulwt r12, r4, r6 - smulwt lr, r3, r8 - smulwt r10, r4, r8 - pkhbt r11, r8, r6, lsl #16 - pkhbt r1, lr, r1, lsl #16 - pkhbt r12, r10, r12, lsl #16 - pkhtb r6, r6, r8, asr #16 - uadd16 r6, r1, r6 - pkhbt lr, r9, r7, lsl #16 - uadd16 r10, r11, lr - usub16 lr, r11, lr - pkhtb r8, r7, r9, asr #16 - subs r5, r5, #1 - smulwt r1, r3, r8 - smulwb r7, r3, r8 - smulwt r11, r4, r8 - smulwb r9, r4, r8 - pkhbt r1, r7, r1, lsl #16 - uadd16 r8, r1, r8 - pkhbt r11, r9, r11, lsl #16 - usub16 r1, r12, r8 - uadd16 r8, r11, r6 - ldr r9, c0x00040004 - ldr r12, [sp] ; get stride from stack - uadd16 r6, r10, r8 - usub16 r7, r10, r8 - uadd16 r7, r7, r9 - uadd16 r6, r6, r9 - uadd16 r10, r14, r1 - usub16 r1, r14, r1 - uadd16 r10, r10, r9 - uadd16 r1, r1, r9 - ldr r11, [r2] ; load input from dst - mov r8, r7, asr #3 - pkhtb r9, r8, r10, asr #19 - mov r8, r1, asr #3 - pkhtb r8, r8, r6, asr #19 - uxtb16 lr, r11, ror #8 - qadd16 r9, r9, lr - uxtb16 lr, r11 - qadd16 r8, r8, lr - usat16 r9, #8, r9 - usat16 r8, #8, r8 - orr r9, r8, r9, lsl #8 - ldr r11, [r2, r12] ; load input from dst - mov r7, r7, lsl #16 - mov r1, r1, lsl #16 - mov r10, r10, lsl #16 - mov r6, r6, lsl #16 - mov r7, r7, asr #3 - pkhtb r7, r7, r10, asr #19 - mov r1, r1, asr #3 - pkhtb r1, r1, r6, asr #19 - uxtb16 r8, r11, ror #8 - qadd16 r7, r7, r8 - uxtb16 r8, r11 - qadd16 r1, r1, r8 - usat16 r7, #8, r7 - usat16 r1, #8, r1 - orr r1, r1, r7, lsl #8 - str r9, [r2], r12 ; store output to dst - str r1, [r2], r12 ; store output to dst - bne vp8_dequant_idct_loop2_v6 - -; memset - sub r0, r0, #32 - add sp, sp, #4 - - mov r12, #0 - str r12, [r0] - str r12, [r0, #4] - str r12, [r0, #8] - str r12, [r0, #12] - str r12, [r0, #16] - str r12, [r0, #20] - str r12, [r0, #24] - str r12, [r0, #28] - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_dequant_idct_add_v6| - -; Constant Pool -cospi8sqrt2minus1 DCD 0x00004E7B -sinpi8sqrt2 DCD 0x00008A8C -c0x00040004 DCD 0x00040004 - - END diff --git a/vp8/common/arm/armv6/dequantize_v6.asm b/vp8/common/arm/armv6/dequantize_v6.asm deleted file mode 100644 index 72f7e0ee5..000000000 --- a/vp8/common/arm/armv6/dequantize_v6.asm +++ /dev/null @@ -1,69 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_dequantize_b_loop_v6| - - AREA |.text|, CODE, READONLY ; name this block of code -;------------------------------- -;void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); -; r0 short *Q, -; r1 short *DQC -; r2 short *DQ -|vp8_dequantize_b_loop_v6| PROC - stmdb sp!, {r4-r9, lr} - - ldr r3, [r0] ;load Q - ldr r4, [r1] ;load DQC - ldr r5, [r0, #4] - ldr r6, [r1, #4] - - mov r12, #2 ;loop counter - -dequant_loop - smulbb r7, r3, r4 ;multiply - smultt r8, r3, r4 - smulbb r9, r5, r6 - smultt lr, r5, r6 - - ldr r3, [r0, #8] - ldr r4, [r1, #8] - ldr r5, [r0, #12] - ldr r6, [r1, #12] - - strh r7, [r2], #2 ;store result - smulbb r7, r3, r4 ;multiply - strh r8, [r2], #2 - smultt r8, r3, r4 - strh r9, [r2], #2 - smulbb r9, r5, r6 - strh lr, [r2], #2 - smultt lr, r5, r6 - - subs r12, r12, #1 - - add r0, r0, #16 - add r1, r1, #16 - - ldrne r3, [r0] - strh r7, [r2], #2 ;store result - ldrne r4, [r1] - strh r8, [r2], #2 - ldrne r5, [r0, #4] - strh r9, [r2], #2 - ldrne r6, [r1, #4] - strh lr, [r2], #2 - - bne dequant_loop - - ldmia sp!, {r4-r9, pc} - ENDP ;|vp8_dequantize_b_loop_v6| - - END diff --git a/vp8/common/arm/armv6/filter_v6.asm b/vp8/common/arm/armv6/filter_v6.asm deleted file mode 100644 index eb4b75bd8..000000000 --- a/vp8/common/arm/armv6/filter_v6.asm +++ /dev/null @@ -1,624 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_filter_block2d_first_pass_armv6| - EXPORT |vp8_filter_block2d_first_pass_16x16_armv6| - EXPORT |vp8_filter_block2d_first_pass_8x8_armv6| - EXPORT |vp8_filter_block2d_second_pass_armv6| - EXPORT |vp8_filter4_block2d_second_pass_armv6| - EXPORT |vp8_filter_block2d_first_pass_only_armv6| - EXPORT |vp8_filter_block2d_second_pass_only_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code -;------------------------------------- -; r0 unsigned char *src_ptr -; r1 short *output_ptr -; r2 unsigned int src_pixels_per_line -; r3 unsigned int output_width -; stack unsigned int output_height -; stack const short *vp8_filter -;------------------------------------- -; vp8_filter the input and put in the output array. Apply the 6 tap FIR filter with -; the output being a 2 byte value and the intput being a 1 byte value. -|vp8_filter_block2d_first_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp8_filter address - ldr r7, [sp, #36] ; output height - - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts - add r12, r3, #16 ; square off the output - sub sp, sp, #4 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r1, [sp] ; push destination to stack - mov r7, r7, lsl #16 ; height is top part of counter - -; six tap filter -|height_loop_1st_6| - ldrb r8, [r0, #-2] ; load source data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - orr r7, r7, r3, lsr #2 ; construct loop counter - -|width_loop_1st_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - - smuad lr, lr, r4 ; apply the filter - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - sub r7, r7, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r11, r10, r6, r8 - - ands r10, r7, #0xff ; test loop counter - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 - add r11, r11, #0x40 - ldrneb r9, [r0, #-1] - usat r11, #8, r11, asr #7 - - strh lr, [r1], r12 ; result is transposed and stored, which - ; will make second pass filtering easier. - ldrneb r10, [r0], #2 - strh r11, [r1], r12 - - bne width_loop_1st_6 - - ldr r1, [sp] ; load and update dst address - subs r7, r7, #0x10000 - add r0, r0, r2 ; move to next input line - - add r1, r1, #2 ; move over to next column - str r1, [sp] - - bne height_loop_1st_6 - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -; -------------------------- -; 16x16 version -; ----------------------------- -|vp8_filter_block2d_first_pass_16x16_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp8_filter address - ldr r7, [sp, #36] ; output height - - add r4, r2, #18 ; preload next low - pld [r0, r4] - - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts - add r12, r3, #16 ; square off the output - sub sp, sp, #4 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r1, [sp] ; push destination to stack - mov r7, r7, lsl #16 ; height is top part of counter - -; six tap filter -|height_loop_1st_16_6| - ldrb r8, [r0, #-2] ; load source data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - orr r7, r7, r3, lsr #2 ; construct loop counter - -|width_loop_1st_16_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - - smuad lr, lr, r4 ; apply the filter - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - sub r7, r7, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r11, r10, r6, r8 - - ands r10, r7, #0xff ; test loop counter - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 - add r11, r11, #0x40 - ldrneb r9, [r0, #-1] - usat r11, #8, r11, asr #7 - - strh lr, [r1], r12 ; result is transposed and stored, which - ; will make second pass filtering easier. - ldrneb r10, [r0], #2 - strh r11, [r1], r12 - - bne width_loop_1st_16_6 - - ldr r1, [sp] ; load and update dst address - subs r7, r7, #0x10000 - add r0, r0, r2 ; move to next input line - - add r11, r2, #34 ; adding back block width(=16) - pld [r0, r11] ; preload next low - - add r1, r1, #2 ; move over to next column - str r1, [sp] - - bne height_loop_1st_16_6 - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -; -------------------------- -; 8x8 version -; ----------------------------- -|vp8_filter_block2d_first_pass_8x8_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vp8_filter address - ldr r7, [sp, #36] ; output height - - add r4, r2, #10 ; preload next low - pld [r0, r4] - - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - mov r3, r3, lsl #1 ; multiply width by 2 because using shorts - add r12, r3, #16 ; square off the output - sub sp, sp, #4 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r1, [sp] ; push destination to stack - mov r7, r7, lsl #16 ; height is top part of counter - -; six tap filter -|height_loop_1st_8_6| - ldrb r8, [r0, #-2] ; load source data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - orr r7, r7, r3, lsr #2 ; construct loop counter - -|width_loop_1st_8_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - - smuad lr, lr, r4 ; apply the filter - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - sub r7, r7, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r11, r10, r6, r8 - - ands r10, r7, #0xff ; test loop counter - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 - add r11, r11, #0x40 - ldrneb r9, [r0, #-1] - usat r11, #8, r11, asr #7 - - strh lr, [r1], r12 ; result is transposed and stored, which - ; will make second pass filtering easier. - ldrneb r10, [r0], #2 - strh r11, [r1], r12 - - bne width_loop_1st_8_6 - - ldr r1, [sp] ; load and update dst address - subs r7, r7, #0x10000 - add r0, r0, r2 ; move to next input line - - add r11, r2, #18 ; adding back block width(=8) - pld [r0, r11] ; preload next low - - add r1, r1, #2 ; move over to next column - str r1, [sp] - - bne height_loop_1st_8_6 - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -;--------------------------------- -; r0 short *src_ptr, -; r1 unsigned char *output_ptr, -; r2 unsigned int output_pitch, -; r3 unsigned int cnt, -; stack const short *vp8_filter -;--------------------------------- -|vp8_filter_block2d_second_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #36] ; vp8_filter address - sub sp, sp, #4 - mov r7, r3, lsl #16 ; height is top part of counter - str r1, [sp] ; push destination to stack - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - pkhbt r12, r5, r4 ; pack the filter differently - pkhbt r11, r6, r5 - - sub r0, r0, #4 ; offset input buffer - -|height_loop_2nd| - ldr r8, [r0] ; load the data - ldr r9, [r0, #4] - orr r7, r7, r3, lsr #1 ; loop counter - -|width_loop_2nd| - smuad lr, r4, r8 ; apply filter - sub r7, r7, #1 - smulbt r8, r4, r8 - - ldr r10, [r0, #8] - - smlad lr, r5, r9, lr - smladx r8, r12, r9, r8 - - ldrh r9, [r0, #12] - - smlad lr, r6, r10, lr - smladx r8, r11, r10, r8 - - add r0, r0, #4 - smlatb r10, r6, r9, r8 - - add lr, lr, #0x40 ; round_shift_and_clamp - ands r8, r7, #0xff - usat lr, #8, lr, asr #7 - add r10, r10, #0x40 - strb lr, [r1], r2 ; the result is transposed back and stored - usat r10, #8, r10, asr #7 - - ldrne r8, [r0] ; load data for next loop - ldrne r9, [r0, #4] - strb r10, [r1], r2 - - bne width_loop_2nd - - ldr r1, [sp] ; update dst for next loop - subs r7, r7, #0x10000 - add r0, r0, #16 ; updata src for next loop - add r1, r1, #1 - str r1, [sp] - - bne height_loop_2nd - - add sp, sp, #4 - ldmia sp!, {r4 - r11, pc} - - ENDP - -;--------------------------------- -; r0 short *src_ptr, -; r1 unsigned char *output_ptr, -; r2 unsigned int output_pitch, -; r3 unsigned int cnt, -; stack const short *vp8_filter -;--------------------------------- -|vp8_filter4_block2d_second_pass_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #36] ; vp8_filter address - mov r7, r3, lsl #16 ; height is top part of counter - - ldr r4, [r11] ; load up packed filter coefficients - add lr, r1, r3 ; save final destination pointer - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - pkhbt r12, r5, r4 ; pack the filter differently - pkhbt r11, r6, r5 - mov r4, #0x40 ; rounding factor (for smlad{x}) - -|height_loop_2nd_4| - ldrd r8, r9, [r0, #-4] ; load the data - orr r7, r7, r3, lsr #1 ; loop counter - -|width_loop_2nd_4| - ldr r10, [r0, #4]! - smladx r6, r9, r12, r4 ; apply filter - pkhbt r8, r9, r8 - smlad r5, r8, r12, r4 - pkhbt r8, r10, r9 - smladx r6, r10, r11, r6 - sub r7, r7, #1 - smlad r5, r8, r11, r5 - - mov r8, r9 ; shift the data for the next loop - mov r9, r10 - - usat r6, #8, r6, asr #7 ; shift and clamp - usat r5, #8, r5, asr #7 - - strb r5, [r1], r2 ; the result is transposed back and stored - tst r7, #0xff - strb r6, [r1], r2 - - bne width_loop_2nd_4 - - subs r7, r7, #0x10000 - add r0, r0, #16 ; update src for next loop - sub r1, lr, r7, lsr #16 ; update dst for next loop - - bne height_loop_2nd_4 - - ldmia sp!, {r4 - r11, pc} - - ENDP - -;------------------------------------ -; r0 unsigned char *src_ptr -; r1 unsigned char *output_ptr, -; r2 unsigned int src_pixels_per_line -; r3 unsigned int cnt, -; stack unsigned int output_pitch, -; stack const short *vp8_filter -;------------------------------------ -|vp8_filter_block2d_first_pass_only_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - add r7, r2, r3 ; preload next low - add r7, r7, #2 - pld [r0, r7] - - ldr r4, [sp, #36] ; output pitch - ldr r11, [sp, #40] ; HFilter address - sub sp, sp, #8 - - mov r7, r3 - sub r2, r2, r3 ; inside loop increments input array, - ; so the height loop only needs to add - ; r2 - width to the input pointer - - sub r4, r4, r3 - str r4, [sp] ; save modified output pitch - str r2, [sp, #4] - - mov r2, #0x40 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - -; six tap filter -|height_loop_1st_only_6| - ldrb r8, [r0, #-2] ; load data - ldrb r9, [r0, #-1] - ldrb r10, [r0], #2 - - mov r12, r3, lsr #1 ; loop counter - -|width_loop_1st_only_6| - ldrb r11, [r0, #-1] - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0] - -;; smuad lr, lr, r4 - smlad lr, lr, r4, r2 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 -;; smuad r8, r8, r4 - smlad r8, r8, r4, r2 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0, #1] - smlad r8, r11, r5, r8 - ldrb r11, [r0, #2] - - subs r12, r12, #1 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r10, r10, r6, r8 - -;; add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0, #-2] ; load data for next loop - usat lr, #8, lr, asr #7 -;; add r10, r10, #0x40 - strb lr, [r1], #1 ; store the result - usat r10, #8, r10, asr #7 - - ldrneb r9, [r0, #-1] - strb r10, [r1], #1 - ldrneb r10, [r0], #2 - - bne width_loop_1st_only_6 - - ldr lr, [sp] ; load back output pitch - ldr r12, [sp, #4] ; load back output pitch - subs r7, r7, #1 - add r0, r0, r12 ; updata src for next loop - - add r11, r12, r3 ; preload next low - add r11, r11, #2 - pld [r0, r11] - - add r1, r1, lr ; update dst for next loop - - bne height_loop_1st_only_6 - - add sp, sp, #8 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_filter_block2d_first_pass_only_armv6| - - -;------------------------------------ -; r0 unsigned char *src_ptr, -; r1 unsigned char *output_ptr, -; r2 unsigned int src_pixels_per_line -; r3 unsigned int cnt, -; stack unsigned int output_pitch, -; stack const short *vp8_filter -;------------------------------------ -|vp8_filter_block2d_second_pass_only_armv6| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; VFilter address - ldr r12, [sp, #36] ; output pitch - - mov r7, r3, lsl #16 ; height is top part of counter - sub r0, r0, r2, lsl #1 ; need 6 elements for filtering, 2 before, 3 after - - sub sp, sp, #8 - - ldr r4, [r11] ; load up packed filter coefficients - ldr r5, [r11, #4] - ldr r6, [r11, #8] - - str r0, [sp] ; save r0 to stack - str r1, [sp, #4] ; save dst to stack - -; six tap filter -|width_loop_2nd_only_6| - ldrb r8, [r0], r2 ; load data - orr r7, r7, r3 ; loop counter - ldrb r9, [r0], r2 - ldrb r10, [r0], r2 - -|height_loop_2nd_only_6| - ; filter first column in this inner loop, than, move to next colum. - ldrb r11, [r0], r2 - - pkhbt lr, r8, r9, lsl #16 ; r9 | r8 - pkhbt r8, r9, r10, lsl #16 ; r10 | r9 - - ldrb r9, [r0], r2 - - smuad lr, lr, r4 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - smuad r8, r8, r4 - pkhbt r11, r11, r9, lsl #16 ; r9 | r11 - - smlad lr, r10, r5, lr - ldrb r10, [r0], r2 - smlad r8, r11, r5, r8 - ldrb r11, [r0] - - sub r7, r7, #2 - sub r0, r0, r2, lsl #2 - - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - pkhbt r10, r10, r11, lsl #16 ; r11 | r10 - - smlad lr, r9, r6, lr - smlad r10, r10, r6, r8 - - ands r9, r7, #0xff - - add lr, lr, #0x40 ; round_shift_and_clamp - ldrneb r8, [r0], r2 ; load data for next loop - usat lr, #8, lr, asr #7 - add r10, r10, #0x40 - strb lr, [r1], r12 ; store the result for the column - usat r10, #8, r10, asr #7 - - ldrneb r9, [r0], r2 - strb r10, [r1], r12 - ldrneb r10, [r0], r2 - - bne height_loop_2nd_only_6 - - ldr r0, [sp] - ldr r1, [sp, #4] - subs r7, r7, #0x10000 - add r0, r0, #1 ; move to filter next column - str r0, [sp] - add r1, r1, #1 - str r1, [sp, #4] - - bne width_loop_2nd_only_6 - - add sp, sp, #8 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_filter_block2d_second_pass_only_armv6| - - END diff --git a/vp8/common/arm/armv6/idct_blk_v6.c b/vp8/common/arm/armv6/idct_blk_v6.c deleted file mode 100644 index 14a1273e2..000000000 --- a/vp8/common/arm/armv6/idct_blk_v6.c +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8_rtcd.h" - -void vp8_dequant_idct_add_y_block_v6(short *q, short *dq, unsigned char *dst, - int stride, char *eobs) { - int i; - - for (i = 0; i < 4; ++i) { - if (eobs[0] > 1) - vp8_dequant_idct_add_v6(q, dq, dst, stride); - else if (eobs[0] == 1) { - vp8_dc_only_idct_add_v6(q[0] * dq[0], dst, stride, dst, stride); - ((int *)q)[0] = 0; - } - - if (eobs[1] > 1) - vp8_dequant_idct_add_v6(q + 16, dq, dst + 4, stride); - else if (eobs[1] == 1) { - vp8_dc_only_idct_add_v6(q[16] * dq[0], dst + 4, stride, dst + 4, stride); - ((int *)(q + 16))[0] = 0; - } - - if (eobs[2] > 1) - vp8_dequant_idct_add_v6(q + 32, dq, dst + 8, stride); - else if (eobs[2] == 1) { - vp8_dc_only_idct_add_v6(q[32] * dq[0], dst + 8, stride, dst + 8, stride); - ((int *)(q + 32))[0] = 0; - } - - if (eobs[3] > 1) - vp8_dequant_idct_add_v6(q + 48, dq, dst + 12, stride); - else if (eobs[3] == 1) { - vp8_dc_only_idct_add_v6(q[48] * dq[0], dst + 12, stride, dst + 12, - stride); - ((int *)(q + 48))[0] = 0; - } - - q += 64; - dst += 4 * stride; - eobs += 4; - } -} - -void vp8_dequant_idct_add_uv_block_v6(short *q, short *dq, unsigned char *dstu, - unsigned char *dstv, int stride, - char *eobs) { - int i; - - for (i = 0; i < 2; ++i) { - if (eobs[0] > 1) - vp8_dequant_idct_add_v6(q, dq, dstu, stride); - else if (eobs[0] == 1) { - vp8_dc_only_idct_add_v6(q[0] * dq[0], dstu, stride, dstu, stride); - ((int *)q)[0] = 0; - } - - if (eobs[1] > 1) - vp8_dequant_idct_add_v6(q + 16, dq, dstu + 4, stride); - else if (eobs[1] == 1) { - vp8_dc_only_idct_add_v6(q[16] * dq[0], dstu + 4, stride, dstu + 4, - stride); - ((int *)(q + 16))[0] = 0; - } - - q += 32; - dstu += 4 * stride; - eobs += 2; - } - - for (i = 0; i < 2; ++i) { - if (eobs[0] > 1) - vp8_dequant_idct_add_v6(q, dq, dstv, stride); - else if (eobs[0] == 1) { - vp8_dc_only_idct_add_v6(q[0] * dq[0], dstv, stride, dstv, stride); - ((int *)q)[0] = 0; - } - - if (eobs[1] > 1) - vp8_dequant_idct_add_v6(q + 16, dq, dstv + 4, stride); - else if (eobs[1] == 1) { - vp8_dc_only_idct_add_v6(q[16] * dq[0], dstv + 4, stride, dstv + 4, - stride); - ((int *)(q + 16))[0] = 0; - } - - q += 32; - dstv += 4 * stride; - eobs += 2; - } -} diff --git a/vp8/common/arm/armv6/idct_v6.asm b/vp8/common/arm/armv6/idct_v6.asm deleted file mode 100644 index b4d44cbeb..000000000 --- a/vp8/common/arm/armv6/idct_v6.asm +++ /dev/null @@ -1,202 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_short_idct4x4llm_v6_dual| - - AREA |.text|, CODE, READONLY - - -; void vp8_short_idct4x4llm_c(short *input, unsigned char *pred, int pitch, -; unsigned char *dst, int stride) -; r0 short* input -; r1 unsigned char* pred -; r2 int pitch -; r3 unsigned char* dst -; sp int stride - -|vp8_short_idct4x4llm_v6_dual| PROC - stmdb sp!, {r4-r11, lr} - - sub sp, sp, #4 - - mov r4, #0x00008A00 ; sin - orr r4, r4, #0x0000008C ; sinpi8sqrt2 - - mov r5, #0x00004E00 ; cos - orr r5, r5, #0x0000007B ; cospi8sqrt2minus1 - orr r5, r5, #1<<31 ; loop counter on top bit - -loop1_dual - ldr r6, [r0, #(4*2)] ; i5 | i4 - ldr r12, [r0, #(12*2)] ; i13|i12 - ldr r14, [r0, #(8*2)] ; i9 | i8 - - smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 - smulbb r7, r5, r6 ; (ip[4] * cospi8sqrt2minus1) >> 16 - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 - smulwb r8, r4, r6 ; (ip[4] * sinpi8sqrt2) >> 16 - - smulbt r11, r5, r12 ; (ip[13] * cospi8sqrt2minus1) >> 16 - pkhtb r7, r9, r7, asr #16 ; 5c | 4c - pkhbt r8, r8, r10, lsl #16 ; 5s | 4s - uadd16 r6, r6, r7 ; 5c+5 | 4c+4 - - smulwt r7, r4, r12 ; (ip[13] * sinpi8sqrt2) >> 16 - smulbb r9, r5, r12 ; (ip[12] * cospi8sqrt2minus1) >> 16 - smulwb r10, r4, r12 ; (ip[12] * sinpi8sqrt2) >> 16 - - subs r5, r5, #1<<31 ; i-- - - pkhtb r9, r11, r9, asr #16 ; 13c | 12c - ldr r11, [r0] ; i1 | i0 - pkhbt r10, r10, r7, lsl #16 ; 13s | 12s - uadd16 r7, r12, r9 ; 13c+13 | 12c+12 - - usub16 r7, r8, r7 ; c - uadd16 r6, r6, r10 ; d - uadd16 r10, r11, r14 ; a - usub16 r8, r11, r14 ; b - - uadd16 r9, r10, r6 ; a+d - usub16 r10, r10, r6 ; a-d - uadd16 r6, r8, r7 ; b+c - usub16 r7, r8, r7 ; b-c - - ; use input buffer to store intermediate results - str r6, [r0, #(4*2)] ; o5 | o4 - str r7, [r0, #(8*2)] ; o9 | o8 - str r10,[r0, #(12*2)] ; o13|o12 - str r9, [r0], #4 ; o1 | o0 - - bcs loop1_dual - - sub r0, r0, #8 ; reset input/output - str r0, [sp] - -loop2_dual - - ldr r6, [r0, #(4*2)] ; i5 | i4 - ldr r12,[r0, #(2*2)] ; i3 | i2 - ldr r14,[r0, #(6*2)] ; i7 | i6 - ldr r0, [r0, #(0*2)] ; i1 | i0 - - smulbt r9, r5, r6 ; (ip[5] * cospi8sqrt2minus1) >> 16 - smulbt r7, r5, r0 ; (ip[1] * cospi8sqrt2minus1) >> 16 - smulwt r10, r4, r6 ; (ip[5] * sinpi8sqrt2) >> 16 - smulwt r8, r4, r0 ; (ip[1] * sinpi8sqrt2) >> 16 - - pkhbt r11, r6, r0, lsl #16 ; i0 | i4 - pkhtb r7, r7, r9, asr #16 ; 1c | 5c - pkhtb r0, r0, r6, asr #16 ; i1 | i5 - pkhbt r8, r10, r8, lsl #16 ; 1s | 5s = temp1 - - uadd16 r0, r7, r0 ; 1c+1 | 5c+5 = temp2 - pkhbt r9, r14, r12, lsl #16 ; i2 | i6 - uadd16 r10, r11, r9 ; a - usub16 r9, r11, r9 ; b - pkhtb r6, r12, r14, asr #16 ; i3 | i7 - - subs r5, r5, #1<<31 ; i-- - - smulbt r7, r5, r6 ; (ip[3] * cospi8sqrt2minus1) >> 16 - smulwt r11, r4, r6 ; (ip[3] * sinpi8sqrt2) >> 16 - smulbb r12, r5, r6 ; (ip[7] * cospi8sqrt2minus1) >> 16 - smulwb r14, r4, r6 ; (ip[7] * sinpi8sqrt2) >> 16 - - pkhtb r7, r7, r12, asr #16 ; 3c | 7c - pkhbt r11, r14, r11, lsl #16 ; 3s | 7s = temp1 - - uadd16 r6, r7, r6 ; 3c+3 | 7c+7 = temp2 - usub16 r12, r8, r6 ; c (o1 | o5) - uadd16 r6, r11, r0 ; d (o3 | o7) - uadd16 r7, r10, r6 ; a+d - - mov r8, #4 ; set up 4's - orr r8, r8, #0x40000 ; 4|4 - - usub16 r6, r10, r6 ; a-d - uadd16 r6, r6, r8 ; a-d+4, 3|7 - uadd16 r7, r7, r8 ; a+d+4, 0|4 - uadd16 r10, r9, r12 ; b+c - usub16 r0, r9, r12 ; b-c - uadd16 r10, r10, r8 ; b+c+4, 1|5 - uadd16 r8, r0, r8 ; b-c+4, 2|6 - - ldr lr, [sp, #40] ; dst stride - - ldrb r0, [r1] ; pred p0 - ldrb r11, [r1, #1] ; pred p1 - ldrb r12, [r1, #2] ; pred p2 - - add r0, r0, r7, asr #19 ; p0 + o0 - add r11, r11, r10, asr #19 ; p1 + o1 - add r12, r12, r8, asr #19 ; p2 + o2 - - usat r0, #8, r0 ; d0 = clip8(p0 + o0) - usat r11, #8, r11 ; d1 = clip8(p1 + o1) - usat r12, #8, r12 ; d2 = clip8(p2 + o2) - - add r0, r0, r11, lsl #8 ; |--|--|d1|d0| - - ldrb r11, [r1, #3] ; pred p3 - - add r0, r0, r12, lsl #16 ; |--|d2|d1|d0| - - add r11, r11, r6, asr #19 ; p3 + o3 - - sxth r7, r7 ; - sxth r10, r10 ; - - usat r11, #8, r11 ; d3 = clip8(p3 + o3) - - sxth r8, r8 ; - sxth r6, r6 ; - - add r0, r0, r11, lsl #24 ; |d3|d2|d1|d0| - - ldrb r12, [r1, r2]! ; pred p4 - str r0, [r3], lr - ldrb r11, [r1, #1] ; pred p5 - - add r12, r12, r7, asr #3 ; p4 + o4 - add r11, r11, r10, asr #3 ; p5 + o5 - - usat r12, #8, r12 ; d4 = clip8(p4 + o4) - usat r11, #8, r11 ; d5 = clip8(p5 + o5) - - ldrb r7, [r1, #2] ; pred p6 - ldrb r10, [r1, #3] ; pred p6 - - add r12, r12, r11, lsl #8 ; |--|--|d5|d4| - - add r7, r7, r8, asr #3 ; p6 + o6 - add r10, r10, r6, asr #3 ; p7 + o7 - - ldr r0, [sp] ; load input pointer - - usat r7, #8, r7 ; d6 = clip8(p6 + o6) - usat r10, #8, r10 ; d7 = clip8(p7 + o7) - - add r12, r12, r7, lsl #16 ; |--|d6|d5|d4| - add r12, r12, r10, lsl #24 ; |d7|d6|d5|d4| - - str r12, [r3], lr - add r0, r0, #16 - add r1, r1, r2 ; pred + pitch - - bcs loop2_dual - - add sp, sp, #4 ; idct_output buffer - ldmia sp!, {r4 - r11, pc} - - ENDP - - END diff --git a/vp8/common/arm/armv6/iwalsh_v6.asm b/vp8/common/arm/armv6/iwalsh_v6.asm deleted file mode 100644 index 31ef09cad..000000000 --- a/vp8/common/arm/armv6/iwalsh_v6.asm +++ /dev/null @@ -1,136 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_short_inv_walsh4x4_v6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_inv_walsh4x4_v6(short *input, short *mb_dqcoeff) -|vp8_short_inv_walsh4x4_v6| PROC - - stmdb sp!, {r4 - r12, lr} - - ldr r2, [r0, #0] ; [1 | 0] - ldr r3, [r0, #4] ; [3 | 2] - ldr r4, [r0, #8] ; [5 | 4] - ldr r5, [r0, #12] ; [7 | 6] - ldr r6, [r0, #16] ; [9 | 8] - ldr r7, [r0, #20] ; [11 | 10] - ldr r8, [r0, #24] ; [13 | 12] - ldr r9, [r0, #28] ; [15 | 14] - - qadd16 r10, r2, r8 ; a1 [1+13 | 0+12] - qadd16 r11, r4, r6 ; b1 [5+9 | 4+8] - qsub16 r12, r4, r6 ; c1 [5-9 | 4-8] - qsub16 lr, r2, r8 ; d1 [1-13 | 0-12] - - qadd16 r2, r10, r11 ; a1 + b1 [1 | 0] - qadd16 r4, r12, lr ; c1 + d1 [5 | 4] - qsub16 r6, r10, r11 ; a1 - b1 [9 | 8] - qsub16 r8, lr, r12 ; d1 - c1 [13 | 12] - - qadd16 r10, r3, r9 ; a1 [3+15 | 2+14] - qadd16 r11, r5, r7 ; b1 [7+11 | 6+10] - qsub16 r12, r5, r7 ; c1 [7-11 | 6-10] - qsub16 lr, r3, r9 ; d1 [3-15 | 2-14] - - qadd16 r3, r10, r11 ; a1 + b1 [3 | 2] - qadd16 r5, r12, lr ; c1 + d1 [7 | 6] - qsub16 r7, r10, r11 ; a1 - b1 [11 | 10] - qsub16 r9, lr, r12 ; d1 - c1 [15 | 14] - - ; first transform complete - - qsubaddx r10, r2, r3 ; [c1|a1] [1-2 | 0+3] - qaddsubx r11, r2, r3 ; [b1|d1] [1+2 | 0-3] - qsubaddx r12, r4, r5 ; [c1|a1] [5-6 | 4+7] - qaddsubx lr, r4, r5 ; [b1|d1] [5+6 | 4-7] - - qaddsubx r2, r10, r11 ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r3, r11, r10 ; [a2|d2] [b1+a1 | d1-c1] - ldr r10, c0x00030003 - qaddsubx r4, r12, lr ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r5, lr, r12 ; [a2|d2] [b1+a1 | d1-c1] - - qadd16 r2, r2, r10 ; [b2+3|c2+3] - qadd16 r3, r3, r10 ; [a2+3|d2+3] - qadd16 r4, r4, r10 ; [b2+3|c2+3] - qadd16 r5, r5, r10 ; [a2+3|d2+3] - - asr r12, r3, #19 ; [0] - strh r12, [r1], #32 - asr lr, r2, #19 ; [1] - strh lr, [r1], #32 - sxth r2, r2 - sxth r3, r3 - asr r2, r2, #3 ; [2] - strh r2, [r1], #32 - asr r3, r3, #3 ; [3] - strh r3, [r1], #32 - - asr r12, r5, #19 ; [4] - strh r12, [r1], #32 - asr lr, r4, #19 ; [5] - strh lr, [r1], #32 - sxth r4, r4 - sxth r5, r5 - asr r4, r4, #3 ; [6] - strh r4, [r1], #32 - asr r5, r5, #3 ; [7] - strh r5, [r1], #32 - - qsubaddx r2, r6, r7 ; [c1|a1] [9-10 | 8+11] - qaddsubx r3, r6, r7 ; [b1|d1] [9+10 | 8-11] - qsubaddx r4, r8, r9 ; [c1|a1] [13-14 | 12+15] - qaddsubx r5, r8, r9 ; [b1|d1] [13+14 | 12-15] - - qaddsubx r6, r2, r3 ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r7, r3, r2 ; [a2|d2] [b1+a1 | d1-c1] - qaddsubx r8, r4, r5 ; [b2|c2] [c1+d1 | a1-b1] - qaddsubx r9, r5, r4 ; [a2|d2] [b1+a1 | d1-c1] - - qadd16 r6, r6, r10 ; [b2+3|c2+3] - qadd16 r7, r7, r10 ; [a2+3|d2+3] - qadd16 r8, r8, r10 ; [b2+3|c2+3] - qadd16 r9, r9, r10 ; [a2+3|d2+3] - - asr r12, r7, #19 ; [8] - strh r12, [r1], #32 - asr lr, r6, #19 ; [9] - strh lr, [r1], #32 - sxth r6, r6 - sxth r7, r7 - asr r6, r6, #3 ; [10] - strh r6, [r1], #32 - asr r7, r7, #3 ; [11] - strh r7, [r1], #32 - - asr r12, r9, #19 ; [12] - strh r12, [r1], #32 - asr lr, r8, #19 ; [13] - strh lr, [r1], #32 - sxth r8, r8 - sxth r9, r9 - asr r8, r8, #3 ; [14] - strh r8, [r1], #32 - asr r9, r9, #3 ; [15] - strh r9, [r1], #32 - - ldmia sp!, {r4 - r12, pc} - ENDP ; |vp8_short_inv_walsh4x4_v6| - - -; Constant Pool -c0x00030003 DCD 0x00030003 - END diff --git a/vp8/common/arm/armv6/loopfilter_v6.asm b/vp8/common/arm/armv6/loopfilter_v6.asm deleted file mode 100644 index 1cbbbcdef..000000000 --- a/vp8/common/arm/armv6/loopfilter_v6.asm +++ /dev/null @@ -1,1282 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_loop_filter_horizontal_edge_armv6| - EXPORT |vp8_mbloop_filter_horizontal_edge_armv6| - EXPORT |vp8_loop_filter_vertical_edge_armv6| - EXPORT |vp8_mbloop_filter_vertical_edge_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - - MACRO - TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 - ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 - ; a0: 03 02 01 00 - ; a1: 13 12 11 10 - ; a2: 23 22 21 20 - ; a3: 33 32 31 30 - ; b3 b2 b1 b0 - - uxtb16 $b1, $a1 ; xx 12 xx 10 - uxtb16 $b0, $a0 ; xx 02 xx 00 - uxtb16 $b3, $a3 ; xx 32 xx 30 - uxtb16 $b2, $a2 ; xx 22 xx 20 - orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 - orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 - - uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 - uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 - uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 - uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 - orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 - orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 - - pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 - pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 - - pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 - pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 - MEND - - -src RN r0 -pstep RN r1 -count RN r5 - -;r0 unsigned char *src_ptr, -;r1 int src_pixel_step, -;r2 const char *blimit, -;r3 const char *limit, -;stack const char *thresh, -;stack int count - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_loop_filter_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r6, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r9, [src], pstep ; p3 - ldrb r4, [r2] ; blimit - ldr r10, [src], pstep ; p2 - ldrb r2, [r3] ; limit - ldr r11, [src], pstep ; p1 - orr r4, r4, r4, lsl #8 - ldrb r3, [r6] ; thresh - orr r2, r2, r2, lsl #8 - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|Hnext8| - ; vp8_filter_mask() function - ; calculate breakout conditions - ldr r12, [src], pstep ; p0 - - uqsub8 r6, r9, r10 ; p3 - p2 - uqsub8 r7, r10, r9 ; p2 - p3 - uqsub8 r8, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - - orr r6, r6, r7 ; abs (p3-p2) - orr r8, r8, r10 ; abs (p2-p1) - uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask - uqsub8 r8, r8, r2 ; compare to limit - uqsub8 r6, r11, r12 ; p1 - p0 - orr lr, lr, r8 - uqsub8 r7, r12, r11 ; p0 - p1 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - orr r6, r6, r7 ; abs (p1-p0) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later - orr lr, lr, r7 - - uqsub8 r6, r11, r10 ; p1 - q1 - uqsub8 r7, r10, r11 ; q1 - p1 - uqsub8 r11, r12, r9 ; p0 - q0 - uqsub8 r12, r9, r12 ; q0 - p0 - orr r6, r6, r7 ; abs (p1-q1) - ldr r7, c0x7F7F7F7F - orr r12, r11, r12 ; abs (p0-q0) - ldr r11, [src], pstep ; q2 - uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 - and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r7, r9, r10 ; q0 - q1 - uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r6, r10, r9 ; q1 - q0 - uqsub8 r12, r12, r4 ; compare to flimit - uqsub8 r9, r11, r10 ; q2 - q1 - - orr lr, lr, r12 - - ldr r12, [src], pstep ; q3 - uqsub8 r10, r10, r11 ; q1 - q2 - orr r6, r7, r6 ; abs (q1-q0) - orr r10, r9, r10 ; abs (q2-q1) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r10, r10, r2 ; compare to limit - uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later - orr lr, lr, r7 - orr lr, lr, r10 - - uqsub8 r10, r12, r11 ; q3 - q2 - uqsub8 r9, r11, r12 ; q2 - q3 - - mvn r11, #0 ; r11 == -1 - - orr r10, r10, r9 ; abs (q3-q2) - uqsub8 r10, r10, r2 ; compare to limit - - mov r12, #0 - orr lr, lr, r10 - sub src, src, pstep, lsl #2 - - usub8 lr, r12, lr ; use usub8 instead of ssub8 - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq hskip_filter ; skip filtering - - sub src, src, pstep, lsl #1 ; move src pointer down by 6 lines - - ;vp8_hevmask() function - ;calculate high edge variance - orr r10, r6, r8 ; calculate vp8_hevmask - - ldr r7, [src], pstep ; p1 - - usub8 r10, r12, r10 ; use usub8 instead of ssub8 - sel r6, r12, r11 ; obtain vp8_hevmask: r6 - - ;vp8_filter() function - ldr r8, [src], pstep ; p0 - ldr r12, c0x80808080 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - - eor r7, r7, r12 ; p1 offset to convert to a signed value - eor r8, r8, r12 ; p0 offset to convert to a signed value - eor r9, r9, r12 ; q0 offset to convert to a signed value - eor r10, r10, r12 ; q1 offset to convert to a signed value - - str r9, [sp] ; store qs0 temporarily - str r8, [sp, #4] ; store ps0 temporarily - str r10, [sp, #8] ; store qs1 temporarily - str r7, [sp, #12] ; store ps1 temporarily - - qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) - qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - - and r7, r7, r6 ; vp8_filter (r7) &= hev - - qadd8 r7, r7, r8 - ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 - - qadd8 r7, r7, r8 - ldr r10, c0x04040404 - - qadd8 r7, r7, r8 - and r7, r7, lr ; vp8_filter &= mask; - - ;modify code for vp8 -- Filter1 = vp8_filter (r7) - qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) - qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) - - mov r9, #0 - shadd8 r8 , r8 , r9 ; Filter2 >>= 3 - shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 - shadd8 r8 , r8 , r9 - shadd8 r7 , r7 , r9 - shadd8 lr , r8 , r9 ; lr: Filter2 - shadd8 r7 , r7 , r9 ; r7: filter - - ;usub8 lr, r8, r10 ; s = (s==4)*-1 - ;sel lr, r11, r9 - ;usub8 r8, r10, r8 - ;sel r8, r11, r9 - ;and r8, r8, lr ; -1 for each element that equals 4 - - ;calculate output - ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) - - ldr r8, [sp] ; load qs0 - ldr r9, [sp, #4] ; load ps0 - - ldr r10, c0x01010101 - - qsub8 r8 ,r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) - qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) - - ;end of modification for vp8 - - mov lr, #0 - sadd8 r7, r7 , r10 ; vp8_filter += 1 - shadd8 r7, r7, lr ; vp8_filter >>= 1 - - ldr r11, [sp, #12] ; load ps1 - ldr r10, [sp, #8] ; load qs1 - - bic r7, r7, r6 ; vp8_filter &= ~hev - sub src, src, pstep, lsl #2 - - qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) - qsub8 r10, r10,r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) - - eor r11, r11, r12 ; *op1 = u^0x80 - str r11, [src], pstep ; store op1 - eor r9, r9, r12 ; *op0 = u^0x80 - str r9, [src], pstep ; store op0 result - eor r8, r8, r12 ; *oq0 = u^0x80 - str r8, [src], pstep ; store oq0 result - eor r10, r10, r12 ; *oq1 = u^0x80 - str r10, [src], pstep ; store oq1 - - sub src, src, pstep, lsl #1 - -|hskip_filter| - add src, src, #4 - sub src, src, pstep, lsl #2 - - subs count, count, #1 - - ldrne r9, [src], pstep ; p3 - ldrne r10, [src], pstep ; p2 - ldrne r11, [src], pstep ; p1 - - bne Hnext8 - - add sp, sp, #16 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_loop_filter_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_mbloop_filter_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r6, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r9, [src], pstep ; p3 - ldrb r4, [r2] ; blimit - ldr r10, [src], pstep ; p2 - ldrb r2, [r3] ; limit - ldr r11, [src], pstep ; p1 - orr r4, r4, r4, lsl #8 - ldrb r3, [r6] ; thresh - orr r2, r2, r2, lsl #8 - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|MBHnext8| - - ; vp8_filter_mask() function - ; calculate breakout conditions - ldr r12, [src], pstep ; p0 - - uqsub8 r6, r9, r10 ; p3 - p2 - uqsub8 r7, r10, r9 ; p2 - p3 - uqsub8 r8, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - - orr r6, r6, r7 ; abs (p3-p2) - orr r8, r8, r10 ; abs (p2-p1) - uqsub8 lr, r6, r2 ; compare to limit. lr: vp8_filter_mask - uqsub8 r8, r8, r2 ; compare to limit - - uqsub8 r6, r11, r12 ; p1 - p0 - orr lr, lr, r8 - uqsub8 r7, r12, r11 ; p0 - p1 - ldr r9, [src], pstep ; q0 - ldr r10, [src], pstep ; q1 - orr r6, r6, r7 ; abs (p1-p0) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r8, r6, r3 ; compare to thresh -- save r8 for later - orr lr, lr, r7 - - uqsub8 r6, r11, r10 ; p1 - q1 - uqsub8 r7, r10, r11 ; q1 - p1 - uqsub8 r11, r12, r9 ; p0 - q0 - uqsub8 r12, r9, r12 ; q0 - p0 - orr r6, r6, r7 ; abs (p1-q1) - ldr r7, c0x7F7F7F7F - orr r12, r11, r12 ; abs (p0-q0) - ldr r11, [src], pstep ; q2 - uqadd8 r12, r12, r12 ; abs (p0-q0) * 2 - and r6, r7, r6, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r7, r9, r10 ; q0 - q1 - uqadd8 r12, r12, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r6, r10, r9 ; q1 - q0 - uqsub8 r12, r12, r4 ; compare to flimit - uqsub8 r9, r11, r10 ; q2 - q1 - - orr lr, lr, r12 - - ldr r12, [src], pstep ; q3 - - uqsub8 r10, r10, r11 ; q1 - q2 - orr r6, r7, r6 ; abs (q1-q0) - orr r10, r9, r10 ; abs (q2-q1) - uqsub8 r7, r6, r2 ; compare to limit - uqsub8 r10, r10, r2 ; compare to limit - uqsub8 r6, r6, r3 ; compare to thresh -- save r6 for later - orr lr, lr, r7 - orr lr, lr, r10 - - uqsub8 r10, r12, r11 ; q3 - q2 - uqsub8 r9, r11, r12 ; q2 - q3 - - mvn r11, #0 ; r11 == -1 - - orr r10, r10, r9 ; abs (q3-q2) - uqsub8 r10, r10, r2 ; compare to limit - - mov r12, #0 - - orr lr, lr, r10 - - usub8 lr, r12, lr ; use usub8 instead of ssub8 - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq mbhskip_filter ; skip filtering - - ;vp8_hevmask() function - ;calculate high edge variance - sub src, src, pstep, lsl #2 ; move src pointer down by 6 lines - sub src, src, pstep, lsl #1 - - orr r10, r6, r8 - ldr r7, [src], pstep ; p1 - - usub8 r10, r12, r10 - sel r6, r12, r11 ; hev mask: r6 - - ;vp8_mbfilter() function - ;p2, q2 are only needed at the end. Don't need to load them in now. - ldr r8, [src], pstep ; p0 - ldr r12, c0x80808080 - ldr r9, [src], pstep ; q0 - ldr r10, [src] ; q1 - - eor r7, r7, r12 ; ps1 - eor r8, r8, r12 ; ps0 - eor r9, r9, r12 ; qs0 - eor r10, r10, r12 ; qs1 - - qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - str r7, [sp, #12] ; store ps1 temporarily - qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) - str r10, [sp, #8] ; store qs1 temporarily - qadd8 r7, r7, r12 - str r9, [sp] ; store qs0 temporarily - qadd8 r7, r7, r12 - str r8, [sp, #4] ; store ps0 temporarily - qadd8 r7, r7, r12 ; vp8_filter: r7 - - ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 - ldr r9, c0x04040404 - - and r7, r7, lr ; vp8_filter &= mask (lr is free) - - mov r12, r7 ; Filter2: r12 - and r12, r12, r6 ; Filter2 &= hev - - ;modify code for vp8 - ;save bottom 3 bits so that we round one side +4 and the other +3 - qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) - qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) - - mov r10, #0 - shadd8 r8 , r8 , r10 ; Filter1 >>= 3 - shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - shadd8 r8 , r8 , r10 - shadd8 r12 , r12 , r10 - shadd8 r8 , r8 , r10 ; r8: Filter1 - shadd8 r12 , r12 , r10 ; r12: Filter2 - - ldr r9, [sp] ; load qs0 - ldr r11, [sp, #4] ; load ps0 - - qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) - qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) - - ;save bottom 3 bits so that we round one side +4 and the other +3 - ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) - ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) - ;mov r10, #0 - ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - ;usub8 lr, r8, r9 ; s = (s==4)*-1 - ;sel lr, r11, r10 - ;shadd8 r12 , r12 , r10 - ;usub8 r8, r9, r8 - ;sel r8, r11, r10 - ;ldr r9, [sp] ; load qs0 - ;ldr r11, [sp, #4] ; load ps0 - ;shadd8 r12 , r12 , r10 - ;and r8, r8, lr ; -1 for each element that equals 4 - ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) - ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) - ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) - - ;end of modification for vp8 - - bic r12, r7, r6 ; vp8_filter &= ~hev ( r6 is free) - ;mov r12, r7 - - ;roughly 3/7th difference across boundary - mov lr, #0x1b ; 27 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r7, r10, lr, r7 - smultb r10, r10, lr - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - add r10, r10, #63 - ssat r7, #8, r7, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r7, r10, lsl #16 - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) - - qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) - qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) - eor r8, r8, lr ; *oq0 = s^0x80 - str r8, [src] ; store *oq0 - sub src, src, pstep - eor r10, r10, lr ; *op0 = s^0x80 - str r10, [src] ; store *op0 - - ;roughly 2/7th difference across boundary - mov lr, #0x12 ; 18 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r9, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r9, #8, r9, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r9, r10, lsl #16 - - ldr r9, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) - - qadd8 r11, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) - qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) - eor r11, r11, lr ; *op1 = s^0x80 - str r11, [src], pstep ; store *op1 - eor r8, r8, lr ; *oq1 = s^0x80 - add src, src, pstep, lsl #1 - - mov r7, #0x3f ; 63 - - str r8, [src], pstep ; store *oq1 - - ;roughly 1/7th difference across boundary - mov lr, #0x9 ; 9 - ldr r9, [src] ; load q2 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r12, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r12, #8, r12, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r12, r10, lsl #16 - - sub src, src, pstep - ldr lr, c0x80808080 - - ldr r11, [src] ; load p2 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - eor r9, r9, lr - eor r11, r11, lr - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) - - qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) - qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) - eor r8, r8, lr ; *op2 = s^0x80 - str r8, [src], pstep, lsl #2 ; store *op2 - add src, src, pstep - eor r10, r10, lr ; *oq2 = s^0x80 - str r10, [src], pstep, lsl #1 ; store *oq2 - -|mbhskip_filter| - add src, src, #4 - sub src, src, pstep, lsl #3 - subs count, count, #1 - - ldrne r9, [src], pstep ; p3 - ldrne r10, [src], pstep ; p2 - ldrne r11, [src], pstep ; p1 - - bne MBHnext8 - - add sp, sp, #16 - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_mbloop_filter_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_loop_filter_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, #4 ; move src pointer down by 4 - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r12, [sp, #36] ; load thresh address - sub sp, sp, #16 ; create temp buffer - - ldr r6, [src], pstep ; load source data - ldrb r4, [r2] ; blimit - ldr r7, [src], pstep - ldrb r2, [r3] ; limit - ldr r8, [src], pstep - orr r4, r4, r4, lsl #8 - ldrb r3, [r12] ; thresh - orr r2, r2, r2, lsl #8 - ldr lr, [src], pstep - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|Vnext8| - - ; vp8_filter_mask() function - ; calculate breakout conditions - ; transpose the source data for 4-in-parallel operation - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - uqsub8 r7, r9, r10 ; p3 - p2 - uqsub8 r8, r10, r9 ; p2 - p3 - uqsub8 r9, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - orr r7, r7, r8 ; abs (p3-p2) - orr r10, r9, r10 ; abs (p2-p1) - uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask - uqsub8 r10, r10, r2 ; compare to limit - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr lr, lr, r10 - - uqsub8 r6, r11, r12 ; p1 - p0 - uqsub8 r7, r12, r11 ; p0 - p1 - add src, src, #4 ; move src pointer up by 4 - orr r6, r6, r7 ; abs (p1-p0) - str r11, [sp, #12] ; save p1 - uqsub8 r10, r6, r2 ; compare to limit - uqsub8 r11, r6, r3 ; compare to thresh - orr lr, lr, r10 - - ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now - ; transpose the source data for 4-in-parallel operation - ldr r6, [src], pstep ; load source data - str r11, [sp] ; push r11 to stack - ldr r7, [src], pstep - str r12, [sp, #4] ; save current reg before load q0 - q3 data - ldr r8, [src], pstep - str lr, [sp, #8] - ldr lr, [src], pstep - - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - ldr lr, [sp, #8] ; load back (f)limit accumulator - - uqsub8 r6, r12, r11 ; q3 - q2 - uqsub8 r7, r11, r12 ; q2 - q3 - uqsub8 r12, r11, r10 ; q2 - q1 - uqsub8 r11, r10, r11 ; q1 - q2 - orr r6, r6, r7 ; abs (q3-q2) - orr r7, r12, r11 ; abs (q2-q1) - uqsub8 r6, r6, r2 ; compare to limit - uqsub8 r7, r7, r2 ; compare to limit - ldr r11, [sp, #4] ; load back p0 - ldr r12, [sp, #12] ; load back p1 - orr lr, lr, r6 - orr lr, lr, r7 - - uqsub8 r6, r11, r9 ; p0 - q0 - uqsub8 r7, r9, r11 ; q0 - p0 - uqsub8 r8, r12, r10 ; p1 - q1 - uqsub8 r11, r10, r12 ; q1 - p1 - orr r6, r6, r7 ; abs (p0-q0) - ldr r7, c0x7F7F7F7F - orr r8, r8, r11 ; abs (p1-q1) - uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 - and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r11, r10, r9 ; q1 - q0 - uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r12, r9, r10 ; q0 - q1 - uqsub8 r6, r6, r4 ; compare to flimit - - orr r9, r11, r12 ; abs (q1-q0) - uqsub8 r8, r9, r2 ; compare to limit - uqsub8 r10, r9, r3 ; compare to thresh - orr lr, lr, r6 - orr lr, lr, r8 - - mvn r11, #0 ; r11 == -1 - mov r12, #0 - - usub8 lr, r12, lr - ldr r9, [sp] ; load the compared result - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq vskip_filter ; skip filtering - - ;vp8_hevmask() function - ;calculate high edge variance - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r9, r9, r10 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - usub8 r9, r12, r9 - sel r6, r12, r11 ; hev mask: r6 - - ;vp8_filter() function - ; load soure data to r6, r11, r12, lr - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - pkhbt r12, r7, r8, lsl #16 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - pkhbt r11, r9, r10, lsl #16 - - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first - str r6, [sp] - str lr, [sp, #4] - - pkhbt r6, r7, r8, lsl #16 - pkhbt lr, r9, r10, lsl #16 - - ;transpose r12, r11, r6, lr to r7, r8, r9, r10 - TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 - - ;load back hev_mask r6 and filter_mask lr - ldr r12, c0x80808080 - ldr r6, [sp] - ldr lr, [sp, #4] - - eor r7, r7, r12 ; p1 offset to convert to a signed value - eor r8, r8, r12 ; p0 offset to convert to a signed value - eor r9, r9, r12 ; q0 offset to convert to a signed value - eor r10, r10, r12 ; q1 offset to convert to a signed value - - str r9, [sp] ; store qs0 temporarily - str r8, [sp, #4] ; store ps0 temporarily - str r10, [sp, #8] ; store qs1 temporarily - str r7, [sp, #12] ; store ps1 temporarily - - qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) - qsub8 r8, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - - and r7, r7, r6 ; vp8_filter (r7) &= hev (r7 : filter) - - qadd8 r7, r7, r8 - ldr r9, c0x03030303 ; r9 = 3 --modified for vp8 - - qadd8 r7, r7, r8 - ldr r10, c0x04040404 - - qadd8 r7, r7, r8 - ;mvn r11, #0 ; r11 == -1 - - and r7, r7, lr ; vp8_filter &= mask - - ;modify code for vp8 -- Filter1 = vp8_filter (r7) - qadd8 r8 , r7 , r9 ; Filter2 (r8) = vp8_signed_char_clamp(vp8_filter+3) - qadd8 r7 , r7 , r10 ; vp8_filter = vp8_signed_char_clamp(vp8_filter+4) - - mov r9, #0 - shadd8 r8 , r8 , r9 ; Filter2 >>= 3 - shadd8 r7 , r7 , r9 ; vp8_filter >>= 3 - shadd8 r8 , r8 , r9 - shadd8 r7 , r7 , r9 - shadd8 lr , r8 , r9 ; lr: filter2 - shadd8 r7 , r7 , r9 ; r7: filter - - ;usub8 lr, r8, r10 ; s = (s==4)*-1 - ;sel lr, r11, r9 - ;usub8 r8, r10, r8 - ;sel r8, r11, r9 - ;and r8, r8, lr ; -1 for each element that equals 4 -- r8: s - - ;calculate output - ;qadd8 lr, r8, r7 ; u = vp8_signed_char_clamp(s + vp8_filter) - - ldr r8, [sp] ; load qs0 - ldr r9, [sp, #4] ; load ps0 - - ldr r10, c0x01010101 - - qsub8 r8, r8, r7 ; u = vp8_signed_char_clamp(qs0 - vp8_filter) - qadd8 r9, r9, lr ; u = vp8_signed_char_clamp(ps0 + Filter2) - ;end of modification for vp8 - - eor r8, r8, r12 - eor r9, r9, r12 - - mov lr, #0 - - sadd8 r7, r7, r10 - shadd8 r7, r7, lr - - ldr r10, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - - bic r7, r7, r6 ; r7: vp8_filter - - qsub8 r10 , r10, r7 ; u = vp8_signed_char_clamp(qs1 - vp8_filter) - qadd8 r11, r11, r7 ; u = vp8_signed_char_clamp(ps1 + vp8_filter) - eor r10, r10, r12 - eor r11, r11, r12 - - sub src, src, pstep, lsl #2 - - ;we can use TRANSPOSE_MATRIX macro to transpose output - input: q1, q0, p0, p1 - ;output is b0, b1, b2, b3 - ;b0: 03 02 01 00 - ;b1: 13 12 11 10 - ;b2: 23 22 21 20 - ;b3: 33 32 31 30 - ; p1 p0 q0 q1 - ; (a3 a2 a1 a0) - TRANSPOSE_MATRIX r11, r9, r8, r10, r6, r7, r12, lr - - strh r6, [src, #-2] ; store the result - mov r6, r6, lsr #16 - strh r6, [src], pstep - - strh r7, [src, #-2] - mov r7, r7, lsr #16 - strh r7, [src], pstep - - strh r12, [src, #-2] - mov r12, r12, lsr #16 - strh r12, [src], pstep - - strh lr, [src, #-2] - mov lr, lr, lsr #16 - strh lr, [src], pstep - -|vskip_filter| - sub src, src, #4 - subs count, count, #1 - - ldrne r6, [src], pstep ; load source data - ldrne r7, [src], pstep - ldrne r8, [src], pstep - ldrne lr, [src], pstep - - bne Vnext8 - - add sp, sp, #16 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_loop_filter_vertical_edge_armv6| - - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_mbloop_filter_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - sub src, src, #4 ; move src pointer down by 4 - ldr count, [sp, #40] ; count for 8-in-parallel - ldr r12, [sp, #36] ; load thresh address - pld [src, #23] ; preload for next block - sub sp, sp, #16 ; create temp buffer - - ldr r6, [src], pstep ; load source data - ldrb r4, [r2] ; blimit - pld [src, #23] - ldr r7, [src], pstep - ldrb r2, [r3] ; limit - pld [src, #23] - ldr r8, [src], pstep - orr r4, r4, r4, lsl #8 - ldrb r3, [r12] ; thresh - orr r2, r2, r2, lsl #8 - pld [src, #23] - ldr lr, [src], pstep - mov count, count, lsl #1 ; 4-in-parallel - orr r4, r4, r4, lsl #16 - orr r3, r3, r3, lsl #8 - orr r2, r2, r2, lsl #16 - orr r3, r3, r3, lsl #16 - -|MBVnext8| - ; vp8_filter_mask() function - ; calculate breakout conditions - ; transpose the source data for 4-in-parallel operation - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - uqsub8 r7, r9, r10 ; p3 - p2 - uqsub8 r8, r10, r9 ; p2 - p3 - uqsub8 r9, r10, r11 ; p2 - p1 - uqsub8 r10, r11, r10 ; p1 - p2 - orr r7, r7, r8 ; abs (p3-p2) - orr r10, r9, r10 ; abs (p2-p1) - uqsub8 lr, r7, r2 ; compare to limit. lr: vp8_filter_mask - uqsub8 r10, r10, r2 ; compare to limit - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr lr, lr, r10 - - uqsub8 r6, r11, r12 ; p1 - p0 - uqsub8 r7, r12, r11 ; p0 - p1 - add src, src, #4 ; move src pointer up by 4 - orr r6, r6, r7 ; abs (p1-p0) - str r11, [sp, #12] ; save p1 - uqsub8 r10, r6, r2 ; compare to limit - uqsub8 r11, r6, r3 ; compare to thresh - orr lr, lr, r10 - - ; transpose uses 8 regs(r6 - r12 and lr). Need to save reg value now - ; transpose the source data for 4-in-parallel operation - ldr r6, [src], pstep ; load source data - str r11, [sp] ; push r11 to stack - ldr r7, [src], pstep - str r12, [sp, #4] ; save current reg before load q0 - q3 data - ldr r8, [src], pstep - str lr, [sp, #8] - ldr lr, [src], pstep - - - TRANSPOSE_MATRIX r6, r7, r8, lr, r9, r10, r11, r12 - - ldr lr, [sp, #8] ; load back (f)limit accumulator - - uqsub8 r6, r12, r11 ; q3 - q2 - uqsub8 r7, r11, r12 ; q2 - q3 - uqsub8 r12, r11, r10 ; q2 - q1 - uqsub8 r11, r10, r11 ; q1 - q2 - orr r6, r6, r7 ; abs (q3-q2) - orr r7, r12, r11 ; abs (q2-q1) - uqsub8 r6, r6, r2 ; compare to limit - uqsub8 r7, r7, r2 ; compare to limit - ldr r11, [sp, #4] ; load back p0 - ldr r12, [sp, #12] ; load back p1 - orr lr, lr, r6 - orr lr, lr, r7 - - uqsub8 r6, r11, r9 ; p0 - q0 - uqsub8 r7, r9, r11 ; q0 - p0 - uqsub8 r8, r12, r10 ; p1 - q1 - uqsub8 r11, r10, r12 ; q1 - p1 - orr r6, r6, r7 ; abs (p0-q0) - ldr r7, c0x7F7F7F7F - orr r8, r8, r11 ; abs (p1-q1) - uqadd8 r6, r6, r6 ; abs (p0-q0) * 2 - and r8, r7, r8, lsr #1 ; abs (p1-q1) / 2 - uqsub8 r11, r10, r9 ; q1 - q0 - uqadd8 r6, r8, r6 ; abs (p0-q0)*2 + abs (p1-q1)/2 - uqsub8 r12, r9, r10 ; q0 - q1 - uqsub8 r6, r6, r4 ; compare to flimit - - orr r9, r11, r12 ; abs (q1-q0) - uqsub8 r8, r9, r2 ; compare to limit - uqsub8 r10, r9, r3 ; compare to thresh - orr lr, lr, r6 - orr lr, lr, r8 - - mvn r11, #0 ; r11 == -1 - mov r12, #0 - - usub8 lr, r12, lr - ldr r9, [sp] ; load the compared result - sel lr, r11, r12 ; filter mask: lr - - cmp lr, #0 - beq mbvskip_filter ; skip filtering - - - - ;vp8_hevmask() function - ;calculate high edge variance - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r9, r9, r10 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - usub8 r9, r12, r9 - sel r6, r12, r11 ; hev mask: r6 - - - ; vp8_mbfilter() function - ; p2, q2 are only needed at the end. Don't need to load them in now. - ; Transpose needs 8 regs(r6 - r12, and lr). Save r6 and lr first - ; load soure data to r6, r11, r12, lr - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - pkhbt r12, r7, r8, lsl #16 - - ldrh r7, [src, #-2] - ldrh r8, [src], pstep - - pkhbt r11, r9, r10, lsl #16 - - ldrh r9, [src, #-2] - ldrh r10, [src], pstep - - str r6, [sp] ; save r6 - str lr, [sp, #4] ; save lr - - pkhbt r6, r7, r8, lsl #16 - pkhbt lr, r9, r10, lsl #16 - - ;transpose r12, r11, r6, lr to p1, p0, q0, q1 - TRANSPOSE_MATRIX r12, r11, r6, lr, r7, r8, r9, r10 - - ;load back hev_mask r6 and filter_mask lr - ldr r12, c0x80808080 - ldr r6, [sp] - ldr lr, [sp, #4] - - eor r7, r7, r12 ; ps1 - eor r8, r8, r12 ; ps0 - eor r9, r9, r12 ; qs0 - eor r10, r10, r12 ; qs1 - - qsub8 r12, r9, r8 ; vp8_signed_char_clamp(vp8_filter + 3 * ( qs0 - ps0)) - str r7, [sp, #12] ; store ps1 temporarily - qsub8 r7, r7, r10 ; vp8_signed_char_clamp(ps1-qs1) - str r10, [sp, #8] ; store qs1 temporarily - qadd8 r7, r7, r12 - str r9, [sp] ; store qs0 temporarily - qadd8 r7, r7, r12 - str r8, [sp, #4] ; store ps0 temporarily - qadd8 r7, r7, r12 ; vp8_filter: r7 - - ldr r10, c0x03030303 ; r10 = 3 --modified for vp8 - ldr r9, c0x04040404 - ;mvn r11, #0 ; r11 == -1 - - and r7, r7, lr ; vp8_filter &= mask (lr is free) - - mov r12, r7 ; Filter2: r12 - and r12, r12, r6 ; Filter2 &= hev - - ;modify code for vp8 - ;save bottom 3 bits so that we round one side +4 and the other +3 - qadd8 r8 , r12 , r9 ; Filter1 (r8) = vp8_signed_char_clamp(Filter2+4) - qadd8 r12 , r12 , r10 ; Filter2 (r12) = vp8_signed_char_clamp(Filter2+3) - - mov r10, #0 - shadd8 r8 , r8 , r10 ; Filter1 >>= 3 - shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - shadd8 r8 , r8 , r10 - shadd8 r12 , r12 , r10 - shadd8 r8 , r8 , r10 ; r8: Filter1 - shadd8 r12 , r12 , r10 ; r12: Filter2 - - ldr r9, [sp] ; load qs0 - ldr r11, [sp, #4] ; load ps0 - - qsub8 r9 , r9, r8 ; qs0 = vp8_signed_char_clamp(qs0 - Filter1) - qadd8 r11, r11, r12 ; ps0 = vp8_signed_char_clamp(ps0 + Filter2) - - ;save bottom 3 bits so that we round one side +4 and the other +3 - ;and r8, r12, r10 ; s = Filter2 & 7 (s: r8) - ;qadd8 r12 , r12 , r9 ; Filter2 = vp8_signed_char_clamp(Filter2+4) - ;mov r10, #0 - ;shadd8 r12 , r12 , r10 ; Filter2 >>= 3 - ;usub8 lr, r8, r9 ; s = (s==4)*-1 - ;sel lr, r11, r10 - ;shadd8 r12 , r12 , r10 - ;usub8 r8, r9, r8 - ;sel r8, r11, r10 - ;ldr r9, [sp] ; load qs0 - ;ldr r11, [sp, #4] ; load ps0 - ;shadd8 r12 , r12 , r10 - ;and r8, r8, lr ; -1 for each element that equals 4 - ;qadd8 r10, r8, r12 ; u = vp8_signed_char_clamp(s + Filter2) - ;qsub8 r9 , r9, r12 ; qs0 = vp8_signed_char_clamp(qs0 - Filter2) - ;qadd8 r11, r11, r10 ; ps0 = vp8_signed_char_clamp(ps0 + u) - - ;end of modification for vp8 - - bic r12, r7, r6 ;vp8_filter &= ~hev ( r6 is free) - ;mov r12, r7 - - ;roughly 3/7th difference across boundary - mov lr, #0x1b ; 27 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r7, r10, lr, r7 - smultb r10, r10, lr - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - add r10, r10, #63 - ssat r7, #8, r7, asr #7 - ssat r10, #8, r10, asr #7 - - ldr lr, c0x80808080 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r7, r10, lsl #16 - uxtb16 r6, r6 - uxtb16 r10, r10 - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 27)>>7) - - qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs0 - u) - qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps0 + u) - eor r8, r8, lr ; *oq0 = s^0x80 - eor r10, r10, lr ; *op0 = s^0x80 - - strb r10, [src, #-1] ; store op0 result - strb r8, [src], pstep ; store oq0 result - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - strb r10, [src, #-1] - strb r8, [src], pstep - - ;roughly 2/7th difference across boundary - mov lr, #0x12 ; 18 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r9, r10, lr, r7 - - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r9, #8, r9, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 ; move src pointer down by 4 lines - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r9, r10, lsl #16 - - ldr r9, [sp, #8] ; load qs1 - ldr r11, [sp, #12] ; load ps1 - ldr lr, c0x80808080 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - add src, src, #2 - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 18)>>7) - - qsub8 r8, r9, r10 ; s = vp8_signed_char_clamp(qs1 - u) - qadd8 r10, r11, r10 ; s = vp8_signed_char_clamp(ps1 + u) - eor r8, r8, lr ; *oq1 = s^0x80 - eor r10, r10, lr ; *op1 = s^0x80 - - ldrb r11, [src, #-5] ; load p2 for 1/7th difference across boundary - strb r10, [src, #-4] ; store op1 - strb r8, [src, #-1] ; store oq1 - ldrb r9, [src], pstep ; load q2 for 1/7th difference across boundary - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - orr r11, r11, r6, lsl #8 - orr r9, r9, r7, lsl #8 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - - mov r10, r10, lsr #8 - mov r8, r8, lsr #8 - orr r11, r11, r6, lsl #16 - orr r9, r9, r7, lsl #16 - - ldrb r6, [src, #-5] - strb r10, [src, #-4] - strb r8, [src, #-1] - ldrb r7, [src], pstep - orr r11, r11, r6, lsl #24 - orr r9, r9, r7, lsl #24 - - ;roughly 1/7th difference across boundary - eor r9, r9, lr - eor r11, r11, lr - - mov lr, #0x9 ; 9 - mov r7, #0x3f ; 63 - - sxtb16 r6, r12 - sxtb16 r10, r12, ror #8 - smlabb r8, r6, lr, r7 - smlatb r6, r6, lr, r7 - smlabb r12, r10, lr, r7 - smlatb r10, r10, lr, r7 - ssat r8, #8, r8, asr #7 - ssat r6, #8, r6, asr #7 - ssat r12, #8, r12, asr #7 - ssat r10, #8, r10, asr #7 - - sub src, src, pstep, lsl #2 - - pkhbt r6, r8, r6, lsl #16 - pkhbt r10, r12, r10, lsl #16 - - uxtb16 r6, r6 - uxtb16 r10, r10 - - ldr lr, c0x80808080 - - orr r10, r6, r10, lsl #8 ; u = vp8_signed_char_clamp((63 + Filter2 * 9)>>7) - - qadd8 r8, r11, r10 ; s = vp8_signed_char_clamp(ps2 + u) - qsub8 r10, r9, r10 ; s = vp8_signed_char_clamp(qs2 - u) - eor r8, r8, lr ; *op2 = s^0x80 - eor r10, r10, lr ; *oq2 = s^0x80 - - strb r8, [src, #-5] ; store *op2 - strb r10, [src], pstep ; store *oq2 - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - mov r8, r8, lsr #8 - mov r10, r10, lsr #8 - strb r8, [src, #-5] - strb r10, [src], pstep - - ;adjust src pointer for next loop - sub src, src, #2 - -|mbvskip_filter| - sub src, src, #4 - subs count, count, #1 - - pld [src, #23] ; preload for next block - ldrne r6, [src], pstep ; load source data - pld [src, #23] - ldrne r7, [src], pstep - pld [src, #23] - ldrne r8, [src], pstep - pld [src, #23] - ldrne lr, [src], pstep - - bne MBVnext8 - - add sp, sp, #16 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_mbloop_filter_vertical_edge_armv6| - -; Constant Pool -c0x80808080 DCD 0x80808080 -c0x03030303 DCD 0x03030303 -c0x04040404 DCD 0x04040404 -c0x01010101 DCD 0x01010101 -c0x7F7F7F7F DCD 0x7F7F7F7F - - END diff --git a/vp8/common/arm/armv6/simpleloopfilter_v6.asm b/vp8/common/arm/armv6/simpleloopfilter_v6.asm deleted file mode 100644 index 5e00cf01b..000000000 --- a/vp8/common/arm/armv6/simpleloopfilter_v6.asm +++ /dev/null @@ -1,286 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_loop_filter_simple_horizontal_edge_armv6| - EXPORT |vp8_loop_filter_simple_vertical_edge_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code - - MACRO - TRANSPOSE_MATRIX $a0, $a1, $a2, $a3, $b0, $b1, $b2, $b3 - ; input: $a0, $a1, $a2, $a3; output: $b0, $b1, $b2, $b3 - ; a0: 03 02 01 00 - ; a1: 13 12 11 10 - ; a2: 23 22 21 20 - ; a3: 33 32 31 30 - ; b3 b2 b1 b0 - - uxtb16 $b1, $a1 ; xx 12 xx 10 - uxtb16 $b0, $a0 ; xx 02 xx 00 - uxtb16 $b3, $a3 ; xx 32 xx 30 - uxtb16 $b2, $a2 ; xx 22 xx 20 - orr $b1, $b0, $b1, lsl #8 ; 12 02 10 00 - orr $b3, $b2, $b3, lsl #8 ; 32 22 30 20 - - uxtb16 $a1, $a1, ror #8 ; xx 13 xx 11 - uxtb16 $a3, $a3, ror #8 ; xx 33 xx 31 - uxtb16 $a0, $a0, ror #8 ; xx 03 xx 01 - uxtb16 $a2, $a2, ror #8 ; xx 23 xx 21 - orr $a0, $a0, $a1, lsl #8 ; 13 03 11 01 - orr $a2, $a2, $a3, lsl #8 ; 33 23 31 21 - - pkhtb $b2, $b3, $b1, asr #16 ; 32 22 12 02 -- p1 - pkhbt $b0, $b1, $b3, lsl #16 ; 30 20 10 00 -- p3 - - pkhtb $b3, $a2, $a0, asr #16 ; 33 23 13 03 -- p0 - pkhbt $b1, $a0, $a2, lsl #16 ; 31 21 11 01 -- p2 - MEND - - - -src RN r0 -pstep RN r1 - -;r0 unsigned char *src_ptr, -;r1 int src_pixel_step, -;r2 const char *blimit - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_loop_filter_simple_horizontal_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - ldrb r12, [r2] ; blimit - ldr r3, [src, -pstep, lsl #1] ; p1 - ldr r4, [src, -pstep] ; p0 - ldr r5, [src] ; q0 - ldr r6, [src, pstep] ; q1 - orr r12, r12, r12, lsl #8 ; blimit - ldr r2, c0x80808080 - orr r12, r12, r12, lsl #16 ; blimit - mov r9, #4 ; double the count. we're doing 4 at a time - mov lr, #0 ; need 0 in a couple places - -|simple_hnext8| - ; vp8_simple_filter_mask() - - uqsub8 r7, r3, r6 ; p1 - q1 - uqsub8 r8, r6, r3 ; q1 - p1 - uqsub8 r10, r4, r5 ; p0 - q0 - uqsub8 r11, r5, r4 ; q0 - p0 - orr r8, r8, r7 ; abs(p1 - q1) - orr r10, r10, r11 ; abs(p0 - q0) - uqadd8 r10, r10, r10 ; abs(p0 - q0) * 2 - uhadd8 r8, r8, lr ; abs(p1 - q2) >> 1 - uqadd8 r10, r10, r8 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 - mvn r8, #0 - usub8 r10, r12, r10 ; compare to flimit. usub8 sets GE flags - sel r10, r8, lr ; filter mask: F or 0 - cmp r10, #0 - beq simple_hskip_filter ; skip filtering if all masks are 0x00 - - ;vp8_simple_filter() - - eor r3, r3, r2 ; p1 offset to convert to a signed value - eor r6, r6, r2 ; q1 offset to convert to a signed value - eor r4, r4, r2 ; p0 offset to convert to a signed value - eor r5, r5, r2 ; q0 offset to convert to a signed value - - qsub8 r3, r3, r6 ; vp8_filter = p1 - q1 - qsub8 r6, r5, r4 ; q0 - p0 - qadd8 r3, r3, r6 ; += q0 - p0 - ldr r7, c0x04040404 - qadd8 r3, r3, r6 ; += q0 - p0 - ldr r8, c0x03030303 - qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0)) - ;STALL - and r3, r3, r10 ; vp8_filter &= mask - - qadd8 r7 , r3 , r7 ; Filter1 = vp8_filter + 4 - qadd8 r8 , r3 , r8 ; Filter2 = vp8_filter + 3 - - shadd8 r7 , r7 , lr - shadd8 r8 , r8 , lr - shadd8 r7 , r7 , lr - shadd8 r8 , r8 , lr - shadd8 r7 , r7 , lr ; Filter1 >>= 3 - shadd8 r8 , r8 , lr ; Filter2 >>= 3 - - qsub8 r5 ,r5, r7 ; u = q0 - Filter1 - qadd8 r4, r4, r8 ; u = p0 + Filter2 - eor r5, r5, r2 ; *oq0 = u^0x80 - str r5, [src] ; store oq0 result - eor r4, r4, r2 ; *op0 = u^0x80 - str r4, [src, -pstep] ; store op0 result - -|simple_hskip_filter| - subs r9, r9, #1 - addne src, src, #4 ; next row - - ldrne r3, [src, -pstep, lsl #1] ; p1 - ldrne r4, [src, -pstep] ; p0 - ldrne r5, [src] ; q0 - ldrne r6, [src, pstep] ; q1 - - bne simple_hnext8 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_loop_filter_simple_horizontal_edge_armv6| - - -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- -|vp8_loop_filter_simple_vertical_edge_armv6| PROC -;-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- - stmdb sp!, {r4 - r11, lr} - - ldrb r12, [r2] ; r12: blimit - ldr r2, c0x80808080 - orr r12, r12, r12, lsl #8 - - ; load soure data to r7, r8, r9, r10 - ldrh r3, [src, #-2] - pld [src, #23] ; preload for next block - ldrh r4, [src], pstep - orr r12, r12, r12, lsl #16 - - ldrh r5, [src, #-2] - pld [src, #23] - ldrh r6, [src], pstep - - pkhbt r7, r3, r4, lsl #16 - - ldrh r3, [src, #-2] - pld [src, #23] - ldrh r4, [src], pstep - - pkhbt r8, r5, r6, lsl #16 - - ldrh r5, [src, #-2] - pld [src, #23] - ldrh r6, [src], pstep - mov r11, #4 ; double the count. we're doing 4 at a time - -|simple_vnext8| - ; vp8_simple_filter_mask() function - pkhbt r9, r3, r4, lsl #16 - pkhbt r10, r5, r6, lsl #16 - - ;transpose r7, r8, r9, r10 to r3, r4, r5, r6 - TRANSPOSE_MATRIX r7, r8, r9, r10, r3, r4, r5, r6 - - uqsub8 r7, r3, r6 ; p1 - q1 - uqsub8 r8, r6, r3 ; q1 - p1 - uqsub8 r9, r4, r5 ; p0 - q0 - uqsub8 r10, r5, r4 ; q0 - p0 - orr r7, r7, r8 ; abs(p1 - q1) - orr r9, r9, r10 ; abs(p0 - q0) - mov r8, #0 - uqadd8 r9, r9, r9 ; abs(p0 - q0) * 2 - uhadd8 r7, r7, r8 ; abs(p1 - q1) / 2 - uqadd8 r7, r7, r9 ; abs(p0 - q0)*2 + abs(p1 - q1)/2 - mvn r10, #0 ; r10 == -1 - - usub8 r7, r12, r7 ; compare to flimit - sel lr, r10, r8 ; filter mask - - cmp lr, #0 - beq simple_vskip_filter ; skip filtering - - ;vp8_simple_filter() function - eor r3, r3, r2 ; p1 offset to convert to a signed value - eor r6, r6, r2 ; q1 offset to convert to a signed value - eor r4, r4, r2 ; p0 offset to convert to a signed value - eor r5, r5, r2 ; q0 offset to convert to a signed value - - qsub8 r3, r3, r6 ; vp8_filter = p1 - q1 - qsub8 r6, r5, r4 ; q0 - p0 - - qadd8 r3, r3, r6 ; vp8_filter += q0 - p0 - ldr r9, c0x03030303 ; r9 = 3 - - qadd8 r3, r3, r6 ; vp8_filter += q0 - p0 - ldr r7, c0x04040404 - - qadd8 r3, r3, r6 ; vp8_filter = p1-q1 + 3*(q0-p0)) - ;STALL - and r3, r3, lr ; vp8_filter &= mask - - qadd8 r9 , r3 , r9 ; Filter2 = vp8_filter + 3 - qadd8 r3 , r3 , r7 ; Filter1 = vp8_filter + 4 - - shadd8 r9 , r9 , r8 - shadd8 r3 , r3 , r8 - shadd8 r9 , r9 , r8 - shadd8 r3 , r3 , r8 - shadd8 r9 , r9 , r8 ; Filter2 >>= 3 - shadd8 r3 , r3 , r8 ; Filter1 >>= 3 - - ;calculate output - sub src, src, pstep, lsl #2 - - qadd8 r4, r4, r9 ; u = p0 + Filter2 - qsub8 r5, r5, r3 ; u = q0 - Filter1 - eor r4, r4, r2 ; *op0 = u^0x80 - eor r5, r5, r2 ; *oq0 = u^0x80 - - strb r4, [src, #-1] ; store the result - mov r4, r4, lsr #8 - strb r5, [src], pstep - mov r5, r5, lsr #8 - - strb r4, [src, #-1] - mov r4, r4, lsr #8 - strb r5, [src], pstep - mov r5, r5, lsr #8 - - strb r4, [src, #-1] - mov r4, r4, lsr #8 - strb r5, [src], pstep - mov r5, r5, lsr #8 - - strb r4, [src, #-1] - strb r5, [src], pstep - -|simple_vskip_filter| - subs r11, r11, #1 - - ; load soure data to r7, r8, r9, r10 - ldrneh r3, [src, #-2] - pld [src, #23] ; preload for next block - ldrneh r4, [src], pstep - - ldrneh r5, [src, #-2] - pld [src, #23] - ldrneh r6, [src], pstep - - pkhbt r7, r3, r4, lsl #16 - - ldrneh r3, [src, #-2] - pld [src, #23] - ldrneh r4, [src], pstep - - pkhbt r8, r5, r6, lsl #16 - - ldrneh r5, [src, #-2] - pld [src, #23] - ldrneh r6, [src], pstep - - bne simple_vnext8 - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_loop_filter_simple_vertical_edge_armv6| - -; Constant Pool -c0x80808080 DCD 0x80808080 -c0x03030303 DCD 0x03030303 -c0x04040404 DCD 0x04040404 - - END diff --git a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm b/vp8/common/arm/armv6/sixtappredict8x4_v6.asm deleted file mode 100644 index e81aef53d..000000000 --- a/vp8/common/arm/armv6/sixtappredict8x4_v6.asm +++ /dev/null @@ -1,273 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vp8_sixtap_predict8x4_armv6| - - AREA |.text|, CODE, READONLY ; name this block of code -;------------------------------------- -; r0 unsigned char *src_ptr, -; r1 int src_pixels_per_line, -; r2 int xoffset, -; r3 int yoffset, -; stack unsigned char *dst_ptr, -; stack int dst_pitch -;------------------------------------- -;note: In first pass, store the result in transpose(8linesx9columns) on stack. Temporary stack size is 184. -;Line width is 20 that is 9 short data plus 2 to make it 4bytes aligned. In second pass, load data from stack, -;and the result is stored in transpose. -|vp8_sixtap_predict8x4_armv6| PROC - stmdb sp!, {r4 - r11, lr} - str r3, [sp, #-184]! ;reserve space on stack for temporary storage, store yoffset - - cmp r2, #0 ;skip first_pass filter if xoffset=0 - add lr, sp, #4 ;point to temporary buffer - beq skip_firstpass_filter - -;first-pass filter - adr r12, filter8_coeff - sub r0, r0, r1, lsl #1 - - add r3, r1, #10 ; preload next low - pld [r0, r3] - - add r2, r12, r2, lsl #4 ;calculate filter location - add r0, r0, #3 ;adjust src only for loading convinience - - ldr r3, [r2] ; load up packed filter coefficients - ldr r4, [r2, #4] - ldr r5, [r2, #8] - - mov r2, #0x90000 ; height=9 is top part of counter - - sub r1, r1, #8 - -|first_pass_hloop_v6| - ldrb r6, [r0, #-5] ; load source data - ldrb r7, [r0, #-4] - ldrb r8, [r0, #-3] - ldrb r9, [r0, #-2] - ldrb r10, [r0, #-1] - - orr r2, r2, #0x4 ; construct loop counter. width=8=4x2 - - pkhbt r6, r6, r7, lsl #16 ; r7 | r6 - pkhbt r7, r7, r8, lsl #16 ; r8 | r7 - - pkhbt r8, r8, r9, lsl #16 ; r9 | r8 - pkhbt r9, r9, r10, lsl #16 ; r10 | r9 - -|first_pass_wloop_v6| - smuad r11, r6, r3 ; vp8_filter[0], vp8_filter[1] - smuad r12, r7, r3 - - ldrb r6, [r0], #1 - - smlad r11, r8, r4, r11 ; vp8_filter[2], vp8_filter[3] - ldrb r7, [r0], #1 - smlad r12, r9, r4, r12 - - pkhbt r10, r10, r6, lsl #16 ; r10 | r9 - pkhbt r6, r6, r7, lsl #16 ; r11 | r10 - smlad r11, r10, r5, r11 ; vp8_filter[4], vp8_filter[5] - smlad r12, r6, r5, r12 - - sub r2, r2, #1 - - add r11, r11, #0x40 ; round_shift_and_clamp - tst r2, #0xff ; test loop counter - usat r11, #8, r11, asr #7 - add r12, r12, #0x40 - strh r11, [lr], #20 ; result is transposed and stored, which - usat r12, #8, r12, asr #7 - - strh r12, [lr], #20 - - movne r11, r6 - movne r12, r7 - - movne r6, r8 - movne r7, r9 - movne r8, r10 - movne r9, r11 - movne r10, r12 - - bne first_pass_wloop_v6 - - ;;add r9, ppl, #30 ; attempt to load 2 adjacent cache lines - ;;IF ARCHITECTURE=6 - ;pld [src, ppl] - ;;pld [src, r9] - ;;ENDIF - - subs r2, r2, #0x10000 - - sub lr, lr, #158 - - add r0, r0, r1 ; move to next input line - - add r11, r1, #18 ; preload next low. adding back block width(=8), which is subtracted earlier - pld [r0, r11] - - bne first_pass_hloop_v6 - -;second pass filter -secondpass_filter - ldr r3, [sp], #4 ; load back yoffset - ldr r0, [sp, #216] ; load dst address from stack 180+36 - ldr r1, [sp, #220] ; load dst stride from stack 180+40 - - cmp r3, #0 - beq skip_secondpass_filter - - adr r12, filter8_coeff - add lr, r12, r3, lsl #4 ;calculate filter location - - mov r2, #0x00080000 - - ldr r3, [lr] ; load up packed filter coefficients - ldr r4, [lr, #4] - ldr r5, [lr, #8] - - pkhbt r12, r4, r3 ; pack the filter differently - pkhbt r11, r5, r4 - -second_pass_hloop_v6 - ldr r6, [sp] ; load the data - ldr r7, [sp, #4] - - orr r2, r2, #2 ; loop counter - -second_pass_wloop_v6 - smuad lr, r3, r6 ; apply filter - smulbt r10, r3, r6 - - ldr r8, [sp, #8] - - smlad lr, r4, r7, lr - smladx r10, r12, r7, r10 - - ldrh r9, [sp, #12] - - smlad lr, r5, r8, lr - smladx r10, r11, r8, r10 - - add sp, sp, #4 - smlatb r10, r5, r9, r10 - - sub r2, r2, #1 - - add lr, lr, #0x40 ; round_shift_and_clamp - tst r2, #0xff - usat lr, #8, lr, asr #7 - add r10, r10, #0x40 - strb lr, [r0], r1 ; the result is transposed back and stored - usat r10, #8, r10, asr #7 - - strb r10, [r0],r1 - - movne r6, r7 - movne r7, r8 - - bne second_pass_wloop_v6 - - subs r2, r2, #0x10000 - add sp, sp, #12 ; updata src for next loop (20-8) - sub r0, r0, r1, lsl #2 - add r0, r0, #1 - - bne second_pass_hloop_v6 - - add sp, sp, #20 - ldmia sp!, {r4 - r11, pc} - -;-------------------- -skip_firstpass_filter - sub r0, r0, r1, lsl #1 - sub r1, r1, #8 - mov r2, #9 - -skip_firstpass_hloop - ldrb r4, [r0], #1 ; load data - subs r2, r2, #1 - ldrb r5, [r0], #1 - strh r4, [lr], #20 ; store it to immediate buffer - ldrb r6, [r0], #1 ; load data - strh r5, [lr], #20 - ldrb r7, [r0], #1 - strh r6, [lr], #20 - ldrb r8, [r0], #1 - strh r7, [lr], #20 - ldrb r9, [r0], #1 - strh r8, [lr], #20 - ldrb r10, [r0], #1 - strh r9, [lr], #20 - ldrb r11, [r0], #1 - strh r10, [lr], #20 - add r0, r0, r1 ; move to next input line - strh r11, [lr], #20 - - sub lr, lr, #158 ; move over to next column - bne skip_firstpass_hloop - - b secondpass_filter - -;-------------------- -skip_secondpass_filter - mov r2, #8 - add sp, sp, #4 ;start from src[0] instead of src[-2] - -skip_secondpass_hloop - ldr r6, [sp], #4 - subs r2, r2, #1 - ldr r8, [sp], #4 - - mov r7, r6, lsr #16 ; unpack - strb r6, [r0], r1 - mov r9, r8, lsr #16 - strb r7, [r0], r1 - add sp, sp, #12 ; 20-8 - strb r8, [r0], r1 - strb r9, [r0], r1 - - sub r0, r0, r1, lsl #2 - add r0, r0, #1 - - bne skip_secondpass_hloop - - add sp, sp, #16 ; 180 - (160 +4) - - ldmia sp!, {r4 - r11, pc} - - ENDP - -;----------------- -;One word each is reserved. Label filter_coeff can be used to access the data. -;Data address: filter_coeff, filter_coeff+4, filter_coeff+8 ... -filter8_coeff - DCD 0x00000000, 0x00000080, 0x00000000, 0x00000000 - DCD 0xfffa0000, 0x000c007b, 0x0000ffff, 0x00000000 - DCD 0xfff50002, 0x0024006c, 0x0001fff8, 0x00000000 - DCD 0xfff70000, 0x0032005d, 0x0000fffa, 0x00000000 - DCD 0xfff00003, 0x004d004d, 0x0003fff0, 0x00000000 - DCD 0xfffa0000, 0x005d0032, 0x0000fff7, 0x00000000 - DCD 0xfff80001, 0x006c0024, 0x0002fff5, 0x00000000 - DCD 0xffff0000, 0x007b000c, 0x0000fffa, 0x00000000 - - ;DCD 0, 0, 128, 0, 0, 0 - ;DCD 0, -6, 123, 12, -1, 0 - ;DCD 2, -11, 108, 36, -8, 1 - ;DCD 0, -9, 93, 50, -6, 0 - ;DCD 3, -16, 77, 77, -16, 3 - ;DCD 0, -6, 50, 93, -9, 0 - ;DCD 1, -8, 36, 108, -11, 2 - ;DCD 0, -1, 12, 123, -6, 0 - - END diff --git a/vp8/common/arm/bilinearfilter_arm.c b/vp8/common/arm/bilinearfilter_arm.c deleted file mode 100644 index d02a8749b..000000000 --- a/vp8/common/arm/bilinearfilter_arm.c +++ /dev/null @@ -1,87 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include <math.h> -#include "vp8/common/filter.h" -#include "bilinearfilter_arm.h" - -void vp8_filter_block2d_bil_armv6(unsigned char *src_ptr, - unsigned char *dst_ptr, - unsigned int src_pitch, - unsigned int dst_pitch, const short *HFilter, - const short *VFilter, int Width, int Height) { - unsigned short FData[36 * 16]; /* Temp data buffer used in filtering */ - - /* First filter 1-D horizontally... */ - vp8_filter_block2d_bil_first_pass_armv6(src_ptr, FData, src_pitch, Height + 1, - Width, HFilter); - - /* then 1-D vertically... */ - vp8_filter_block2d_bil_second_pass_armv6(FData, dst_ptr, dst_pitch, Height, - Width, VFilter); -} - -void vp8_bilinear_predict4x4_armv6(unsigned char *src_ptr, - int src_pixels_per_line, int xoffset, - int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, - HFilter, VFilter, 4, 4); -} - -void vp8_bilinear_predict8x8_armv6(unsigned char *src_ptr, - int src_pixels_per_line, int xoffset, - int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, - HFilter, VFilter, 8, 8); -} - -void vp8_bilinear_predict8x4_armv6(unsigned char *src_ptr, - int src_pixels_per_line, int xoffset, - int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, - HFilter, VFilter, 8, 4); -} - -void vp8_bilinear_predict16x16_armv6(unsigned char *src_ptr, - int src_pixels_per_line, int xoffset, - int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - - HFilter = vp8_bilinear_filters[xoffset]; - VFilter = vp8_bilinear_filters[yoffset]; - - vp8_filter_block2d_bil_armv6(src_ptr, dst_ptr, src_pixels_per_line, dst_pitch, - HFilter, VFilter, 16, 16); -} diff --git a/vp8/common/arm/bilinearfilter_arm.h b/vp8/common/arm/bilinearfilter_arm.h deleted file mode 100644 index c1c70a362..000000000 --- a/vp8/common/arm/bilinearfilter_arm.h +++ /dev/null @@ -1,31 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#ifndef VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ -#define VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ - -#ifdef __cplusplus -extern "C" { -#endif - -extern void vp8_filter_block2d_bil_first_pass_armv6( - const unsigned char *src_ptr, unsigned short *dst_ptr, - unsigned int src_pitch, unsigned int height, unsigned int width, - const short *vp8_filter); - -extern void vp8_filter_block2d_bil_second_pass_armv6( - const unsigned short *src_ptr, unsigned char *dst_ptr, int dst_pitch, - unsigned int height, unsigned int width, const short *vp8_filter); - -#ifdef __cplusplus -} // extern "C" -#endif - -#endif // VP8_COMMON_ARM_BILINEARFILTER_ARM_H_ diff --git a/vp8/common/arm/dequantize_arm.c b/vp8/common/arm/dequantize_arm.c deleted file mode 100644 index 3b6b1820e..000000000 --- a/vp8/common/arm/dequantize_arm.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8/common/blockd.h" - -#if HAVE_MEDIA -extern void vp8_dequantize_b_loop_v6(short *Q, short *DQC, short *DQ); - -void vp8_dequantize_b_v6(BLOCKD *d, short *DQC) { - short *DQ = d->dqcoeff; - short *Q = d->qcoeff; - - vp8_dequantize_b_loop_v6(Q, DQC, DQ); -} -#endif diff --git a/vp8/common/arm/filter_arm.c b/vp8/common/arm/filter_arm.c deleted file mode 100644 index 6d547d686..000000000 --- a/vp8/common/arm/filter_arm.c +++ /dev/null @@ -1,176 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8_rtcd.h" -#include <math.h> -#include "vp8/common/filter.h" -#include "vpx_ports/mem.h" - -extern void vp8_filter_block2d_first_pass_armv6( - unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line, - unsigned int output_width, unsigned int output_height, - const short *vp8_filter); - -// 8x8 -extern void vp8_filter_block2d_first_pass_8x8_armv6( - unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line, - unsigned int output_width, unsigned int output_height, - const short *vp8_filter); - -// 16x16 -extern void vp8_filter_block2d_first_pass_16x16_armv6( - unsigned char *src_ptr, short *output_ptr, unsigned int src_pixels_per_line, - unsigned int output_width, unsigned int output_height, - const short *vp8_filter); - -extern void vp8_filter_block2d_second_pass_armv6(short *src_ptr, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int cnt, - const short *vp8_filter); - -extern void vp8_filter4_block2d_second_pass_armv6(short *src_ptr, - unsigned char *output_ptr, - unsigned int output_pitch, - unsigned int cnt, - const short *vp8_filter); - -extern void vp8_filter_block2d_first_pass_only_armv6( - unsigned char *src_ptr, unsigned char *output_ptr, - unsigned int src_pixels_per_line, unsigned int cnt, - unsigned int output_pitch, const short *vp8_filter); - -extern void vp8_filter_block2d_second_pass_only_armv6( - unsigned char *src_ptr, unsigned char *output_ptr, - unsigned int src_pixels_per_line, unsigned int cnt, - unsigned int output_pitch, const short *vp8_filter); - -#if HAVE_MEDIA -void vp8_sixtap_predict4x4_armv6(unsigned char *src_ptr, - int src_pixels_per_line, int xoffset, - int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - DECLARE_ALIGNED(4, short, - FData[12 * 4]); /* Temp data buffer used in filtering */ - - HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ - VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ - - /* Vfilter is null. First pass only */ - if (xoffset && !yoffset) { - /*vp8_filter_block2d_first_pass_armv6 ( src_ptr, FData+2, - src_pixels_per_line, 4, 4, HFilter ); - vp8_filter_block2d_second_pass_armv6 ( FData+2, dst_ptr, dst_pitch, 4, - VFilter );*/ - - vp8_filter_block2d_first_pass_only_armv6( - src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, HFilter); - } - /* Hfilter is null. Second pass only */ - else if (!xoffset && yoffset) { - vp8_filter_block2d_second_pass_only_armv6( - src_ptr, dst_ptr, src_pixels_per_line, 4, dst_pitch, VFilter); - } else { - /* Vfilter is a 4 tap filter */ - if (yoffset & 0x1) { - vp8_filter_block2d_first_pass_armv6(src_ptr - src_pixels_per_line, - FData + 1, src_pixels_per_line, 4, 7, - HFilter); - vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, - VFilter); - } - /* Vfilter is 6 tap filter */ - else { - vp8_filter_block2d_first_pass_armv6(src_ptr - (2 * src_pixels_per_line), - FData, src_pixels_per_line, 4, 9, - HFilter); - vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 4, - VFilter); - } - } -} - -void vp8_sixtap_predict8x8_armv6(unsigned char *src_ptr, - int src_pixels_per_line, int xoffset, - int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - DECLARE_ALIGNED(4, short, - FData[16 * 8]); /* Temp data buffer used in filtering */ - - HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ - VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ - - if (xoffset && !yoffset) { - vp8_filter_block2d_first_pass_only_armv6( - src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, HFilter); - } - /* Hfilter is null. Second pass only */ - else if (!xoffset && yoffset) { - vp8_filter_block2d_second_pass_only_armv6( - src_ptr, dst_ptr, src_pixels_per_line, 8, dst_pitch, VFilter); - } else { - if (yoffset & 0x1) { - vp8_filter_block2d_first_pass_8x8_armv6(src_ptr - src_pixels_per_line, - FData + 1, src_pixels_per_line, 8, - 11, HFilter); - vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, - VFilter); - } else { - vp8_filter_block2d_first_pass_8x8_armv6( - src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 8, - 13, HFilter); - vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 8, - VFilter); - } - } -} - -void vp8_sixtap_predict16x16_armv6(unsigned char *src_ptr, - int src_pixels_per_line, int xoffset, - int yoffset, unsigned char *dst_ptr, - int dst_pitch) { - const short *HFilter; - const short *VFilter; - DECLARE_ALIGNED(4, short, - FData[24 * 16]); /* Temp data buffer used in filtering */ - - HFilter = vp8_sub_pel_filters[xoffset]; /* 6 tap */ - VFilter = vp8_sub_pel_filters[yoffset]; /* 6 tap */ - - if (xoffset && !yoffset) { - vp8_filter_block2d_first_pass_only_armv6( - src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, HFilter); - } - /* Hfilter is null. Second pass only */ - else if (!xoffset && yoffset) { - vp8_filter_block2d_second_pass_only_armv6( - src_ptr, dst_ptr, src_pixels_per_line, 16, dst_pitch, VFilter); - } else { - if (yoffset & 0x1) { - vp8_filter_block2d_first_pass_16x16_armv6(src_ptr - src_pixels_per_line, - FData + 1, src_pixels_per_line, - 16, 19, HFilter); - vp8_filter4_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, - VFilter); - } else { - vp8_filter_block2d_first_pass_16x16_armv6( - src_ptr - (2 * src_pixels_per_line), FData, src_pixels_per_line, 16, - 21, HFilter); - vp8_filter_block2d_second_pass_armv6(FData + 2, dst_ptr, dst_pitch, 16, - VFilter); - } - } -} -#endif diff --git a/vp8/common/arm/loopfilter_arm.c b/vp8/common/arm/loopfilter_arm.c index 36fdc8a14..e12f65a04 100644 --- a/vp8/common/arm/loopfilter_arm.c +++ b/vp8/common/arm/loopfilter_arm.c @@ -13,18 +13,6 @@ #include "vp8/common/loopfilter.h" #include "vp8/common/onyxc_int.h" -#define prototype_loopfilter(sym) \ - void sym(unsigned char *src, int pitch, const unsigned char *blimit, \ - const unsigned char *limit, const unsigned char *thresh, int count) - -#if HAVE_MEDIA -extern prototype_loopfilter(vp8_loop_filter_horizontal_edge_armv6); -extern prototype_loopfilter(vp8_loop_filter_vertical_edge_armv6); -extern prototype_loopfilter(vp8_mbloop_filter_horizontal_edge_armv6); -extern prototype_loopfilter(vp8_mbloop_filter_vertical_edge_armv6); -#endif - -#if HAVE_NEON typedef void loopfilter_y_neon(unsigned char *src, int pitch, unsigned char blimit, unsigned char limit, unsigned char thresh); @@ -41,101 +29,7 @@ extern loopfilter_y_neon vp8_mbloop_filter_horizontal_edge_y_neon; extern loopfilter_y_neon vp8_mbloop_filter_vertical_edge_y_neon; extern loopfilter_uv_neon vp8_mbloop_filter_horizontal_edge_uv_neon; extern loopfilter_uv_neon vp8_mbloop_filter_vertical_edge_uv_neon; -#endif - -#if HAVE_MEDIA -/* ARMV6/MEDIA loopfilter functions*/ -/* Horizontal MB filtering */ -void vp8_loop_filter_mbh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, - int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_horizontal_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); - - if (u_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(u_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_mbloop_filter_horizontal_edge_armv6(v_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); -} - -/* Vertical MB Filtering */ -void vp8_loop_filter_mbv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, - int uv_stride, loop_filter_info *lfi) { - vp8_mbloop_filter_vertical_edge_armv6(y_ptr, y_stride, lfi->mblim, lfi->lim, - lfi->hev_thr, 2); - - if (u_ptr) - vp8_mbloop_filter_vertical_edge_armv6(u_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_mbloop_filter_vertical_edge_armv6(v_ptr, uv_stride, lfi->mblim, - lfi->lim, lfi->hev_thr, 1); -} - -/* Horizontal B Filtering */ -void vp8_loop_filter_bh_armv6(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - loop_filter_info *lfi) { - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - vp8_loop_filter_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 2); - - if (u_ptr) - vp8_loop_filter_horizontal_edge_armv6(u_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_loop_filter_horizontal_edge_armv6(v_ptr + 4 * uv_stride, uv_stride, - lfi->blim, lfi->lim, lfi->hev_thr, 1); -} - -void vp8_loop_filter_bhs_armv6(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 4 * y_stride, y_stride, - blimit); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 8 * y_stride, y_stride, - blimit); - vp8_loop_filter_simple_horizontal_edge_armv6(y_ptr + 12 * y_stride, y_stride, - blimit); -} - -/* Vertical B Filtering */ -void vp8_loop_filter_bv_armv6(unsigned char *y_ptr, unsigned char *u_ptr, - unsigned char *v_ptr, int y_stride, int uv_stride, - loop_filter_info *lfi) { - vp8_loop_filter_vertical_edge_armv6(y_ptr + 4, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 8, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - vp8_loop_filter_vertical_edge_armv6(y_ptr + 12, y_stride, lfi->blim, lfi->lim, - lfi->hev_thr, 2); - - if (u_ptr) - vp8_loop_filter_vertical_edge_armv6(u_ptr + 4, uv_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 1); - - if (v_ptr) - vp8_loop_filter_vertical_edge_armv6(v_ptr + 4, uv_stride, lfi->blim, - lfi->lim, lfi->hev_thr, 1); -} - -void vp8_loop_filter_bvs_armv6(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 4, y_stride, blimit); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 8, y_stride, blimit); - vp8_loop_filter_simple_vertical_edge_armv6(y_ptr + 12, y_stride, blimit); -} -#endif -#if HAVE_NEON /* NEON loopfilter functions */ /* Horizontal MB filtering */ void vp8_loop_filter_mbh_neon(unsigned char *y_ptr, unsigned char *u_ptr, @@ -205,4 +99,3 @@ void vp8_loop_filter_bv_neon(unsigned char *y_ptr, unsigned char *u_ptr, vp8_loop_filter_vertical_edge_uv_neon(u_ptr + 4, uv_stride, blim, lim, hev_thr, v_ptr + 4); } -#endif diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c index ef1afa8cb..07c922333 100644 --- a/vp8/common/reconintra4x4.c +++ b/vp8/common/reconintra4x4.c @@ -15,6 +15,8 @@ #include "vp8_rtcd.h" #include "blockd.h" #include "reconintra4x4.h" +#include "vp8/common/common.h" +#include "vpx_ports/mem.h" typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left); @@ -38,8 +40,19 @@ void vp8_intra4x4_predict(unsigned char *above, unsigned char *yleft, int left_stride, B_PREDICTION_MODE b_mode, unsigned char *dst, int dst_stride, unsigned char top_left) { - unsigned char Left[4]; unsigned char Aboveb[12], *Above = Aboveb + 4; +#if HAVE_NEON + // Neon intrinsics are unable to load 32 bits, or 4 8 bit values. Instead, it + // over reads but does not use the extra 4 values. + unsigned char Left[8]; +#if VPX_WITH_ASAN + // Silence an 'uninitialized read' warning. Although uninitialized values are + // indeed read, they are not used. + vp8_zero_array(Left, 8); +#endif // VPX_WITH_ASAN +#else + unsigned char Left[4]; +#endif // HAVE_NEON Left[0] = yleft[0]; Left[1] = yleft[left_stride]; diff --git a/vp8/common/rtcd_defs.pl b/vp8/common/rtcd_defs.pl index a440352f4..b58f8e7af 100644 --- a/vp8/common/rtcd_defs.pl +++ b/vp8/common/rtcd_defs.pl @@ -29,81 +29,69 @@ $vp8_clear_system_state_mmx=vpx_reset_mmx_state; # Dequant # add_proto qw/void vp8_dequantize_b/, "struct blockd*, short *dqc"; -specialize qw/vp8_dequantize_b mmx media neon msa/; -$vp8_dequantize_b_media=vp8_dequantize_b_v6; +specialize qw/vp8_dequantize_b mmx neon msa/; add_proto qw/void vp8_dequant_idct_add/, "short *input, short *dq, unsigned char *output, int stride"; -specialize qw/vp8_dequant_idct_add mmx media neon dspr2 msa/; -$vp8_dequant_idct_add_media=vp8_dequant_idct_add_v6; +specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa/; $vp8_dequant_idct_add_dspr2=vp8_dequant_idct_add_dspr2; add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_y_block mmx sse2 media neon dspr2 msa/; -$vp8_dequant_idct_add_y_block_media=vp8_dequant_idct_add_y_block_v6; +specialize qw/vp8_dequant_idct_add_y_block mmx sse2 neon dspr2 msa/; $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs"; -specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 media neon dspr2 msa/; -$vp8_dequant_idct_add_uv_block_media=vp8_dequant_idct_add_uv_block_v6; +specialize qw/vp8_dequant_idct_add_uv_block mmx sse2 neon dspr2 msa/; $vp8_dequant_idct_add_y_block_dspr2=vp8_dequant_idct_add_y_block_dspr2; # # Loopfilter # add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_mbv mmx sse2 media neon dspr2 msa/; -$vp8_loop_filter_mbv_media=vp8_loop_filter_mbv_armv6; +specialize qw/vp8_loop_filter_mbv mmx sse2 neon dspr2 msa/; $vp8_loop_filter_mbv_dspr2=vp8_loop_filter_mbv_dspr2; add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_bv mmx sse2 media neon dspr2 msa/; -$vp8_loop_filter_bv_media=vp8_loop_filter_bv_armv6; +specialize qw/vp8_loop_filter_bv mmx sse2 neon dspr2 msa/; $vp8_loop_filter_bv_dspr2=vp8_loop_filter_bv_dspr2; add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_mbh mmx sse2 media neon dspr2 msa/; -$vp8_loop_filter_mbh_media=vp8_loop_filter_mbh_armv6; +specialize qw/vp8_loop_filter_mbh mmx sse2 neon dspr2 msa/; $vp8_loop_filter_mbh_dspr2=vp8_loop_filter_mbh_dspr2; add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y, unsigned char *u, unsigned char *v, int ystride, int uv_stride, struct loop_filter_info *lfi"; -specialize qw/vp8_loop_filter_bh mmx sse2 media neon dspr2 msa/; -$vp8_loop_filter_bh_media=vp8_loop_filter_bh_armv6; +specialize qw/vp8_loop_filter_bh mmx sse2 neon dspr2 msa/; $vp8_loop_filter_bh_dspr2=vp8_loop_filter_bh_dspr2; add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_mbv mmx sse2 media neon msa/; +specialize qw/vp8_loop_filter_simple_mbv mmx sse2 neon msa/; $vp8_loop_filter_simple_mbv_c=vp8_loop_filter_simple_vertical_edge_c; $vp8_loop_filter_simple_mbv_mmx=vp8_loop_filter_simple_vertical_edge_mmx; $vp8_loop_filter_simple_mbv_sse2=vp8_loop_filter_simple_vertical_edge_sse2; -$vp8_loop_filter_simple_mbv_media=vp8_loop_filter_simple_vertical_edge_armv6; $vp8_loop_filter_simple_mbv_neon=vp8_loop_filter_mbvs_neon; $vp8_loop_filter_simple_mbv_msa=vp8_loop_filter_simple_vertical_edge_msa; add_proto qw/void vp8_loop_filter_simple_mbh/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_mbh mmx sse2 media neon msa/; +specialize qw/vp8_loop_filter_simple_mbh mmx sse2 neon msa/; $vp8_loop_filter_simple_mbh_c=vp8_loop_filter_simple_horizontal_edge_c; $vp8_loop_filter_simple_mbh_mmx=vp8_loop_filter_simple_horizontal_edge_mmx; $vp8_loop_filter_simple_mbh_sse2=vp8_loop_filter_simple_horizontal_edge_sse2; -$vp8_loop_filter_simple_mbh_media=vp8_loop_filter_simple_horizontal_edge_armv6; $vp8_loop_filter_simple_mbh_neon=vp8_loop_filter_mbhs_neon; $vp8_loop_filter_simple_mbh_msa=vp8_loop_filter_simple_horizontal_edge_msa; add_proto qw/void vp8_loop_filter_simple_bv/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_bv mmx sse2 media neon msa/; +specialize qw/vp8_loop_filter_simple_bv mmx sse2 neon msa/; $vp8_loop_filter_simple_bv_c=vp8_loop_filter_bvs_c; $vp8_loop_filter_simple_bv_mmx=vp8_loop_filter_bvs_mmx; $vp8_loop_filter_simple_bv_sse2=vp8_loop_filter_bvs_sse2; -$vp8_loop_filter_simple_bv_media=vp8_loop_filter_bvs_armv6; $vp8_loop_filter_simple_bv_neon=vp8_loop_filter_bvs_neon; $vp8_loop_filter_simple_bv_msa=vp8_loop_filter_bvs_msa; add_proto qw/void vp8_loop_filter_simple_bh/, "unsigned char *y, int ystride, const unsigned char *blimit"; -specialize qw/vp8_loop_filter_simple_bh mmx sse2 media neon msa/; +specialize qw/vp8_loop_filter_simple_bh mmx sse2 neon msa/; $vp8_loop_filter_simple_bh_c=vp8_loop_filter_bhs_c; $vp8_loop_filter_simple_bh_mmx=vp8_loop_filter_bhs_mmx; $vp8_loop_filter_simple_bh_sse2=vp8_loop_filter_bhs_sse2; -$vp8_loop_filter_simple_bh_media=vp8_loop_filter_bhs_armv6; $vp8_loop_filter_simple_bh_neon=vp8_loop_filter_bhs_neon; $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa; @@ -112,8 +100,7 @@ $vp8_loop_filter_simple_bh_msa=vp8_loop_filter_bhs_msa; # #idct16 add_proto qw/void vp8_short_idct4x4llm/, "short *input, unsigned char *pred, int pitch, unsigned char *dst, int dst_stride"; -specialize qw/vp8_short_idct4x4llm mmx media neon dspr2 msa/; -$vp8_short_idct4x4llm_media=vp8_short_idct4x4llm_v6_dual; +specialize qw/vp8_short_idct4x4llm mmx neon dspr2 msa/; $vp8_short_idct4x4llm_dspr2=vp8_short_idct4x4llm_dspr2; #iwalsh1 @@ -124,32 +111,27 @@ $vp8_short_inv_walsh4x4_1_dspr2=vp8_short_inv_walsh4x4_1_dspr2; #iwalsh16 add_proto qw/void vp8_short_inv_walsh4x4/, "short *input, short *output"; -specialize qw/vp8_short_inv_walsh4x4 mmx sse2 media neon dspr2 msa/; -$vp8_short_inv_walsh4x4_media=vp8_short_inv_walsh4x4_v6; +specialize qw/vp8_short_inv_walsh4x4 mmx sse2 neon dspr2 msa/; $vp8_short_inv_walsh4x4_dspr2=vp8_short_inv_walsh4x4_dspr2; #idct1_scalar_add add_proto qw/void vp8_dc_only_idct_add/, "short input, unsigned char *pred, int pred_stride, unsigned char *dst, int dst_stride"; -specialize qw/vp8_dc_only_idct_add mmx media neon dspr2 msa/; -$vp8_dc_only_idct_add_media=vp8_dc_only_idct_add_v6; +specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa/; $vp8_dc_only_idct_add_dspr2=vp8_dc_only_idct_add_dspr2; # # RECON # add_proto qw/void vp8_copy_mem16x16/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_copy_mem16x16 mmx sse2 media neon dspr2 msa/; -$vp8_copy_mem16x16_media=vp8_copy_mem16x16_v6; +specialize qw/vp8_copy_mem16x16 mmx sse2 neon dspr2 msa/; $vp8_copy_mem16x16_dspr2=vp8_copy_mem16x16_dspr2; add_proto qw/void vp8_copy_mem8x8/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_copy_mem8x8 mmx media neon dspr2 msa/; -$vp8_copy_mem8x8_media=vp8_copy_mem8x8_v6; +specialize qw/vp8_copy_mem8x8 mmx neon dspr2 msa/; $vp8_copy_mem8x8_dspr2=vp8_copy_mem8x8_dspr2; add_proto qw/void vp8_copy_mem8x4/, "unsigned char *src, int src_pitch, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_copy_mem8x4 mmx media neon dspr2 msa/; -$vp8_copy_mem8x4_media=vp8_copy_mem8x4_v6; +specialize qw/vp8_copy_mem8x4 mmx neon dspr2 msa/; $vp8_copy_mem8x4_dspr2=vp8_copy_mem8x4_dspr2; # @@ -180,40 +162,36 @@ if (vpx_config("CONFIG_POSTPROC") eq "yes") { # Subpixel # add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 media neon dspr2 msa/; -$vp8_sixtap_predict16x16_media=vp8_sixtap_predict16x16_armv6; +specialize qw/vp8_sixtap_predict16x16 mmx sse2 ssse3 neon dspr2 msa/; $vp8_sixtap_predict16x16_dspr2=vp8_sixtap_predict16x16_dspr2; add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 media neon dspr2 msa/; -$vp8_sixtap_predict8x8_media=vp8_sixtap_predict8x8_armv6; +specialize qw/vp8_sixtap_predict8x8 mmx sse2 ssse3 neon dspr2 msa/; $vp8_sixtap_predict8x8_dspr2=vp8_sixtap_predict8x8_dspr2; add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 media neon dspr2 msa/; -$vp8_sixtap_predict8x4_media=vp8_sixtap_predict8x4_armv6; +specialize qw/vp8_sixtap_predict8x4 mmx sse2 ssse3 neon dspr2 msa/; $vp8_sixtap_predict8x4_dspr2=vp8_sixtap_predict8x4_dspr2; +# TODO(johannkoenig): Add neon implementation +# https://bugs.chromium.org/p/webm/issues/detail?id=1273 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_sixtap_predict4x4 mmx ssse3 media dspr2 msa/; -$vp8_sixtap_predict4x4_media=vp8_sixtap_predict4x4_armv6; +specialize qw/vp8_sixtap_predict4x4 mmx ssse3 dspr2 msa/; $vp8_sixtap_predict4x4_dspr2=vp8_sixtap_predict4x4_dspr2; add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 media neon msa/; -$vp8_bilinear_predict16x16_media=vp8_bilinear_predict16x16_armv6; +specialize qw/vp8_bilinear_predict16x16 mmx sse2 ssse3 neon msa/; add_proto qw/void vp8_bilinear_predict8x8/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 media neon msa/; -$vp8_bilinear_predict8x8_media=vp8_bilinear_predict8x8_armv6; +specialize qw/vp8_bilinear_predict8x8 mmx sse2 ssse3 neon msa/; add_proto qw/void vp8_bilinear_predict8x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_bilinear_predict8x4 mmx media neon msa/; -$vp8_bilinear_predict8x4_media=vp8_bilinear_predict8x4_armv6; +specialize qw/vp8_bilinear_predict8x4 mmx neon msa/; +# TODO(johannkoenig): Add neon implementation +# https://bugs.chromium.org/p/webm/issues/detail?id=1273 add_proto qw/void vp8_bilinear_predict4x4/, "unsigned char *src, int src_pitch, int xofst, int yofst, unsigned char *dst, int dst_pitch"; -specialize qw/vp8_bilinear_predict4x4 mmx media msa/; -$vp8_bilinear_predict4x4_media=vp8_bilinear_predict4x4_armv6; +specialize qw/vp8_bilinear_predict4x4 mmx msa/; # # Encoder functions below this point. @@ -232,16 +210,13 @@ if ($opts{arch} =~ /x86/) { # Forward DCT # add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct4x4 mmx sse2 media neon msa/; -$vp8_short_fdct4x4_media=vp8_short_fdct4x4_armv6; +specialize qw/vp8_short_fdct4x4 mmx sse2 neon msa/; add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_fdct8x4 mmx sse2 media neon msa/; -$vp8_short_fdct8x4_media=vp8_short_fdct8x4_armv6; +specialize qw/vp8_short_fdct8x4 mmx sse2 neon msa/; add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch"; -specialize qw/vp8_short_walsh4x4 sse2 media neon msa/; -$vp8_short_walsh4x4_media=vp8_short_walsh4x4_armv6; +specialize qw/vp8_short_walsh4x4 sse2 neon msa/; # # Quantizer diff --git a/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm b/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm deleted file mode 100644 index 8034c1db9..000000000 --- a/vp8/encoder/arm/armv6/vp8_short_fdct4x4_armv6.asm +++ /dev/null @@ -1,262 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_short_fdct4x4_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY -; void vp8_short_fdct4x4_c(short *input, short *output, int pitch) -|vp8_short_fdct4x4_armv6| PROC - - stmfd sp!, {r4 - r12, lr} - - ; PART 1 - - ; coeffs 0-3 - ldrd r4, r5, [r0] ; [i1 | i0] [i3 | i2] - - ldr r10, c7500 - ldr r11, c14500 - ldr r12, c0x22a453a0 ; [2217*4 | 5352*4] - ldr lr, c0x00080008 - ror r5, r5, #16 ; [i2 | i3] - - qadd16 r6, r4, r5 ; [i1+i2 | i0+i3] = [b1 | a1] without shift - qsub16 r7, r4, r5 ; [i1-i2 | i0-i3] = [c1 | d1] without shift - - add r0, r0, r2 ; update input pointer - - qadd16 r7, r7, r7 ; 2*[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r4, r6, lr ; o0 = (i1+i2)*8 + (i0+i3)*8 - smusd r5, r6, lr ; o2 = (i1+i2)*8 - (i0+i3)*8 - - smlad r6, r7, r12, r11 ; o1 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o3 = (d1 * 2217 - c1 * 5352 + 7500) - - ldrd r8, r9, [r0] ; [i5 | i4] [i7 | i6] - - pkhbt r3, r4, r6, lsl #4 ; [o1 | o0], keep in register for PART 2 - pkhbt r6, r5, r7, lsl #4 ; [o3 | o2] - - str r6, [r1, #4] - - ; coeffs 4-7 - ror r9, r9, #16 ; [i6 | i7] - - qadd16 r6, r8, r9 ; [i5+i6 | i4+i7] = [b1 | a1] without shift - qsub16 r7, r8, r9 ; [i5-i6 | i4-i7] = [c1 | d1] without shift - - add r0, r0, r2 ; update input pointer - - qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r9, r6, lr ; o4 = (i5+i6)*8 + (i4+i7)*8 - smusd r8, r6, lr ; o6 = (i5+i6)*8 - (i4+i7)*8 - - smlad r6, r7, r12, r11 ; o5 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o7 = (d1 * 2217 - c1 * 5352 + 7500) - - ldrd r4, r5, [r0] ; [i9 | i8] [i11 | i10] - - pkhbt r9, r9, r6, lsl #4 ; [o5 | o4], keep in register for PART 2 - pkhbt r6, r8, r7, lsl #4 ; [o7 | o6] - - str r6, [r1, #12] - - ; coeffs 8-11 - ror r5, r5, #16 ; [i10 | i11] - - qadd16 r6, r4, r5 ; [i9+i10 | i8+i11]=[b1 | a1] without shift - qsub16 r7, r4, r5 ; [i9-i10 | i8-i11]=[c1 | d1] without shift - - add r0, r0, r2 ; update input pointer - - qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r2, r6, lr ; o8 = (i9+i10)*8 + (i8+i11)*8 - smusd r8, r6, lr ; o10 = (i9+i10)*8 - (i8+i11)*8 - - smlad r6, r7, r12, r11 ; o9 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o11 = (d1 * 2217 - c1 * 5352 + 7500) - - ldrd r4, r5, [r0] ; [i13 | i12] [i15 | i14] - - pkhbt r2, r2, r6, lsl #4 ; [o9 | o8], keep in register for PART 2 - pkhbt r6, r8, r7, lsl #4 ; [o11 | o10] - - str r6, [r1, #20] - - ; coeffs 12-15 - ror r5, r5, #16 ; [i14 | i15] - - qadd16 r6, r4, r5 ; [i13+i14 | i12+i15]=[b1|a1] without shift - qsub16 r7, r4, r5 ; [i13-i14 | i12-i15]=[c1|d1] without shift - - qadd16 r7, r7, r7 ; 2x[c1|d1] --> we can use smlad and smlsd - ; with 2217*4 and 5352*4 without losing the - ; sign bit (overflow) - - smuad r4, r6, lr ; o12 = (i13+i14)*8 + (i12+i15)*8 - smusd r5, r6, lr ; o14 = (i13+i14)*8 - (i12+i15)*8 - - smlad r6, r7, r12, r11 ; o13 = (c1 * 2217 + d1 * 5352 + 14500) - smlsdx r7, r7, r12, r10 ; o15 = (d1 * 2217 - c1 * 5352 + 7500) - - pkhbt r0, r4, r6, lsl #4 ; [o13 | o12], keep in register for PART 2 - pkhbt r6, r5, r7, lsl #4 ; [o15 | o14] - - str r6, [r1, #28] - - - ; PART 2 ------------------------------------------------- - ldr r11, c12000 - ldr r10, c51000 - ldr lr, c0x00070007 - - qadd16 r4, r3, r0 ; a1 = [i1+i13 | i0+i12] - qadd16 r5, r9, r2 ; b1 = [i5+i9 | i4+i8] - qsub16 r6, r9, r2 ; c1 = [i5-i9 | i4-i8] - qsub16 r7, r3, r0 ; d1 = [i1-i13 | i0-i12] - - qadd16 r4, r4, lr ; a1 + 7 - - add r0, r11, #0x10000 ; add (d!=0) - - qadd16 r2, r4, r5 ; a1 + b1 + 7 - qsub16 r3, r4, r5 ; a1 - b1 + 7 - - ldr r12, c0x08a914e8 ; [2217 | 5352] - - lsl r8, r2, #16 ; prepare bottom halfword for scaling - asr r2, r2, #4 ; scale top halfword - lsl r9, r3, #16 ; prepare bottom halfword for scaling - asr r3, r3, #4 ; scale top halfword - pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword - pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword - - smulbt r2, r6, r12 ; [ ------ | c1*2217] - str r4, [r1, #0] ; [ o1 | o0] - smultt r3, r6, r12 ; [c1*2217 | ------ ] - str r5, [r1, #16] ; [ o9 | o8] - - smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] - smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] - - smulbb r2, r6, r12 ; [ ------ | c1*5352] - smultb r3, r6, r12 ; [c1*5352 | ------ ] - - lsls r6, r7, #16 ; d1 != 0 ? - addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) - addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) - asrs r6, r7, #16 - addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) - addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) - - smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 - smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 - - pkhtb r9, r9, r8, asr #16 - - sub r4, r4, r2 - sub r5, r5, r3 - - ldr r3, [r1, #4] ; [i3 | i2] - - pkhtb r5, r5, r4, asr #16 ; [o13|o12] - - str r9, [r1, #8] ; [o5 | 04] - - ldr r9, [r1, #12] ; [i7 | i6] - ldr r8, [r1, #28] ; [i15|i14] - ldr r2, [r1, #20] ; [i11|i10] - str r5, [r1, #24] ; [o13|o12] - - qadd16 r4, r3, r8 ; a1 = [i3+i15 | i2+i14] - qadd16 r5, r9, r2 ; b1 = [i7+i11 | i6+i10] - - qadd16 r4, r4, lr ; a1 + 7 - - qsub16 r6, r9, r2 ; c1 = [i7-i11 | i6-i10] - qadd16 r2, r4, r5 ; a1 + b1 + 7 - qsub16 r7, r3, r8 ; d1 = [i3-i15 | i2-i14] - qsub16 r3, r4, r5 ; a1 - b1 + 7 - - lsl r8, r2, #16 ; prepare bottom halfword for scaling - asr r2, r2, #4 ; scale top halfword - lsl r9, r3, #16 ; prepare bottom halfword for scaling - asr r3, r3, #4 ; scale top halfword - pkhtb r4, r2, r8, asr #20 ; pack and scale bottom halfword - pkhtb r5, r3, r9, asr #20 ; pack and scale bottom halfword - - smulbt r2, r6, r12 ; [ ------ | c1*2217] - str r4, [r1, #4] ; [ o3 | o2] - smultt r3, r6, r12 ; [c1*2217 | ------ ] - str r5, [r1, #20] ; [ o11 | o10] - - smlabb r8, r7, r12, r2 ; [ ------ | d1*5352] - smlatb r9, r7, r12, r3 ; [d1*5352 | ------ ] - - smulbb r2, r6, r12 ; [ ------ | c1*5352] - smultb r3, r6, r12 ; [c1*5352 | ------ ] - - lsls r6, r7, #16 ; d1 != 0 ? - addeq r8, r8, r11 ; c1_b*2217+d1_b*5352+12000 + (d==0) - addne r8, r8, r0 ; c1_b*2217+d1_b*5352+12000 + (d!=0) - - asrs r6, r7, #16 - addeq r9, r9, r11 ; c1_t*2217+d1_t*5352+12000 + (d==0) - addne r9, r9, r0 ; c1_t*2217+d1_t*5352+12000 + (d!=0) - - smlabt r4, r7, r12, r10 ; [ ------ | d1*2217] + 51000 - smlatt r5, r7, r12, r10 ; [d1*2217 | ------ ] + 51000 - - pkhtb r9, r9, r8, asr #16 - - sub r4, r4, r2 - sub r5, r5, r3 - - str r9, [r1, #12] ; [o7 | o6] - pkhtb r5, r5, r4, asr #16 ; [o15|o14] - - str r5, [r1, #28] ; [o15|o14] - - ldmfd sp!, {r4 - r12, pc} - - ENDP - -; Used constants -c7500 - DCD 7500 -c14500 - DCD 14500 -c0x22a453a0 - DCD 0x22a453a0 -c0x00080008 - DCD 0x00080008 -c12000 - DCD 12000 -c51000 - DCD 51000 -c0x00070007 - DCD 0x00070007 -c0x08a914e8 - DCD 0x08a914e8 - - END diff --git a/vp8/encoder/arm/armv6/walsh_v6.asm b/vp8/encoder/arm/armv6/walsh_v6.asm deleted file mode 100644 index 5eaf3f25a..000000000 --- a/vp8/encoder/arm/armv6/walsh_v6.asm +++ /dev/null @@ -1,212 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - EXPORT |vp8_short_walsh4x4_armv6| - - ARM - REQUIRE8 - PRESERVE8 - - AREA |.text|, CODE, READONLY ; name this block of code - -;short vp8_short_walsh4x4_armv6(short *input, short *output, int pitch) -; r0 short *input, -; r1 short *output, -; r2 int pitch -|vp8_short_walsh4x4_armv6| PROC - - stmdb sp!, {r4 - r11, lr} - - ldrd r4, r5, [r0], r2 - ldr lr, c00040004 - ldrd r6, r7, [r0], r2 - - ; 0-3 - qadd16 r3, r4, r5 ; [d1|a1] [1+3 | 0+2] - qsub16 r4, r4, r5 ; [c1|b1] [1-3 | 0-2] - - ldrd r8, r9, [r0], r2 - ; 4-7 - qadd16 r5, r6, r7 ; [d1|a1] [5+7 | 4+6] - qsub16 r6, r6, r7 ; [c1|b1] [5-7 | 4-6] - - ldrd r10, r11, [r0] - ; 8-11 - qadd16 r7, r8, r9 ; [d1|a1] [9+11 | 8+10] - qsub16 r8, r8, r9 ; [c1|b1] [9-11 | 8-10] - - ; 12-15 - qadd16 r9, r10, r11 ; [d1|a1] [13+15 | 12+14] - qsub16 r10, r10, r11 ; [c1|b1] [13-15 | 12-14] - - - lsls r2, r3, #16 - smuad r11, r3, lr ; A0 = a1<<2 + d1<<2 - addne r11, r11, #1 ; A0 += (a1!=0) - - lsls r2, r7, #16 - smuad r12, r7, lr ; C0 = a1<<2 + d1<<2 - addne r12, r12, #1 ; C0 += (a1!=0) - - add r0, r11, r12 ; a1_0 = A0 + C0 - sub r11, r11, r12 ; b1_0 = A0 - C0 - - lsls r2, r5, #16 - smuad r12, r5, lr ; B0 = a1<<2 + d1<<2 - addne r12, r12, #1 ; B0 += (a1!=0) - - lsls r2, r9, #16 - smuad r2, r9, lr ; D0 = a1<<2 + d1<<2 - addne r2, r2, #1 ; D0 += (a1!=0) - - add lr, r12, r2 ; d1_0 = B0 + D0 - sub r12, r12, r2 ; c1_0 = B0 - D0 - - ; op[0,4,8,12] - adds r2, r0, lr ; a2 = a1_0 + d1_0 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r0, r0, lr ; d2 = a1_0 - d1_0 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1] ; op[0] - - addmi r0, r0, #1 ; += a2 < 0 - add r0, r0, #3 ; += 3 - ldr lr, c00040004 - mov r0, r0, asr #3 ; >> 3 - strh r0, [r1, #24] ; op[12] - - adds r2, r11, r12 ; b2 = b1_0 + c1_0 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r0, r11, r12 ; c2 = b1_0 - c1_0 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #8] ; op[4] - - addmi r0, r0, #1 ; += a2 < 0 - add r0, r0, #3 ; += 3 - smusd r3, r3, lr ; A3 = a1<<2 - d1<<2 - smusd r7, r7, lr ; C3 = a1<<2 - d1<<2 - mov r0, r0, asr #3 ; >> 3 - strh r0, [r1, #16] ; op[8] - - - ; op[3,7,11,15] - add r0, r3, r7 ; a1_3 = A3 + C3 - sub r3, r3, r7 ; b1_3 = A3 - C3 - - smusd r5, r5, lr ; B3 = a1<<2 - d1<<2 - smusd r9, r9, lr ; D3 = a1<<2 - d1<<2 - add r7, r5, r9 ; d1_3 = B3 + D3 - sub r5, r5, r9 ; c1_3 = B3 - D3 - - adds r2, r0, r7 ; a2 = a1_3 + d1_3 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - adds r9, r3, r5 ; b2 = b1_3 + c1_3 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #6] ; op[3] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - subs r2, r3, r5 ; c2 = b1_3 - c1_3 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #14] ; op[7] - - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r9, r0, r7 ; d2 = a1_3 - d1_3 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #22] ; op[11] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - smuad r3, r4, lr ; A1 = b1<<2 + c1<<2 - smuad r5, r8, lr ; C1 = b1<<2 + c1<<2 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #30] ; op[15] - - ; op[1,5,9,13] - add r0, r3, r5 ; a1_1 = A1 + C1 - sub r3, r3, r5 ; b1_1 = A1 - C1 - - smuad r7, r6, lr ; B1 = b1<<2 + c1<<2 - smuad r9, r10, lr ; D1 = b1<<2 + c1<<2 - add r5, r7, r9 ; d1_1 = B1 + D1 - sub r7, r7, r9 ; c1_1 = B1 - D1 - - adds r2, r0, r5 ; a2 = a1_1 + d1_1 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - adds r9, r3, r7 ; b2 = b1_1 + c1_1 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #2] ; op[1] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - subs r2, r3, r7 ; c2 = b1_1 - c1_1 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #10] ; op[5] - - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r9, r0, r5 ; d2 = a1_1 - d1_1 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #18] ; op[9] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - smusd r4, r4, lr ; A2 = b1<<2 - c1<<2 - smusd r8, r8, lr ; C2 = b1<<2 - c1<<2 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #26] ; op[13] - - - ; op[2,6,10,14] - add r11, r4, r8 ; a1_2 = A2 + C2 - sub r12, r4, r8 ; b1_2 = A2 - C2 - - smusd r6, r6, lr ; B2 = b1<<2 - c1<<2 - smusd r10, r10, lr ; D2 = b1<<2 - c1<<2 - add r4, r6, r10 ; d1_2 = B2 + D2 - sub r8, r6, r10 ; c1_2 = B2 - D2 - - adds r2, r11, r4 ; a2 = a1_2 + d1_2 - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - adds r9, r12, r8 ; b2 = b1_2 + c1_2 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #4] ; op[2] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - subs r2, r12, r8 ; c2 = b1_2 - c1_2 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #12] ; op[6] - - addmi r2, r2, #1 ; += a2 < 0 - add r2, r2, #3 ; += 3 - subs r9, r11, r4 ; d2 = a1_2 - d1_2 - mov r2, r2, asr #3 ; >> 3 - strh r2, [r1, #20] ; op[10] - - addmi r9, r9, #1 ; += a2 < 0 - add r9, r9, #3 ; += 3 - mov r9, r9, asr #3 ; >> 3 - strh r9, [r1, #28] ; op[14] - - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vp8_short_walsh4x4_armv6| - -c00040004 - DCD 0x00040004 - - END diff --git a/vp8/encoder/arm/dct_arm.c b/vp8/encoder/arm/dct_arm.c deleted file mode 100644 index 983dd217a..000000000 --- a/vp8/encoder/arm/dct_arm.c +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright (c) 2011 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "vpx_config.h" -#include "vp8_rtcd.h" - -#if HAVE_MEDIA - -void vp8_short_fdct8x4_armv6(short *input, short *output, int pitch) { - vp8_short_fdct4x4_armv6(input, output, pitch); - vp8_short_fdct4x4_armv6(input + 4, output + 16, pitch); -} - -#endif /* HAVE_MEDIA */ diff --git a/vp8/vp8_common.mk b/vp8/vp8_common.mk index 63a918838..d863a0a26 100644 --- a/vp8/vp8_common.mk +++ b/vp8/vp8_common.mk @@ -123,30 +123,8 @@ ifeq ($(CONFIG_POSTPROC),yes) VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c endif -# common (c) -VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/filter_arm.c -VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/loopfilter_arm.c -VP8_COMMON_SRCS-$(ARCH_ARM) += common/arm/dequantize_arm.c - -# common (media) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.c -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/bilinearfilter_arm.h -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/bilinearfilter_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/copymem8x4_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/copymem8x8_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/copymem16x16_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dc_only_idct_add_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/iwalsh_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/filter_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/loopfilter_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/simpleloopfilter_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/sixtappredict8x4_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequant_idct_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/dequantize_v6$(ASM) -VP8_COMMON_SRCS-$(HAVE_MEDIA) += common/arm/armv6/idct_blk_v6.c - # common (neon intrinsics) +VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/loopfilter_arm.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/bilinearpredict_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/copymem_neon.c VP8_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/dc_only_idct_add_neon.c diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index 857a631bf..4e3d31e3b 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -16,10 +16,6 @@ VP8_CX_SRCS-no += $(VP8_COMMON_SRCS-no) VP8_CX_SRCS_REMOVE-yes += $(VP8_COMMON_SRCS_REMOVE-yes) VP8_CX_SRCS_REMOVE-no += $(VP8_COMMON_SRCS_REMOVE-no) -ifeq ($(ARCH_ARM),yes) - include $(SRC_PATH_BARE)/$(VP8_PREFIX)vp8cx_arm.mk -endif - VP8_CX_SRCS-yes += vp8cx.mk VP8_CX_SRCS-yes += vp8_cx_iface.c @@ -101,6 +97,11 @@ ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm endif +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c +VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c + VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/dct_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/encodeopt_msa.c VP8_CX_SRCS-$(HAVE_MSA) += encoder/mips/msa/quantize_msa.c diff --git a/vp8/vp8cx_arm.mk b/vp8/vp8cx_arm.mk deleted file mode 100644 index 838b53d84..000000000 --- a/vp8/vp8cx_arm.mk +++ /dev/null @@ -1,28 +0,0 @@ -## -## Copyright (c) 2010 The WebM project authors. All Rights Reserved. -## -## Use of this source code is governed by a BSD-style license -## that can be found in the LICENSE file in the root of the source -## tree. An additional intellectual property rights grant can be found -## in the file PATENTS. All contributing project authors may -## be found in the AUTHORS file in the root of the source tree. -## - - -VP8_CX_SRCS-$(ARCH_ARM) += vp8cx_arm.mk - -#File list for arm -# encoder -VP8_CX_SRCS-$(ARCH_ARM) += encoder/arm/dct_arm.c - -#File list for media -# encoder -VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/vp8_short_fdct4x4_armv6$(ASM) -VP8_CX_SRCS-$(HAVE_MEDIA) += encoder/arm/armv6/walsh_v6$(ASM) - -#File list for neon -# encoder -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/denoising_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/fastquantizeb_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/shortfdct_neon.c -VP8_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp8_shortwalsh4x4_neon.c diff --git a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c index 39bed46ee..0e3febf15 100644 --- a/vp9/common/arm/neon/vp9_iht8x8_add_neon.c +++ b/vp9/common/arm/neon/vp9_iht8x8_add_neon.c @@ -14,6 +14,7 @@ #include "./vp9_rtcd.h" #include "./vpx_config.h" #include "vp9/common/vp9_common.h" +#include "vpx_dsp/arm/transpose_neon.h" static int16_t cospi_2_64 = 16305; static int16_t cospi_4_64 = 16069; @@ -31,70 +32,6 @@ static int16_t cospi_26_64 = 4756; static int16_t cospi_28_64 = 3196; static int16_t cospi_30_64 = 1606; -static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; -} - static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, int16x8_t *q11s16, int16x8_t *q12s16, int16x8_t *q13s16, @@ -489,8 +426,8 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, q14s16 = vld1q_s16(input + 8 * 6); q15s16 = vld1q_s16(input + 8 * 7); - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); switch (tx_type) { case 0: // idct_idct is not supported. Fall back to C @@ -506,8 +443,8 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, &q15s16); // transpose the matrix - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, + &q14s16, &q15s16); // generate IADST constants // GENERATE_IADST_CONSTANTS @@ -525,8 +462,8 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, &q15s16); // transpose the matrix - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, + &q14s16, &q15s16); // generate IDCT constants // GENERATE_IDCT_CONSTANTS @@ -544,8 +481,8 @@ void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, &q15s16); // transpose the matrix - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, + &q14s16, &q15s16); // then transform columns IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, diff --git a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c index b12dfa70f..e68d01e9f 100644 --- a/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c +++ b/vp9/common/mips/dspr2/vp9_itrans16_dspr2.c @@ -68,7 +68,8 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, int pitch, for (j = 0; j < 16; ++j) temp_in[j * 16 + i] = out[i * 16 + j]; idct16_cols_add_blk_dspr2(temp_in, dest, pitch); - } break; + break; + } case ADST_ADST: // ADST in both directions { int16_t temp_in[16]; @@ -89,7 +90,8 @@ void vp9_iht16x16_256_add_dspr2(const int16_t *input, uint8_t *dest, int pitch, dest[j * pitch + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6) + dest[j * pitch + i]); } - } break; + break; + } default: printf("vp9_short_iht16x16_add_dspr2 : Invalid tx_type\n"); break; } } diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 3be7d67dc..6b352924a 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -130,7 +130,7 @@ unsigned int vp9_high_get_sby_perpixel_variance(VP9_COMP *cpi, CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8), 0, &sse); break; } - return ROUND_POWER_OF_TWO((int64_t)var, num_pels_log2_lookup[bs]); + return ROUND64_POWER_OF_TWO((int64_t)var, num_pels_log2_lookup[bs]); } #endif // CONFIG_VP9_HIGHBITDEPTH diff --git a/vp9/encoder/vp9_ratectrl.c b/vp9/encoder/vp9_ratectrl.c index c60f22c8e..5daad7458 100644 --- a/vp9/encoder/vp9_ratectrl.c +++ b/vp9/encoder/vp9_ratectrl.c @@ -1522,7 +1522,7 @@ void vp9_rc_get_one_pass_vbr_params(VP9_COMP *cpi) { VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1); } else if (rc->avg_frame_low_motion < 20) { // Decrease gf interval for high motion case. - rc->baseline_gf_interval = VPXMAX(5, rc->baseline_gf_interval >> 1); + rc->baseline_gf_interval = VPXMAX(6, rc->baseline_gf_interval >> 1); } // Adjust boost and af_ratio based on avg_frame_low_motion, which varies // between 0 and 100 (stationary, 100% zero/small motion). @@ -2115,10 +2115,9 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { VPXMIN(15, (3 * rc->baseline_gf_interval) >> 1); } else if (high_content) { rc->gfu_boost = DEFAULT_GF_BOOST >> 1; - if (rate_err > 3.0) - rc->baseline_gf_interval = VPXMAX(10, rc->baseline_gf_interval >> 1); - else - rc->baseline_gf_interval = VPXMAX(5, rc->baseline_gf_interval >> 1); + rc->baseline_gf_interval = (rate_err > 3.0) + ? VPXMAX(10, rc->baseline_gf_interval >> 1) + : VPXMAX(6, rc->baseline_gf_interval >> 1); } // Check for constraining gf_interval for up-coming scene/content changes, // or for up-coming key frame, whichever is closer. @@ -2126,8 +2125,7 @@ void adjust_gf_boost_lag_one_pass_vbr(VP9_COMP *cpi, uint64_t avg_sad_current) { if (rc->high_source_sad_lagindex > 0 && frame_constraint > rc->high_source_sad_lagindex) frame_constraint = rc->high_source_sad_lagindex; - if (steady_sad_lagindex > 0 && steady_sad_lagindex > 2 && - frame_constraint > steady_sad_lagindex) + if (steady_sad_lagindex > 3 && frame_constraint > steady_sad_lagindex) frame_constraint = steady_sad_lagindex; adjust_gfint_frame_constraint(cpi, frame_constraint); rc->frames_till_gf_update_due = rc->baseline_gf_interval; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index e2c2e200b..a3ef5e5db 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -685,7 +685,7 @@ static void block_rd_txfm(int plane, int block, int blk_row, int blk_col, plane_bsize, tx_bsize); #if CONFIG_VP9_HIGHBITDEPTH if ((xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) && (xd->bd > 8)) - sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2); + sse = ROUND64_POWER_OF_TWO(sse, (xd->bd - 8) * 2); #endif // CONFIG_VP9_HIGHBITDEPTH sse = sse * 16; tmp = pixel_sse(args->cpi, xd, pd, src, src_stride, dst, dst_stride, diff --git a/vpx_dsp/arm/bilinear_filter_media.asm b/vpx_dsp/arm/bilinear_filter_media.asm deleted file mode 100644 index f3f9754c1..000000000 --- a/vpx_dsp/arm/bilinear_filter_media.asm +++ /dev/null @@ -1,237 +0,0 @@ -; -; Copyright (c) 2010 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vpx_filter_block2d_bil_first_pass_media| - EXPORT |vpx_filter_block2d_bil_second_pass_media| - - AREA |.text|, CODE, READONLY ; name this block of code - -;------------------------------------- -; r0 unsigned char *src_ptr, -; r1 unsigned short *dst_ptr, -; r2 unsigned int src_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *vpx_filter -;------------------------------------- -; The output is transposed stroed in output array to make it easy for second pass filtering. -|vpx_filter_block2d_bil_first_pass_media| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vpx_filter address - ldr r4, [sp, #36] ; width - - mov r12, r3 ; outer-loop counter - - add r7, r2, r4 ; preload next row - pld [r0, r7] - - sub r2, r2, r4 ; src increment for height loop - - ldr r5, [r11] ; load up filter coefficients - - mov r3, r3, lsl #1 ; height*2 - add r3, r3, #2 ; plus 2 to make output buffer 4-bit aligned since height is actually (height+1) - - mov r11, r1 ; save dst_ptr for each row - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_1st_filter - -|bil_height_loop_1st_v6| - ldrb r6, [r0] ; load source data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - mov lr, r4, lsr #2 ; 4-in-parellel loop counter - -|bil_width_loop_1st_v6| - ldrb r9, [r0, #3] - ldrb r10, [r0, #4] - - pkhbt r6, r6, r7, lsl #16 ; src[1] | src[0] - pkhbt r7, r7, r8, lsl #16 ; src[2] | src[1] - - smuad r6, r6, r5 ; apply the filter - pkhbt r8, r8, r9, lsl #16 ; src[3] | src[2] - smuad r7, r7, r5 - pkhbt r9, r9, r10, lsl #16 ; src[4] | src[3] - - smuad r8, r8, r5 - smuad r9, r9, r5 - - add r0, r0, #4 - subs lr, lr, #1 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #16, r6, asr #7 - usat r7, #16, r7, asr #7 - - strh r6, [r1], r3 ; result is transposed and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strh r7, [r1], r3 - add r9, r9, #0x40 - usat r8, #16, r8, asr #7 - usat r9, #16, r9, asr #7 - - strh r8, [r1], r3 ; result is transposed and stored - - ldrneb r6, [r0] ; load source data - strh r9, [r1], r3 - - ldrneb r7, [r0, #1] - ldrneb r8, [r0, #2] - - bne bil_width_loop_1st_v6 - - add r0, r0, r2 ; move to next input row - subs r12, r12, #1 - - add r9, r2, r4, lsl #1 ; adding back block width - pld [r0, r9] ; preload next row - - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_1st_v6 - - ldmia sp!, {r4 - r11, pc} - -|bil_null_1st_filter| -|bil_height_loop_null_1st| - mov lr, r4, lsr #2 ; loop counter - -|bil_width_loop_null_1st| - ldrb r6, [r0] ; load data - ldrb r7, [r0, #1] - ldrb r8, [r0, #2] - ldrb r9, [r0, #3] - - strh r6, [r1], r3 ; store it to immediate buffer - add r0, r0, #4 - strh r7, [r1], r3 - subs lr, lr, #1 - strh r8, [r1], r3 - strh r9, [r1], r3 - - bne bil_width_loop_null_1st - - subs r12, r12, #1 - add r0, r0, r2 ; move to next input line - add r11, r11, #2 ; move over to next column - mov r1, r11 - - bne bil_height_loop_null_1st - - ldmia sp!, {r4 - r11, pc} - - ENDP ; |vpx_filter_block2d_bil_first_pass_media| - - -;--------------------------------- -; r0 unsigned short *src_ptr, -; r1 unsigned char *dst_ptr, -; r2 int dst_pitch, -; r3 unsigned int height, -; stack unsigned int width, -; stack const short *vpx_filter -;--------------------------------- -|vpx_filter_block2d_bil_second_pass_media| PROC - stmdb sp!, {r4 - r11, lr} - - ldr r11, [sp, #40] ; vpx_filter address - ldr r4, [sp, #36] ; width - - ldr r5, [r11] ; load up filter coefficients - mov r12, r4 ; outer-loop counter = width, since we work on transposed data matrix - mov r11, r1 - - cmp r5, #128 ; if filter coef = 128, then skip the filter - beq bil_null_2nd_filter - -|bil_height_loop_2nd| - ldr r6, [r0] ; load the data - ldr r8, [r0, #4] - ldrh r10, [r0, #8] - mov lr, r3, lsr #2 ; loop counter - -|bil_width_loop_2nd| - pkhtb r7, r6, r8 ; src[1] | src[2] - pkhtb r9, r8, r10 ; src[3] | src[4] - - smuad r6, r6, r5 ; apply filter - smuad r8, r8, r5 ; apply filter - - subs lr, lr, #1 - - smuadx r7, r7, r5 ; apply filter - smuadx r9, r9, r5 ; apply filter - - add r0, r0, #8 - - add r6, r6, #0x40 ; round_shift_and_clamp - add r7, r7, #0x40 - usat r6, #8, r6, asr #7 - usat r7, #8, r7, asr #7 - strb r6, [r1], r2 ; the result is transposed back and stored - - add r8, r8, #0x40 ; round_shift_and_clamp - strb r7, [r1], r2 - add r9, r9, #0x40 - usat r8, #8, r8, asr #7 - usat r9, #8, r9, asr #7 - strb r8, [r1], r2 ; the result is transposed back and stored - - ldrne r6, [r0] ; load data - strb r9, [r1], r2 - ldrne r8, [r0, #4] - ldrneh r10, [r0, #8] - - bne bil_width_loop_2nd - - subs r12, r12, #1 - add r0, r0, #4 ; update src for next row - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_2nd - ldmia sp!, {r4 - r11, pc} - -|bil_null_2nd_filter| -|bil_height_loop_null_2nd| - mov lr, r3, lsr #2 - -|bil_width_loop_null_2nd| - ldr r6, [r0], #4 ; load data - subs lr, lr, #1 - ldr r8, [r0], #4 - - strb r6, [r1], r2 ; store data - mov r7, r6, lsr #16 - strb r7, [r1], r2 - mov r9, r8, lsr #16 - strb r8, [r1], r2 - strb r9, [r1], r2 - - bne bil_width_loop_null_2nd - - subs r12, r12, #1 - add r0, r0, #4 - add r11, r11, #1 - mov r1, r11 - - bne bil_height_loop_null_2nd - - ldmia sp!, {r4 - r11, pc} - ENDP ; |vpx_filter_block2d_second_pass_media| - - END diff --git a/vpx_dsp/arm/hadamard_neon.c b/vpx_dsp/arm/hadamard_neon.c index 46b2755ea..977323497 100644 --- a/vpx_dsp/arm/hadamard_neon.c +++ b/vpx_dsp/arm/hadamard_neon.c @@ -11,6 +11,7 @@ #include <arm_neon.h> #include "./vpx_dsp_rtcd.h" +#include "vpx_dsp/arm/transpose_neon.h" static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, @@ -43,93 +44,6 @@ static void hadamard8x8_one_pass(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, *a7 = vaddq_s16(c1, c5); } -// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider -// reversing transpose order which may make it easier for the compiler to -// reconcile the vtrn.64 moves. -static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, - int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, - int16x8_t *a6, int16x8_t *a7) { - // Swap 64 bit elements. Goes from: - // a0: 00 01 02 03 04 05 06 07 - // a1: 08 09 10 11 12 13 14 15 - // a2: 16 17 18 19 20 21 22 23 - // a3: 24 25 26 27 28 29 30 31 - // a4: 32 33 34 35 36 37 38 39 - // a5: 40 41 42 43 44 45 46 47 - // a6: 48 49 50 51 52 53 54 55 - // a7: 56 57 58 59 60 61 62 63 - // to: - // a04_lo: 00 01 02 03 32 33 34 35 - // a15_lo: 08 09 10 11 40 41 42 43 - // a26_lo: 16 17 18 19 48 49 50 51 - // a37_lo: 24 25 26 27 56 57 58 59 - // a04_hi: 04 05 06 07 36 37 38 39 - // a15_hi: 12 13 14 15 44 45 46 47 - // a26_hi: 20 21 22 23 52 53 54 55 - // a37_hi: 28 29 30 31 60 61 62 63 - const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4)); - const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5)); - const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6)); - const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7)); - const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4)); - const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5)); - const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6)); - const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7)); - - // Swap 32 bit elements resulting in: - // a0246_lo: - // 00 01 16 17 32 33 48 49 - // 02 03 18 19 34 35 50 51 - // a1357_lo: - // 08 09 24 25 40 41 56 57 - // 10 11 26 27 42 43 58 59 - // a0246_hi: - // 04 05 20 21 36 37 52 53 - // 06 07 22 23 38 39 54 55 - // a1657_hi: - // 12 13 28 29 44 45 60 61 - // 14 15 30 31 46 47 62 63 - const int32x4x2_t a0246_lo = - vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo)); - const int32x4x2_t a1357_lo = - vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo)); - const int32x4x2_t a0246_hi = - vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi)); - const int32x4x2_t a1357_hi = - vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi)); - - // Swap 16 bit elements resulting in: - // b0: - // 00 08 16 24 32 40 48 56 - // 01 09 17 25 33 41 49 57 - // b1: - // 02 10 18 26 34 42 50 58 - // 03 11 19 27 35 43 51 59 - // b2: - // 04 12 20 28 36 44 52 60 - // 05 13 21 29 37 45 53 61 - // b3: - // 06 14 22 30 38 46 54 62 - // 07 15 23 31 39 47 55 63 - const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]), - vreinterpretq_s16_s32(a1357_lo.val[0])); - const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]), - vreinterpretq_s16_s32(a1357_lo.val[1])); - const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]), - vreinterpretq_s16_s32(a1357_hi.val[0])); - const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]), - vreinterpretq_s16_s32(a1357_hi.val[1])); - - *a0 = b0.val[0]; - *a1 = b0.val[1]; - *a2 = b1.val[0]; - *a3 = b1.val[1]; - *a4 = b2.val[0]; - *a5 = b2.val[1]; - *a6 = b3.val[0]; - *a7 = b3.val[1]; -} - void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, int16_t *coeff) { int16x8_t a0 = vld1q_s16(src_diff); @@ -143,7 +57,7 @@ void vpx_hadamard_8x8_neon(const int16_t *src_diff, int src_stride, hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); - transpose8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); + transpose_s16_8x8(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); hadamard8x8_one_pass(&a0, &a1, &a2, &a3, &a4, &a5, &a6, &a7); diff --git a/vpx_dsp/arm/idct16x16_add_neon.c b/vpx_dsp/arm/idct16x16_add_neon.c index 6c03aff60..5ca2afe48 100644 --- a/vpx_dsp/arm/idct16x16_add_neon.c +++ b/vpx_dsp/arm/idct16x16_add_neon.c @@ -11,72 +11,9 @@ #include <arm_neon.h> #include "./vpx_config.h" +#include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; -} - void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, int output_stride) { int16x4_t d0s16, d1s16, d2s16, d3s16; @@ -115,8 +52,8 @@ void vpx_idct16x16_256_add_neon_pass1(int16_t *in, int16_t *out, q0x2s16 = vld2q_s16(in); q15s16 = q0x2s16.val[0]; - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); @@ -356,8 +293,8 @@ void vpx_idct16x16_256_add_neon_pass2(int16_t *src, int16_t *out, q0x2s16 = vld2q_s16(src); q15s16 = q0x2s16.val[0]; - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); @@ -898,8 +835,8 @@ void vpx_idct16x16_10_add_neon_pass1(int16_t *in, int16_t *out, q0x2s16 = vld2q_s16(in); q15s16 = q0x2s16.val[0]; - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); // stage 3 q0s16 = vdupq_n_s16(cospi_28_64 * 2); @@ -1041,8 +978,8 @@ void vpx_idct16x16_10_add_neon_pass2(int16_t *src, int16_t *out, q0x2s16 = vld2q_s16(src); q15s16 = q0x2s16.val[0]; - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); // stage 3 q6s16 = vdupq_n_s16(cospi_30_64 * 2); diff --git a/vpx_dsp/arm/idct32x32_add_neon.c b/vpx_dsp/arm/idct32x32_add_neon.c index 04f51bfdd..c4d1e8473 100644 --- a/vpx_dsp/arm/idct32x32_add_neon.c +++ b/vpx_dsp/arm/idct32x32_add_neon.c @@ -11,6 +11,7 @@ #include <arm_neon.h> #include "./vpx_config.h" +#include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" #define LOAD_FROM_TRANSPOSED(prev, first, second) \ @@ -155,11 +156,7 @@ static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) { int16_t *in; int i; const int stride = 32; - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; for (i = 0; i < 4; i++, input += 8) { in = input; @@ -179,65 +176,24 @@ static INLINE void idct32_transpose_pair(int16_t *input, int16_t *t_buf) { in += stride; q15s16 = vld1q_s16(in); - d16s16 = vget_low_s16(q8s16); - d17s16 = vget_high_s16(q8s16); - d18s16 = vget_low_s16(q9s16); - d19s16 = vget_high_s16(q9s16); - d20s16 = vget_low_s16(q10s16); - d21s16 = vget_high_s16(q10s16); - d22s16 = vget_low_s16(q11s16); - d23s16 = vget_high_s16(q11s16); - d24s16 = vget_low_s16(q12s16); - d25s16 = vget_high_s16(q12s16); - d26s16 = vget_low_s16(q13s16); - d27s16 = vget_high_s16(q13s16); - d28s16 = vget_low_s16(q14s16); - d29s16 = vget_high_s16(q14s16); - d30s16 = vget_low_s16(q15s16); - d31s16 = vget_high_s16(q15s16); - - q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - q12s16 = vcombine_s16(d17s16, d25s16); - q13s16 = vcombine_s16(d19s16, d27s16); - q14s16 = vcombine_s16(d21s16, d29s16); - q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q9s16), vreinterpretq_s32_s16(q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q12s16), vreinterpretq_s32_s16(q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(q13s16), vreinterpretq_s32_s16(q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - vst1q_s16(t_buf, q0x2s16.val[0]); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, + &q14s16, &q15s16); + + vst1q_s16(t_buf, q8s16); t_buf += 8; - vst1q_s16(t_buf, q0x2s16.val[1]); + vst1q_s16(t_buf, q9s16); t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[0]); + vst1q_s16(t_buf, q10s16); t_buf += 8; - vst1q_s16(t_buf, q1x2s16.val[1]); + vst1q_s16(t_buf, q11s16); t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[0]); + vst1q_s16(t_buf, q12s16); t_buf += 8; - vst1q_s16(t_buf, q2x2s16.val[1]); + vst1q_s16(t_buf, q13s16); t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[0]); + vst1q_s16(t_buf, q14s16); t_buf += 8; - vst1q_s16(t_buf, q3x2s16.val[1]); + vst1q_s16(t_buf, q15s16); t_buf += 8; } return; diff --git a/vpx_dsp/arm/idct8x8_add_neon.c b/vpx_dsp/arm/idct8x8_add_neon.c index f1c271110..82b318256 100644 --- a/vpx_dsp/arm/idct8x8_add_neon.c +++ b/vpx_dsp/arm/idct8x8_add_neon.c @@ -11,72 +11,9 @@ #include <arm_neon.h> #include "./vpx_config.h" +#include "vpx_dsp/arm/transpose_neon.h" #include "vpx_dsp/txfm_common.h" -static INLINE void TRANSPOSE8X8(int16x8_t *q8s16, int16x8_t *q9s16, - int16x8_t *q10s16, int16x8_t *q11s16, - int16x8_t *q12s16, int16x8_t *q13s16, - int16x8_t *q14s16, int16x8_t *q15s16) { - int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; - int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; - int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; - int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; - - d16s16 = vget_low_s16(*q8s16); - d17s16 = vget_high_s16(*q8s16); - d18s16 = vget_low_s16(*q9s16); - d19s16 = vget_high_s16(*q9s16); - d20s16 = vget_low_s16(*q10s16); - d21s16 = vget_high_s16(*q10s16); - d22s16 = vget_low_s16(*q11s16); - d23s16 = vget_high_s16(*q11s16); - d24s16 = vget_low_s16(*q12s16); - d25s16 = vget_high_s16(*q12s16); - d26s16 = vget_low_s16(*q13s16); - d27s16 = vget_high_s16(*q13s16); - d28s16 = vget_low_s16(*q14s16); - d29s16 = vget_high_s16(*q14s16); - d30s16 = vget_low_s16(*q15s16); - d31s16 = vget_high_s16(*q15s16); - - *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 - *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 - *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 - *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 - *q12s16 = vcombine_s16(d17s16, d25s16); - *q13s16 = vcombine_s16(d19s16, d27s16); - *q14s16 = vcombine_s16(d21s16, d29s16); - *q15s16 = vcombine_s16(d23s16, d31s16); - - q0x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); - q1x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); - q2x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); - q3x2s32 = - vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); - - q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 - vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 - q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 - vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 - q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 - vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 - q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 - vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 - - *q8s16 = q0x2s16.val[0]; - *q9s16 = q0x2s16.val[1]; - *q10s16 = q1x2s16.val[0]; - *q11s16 = q1x2s16.val[1]; - *q12s16 = q2x2s16.val[0]; - *q13s16 = q2x2s16.val[1]; - *q14s16 = q3x2s16.val[0]; - *q15s16 = q3x2s16.val[1]; - return; -} - static INLINE void IDCT8x8_1D(int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, int16x8_t *q11s16, int16x8_t *q12s16, int16x8_t *q13s16, @@ -244,14 +181,14 @@ void vpx_idct8x8_64_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { q14s16 = vld1q_s16(input + 48); q15s16 = vld1q_s16(input + 56); - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); @@ -350,8 +287,8 @@ void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { q14s16 = vld1q_s16(input + 48); q15s16 = vld1q_s16(input + 56); - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); // First transform rows // stage 1 @@ -427,8 +364,8 @@ void vpx_idct8x8_12_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { q14s16 = vsubq_s16(q1s16, q6s16); q15s16 = vsubq_s16(q0s16, q7s16); - TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, - &q15s16); + transpose_s16_8x8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, + &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); diff --git a/vpx_dsp/arm/intrapred_neon.c b/vpx_dsp/arm/intrapred_neon.c index 32dd1ba14..38e79ed69 100644 --- a/vpx_dsp/arm/intrapred_neon.c +++ b/vpx_dsp/arm/intrapred_neon.c @@ -20,37 +20,35 @@ // 'do_above' and 'do_left' facilitate branch removal when inlined. static INLINE void dc_4x4(uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left, int do_above, int do_left) { - uint16x8_t sum_top; - uint16x8_t sum_left; - uint8x8_t dc0; + uint16x4_t sum_top; + uint16x4_t sum_left; + uint16x4_t dc0; if (do_above) { const uint8x8_t A = vld1_u8(above); // top row const uint16x4_t p0 = vpaddl_u8(A); // cascading summation of the top - const uint16x4_t p1 = vpadd_u16(p0, p0); - sum_top = vcombine_u16(p1, p1); + sum_top = vpadd_u16(p0, p0); } if (do_left) { const uint8x8_t L = vld1_u8(left); // left border const uint16x4_t p0 = vpaddl_u8(L); // cascading summation of the left - const uint16x4_t p1 = vpadd_u16(p0, p0); - sum_left = vcombine_u16(p1, p1); + sum_left = vpadd_u16(p0, p0); } if (do_above && do_left) { - const uint16x8_t sum = vaddq_u16(sum_left, sum_top); - dc0 = vrshrn_n_u16(sum, 3); + const uint16x4_t sum = vadd_u16(sum_left, sum_top); + dc0 = vrshr_n_u16(sum, 3); } else if (do_above) { - dc0 = vrshrn_n_u16(sum_top, 2); + dc0 = vrshr_n_u16(sum_top, 2); } else if (do_left) { - dc0 = vrshrn_n_u16(sum_left, 2); + dc0 = vrshr_n_u16(sum_left, 2); } else { - dc0 = vdup_n_u8(0x80); + dc0 = vdup_n_u16(0x80); } { - const uint8x8_t dc = vdup_lane_u8(dc0, 0); + const uint8x8_t dc = vdup_lane_u8(vreinterpret_u8_u16(dc0), 0); int i; for (i = 0; i < 4; ++i) { vst1_lane_u32((uint32_t *)(dst + i * stride), vreinterpret_u32_u8(dc), 0); diff --git a/vpx_dsp/arm/sad_media.asm b/vpx_dsp/arm/sad_media.asm deleted file mode 100644 index aed1d3a22..000000000 --- a/vpx_dsp/arm/sad_media.asm +++ /dev/null @@ -1,95 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vpx_sad16x16_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 const unsigned char *src_ptr -; r1 int src_stride -; r2 const unsigned char *ref_ptr -; r3 int ref_stride -|vpx_sad16x16_media| PROC - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - mov r4, #0 ; sad = 0; - mov r5, #8 ; loop count - -loop - ; 1st row - ldr r6, [r0, #0x0] ; load 4 src pixels (1A) - ldr r8, [r2, #0x0] ; load 4 ref pixels (1A) - ldr r7, [r0, #0x4] ; load 4 src pixels (1A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (1A) - ldr r10, [r0, #0x8] ; load 4 src pixels (1B) - ldr r11, [r0, #0xC] ; load 4 src pixels (1B) - - usada8 r4, r8, r6, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (1B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (1B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - ldr r6, [r0, #0x0] ; load 4 src pixels (2A) - ldr r7, [r0, #0x4] ; load 4 src pixels (2A) - add r4, r4, r8 ; add partial sad values - - ; 2nd row - ldr r8, [r2, #0x0] ; load 4 ref pixels (2A) - ldr r9, [r2, #0x4] ; load 4 ref pixels (2A) - ldr r10, [r0, #0x8] ; load 4 src pixels (2B) - ldr r11, [r0, #0xC] ; load 4 src pixels (2B) - - usada8 r4, r6, r8, r4 ; calculate sad for 4 pixels - usad8 r8, r7, r9 ; calculate sad for 4 pixels - - ldr r12, [r2, #0x8] ; load 4 ref pixels (2B) - ldr lr, [r2, #0xC] ; load 4 ref pixels (2B) - - add r0, r0, r1 ; set src pointer to next row - add r2, r2, r3 ; set dst pointer to next row - - usada8 r4, r10, r12, r4 ; calculate sad for 4 pixels - usada8 r8, r11, lr, r8 ; calculate sad for 4 pixels - - pld [r0, r1, lsl #1] - pld [r2, r3, lsl #1] - - subs r5, r5, #1 ; decrement loop counter - add r4, r4, r8 ; add partial sad values - - bne loop - - mov r0, r4 ; return sad - ldmfd sp!, {r4-r12, pc} - - ENDP - - END - diff --git a/vpx_dsp/arm/subpel_variance_media.c b/vpx_dsp/arm/subpel_variance_media.c deleted file mode 100644 index ab5336157..000000000 --- a/vpx_dsp/arm/subpel_variance_media.c +++ /dev/null @@ -1,80 +0,0 @@ -/* - * Copyright (c) 2010 The WebM project authors. All Rights Reserved. - * - * Use of this source code is governed by a BSD-style license - * that can be found in the LICENSE file in the root of the source - * tree. An additional intellectual property rights grant can be found - * in the file PATENTS. All contributing project authors may - * be found in the AUTHORS file in the root of the source tree. - */ - -#include "./vpx_config.h" -#include "./vpx_dsp_rtcd.h" -#include "vpx/vpx_integer.h" -#include "vpx_ports/mem.h" - -#if HAVE_MEDIA -static const int16_t bilinear_filters_media[8][2] = { { 128, 0 }, { 112, 16 }, - { 96, 32 }, { 80, 48 }, - { 64, 64 }, { 48, 80 }, - { 32, 96 }, { 16, 112 } }; - -extern void vpx_filter_block2d_bil_first_pass_media( - const uint8_t *src_ptr, uint16_t *dst_ptr, uint32_t src_pitch, - uint32_t height, uint32_t width, const int16_t *filter); - -extern void vpx_filter_block2d_bil_second_pass_media( - const uint16_t *src_ptr, uint8_t *dst_ptr, int32_t src_pitch, - uint32_t height, uint32_t width, const int16_t *filter); - -unsigned int vpx_sub_pixel_variance8x8_media( - const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t first_pass[10 * 8]; - uint8_t second_pass[8 * 8]; - const int16_t *HFilter, *VFilter; - - HFilter = bilinear_filters_media[xoffset]; - VFilter = bilinear_filters_media[yoffset]; - - vpx_filter_block2d_bil_first_pass_media(src_ptr, first_pass, - src_pixels_per_line, 9, 8, HFilter); - vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 8, 8, 8, - VFilter); - - return vpx_variance8x8_media(second_pass, 8, dst_ptr, dst_pixels_per_line, - sse); -} - -unsigned int vpx_sub_pixel_variance16x16_media( - const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, - const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse) { - uint16_t first_pass[36 * 16]; - uint8_t second_pass[20 * 16]; - const int16_t *HFilter, *VFilter; - unsigned int var; - - if (xoffset == 4 && yoffset == 0) { - var = vpx_variance_halfpixvar16x16_h_media( - src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == 0 && yoffset == 4) { - var = vpx_variance_halfpixvar16x16_v_media( - src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - } else if (xoffset == 4 && yoffset == 4) { - var = vpx_variance_halfpixvar16x16_hv_media( - src_ptr, src_pixels_per_line, dst_ptr, dst_pixels_per_line, sse); - } else { - HFilter = bilinear_filters_media[xoffset]; - VFilter = bilinear_filters_media[yoffset]; - - vpx_filter_block2d_bil_first_pass_media( - src_ptr, first_pass, src_pixels_per_line, 17, 16, HFilter); - vpx_filter_block2d_bil_second_pass_media(first_pass, second_pass, 16, 16, - 16, VFilter); - - var = vpx_variance16x16_media(second_pass, 16, dst_ptr, dst_pixels_per_line, - sse); - } - return var; -} -#endif // HAVE_MEDIA diff --git a/vpx_dsp/arm/transpose_neon.h b/vpx_dsp/arm/transpose_neon.h new file mode 100644 index 000000000..426abe903 --- /dev/null +++ b/vpx_dsp/arm/transpose_neon.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2016 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#ifndef VPX_DSP_ARM_TRANSPOSE_NEON_H_ +#define VPX_DSP_ARM_TRANSPOSE_NEON_H_ + +#include <arm_neon.h> + +#include "./vpx_config.h" + +// Transpose 64 bit elements as follows: +// a0: 00 01 02 03 04 05 06 07 +// a1: 16 17 18 19 20 21 22 23 +// +// b0.val[0]: 00 01 02 03 16 17 18 19 +// b0.val[1]: 04 05 06 07 20 21 22 23 +static INLINE int16x8x2_t vpx_vtrnq_s64(int32x4_t a0, int32x4_t a1) { + int16x8x2_t b0; + b0.val[0] = vcombine_s16(vreinterpret_s16_s32(vget_low_s32(a0)), + vreinterpret_s16_s32(vget_low_s32(a1))); + b0.val[1] = vcombine_s16(vreinterpret_s16_s32(vget_high_s32(a0)), + vreinterpret_s16_s32(vget_high_s32(a1))); + return b0; +} + +static INLINE void transpose_s16_8x8(int16x8_t *a0, int16x8_t *a1, + int16x8_t *a2, int16x8_t *a3, + int16x8_t *a4, int16x8_t *a5, + int16x8_t *a6, int16x8_t *a7) { + // Swap 16 bit elements. Goes from: + // a0: 00 01 02 03 04 05 06 07 + // a1: 08 09 10 11 12 13 14 15 + // a2: 16 17 18 19 20 21 22 23 + // a3: 24 25 26 27 28 29 30 31 + // a4: 32 33 34 35 36 37 38 39 + // a5: 40 41 42 43 44 45 46 47 + // a6: 48 49 50 51 52 53 54 55 + // a7: 56 57 58 59 60 61 62 63 + // to: + // b0.val[0]: 00 08 02 10 04 12 06 14 + // b0.val[1]: 01 09 03 11 05 13 07 15 + // b1.val[0]: 16 24 18 26 20 28 22 30 + // b1.val[1]: 17 25 19 27 21 29 23 31 + // b2.val[0]: 32 40 34 42 36 44 38 46 + // b2.val[1]: 33 41 35 43 37 45 39 47 + // b3.val[0]: 48 56 50 58 52 60 54 62 + // b3.val[1]: 49 57 51 59 53 61 55 63 + + const int16x8x2_t b0 = vtrnq_s16(*a0, *a1); + const int16x8x2_t b1 = vtrnq_s16(*a2, *a3); + const int16x8x2_t b2 = vtrnq_s16(*a4, *a5); + const int16x8x2_t b3 = vtrnq_s16(*a6, *a7); + + // Swap 32 bit elements resulting in: + // c0.val[0]: 00 08 16 24 04 12 20 28 + // c0.val[1]: 02 10 18 26 06 14 22 30 + // c1.val[0]: 01 09 17 25 05 13 21 29 + // c1.val[1]: 03 11 19 27 07 15 23 31 + // c2.val[0]: 32 40 48 56 36 44 52 60 + // c2.val[1]: 34 42 50 58 38 46 54 62 + // c3.val[0]: 33 41 49 57 37 45 53 61 + // c3.val[1]: 35 43 51 59 39 47 55 63 + + const int32x4x2_t c0 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[0]), + vreinterpretq_s32_s16(b1.val[0])); + const int32x4x2_t c1 = vtrnq_s32(vreinterpretq_s32_s16(b0.val[1]), + vreinterpretq_s32_s16(b1.val[1])); + const int32x4x2_t c2 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[0]), + vreinterpretq_s32_s16(b3.val[0])); + const int32x4x2_t c3 = vtrnq_s32(vreinterpretq_s32_s16(b2.val[1]), + vreinterpretq_s32_s16(b3.val[1])); + + // Swap 64 bit elements resulting in: + // d0.val[0]: 00 08 16 24 32 40 48 56 + // d0.val[1]: 04 12 20 28 36 44 52 60 + // d1.val[0]: 01 09 17 25 33 41 49 57 + // d1.val[1]: 05 13 21 29 37 45 53 61 + // d2.val[0]: 02 10 18 26 34 42 50 58 + // d2.val[1]: 06 14 22 30 38 46 54 62 + // d3.val[0]: 03 11 19 27 35 43 51 59 + // d3.val[1]: 07 15 23 31 39 47 55 63 + const int16x8x2_t d0 = vpx_vtrnq_s64(c0.val[0], c2.val[0]); + const int16x8x2_t d1 = vpx_vtrnq_s64(c1.val[0], c3.val[0]); + const int16x8x2_t d2 = vpx_vtrnq_s64(c0.val[1], c2.val[1]); + const int16x8x2_t d3 = vpx_vtrnq_s64(c1.val[1], c3.val[1]); + + *a0 = d0.val[0]; + *a1 = d1.val[0]; + *a2 = d2.val[0]; + *a3 = d3.val[0]; + *a4 = d0.val[1]; + *a5 = d1.val[1]; + *a6 = d2.val[1]; + *a7 = d3.val[1]; +} + +#endif // VPX_DSP_ARM_TRANSPOSE_NEON_H_ diff --git a/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm b/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm deleted file mode 100644 index dab845a20..000000000 --- a/vpx_dsp/arm/variance_halfpixvar16x16_h_media.asm +++ /dev/null @@ -1,182 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vpx_variance_halfpixvar16x16_h_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vpx_variance_halfpixvar16x16_h_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r0, #1] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r0, #5] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r0, #9] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r0, #13] ; load 4 src pixels with 1 byte offset - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END - diff --git a/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm b/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm deleted file mode 100644 index 01953b709..000000000 --- a/vpx_dsp/arm/variance_halfpixvar16x16_hv_media.asm +++ /dev/null @@ -1,222 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vpx_variance_halfpixvar16x16_hv_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vpx_variance_halfpixvar16x16_hv_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; pointer to pixels on the next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load source pixels a, row N - ldr r6, [r0, #1] ; load source pixels b, row N - ldr r5, [r9, #0] ; load source pixels c, row N+1 - ldr r7, [r9, #1] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #0] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load source pixels a, row N - ldr r6, [r0, #5] ; load source pixels b, row N - ldr r5, [r9, #4] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #5] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #4] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load source pixels a, row N - ldr r6, [r0, #9] ; load source pixels b, row N - ldr r5, [r9, #8] ; load source pixels c, row N+1 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - ldr r7, [r9, #9] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #8] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load source pixels a, row N - ldr r6, [r0, #13] ; load source pixels b, row N - ldr r5, [r9, #12] ; load source pixels c, row N+1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - ldr r7, [r9, #13] ; load source pixels d, row N+1 - - ; x = (a + b + 1) >> 1, interpolate pixels horizontally on row N - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - ; y = (c + d + 1) >> 1, interpolate pixels horizontally on row N+1 - mvn r7, r7 - uhsub8 r5, r5, r7 - eor r5, r5, r10 - ; z = (x + y + 1) >> 1, interpolate half pixel values vertically - mvn r5, r5 - uhsub8 r4, r4, r5 - ldr r5, [r2, #12] ; load 4 ref pixels - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END diff --git a/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm b/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm deleted file mode 100644 index 0d17acb38..000000000 --- a/vpx_dsp/arm/variance_halfpixvar16x16_v_media.asm +++ /dev/null @@ -1,184 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vpx_variance_halfpixvar16x16_v_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vpx_variance_halfpixvar16x16_v_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - ldr r10, c80808080 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - mov lr, #0 ; constant zero -loop - add r9, r0, r1 ; set src pointer to next row - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r6, [r9, #0] ; load 4 src pixels from next row - ldr r5, [r2, #0] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r6, [r9, #4] ; load 4 src pixels from next row - ldr r5, [r2, #4] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r6, [r9, #8] ; load 4 src pixels from next row - ldr r5, [r2, #8] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r6, [r9, #12] ; load 4 src pixels from next row - ldr r5, [r2, #12] ; load 4 ref pixels - - ; bilinear interpolation - mvn r6, r6 - uhsub8 r4, r4, r6 - eor r4, r4, r10 - - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r6, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r6, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r7, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r7, r7, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -c80808080 - DCD 0x80808080 - - END - diff --git a/vpx_dsp/arm/variance_media.asm b/vpx_dsp/arm/variance_media.asm deleted file mode 100644 index f7f9e14b0..000000000 --- a/vpx_dsp/arm/variance_media.asm +++ /dev/null @@ -1,358 +0,0 @@ -; -; Copyright (c) 2011 The WebM project authors. All Rights Reserved. -; -; Use of this source code is governed by a BSD-style license -; that can be found in the LICENSE file in the root of the source -; tree. An additional intellectual property rights grant can be found -; in the file PATENTS. All contributing project authors may -; be found in the AUTHORS file in the root of the source tree. -; - - - EXPORT |vpx_variance16x16_media| - EXPORT |vpx_variance8x8_media| - EXPORT |vpx_mse16x16_media| - - ARM - REQUIRE8 - PRESERVE8 - - AREA ||.text||, CODE, READONLY, ALIGN=2 - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vpx_variance16x16_media| PROC - - stmfd sp!, {r4-r12, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r8, #0 ; initialize sum = 0 - mov r11, #0 ; initialize sse = 0 - mov r12, #16 ; set loop counter to 16 (=block height) - -loop16x16 - ; 1st 4 pixels - ldr r4, [r0, #0] ; load 4 src pixels - ldr r5, [r2, #0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r6, r4, r5 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - ; calculate total sum - adds r8, r8, r4 ; add positive differences to sum - subs r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r4, [r0, #4] ; load 4 src pixels - ldr r5, [r2, #4] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r4, [r0, #8] ; load 4 src pixels - ldr r5, [r2, #8] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r4, [r0, #12] ; load 4 src pixels - ldr r5, [r2, #12] ; load 4 ref pixels - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - usub8 r6, r4, r5 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r6, lr ; select bytes with positive difference - usub8 r9, r5, r4 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r6, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r4, r7, lr ; calculate sum of positive differences - usad8 r5, r6, lr ; calculate sum of negative differences - orr r6, r6, r7 ; differences of all 4 pixels - - ; calculate total sum - add r8, r8, r4 ; add positive differences to sum - sub r8, r8, r5 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r5, r6 ; byte (two pixels) to halfwords - uxtb16 r10, r6, ror #8 ; another two pixels to halfwords - smlad r11, r5, r5, r11 ; dual signed multiply, add and accumulate (1) - smlad r11, r10, r10, r11 ; dual signed multiply, add and accumulate (2) - - - subs r12, r12, #1 - - bne loop16x16 - - ; return stuff - ldr r6, [sp, #40] ; get address of sse - mul r0, r8, r8 ; sum * sum - str r11, [r6] ; store sse - sub r0, r11, r0, lsr #8 ; return (sse - ((sum * sum) >> 8)) - - ldmfd sp!, {r4-r12, pc} - - ENDP - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -|vpx_variance8x8_media| PROC - - push {r4-r10, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #8 ; set loop counter to 8 (=block height) - mov r4, #0 ; initialize sum = 0 - mov r5, #0 ; initialize sse = 0 - -loop8x8 - ; 1st 4 pixels - ldr r6, [r0, #0x0] ; load 4 src pixels - ldr r7, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r6, r7 ; calculate difference - pld [r0, r1, lsl #1] - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r0, #0x4] ; load 4 src pixels - ldr r7, [r2, #0x4] ; load 4 ref pixels - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r6, r7 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r10, r8, lr ; select bytes with positive difference - usub8 r9, r7, r6 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r6, r10, lr ; calculate sum of positive differences - usad8 r7, r8, lr ; calculate sum of negative differences - orr r8, r8, r10 ; differences of all 4 pixels - - ; calculate total sum - add r4, r4, r6 ; add positive differences to sum - sub r4, r4, r7 ; subtract negative differences from sum - - ; calculate sse - uxtb16 r7, r8 ; byte (two pixels) to halfwords - uxtb16 r10, r8, ror #8 ; another two pixels to halfwords - smlad r5, r7, r7, r5 ; dual signed multiply, add and accumulate (1) - subs r12, r12, #1 ; next row - smlad r5, r10, r10, r5 ; dual signed multiply, add and accumulate (2) - - bne loop8x8 - - ; return stuff - ldr r8, [sp, #32] ; get address of sse - mul r1, r4, r4 ; sum * sum - str r5, [r8] ; store sse - sub r0, r5, r1, ASR #6 ; return (sse - ((sum * sum) >> 6)) - - pop {r4-r10, pc} - - ENDP - -; r0 unsigned char *src_ptr -; r1 int source_stride -; r2 unsigned char *ref_ptr -; r3 int recon_stride -; stack unsigned int *sse -; -;note: Based on vpx_variance16x16_media. In this function, sum is never used. -; So, we can remove this part of calculation. - -|vpx_mse16x16_media| PROC - - push {r4-r9, lr} - - pld [r0, r1, lsl #0] - pld [r2, r3, lsl #0] - - mov r12, #16 ; set loop counter to 16 (=block height) - mov r4, #0 ; initialize sse = 0 - -loopmse - ; 1st 4 pixels - ldr r5, [r0, #0x0] ; load 4 src pixels - ldr r6, [r2, #0x0] ; load 4 ref pixels - - mov lr, #0 ; constant zero - - usub8 r8, r5, r6 ; calculate difference - pld [r0, r1, lsl #1] - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - pld [r2, r3, lsl #1] - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0x4] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 2nd 4 pixels - ldr r6, [r2, #0x4] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - ldr r5, [r0, #0x8] ; load 4 src pixels - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 3rd 4 pixels - ldr r6, [r2, #0x8] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - ldr r5, [r0, #0xc] ; load 4 src pixels - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - - ; 4th 4 pixels - ldr r6, [r2, #0xc] ; load 4 ref pixels - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - usub8 r8, r5, r6 ; calculate difference - add r0, r0, r1 ; set src_ptr to next row - sel r7, r8, lr ; select bytes with positive difference - usub8 r9, r6, r5 ; calculate difference with reversed operands - add r2, r2, r3 ; set dst_ptr to next row - sel r8, r9, lr ; select bytes with negative difference - - ; calculate partial sums - usad8 r5, r7, lr ; calculate sum of positive differences - usad8 r6, r8, lr ; calculate sum of negative differences - orr r8, r8, r7 ; differences of all 4 pixels - - subs r12, r12, #1 ; next row - - ; calculate sse - uxtb16 r6, r8 ; byte (two pixels) to halfwords - uxtb16 r7, r8, ror #8 ; another two pixels to halfwords - smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) - smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) - - bne loopmse - - ; return stuff - ldr r1, [sp, #28] ; get address of sse - mov r0, r4 ; return sse - str r4, [r1] ; store sse - - pop {r4-r9, pc} - - ENDP - - END diff --git a/vpx_dsp/mips/convolve8_dspr2.c b/vpx_dsp/mips/convolve8_dspr2.c index 789ec8d53..f6812c7d0 100644 --- a/vpx_dsp/mips/convolve8_dspr2.c +++ b/vpx_dsp/mips/convolve8_dspr2.c @@ -1424,7 +1424,8 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, src += src_stride; dst += dst_stride; } - } break; + break; + } case 8: { uint32_t tp1, tp2; @@ -1446,7 +1447,8 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, src += src_stride; dst += dst_stride; } - } break; + break; + } case 16: { uint32_t tp1, tp2, tp3, tp4; @@ -1474,7 +1476,8 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, src += src_stride; dst += dst_stride; } - } break; + break; + } case 32: { uint32_t tp1, tp2, tp3, tp4; uint32_t tp5, tp6, tp7, tp8; @@ -1512,7 +1515,8 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, src += src_stride; dst += dst_stride; } - } break; + break; + } case 64: { uint32_t tp1, tp2, tp3, tp4; uint32_t tp5, tp6, tp7, tp8; @@ -1573,7 +1577,8 @@ void vpx_convolve_copy_dspr2(const uint8_t *src, ptrdiff_t src_stride, src += src_stride; dst += dst_stride; } - } break; + break; + } default: for (y = h; y--;) { for (x = 0; x < w; ++x) { diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk index f56483e0e..21dc95a34 100644 --- a/vpx_dsp/vpx_dsp.mk +++ b/vpx_dsp/vpx_dsp.mk @@ -271,7 +271,6 @@ DSP_SRCS-yes += subtract.c DSP_SRCS-yes += sum_squares.c DSP_SRCS-$(HAVE_SSE2) += x86/sum_squares_sse2.c -DSP_SRCS-$(HAVE_MEDIA) += arm/sad_media$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/sad4d_neon.c DSP_SRCS-$(HAVE_NEON) += arm/sad_neon.c DSP_SRCS-$(HAVE_NEON) += arm/subtract_neon.c @@ -302,12 +301,6 @@ ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC) DSP_SRCS-yes += variance.c DSP_SRCS-yes += variance.h -DSP_SRCS-$(HAVE_MEDIA) += arm/bilinear_filter_media$(ASM) -DSP_SRCS-$(HAVE_MEDIA) += arm/subpel_variance_media.c -DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_h_media$(ASM) -DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_hv_media$(ASM) -DSP_SRCS-$(HAVE_MEDIA) += arm/variance_halfpixvar16x16_v_media$(ASM) -DSP_SRCS-$(HAVE_MEDIA) += arm/variance_media$(ASM) DSP_SRCS-$(HAVE_NEON) += arm/subpel_variance_neon.c DSP_SRCS-$(HAVE_NEON) += arm/variance_neon.c @@ -335,6 +328,9 @@ DSP_SRCS-$(HAVE_SSE2) += x86/highbd_subpel_variance_impl_sse2.asm endif # CONFIG_VP9_HIGHBITDEPTH endif # CONFIG_ENCODERS || CONFIG_POSTPROC || CONFIG_VP9_POSTPROC +# Neon utilities +DSP_SRCS-$(HAVE_NEON) += arm/transpose_neon.h + DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes) DSP_SRCS-yes += vpx_dsp_rtcd.c diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 4d6e97582..5ad154f36 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -960,7 +960,7 @@ add_proto qw/unsigned int vpx_sad16x32/, "const uint8_t *src_ptr, int src_stride specialize qw/vpx_sad16x32 msa sse2/; add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad16x16 media neon msa sse2/; +specialize qw/vpx_sad16x16 neon msa sse2/; add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad16x8 neon msa sse2/; @@ -1387,7 +1387,7 @@ add_proto qw/unsigned int vpx_variance16x32/, "const uint8_t *src_ptr, int sourc specialize qw/vpx_variance16x32 sse2 msa/; add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance16x16 sse2 avx2 media neon msa/; + specialize qw/vpx_variance16x16 sse2 avx2 neon msa/; add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance16x8 sse2 neon msa/; @@ -1396,7 +1396,7 @@ add_proto qw/unsigned int vpx_variance8x16/, "const uint8_t *src_ptr, int source specialize qw/vpx_variance8x16 sse2 neon msa/; add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; - specialize qw/vpx_variance8x8 sse2 media neon msa/; + specialize qw/vpx_variance8x8 sse2 neon msa/; add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; specialize qw/vpx_variance8x4 sse2 msa/; @@ -1417,7 +1417,7 @@ add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, co specialize qw/vpx_get8x8var sse2 neon msa/; add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; - specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/; + specialize qw/vpx_mse16x16 sse2 avx2 neon msa/; add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse"; specialize qw/vpx_mse16x8 sse2 msa/; @@ -1458,7 +1458,7 @@ add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int specialize qw/vpx_sub_pixel_variance16x32 msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance16x16 media neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance16x16 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance16x8 msa sse2 ssse3/; @@ -1467,7 +1467,7 @@ add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int specialize qw/vpx_sub_pixel_variance8x16 msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_sub_pixel_variance8x8 media neon msa sse2 ssse3/; + specialize qw/vpx_sub_pixel_variance8x8 neon msa sse2 ssse3/; add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse"; specialize qw/vpx_sub_pixel_variance8x4 msa sse2 ssse3/; @@ -1520,14 +1520,19 @@ add_proto qw/uint32_t vpx_sub_pixel_avg_variance4x4/, "const uint8_t *src_ptr, i # # Specialty Subpixel # +# TODO(johannkoenig): Add neon implementations of +# vpx_variance_halfpixvar16x16_h +# vpx_variance_halfpixvar16x16_v +# vpx_variance_halfpixvar16x16_hv +# https://bugs.chromium.org/p/webm/issues/detail?id=1273 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_h/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_variance_halfpixvar16x16_h sse2 media/; + specialize qw/vpx_variance_halfpixvar16x16_h sse2/; add_proto qw/uint32_t vpx_variance_halfpixvar16x16_v/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_variance_halfpixvar16x16_v sse2 media/; + specialize qw/vpx_variance_halfpixvar16x16_v sse2/; add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride, uint32_t *sse"; - specialize qw/vpx_variance_halfpixvar16x16_hv sse2 media/; + specialize qw/vpx_variance_halfpixvar16x16_hv sse2/; if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"; diff --git a/vpx_ports/arm_cpudetect.c b/vpx_ports/arm_cpudetect.c index d320ce8fa..79c60f7a1 100644 --- a/vpx_ports/arm_cpudetect.c +++ b/vpx_ports/arm_cpudetect.c @@ -10,8 +10,9 @@ #include <stdlib.h> #include <string.h> -#include "vpx_ports/arm.h" + #include "./vpx_config.h" +#include "vpx_ports/arm.h" #ifdef WINAPI_FAMILY #include <winapifamily.h> @@ -49,9 +50,6 @@ int arm_cpu_caps(void) { return flags; } mask = arm_cpu_env_mask(); -#if HAVE_MEDIA - flags |= HAS_MEDIA; -#endif /* HAVE_MEDIA */ #if HAVE_NEON || HAVE_NEON_ASM flags |= HAS_NEON; #endif /* HAVE_NEON || HAVE_NEON_ASM */ @@ -75,28 +73,18 @@ int arm_cpu_caps(void) { * instructions via their assembled hex code. * All of these instructions should be essentially nops. */ -#if HAVE_MEDIA - if (mask & HAS_MEDIA) __try { - /*SHADD8 r3,r3,r3*/ - __emit(0xE6333F93); - flags |= HAS_MEDIA; +#if HAVE_NEON || HAVE_NEON_ASM + if (mask & HAS_NEON) { + __try { + /*VORR q0,q0,q0*/ + __emit(0xF2200150); + flags |= HAS_NEON; } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { /*Ignore exception.*/ } -} -#endif /* HAVE_MEDIA */ -#if HAVE_NEON || HAVE_NEON_ASM -if (mask & HAS_NEON) { - __try { - /*VORR q0,q0,q0*/ - __emit(0xF2200150); - flags |= HAS_NEON; - } __except (GetExceptionCode() == EXCEPTION_ILLEGAL_INSTRUCTION) { - /*Ignore exception.*/ } -} #endif /* HAVE_NEON || HAVE_NEON_ASM */ -return flags & mask; + return flags & mask; } #elif defined(__ANDROID__) /* end _MSC_VER */ @@ -112,9 +100,6 @@ int arm_cpu_caps(void) { mask = arm_cpu_env_mask(); features = android_getCpuFeatures(); -#if HAVE_MEDIA - flags |= HAS_MEDIA; -#endif /* HAVE_MEDIA */ #if HAVE_NEON || HAVE_NEON_ASM if (features & ANDROID_CPU_ARM_FEATURE_NEON) flags |= HAS_NEON; #endif /* HAVE_NEON || HAVE_NEON_ASM */ @@ -153,15 +138,6 @@ int arm_cpu_caps(void) { } } #endif /* HAVE_NEON || HAVE_NEON_ASM */ -#if HAVE_MEDIA - if (memcmp(buf, "CPU architecture:", 17) == 0) { - int version; - version = atoi(buf + 17); - if (version >= 6) { - flags |= HAS_MEDIA; - } - } -#endif /* HAVE_MEDIA */ } fclose(fin); } diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h index 7e77c6ca9..23e015191 100644 --- a/vpx_ports/mem.h +++ b/vpx_ports/mem.h @@ -39,6 +39,7 @@ /* Shift down with rounding */ #define ROUND_POWER_OF_TWO(value, n) (((value) + (1 << ((n)-1))) >> (n)) +#define ROUND64_POWER_OF_TWO(value, n) (((value) + (1ULL << ((n)-1))) >> (n)) #define ALIGN_POWER_OF_TWO(value, n) \ (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1)) @@ -48,4 +49,14 @@ #define CONVERT_TO_BYTEPTR(x) ((uint8_t *)(((uintptr_t)(x)) >> 1)) #endif // CONFIG_VP9_HIGHBITDEPTH +#if !defined(__has_feature) +#define __has_feature(x) 0 +#endif // !defined(__has_feature) + +#if __has_feature(address_sanitizer) || __SANITIZE_ADDRESS__ +#define VPX_WITH_ASAN 1 +#else +#define VPX_WITH_ASAN 0 +#endif // __has_feature(address_sanitizer) || __SANITIZE_ADDRESS + #endif // VPX_PORTS_MEM_H_ |