diff options
author | James Zern <jzern@google.com> | 2015-07-24 16:54:51 -0700 |
---|---|---|
committer | James Zern <jzern@google.com> | 2015-07-31 18:34:20 -0700 |
commit | fcb4253c9cdaece2dd3a2ab8c6842ed41e24f123 (patch) | |
tree | a61c7507bfd7c9f36cf2899409c3c883f4f5474b /third_party/libyuv/source/scale_neon64.cc | |
parent | f42012e526f218f4607e5d678c5bd44bb5978bff (diff) | |
download | libvpx-fcb4253c9cdaece2dd3a2ab8c6842ed41e24f123.tar libvpx-fcb4253c9cdaece2dd3a2ab8c6842ed41e24f123.tar.gz libvpx-fcb4253c9cdaece2dd3a2ab8c6842ed41e24f123.tar.bz2 libvpx-fcb4253c9cdaece2dd3a2ab8c6842ed41e24f123.zip |
update libyuv to r1456
picks up build warning fixes for visual studio 2015
Change-Id: Idea85fa70d1aeb2a46ea355b87fe41ec5b2b9520
Diffstat (limited to 'third_party/libyuv/source/scale_neon64.cc')
-rw-r--r-- | third_party/libyuv/source/scale_neon64.cc | 374 |
1 files changed, 325 insertions, 49 deletions
diff --git a/third_party/libyuv/source/scale_neon64.cc b/third_party/libyuv/source/scale_neon64.cc index fb68b67d2..1d5519357 100644 --- a/third_party/libyuv/source/scale_neon64.cc +++ b/third_party/libyuv/source/scale_neon64.cc @@ -27,8 +27,8 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "1: \n" // load even pixels into v0, odd into v1 MEMACCESS(0) - "ld2 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %2, %2, #16 \n" // 16 processed per loop + "ld2 {v0.16b,v1.16b}, [%0], #32 \n" + "subs %w2, %w2, #16 \n" // 16 processed per loop MEMACCESS(1) "st1 {v1.16b}, [%1], #16 \n" // store odd pixels "b.gt 1b \n" @@ -40,6 +40,29 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +// Read 32x1 average down and write 16x1. +void ScaleRowDown2Linear_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint8* dst, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS(0) + "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load pixels and post inc + "subs %w2, %w2, #16 \n" // 16 processed per loop + "uaddlp v0.8h, v0.16b \n" // add adjacent + "uaddlp v1.8h, v1.16b \n" + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack + "rshrn2 v0.16b, v1.8h, #1 \n" + MEMACCESS(1) + "st1 {v0.16b}, [%1], #16 \n" + "b.gt 1b \n" + : "+r"(src_ptr), // %0 + "+r"(dst), // %1 + "+r"(dst_width) // %2 + : + : "v0", "v1" // Clobber List + ); +} + // Read 32x2 average down and write 16x1. void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { @@ -51,7 +74,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "ld1 {v0.16b,v1.16b}, [%0], #32 \n" // load row 1 and post inc MEMACCESS(1) "ld1 {v2.16b, v3.16b}, [%1], #32 \n" // load row 2 and post inc - "subs %3, %3, #16 \n" // 16 processed per loop + "subs %w3, %w3, #16 \n" // 16 processed per loop "uaddlp v0.8h, v0.16b \n" // row 1 add adjacent "uaddlp v1.8h, v1.16b \n" "uadalp v0.8h, v2.16b \n" // row 2 add adjacent + row1 @@ -76,7 +99,7 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %2, %2, #8 \n" // 8 processed per loop + "subs %w2, %w2, #8 \n" // 8 processed per loop MEMACCESS(1) "st1 {v2.8b}, [%1], #8 \n" "b.gt 1b \n" @@ -103,7 +126,7 @@ asm volatile ( "ld1 {v2.16b}, [%3], #16 \n" MEMACCESS(5) "ld1 {v3.16b}, [%4], #16 \n" - "subs %5, %5, #4 \n" + "subs %w5, %w5, #4 \n" "uaddlp v0.8h, v0.16b \n" "uadalp v0.8h, v1.16b \n" "uadalp v0.8h, v2.16b \n" @@ -134,7 +157,7 @@ void ScaleRowDown34_NEON(const uint8* src_ptr, "1: \n" MEMACCESS(0) "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 - "subs %2, %2, #24 \n" + "subs %w2, %w2, #24 \n" "orr v2.16b, v3.16b, v3.16b \n" // order v0, v1, v2 MEMACCESS(1) "st3 {v0.8b,v1.8b,v2.8b}, [%1], #24 \n" @@ -158,7 +181,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 MEMACCESS(3) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %2, %2, #24 \n" + "subs %w2, %w2, #24 \n" // filter src line 0 with src line 1 // expand chars to shorts to allow for room @@ -218,7 +241,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" // src line 0 MEMACCESS(3) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%3], #32 \n" // src line 1 - "subs %2, %2, #24 \n" + "subs %w2, %w2, #24 \n" // average src line 0 with src line 1 "urhadd v0.8b, v0.8b, v4.8b \n" "urhadd v1.8b, v1.8b, v5.8b \n" @@ -271,7 +294,7 @@ void ScaleRowDown38_NEON(const uint8* src_ptr, "1: \n" MEMACCESS(0) "ld1 {v0.16b,v1.16b}, [%0], #32 \n" - "subs %2, %2, #12 \n" + "subs %w2, %w2, #12 \n" "tbl v2.16b, {v0.16b,v1.16b}, v3.16b \n" MEMACCESS(1) "st1 {v2.8b}, [%1], #8 \n" @@ -313,7 +336,7 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr, "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" MEMACCESS(4) "ld4 {v16.8b,v17.8b,v18.8b,v19.8b}, [%3], #32 \n" - "subs %4, %4, #12 \n" + "subs %w4, %w4, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 @@ -437,7 +460,7 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, "ld4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%0], #32 \n" MEMACCESS(3) "ld4 {v4.8b,v5.8b,v6.8b,v7.8b}, [%2], #32 \n" - "subs %3, %3, #12 \n" + "subs %w3, %w3, #12 \n" // Shuffle the input data around to get align the data // so adjacent data can be added. 0,1 - 2,3 - 4,5 - 6,7 @@ -522,20 +545,127 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr, ); } +void ScaleAddRows_NEON(const uint8* src_ptr, ptrdiff_t src_stride, + uint16* dst_ptr, int src_width, int src_height) { + const uint8* src_tmp = NULL; + asm volatile ( + "1: \n" + "mov %0, %1 \n" + "mov w12, %w5 \n" + "eor v2.16b, v2.16b, v2.16b \n" + "eor v3.16b, v3.16b, v3.16b \n" + "2: \n" + // load 16 pixels into q0 + MEMACCESS(0) + "ld1 {v0.16b}, [%0], %3 \n" + "uaddw2 v3.8h, v3.8h, v0.16b \n" + "uaddw v2.8h, v2.8h, v0.8b \n" + "subs w12, w12, #1 \n" + "b.gt 2b \n" + MEMACCESS(2) + "st1 {v2.8h, v3.8h}, [%2], #32 \n" // store pixels + "add %1, %1, #16 \n" + "subs %w4, %w4, #16 \n" // 16 processed per loop + "b.gt 1b \n" + : "+r"(src_tmp), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_ptr), // %2 + "+r"(src_stride), // %3 + "+r"(src_width), // %4 + "+r"(src_height) // %5 + : + : "memory", "cc", "w12", "v0", "v1", "v2", "v3" // Clobber List + ); +} + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA8_LANE(n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld2 {v4.b, v5.b}["#n"], [%6] \n" + +void ScaleFilterCols_NEON(uint8* dst_ptr, const uint8* src_ptr, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_ptr; + int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64) x; + int64 dx64 = (int64) dx; + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v3.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v1.4s, v1.4s, v0.4s \n" + // x + 4 * dx, x + 5 * dx, x + 6 * dx, x + 7 * dx + "add v2.4s, v1.4s, v3.4s \n" + "shl v0.4s, v3.4s, #1 \n" // 8 * dx + "1: \n" + LOAD2_DATA8_LANE(0) + LOAD2_DATA8_LANE(1) + LOAD2_DATA8_LANE(2) + LOAD2_DATA8_LANE(3) + LOAD2_DATA8_LANE(4) + LOAD2_DATA8_LANE(5) + LOAD2_DATA8_LANE(6) + LOAD2_DATA8_LANE(7) + "mov v6.16b, v1.16b \n" + "mov v7.16b, v2.16b \n" + "uzp1 v6.8h, v6.8h, v7.8h \n" + "ushll v4.8h, v4.8b, #0 \n" + "ushll v5.8h, v5.8b, #0 \n" + "ssubl v16.4s, v5.4h, v4.4h \n" + "ssubl2 v17.4s, v5.8h, v4.8h \n" + "ushll v7.4s, v6.4h, #0 \n" + "ushll2 v6.4s, v6.8h, #0 \n" + "mul v16.4s, v16.4s, v7.4s \n" + "mul v17.4s, v17.4s, v6.4s \n" + "shrn v6.4h, v16.4s, #16 \n" + "shrn2 v6.8h, v17.4s, #16 \n" + "add v4.8h, v4.8h, v6.8h \n" + "xtn v4.8b, v4.8h \n" + + MEMACCESS(0) + "st1 {v4.8b}, [%0], #8 \n" // store pixels + "add v1.4s, v1.4s, v0.4s \n" + "add v2.4s, v2.4s, v0.4s \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_ptr), // %0 + "+r"(src_ptr), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", + "v4", "v5", "v6", "v7", "v16", "v17" + ); +} + +#undef LOAD2_DATA8_LANE + // 16x2 -> 16x1 void ScaleFilterRows_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { int y_fraction = 256 - source_y_fraction; asm volatile ( - "cmp %4, #0 \n" + "cmp %w4, #0 \n" "b.eq 100f \n" "add %2, %2, %1 \n" - "cmp %4, #64 \n" + "cmp %w4, #64 \n" "b.eq 75f \n" - "cmp %4, #128 \n" + "cmp %w4, #128 \n" "b.eq 50f \n" - "cmp %4, #192 \n" + "cmp %w4, #192 \n" "b.eq 25f \n" "dup v5.8b, %w4 \n" @@ -546,7 +676,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v1.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "umull v6.8h, v0.8b, v4.8b \n" "umull2 v7.8h, v0.16b, v4.16b \n" "umlal v6.8h, v1.8b, v5.8b \n" @@ -564,7 +694,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v1.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) @@ -578,7 +708,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "ld1 {v0.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v1.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" @@ -591,7 +721,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "ld1 {v1.16b}, [%1], #16 \n" MEMACCESS(2) "ld1 {v0.16b}, [%2], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" "urhadd v0.16b, v0.16b, v1.16b \n" "urhadd v0.16b, v0.16b, v1.16b \n" MEMACCESS(0) @@ -603,7 +733,7 @@ void ScaleFilterRows_NEON(uint8* dst_ptr, "100: \n" MEMACCESS(1) "ld1 {v0.16b}, [%1], #16 \n" - "subs %3, %3, #16 \n" + "subs %w3, %w3, #16 \n" MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" "b.gt 100b \n" @@ -631,7 +761,7 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "ld2 {v0.4s, v1.4s}, [%0], #32 \n" MEMACCESS (0) "ld2 {v2.4s, v3.4s}, [%0], #32 \n" - "subs %2, %2, #8 \n" // 8 processed per loop + "subs %w2, %w2, #8 \n" // 8 processed per loop MEMACCESS (1) "st1 {v1.16b}, [%1], #16 \n" // store odd pixels MEMACCESS (1) @@ -645,6 +775,33 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride, ); } +void ScaleARGBRowDown2Linear_NEON(const uint8* src_argb, ptrdiff_t src_stride, + uint8* dst_argb, int dst_width) { + asm volatile ( + "1: \n" + MEMACCESS (0) + // load 8 ARGB pixels. + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" + "subs %w2, %w2, #8 \n" // 8 processed per loop. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + "uaddlp v3.8h, v3.16b \n" // A 16 bytes -> 8 shorts. + "rshrn v0.8b, v0.8h, #1 \n" // downshift, round and pack + "rshrn v1.8b, v1.8h, #1 \n" + "rshrn v2.8b, v2.8h, #1 \n" + "rshrn v3.8b, v3.8h, #1 \n" + MEMACCESS (1) + "st4 {v0.8b,v1.8b,v2.8b,v3.8b}, [%1], #32 \n" + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_argb), // %1 + "+r"(dst_width) // %2 + : + : "memory", "cc", "v0", "v1", "v2", "v3" // Clobber List + ); +} + void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, uint8* dst, int dst_width) { asm volatile ( @@ -653,7 +810,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride, "1: \n" MEMACCESS (0) "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 8 ARGB pixels. - "subs %3, %3, #8 \n" // 8 processed per loop. + "subs %w3, %w3, #8 \n" // 8 processed per loop. "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. @@ -694,21 +851,21 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride, "ld1 {v0.s}[2], [%0], %3 \n" MEMACCESS(0) "ld1 {v0.s}[3], [%0], %3 \n" - "subs %2, %2, #4 \n" // 4 pixels per loop. + "subs %w2, %w2, #4 \n" // 4 pixels per loop. MEMACCESS(1) "st1 {v0.16b}, [%1], #16 \n" "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 "+r"(dst_width) // %2 - : "r"(static_cast<ptrdiff_t>(src_stepx * 4)) // %3 + : "r"((int64)(src_stepx * 4)) // %3 : "memory", "cc", "v0" ); } // Reads 4 pixels at a time. // Alignment requirement: src_argb 4 byte aligned. -// TODO, might be worth another optimization pass in future. +// TODO(Yang Zhang): Might be worth another optimization pass in future. // It could be upgraded to 8 pixels at a time to start with. void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, int src_stepx, @@ -717,36 +874,36 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, "add %1, %1, %0 \n" "1: \n" MEMACCESS(0) - "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 + "ld1 {v0.8b}, [%0], %4 \n" // Read 4 2x2 blocks -> 2x1 MEMACCESS(1) - "ld1 {v1.8b}, [%1], %4 \n" + "ld1 {v1.8b}, [%1], %4 \n" MEMACCESS(0) - "ld1 {v2.8b}, [%0], %4 \n" + "ld1 {v2.8b}, [%0], %4 \n" MEMACCESS(1) - "ld1 {v3.8b}, [%1], %4 \n" + "ld1 {v3.8b}, [%1], %4 \n" MEMACCESS(0) - "ld1 {v4.8b}, [%0], %4 \n" + "ld1 {v4.8b}, [%0], %4 \n" MEMACCESS(1) - "ld1 {v5.8b}, [%1], %4 \n" + "ld1 {v5.8b}, [%1], %4 \n" MEMACCESS(0) - "ld1 {v6.8b}, [%0], %4 \n" + "ld1 {v6.8b}, [%0], %4 \n" MEMACCESS(1) - "ld1 {v7.8b}, [%1], %4 \n" - "uaddl v0.8h, v0.8b, v1.8b \n" - "uaddl v2.8h, v2.8b, v3.8b \n" - "uaddl v4.8h, v4.8b, v5.8b \n" - "uaddl v6.8h, v6.8b, v7.8b \n" - "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd - "mov v0.d[1], v2.d[0] \n" - "mov v2.d[0], v16.d[1] \n" - "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh - "mov v4.d[1], v6.d[0] \n" - "mov v6.d[0], v16.d[1] \n" - "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) - "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) - "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. - "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. - "subs %3, %3, #4 \n" // 4 pixels per loop. + "ld1 {v7.8b}, [%1], %4 \n" + "uaddl v0.8h, v0.8b, v1.8b \n" + "uaddl v2.8h, v2.8b, v3.8b \n" + "uaddl v4.8h, v4.8b, v5.8b \n" + "uaddl v6.8h, v6.8b, v7.8b \n" + "mov v16.d[1], v0.d[1] \n" // ab_cd -> ac_bd + "mov v0.d[1], v2.d[0] \n" + "mov v2.d[0], v16.d[1] \n" + "mov v16.d[1], v4.d[1] \n" // ef_gh -> eg_fh + "mov v4.d[1], v6.d[0] \n" + "mov v6.d[0], v16.d[1] \n" + "add v0.8h, v0.8h, v2.8h \n" // (a+b)_(c+d) + "add v4.8h, v4.8h, v6.8h \n" // (e+f)_(g+h) + "rshrn v0.8b, v0.8h, #2 \n" // first 2 pixels. + "rshrn2 v0.16b, v4.8h, #2 \n" // next 2 pixels. + "subs %w3, %w3, #4 \n" // 4 pixels per loop. MEMACCESS(2) "st1 {v0.16b}, [%2], #16 \n" "b.gt 1b \n" @@ -754,10 +911,129 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride, "+r"(src_stride), // %1 "+r"(dst_argb), // %2 "+r"(dst_width) // %3 - : "r"(src_stepx * 4) // %4 + : "r"((int64)(src_stepx * 4)) // %4 : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD1_DATA32_LANE(vn, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld1 {"#vn".s}["#n"], [%6] \n" + +void ScaleARGBCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + const uint8* src_tmp = src_argb; + int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64) x; + int64 dx64 = (int64) dx; + int64 tmp64 = 0; + asm volatile ( + "1: \n" + LOAD1_DATA32_LANE(v0, 0) + LOAD1_DATA32_LANE(v0, 1) + LOAD1_DATA32_LANE(v0, 2) + LOAD1_DATA32_LANE(v0, 3) + LOAD1_DATA32_LANE(v1, 0) + LOAD1_DATA32_LANE(v1, 1) + LOAD1_DATA32_LANE(v1, 2) + LOAD1_DATA32_LANE(v1, 3) + + MEMACCESS(0) + "st1 {v0.4s, v1.4s}, [%0], #32 \n" // store pixels + "subs %w2, %w2, #8 \n" // 8 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp64), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1" + ); +} + +#undef LOAD1_DATA32_LANE + +// TODO(Yang Zhang): Investigate less load instructions for +// the x/dx stepping +#define LOAD2_DATA32_LANE(vn1, vn2, n) \ + "lsr %5, %3, #16 \n" \ + "add %6, %1, %5, lsl #2 \n" \ + "add %3, %3, %4 \n" \ + MEMACCESS(6) \ + "ld2 {"#vn1".s, "#vn2".s}["#n"], [%6] \n" + +void ScaleARGBFilterCols_NEON(uint8* dst_argb, const uint8* src_argb, + int dst_width, int x, int dx) { + int dx_offset[4] = {0, 1, 2, 3}; + int* tmp = dx_offset; + const uint8* src_tmp = src_argb; + int64 dst_width64 = (int64) dst_width; // Work around ios 64 bit warning. + int64 x64 = (int64) x; + int64 dx64 = (int64) dx; + asm volatile ( + "dup v0.4s, %w3 \n" // x + "dup v1.4s, %w4 \n" // dx + "ld1 {v2.4s}, [%5] \n" // 0 1 2 3 + "shl v6.4s, v1.4s, #2 \n" // 4 * dx + "mul v1.4s, v1.4s, v2.4s \n" + "movi v3.16b, #0x7f \n" // 0x7F + "movi v4.8h, #0x7f \n" // 0x7F + // x , x + 1 * dx, x + 2 * dx, x + 3 * dx + "add v5.4s, v1.4s, v0.4s \n" + "1: \n" + // d0, d1: a + // d2, d3: b + LOAD2_DATA32_LANE(v0, v1, 0) + LOAD2_DATA32_LANE(v0, v1, 1) + LOAD2_DATA32_LANE(v0, v1, 2) + LOAD2_DATA32_LANE(v0, v1, 3) + "shrn v2.4h, v5.4s, #9 \n" + "and v2.8b, v2.8b, v4.8b \n" + "dup v16.8b, v2.b[0] \n" + "dup v17.8b, v2.b[2] \n" + "dup v18.8b, v2.b[4] \n" + "dup v19.8b, v2.b[6] \n" + "ext v2.8b, v16.8b, v17.8b, #4 \n" + "ext v17.8b, v18.8b, v19.8b, #4 \n" + "ins v2.d[1], v17.d[0] \n" // f + "eor v7.16b, v2.16b, v3.16b \n" // 0x7f ^ f + "umull v16.8h, v0.8b, v7.8b \n" + "umull2 v17.8h, v0.16b, v7.16b \n" + "umull v18.8h, v1.8b, v2.8b \n" + "umull2 v19.8h, v1.16b, v2.16b \n" + "add v16.8h, v16.8h, v18.8h \n" + "add v17.8h, v17.8h, v19.8h \n" + "shrn v0.8b, v16.8h, #7 \n" + "shrn2 v0.16b, v17.8h, #7 \n" + + MEMACCESS(0) + "st1 {v0.4s}, [%0], #16 \n" // store pixels + "add v5.4s, v5.4s, v6.4s \n" + "subs %w2, %w2, #4 \n" // 4 processed per loop + "b.gt 1b \n" + : "+r"(dst_argb), // %0 + "+r"(src_argb), // %1 + "+r"(dst_width64), // %2 + "+r"(x64), // %3 + "+r"(dx64), // %4 + "+r"(tmp), // %5 + "+r"(src_tmp) // %6 + : + : "memory", "cc", "v0", "v1", "v2", "v3", "v4", "v5", + "v6", "v7", "v16", "v17", "v18", "v19" + ); +} + +#undef LOAD2_DATA32_LANE + #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus |