diff options
author | James Bankoski <jimbankoski@google.com> | 2016-07-01 19:08:04 +0000 |
---|---|---|
committer | James Bankoski <jimbankoski@google.com> | 2016-07-01 19:14:28 +0000 |
commit | c5372cf077394856eb1aa10e72bcc8e25bb9b3ec (patch) | |
tree | 7cfaf2e17b6c1374a3c3d10026dd74c1ed9992c0 /third_party/libyuv/source/row_neon64.cc | |
parent | aa81375d73ee33d382e7f717c519db6159e497ee (diff) | |
download | libvpx-c5372cf077394856eb1aa10e72bcc8e25bb9b3ec.tar libvpx-c5372cf077394856eb1aa10e72bcc8e25bb9b3ec.tar.gz libvpx-c5372cf077394856eb1aa10e72bcc8e25bb9b3ec.tar.bz2 libvpx-c5372cf077394856eb1aa10e72bcc8e25bb9b3ec.zip |
Revert "libyuv: update to 2f101fdb"
Compile failures on linux platform.
BUG=webm:1253
This reverts commit aa81375d73ee33d382e7f717c519db6159e497ee.
Change-Id: Ibab2c4827bc21518dc03c6e9716b5015cff56fc7
Diffstat (limited to 'third_party/libyuv/source/row_neon64.cc')
-rw-r--r-- | third_party/libyuv/source/row_neon64.cc | 844 |
1 files changed, 561 insertions, 283 deletions
diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc index 6375d4f55..5d015454b 100644 --- a/third_party/libyuv/source/row_neon64.cc +++ b/third_party/libyuv/source/row_neon64.cc @@ -91,15 +91,17 @@ extern "C" { "uzp2 v3.8b, v2.8b, v2.8b \n" \ "ins v1.s[1], v3.s[0] \n" -#define YUVTORGB_SETUP \ +#define YUV422TORGB_SETUP_REG \ "ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \ "ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \ "ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \ "ld1r {v31.4s}, [%[kYToRgb]] \n" \ - "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \ - "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n" + "movi v27.8h, #128 \n" \ + "movi v28.8h, #102 \n" \ + "movi v29.8h, #25 \n" \ + "movi v30.8h, #52 \n" -#define YUVTORGB(vR, vG, vB) \ +#define YUV422TORGB(vR, vG, vB) \ "uxtl v0.8h, v0.8b \n" /* Extract Y */ \ "shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \ "ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \ @@ -127,19 +129,57 @@ extern "C" { "sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \ "sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \ +// YUV to RGB conversion constants. +// Y contribution to R,G,B. Scale and bias. +#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */ +#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */ + +// U and V contributions to R,G,B. +#define UB -128 /* -min(128, round(2.018 * 64)) */ +#define UG 25 /* -round(-0.391 * 64) */ +#define VG 52 /* -round(-0.813 * 64) */ +#define VR -102 /* -round(1.596 * 64) */ + +// Bias values to subtract 16 from Y and 128 from U and V. +#define BB (UB * 128 - YGB) +#define BG (UG * 128 + VG * 128 - YGB) +#define BR (VR * 128 - YGB) + +static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 }; +static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 }; + +#undef YG +#undef YGB +#undef UB +#undef UG +#undef VG +#undef VR +#undef BB +#undef BG +#undef BR + +#define RGBTOUV_SETUP_REG \ + "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ + "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ + "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ + "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ + "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ + "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ + + +#ifdef HAS_I444TOARGBROW_NEON void I444ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ + YUV422TORGB_SETUP_REG "1: \n" READYUV444 - YUVTORGB(v22, v21, v20) - "subs %w4, %w4, #8 \n" + YUV422TORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -148,28 +188,27 @@ void I444ToARGBRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I444TOARGBROW_NEON +#ifdef HAS_I422TOARGBROW_NEON void I422ToARGBRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -178,91 +217,114 @@ void I422ToARGBRow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I422TOARGBROW_NEON -void I422AlphaToARGBRow_NEON(const uint8* src_y, - const uint8* src_u, - const uint8* src_v, - const uint8* src_a, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, - int width) { +#ifdef HAS_I411TOARGBROW_NEON +void I411ToARGBRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_argb, + int width) { asm volatile ( - YUVTORGB_SETUP + YUV422TORGB_SETUP_REG + "1: \n" + READYUV411 + YUV422TORGB(v22, v21, v20) + "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" /* A */ + MEMACCESS(3) + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_argb), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I411TOARGBROW_NEON + +#ifdef HAS_I422TOBGRAROW_NEON +void I422ToBGRARow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_bgra, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v21, v22, v23) + "subs %w4, %w4, #8 \n" + "movi v20.8b, #255 \n" /* A */ MEMACCESS(3) - "ld1 {v23.8b}, [%3], #8 \n" - "subs %w5, %w5, #8 \n" - MEMACCESS(4) - "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n" + "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 - "+r"(src_a), // %3 - "+r"(dst_argb), // %4 - "+r"(width) // %5 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + "+r"(dst_bgra), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I422TOBGRAROW_NEON -void I411ToARGBRow_NEON(const uint8* src_y, +#ifdef HAS_I422TOABGRROW_NEON +void I422ToABGRRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, - uint8* dst_argb, - const struct YuvConstants* yuvconstants, + uint8* dst_abgr, int width) { asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" /* A */ + YUV422TORGB_SETUP_REG "1: \n" - READYUV411 - YUVTORGB(v22, v21, v20) + READYUV422 + YUV422TORGB(v20, v21, v22) "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(src_u), // %1 "+r"(src_v), // %2 - "+r"(dst_argb), // %3 + "+r"(dst_abgr), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I422TOABGRROW_NEON +#ifdef HAS_I422TORGBAROW_NEON void I422ToRGBARow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgba, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP - "movi v20.8b, #255 \n" /* A */ + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUVTORGB(v23, v22, v21) + YUV422TORGB(v23, v22, v21) "subs %w4, %w4, #8 \n" + "movi v20.8b, #255 \n" /* A */ MEMACCESS(3) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n" "b.gt 1b \n" @@ -271,26 +333,25 @@ void I422ToRGBARow_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgba), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I422TORGBAROW_NEON +#ifdef HAS_I422TORGB24ROW_NEON void I422ToRGB24Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb24, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" MEMACCESS(3) "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" @@ -300,33 +361,60 @@ void I422ToRGB24Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgb24), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I422TORGB24ROW_NEON + +#ifdef HAS_I422TORAWROW_NEON +void I422ToRAWRow_NEON(const uint8* src_y, + const uint8* src_u, + const uint8* src_v, + uint8* dst_raw, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READYUV422 + YUV422TORGB(v20, v21, v22) + "subs %w4, %w4, #8 \n" + MEMACCESS(3) + "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n" + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_u), // %1 + "+r"(src_v), // %2 + "+r"(dst_raw), // %3 + "+r"(width) // %4 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_I422TORAWROW_NEON #define ARGBTORGB565 \ "shll v0.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ "sri v0.8h, v21.8h, #5 \n" /* RG */ \ "sri v0.8h, v20.8h, #11 \n" /* RGB */ +#ifdef HAS_I422TORGB565ROW_NEON void I422ToRGB565Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_rgb565, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" ARGBTORGB565 MEMACCESS(3) @@ -337,37 +425,36 @@ void I422ToRGB565Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_rgb565), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I422TORGB565ROW_NEON #define ARGBTOARGB1555 \ "shll v0.8h, v23.8b, #8 \n" /* A */ \ "shll v22.8h, v22.8b, #8 \n" /* R */ \ - "shll v21.8h, v21.8b, #8 \n" /* G */ \ "shll v20.8h, v20.8b, #8 \n" /* B */ \ + "shll v21.8h, v21.8b, #8 \n" /* G */ \ "sri v0.8h, v22.8h, #1 \n" /* AR */ \ "sri v0.8h, v21.8h, #6 \n" /* ARG */ \ "sri v0.8h, v20.8h, #11 \n" /* ARGB */ +#ifdef HAS_I422TOARGB1555ROW_NEON void I422ToARGB1555Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb1555, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV422 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" + "movi v23.8b, #255 \n" ARGBTOARGB1555 MEMACCESS(3) "st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565. @@ -377,14 +464,13 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb1555), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I422TOARGB1555ROW_NEON #define ARGBTOARGB4444 \ /* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \ @@ -396,18 +482,18 @@ void I422ToARGB1555Row_NEON(const uint8* src_y, "orr v1.8b, v22.8b, v23.8b \n" /* RA */ \ "zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */ +#ifdef HAS_I422TOARGB4444ROW_NEON void I422ToARGB4444Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, uint8* dst_argb4444, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP + YUV422TORGB_SETUP_REG "movi v4.16b, #0x0f \n" // bits to clear with vbic. "1: \n" READYUV422 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w4, %w4, #8 \n" "movi v23.8b, #255 \n" ARGBTOARGB4444 @@ -419,40 +505,41 @@ void I422ToARGB4444Row_NEON(const uint8* src_y, "+r"(src_v), // %2 "+r"(dst_argb4444), // %3 "+r"(width) // %4 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I422TOARGB4444ROW_NEON +#ifdef HAS_I400TOARGBROW_NEON void I400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { + int64 width64 = (int64)(width); asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUV400 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" + "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB), - [kUVToG]"r"(&kYuvI601Constants.kUVToG), - [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR), - [kYToRgb]"r"(&kYuvI601Constants.kYToRgb) + "+r"(width64) // %2 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_I400TOARGBROW_NEON +#ifdef HAS_J400TOARGBROW_NEON void J400ToARGBRow_NEON(const uint8* src_y, uint8* dst_argb, int width) { @@ -474,19 +561,20 @@ void J400ToARGBRow_NEON(const uint8* src_y, : "cc", "memory", "v20", "v21", "v22", "v23" ); } +#endif // HAS_J400TOARGBROW_NEON +#ifdef HAS_NV12TOARGBROW_NEON void NV12ToARGBRow_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_argb, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" + YUV422TORGB_SETUP_REG "1: \n" READNV12 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" + "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" @@ -494,53 +582,78 @@ void NV12ToARGBRow_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_NV12TOARGBROW_NEON +#ifdef HAS_NV21TOARGBROW_NEON void NV21ToARGBRow_NEON(const uint8* src_y, - const uint8* src_vu, + const uint8* src_uv, uint8* dst_argb, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" + YUV422TORGB_SETUP_REG "1: \n" READNV21 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" + "movi v23.8b, #255 \n" MEMACCESS(2) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n" "b.gt 1b \n" : "+r"(src_y), // %0 - "+r"(src_vu), // %1 + "+r"(src_uv), // %1 "+r"(dst_argb), // %2 "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_NV21TOARGBROW_NEON +#ifdef HAS_NV12TORGB565ROW_NEON void NV12ToRGB565Row_NEON(const uint8* src_y, const uint8* src_uv, uint8* dst_rgb565, - const struct YuvConstants* yuvconstants, int width) { asm volatile ( - YUVTORGB_SETUP + YUV422TORGB_SETUP_REG "1: \n" READNV12 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) + "subs %w3, %w3, #8 \n" + ARGBTORGB565 + MEMACCESS(2) + "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565. + "b.gt 1b \n" + : "+r"(src_y), // %0 + "+r"(src_uv), // %1 + "+r"(dst_rgb565), // %2 + "+r"(width) // %3 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", + "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" + ); +} +#endif // HAS_NV12TORGB565ROW_NEON + +#ifdef HAS_NV21TORGB565ROW_NEON +void NV21ToRGB565Row_NEON(const uint8* src_y, + const uint8* src_uv, + uint8* dst_rgb565, + int width) { + asm volatile ( + YUV422TORGB_SETUP_REG + "1: \n" + READNV21 + YUV422TORGB(v22, v21, v20) "subs %w3, %w3, #8 \n" ARGBTORGB565 MEMACCESS(2) @@ -550,68 +663,68 @@ void NV12ToRGB565Row_NEON(const uint8* src_y, "+r"(src_uv), // %1 "+r"(dst_rgb565), // %2 "+r"(width) // %3 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_NV21TORGB565ROW_NEON +#ifdef HAS_YUY2TOARGBROW_NEON void YUY2ToARGBRow_NEON(const uint8* src_yuy2, uint8* dst_argb, - const struct YuvConstants* yuvconstants, int width) { + int64 width64 = (int64)(width); asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" + YUV422TORGB_SETUP_REG "1: \n" READYUY2 - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" + "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n" "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + "+r"(width64) // %2 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_YUY2TOARGBROW_NEON +#ifdef HAS_UYVYTOARGBROW_NEON void UYVYToARGBRow_NEON(const uint8* src_uyvy, uint8* dst_argb, - const struct YuvConstants* yuvconstants, int width) { + int64 width64 = (int64)(width); asm volatile ( - YUVTORGB_SETUP - "movi v23.8b, #255 \n" + YUV422TORGB_SETUP_REG "1: \n" READUYVY - YUVTORGB(v22, v21, v20) + YUV422TORGB(v22, v21, v20) "subs %w2, %w2, #8 \n" + "movi v23.8b, #255 \n" MEMACCESS(1) "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n" "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 - : [kUVToRB]"r"(&yuvconstants->kUVToRB), - [kUVToG]"r"(&yuvconstants->kUVToG), - [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR), - [kYToRgb]"r"(&yuvconstants->kYToRgb) + "+r"(width64) // %2 + : [kUVBiasBGR]"r"(&kUVBiasBGR), + [kYToRgb]"r"(&kYToRgb) : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30" ); } +#endif // HAS_UYVYTOARGBROW_NEON // Reads 16 pairs of UV and write even values to dst_u and odd to dst_v. +#ifdef HAS_SPLITUVROW_NEON void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { asm volatile ( @@ -632,8 +745,10 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, : "cc", "memory", "v0", "v1" // Clobber List ); } +#endif // HAS_SPLITUVROW_NEON // Reads 16 U's and V's and writes out 16 pairs of UV. +#ifdef HAS_MERGEUVROW_NEON void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, int width) { asm volatile ( @@ -655,8 +770,10 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv, : "cc", "memory", "v0", "v1" // Clobber List ); } +#endif // HAS_MERGEUVROW_NEON // Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15. +#ifdef HAS_COPYROW_NEON void CopyRow_NEON(const uint8* src, uint8* dst, int count) { asm volatile ( "1: \n" @@ -673,16 +790,17 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) { : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } +#endif // HAS_COPYROW_NEON // SetRow writes 'count' bytes using an 8 bit value repeated. void SetRow_NEON(uint8* dst, uint8 v8, int count) { asm volatile ( "dup v0.16b, %w2 \n" // duplicate 16 bytes "1: \n" - "subs %w1, %w1, #16 \n" // 16 bytes per loop + "subs %w1, %w1, #16 \n" // 16 bytes per loop MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v8) // %2 @@ -694,10 +812,10 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { asm volatile ( "dup v0.4s, %w2 \n" // duplicate 4 ints "1: \n" - "subs %w1, %w1, #4 \n" // 4 ints per loop + "subs %w1, %w1, #4 \n" // 4 ints per loop MEMACCESS(0) "st1 {v0.16b}, [%0], #16 \n" // store - "b.gt 1b \n" + "b.gt 1b \n" : "+r"(dst), // %0 "+r"(count) // %1 : "r"(v32) // %2 @@ -705,15 +823,18 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) { ); } +#ifdef HAS_MIRRORROW_NEON void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { + int64 width64 = (int64) width; asm volatile ( // Start at end of source row. - "add %0, %0, %w2, sxtw \n" + "add %0, %0, %2 \n" "sub %0, %0, #16 \n" + "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #16 \n" // 16 pixels per loop. + "subs %2, %2, #16 \n" // 16 pixels per loop. "rev64 v0.16b, v0.16b \n" MEMACCESS(1) "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 @@ -722,22 +843,26 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) { "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(width) // %2 + "+r"(width64) // %2 : "r"((ptrdiff_t)-16) // %3 : "cc", "memory", "v0" ); } +#endif // HAS_MIRRORROW_NEON +#ifdef HAS_MIRRORUVROW_NEON void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int width) { + int64 width64 = (int64) width; asm volatile ( // Start at end of source row. - "add %0, %0, %w3, sxtw #1 \n" + "add %0, %0, %3, lsl #1 \n" "sub %0, %0, #16 \n" + "1: \n" MEMACCESS(0) "ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16 - "subs %w3, %w3, #8 \n" // 8 pixels per loop. + "subs %3, %3, #8 \n" // 8 pixels per loop. "rev64 v0.8b, v0.8b \n" "rev64 v1.8b, v1.8b \n" MEMACCESS(1) @@ -748,21 +873,25 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v, : "+r"(src_uv), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(width) // %3 + "+r"(width64) // %3 : "r"((ptrdiff_t)-16) // %4 : "cc", "memory", "v0", "v1" ); } +#endif // HAS_MIRRORUVROW_NEON +#ifdef HAS_ARGBMIRRORROW_NEON void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { + int64 width64 = (int64) width; asm volatile ( - // Start at end of source row. - "add %0, %0, %w2, sxtw #2 \n" + // Start at end of source row. + "add %0, %0, %2, lsl #2 \n" "sub %0, %0, #16 \n" + "1: \n" MEMACCESS(0) "ld1 {v0.16b}, [%0], %3 \n" // src -= 16 - "subs %w2, %w2, #4 \n" // 4 pixels per loop. + "subs %2, %2, #4 \n" // 4 pixels per loop. "rev64 v0.4s, v0.4s \n" MEMACCESS(1) "st1 {v0.D}[1], [%1], #8 \n" // dst += 16 @@ -771,13 +900,15 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) { "b.gt 1b \n" : "+r"(src), // %0 "+r"(dst), // %1 - "+r"(width) // %2 + "+r"(width64) // %2 : "r"((ptrdiff_t)-16) // %3 : "cc", "memory", "v0" ); } +#endif // HAS_ARGBMIRRORROW_NEON -void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { +#ifdef HAS_RGB24TOARGBROW_NEON +void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) { asm volatile ( "movi v4.8b, #255 \n" // Alpha "1: \n" @@ -789,13 +920,15 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) { "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } +#endif // HAS_RGB24TOARGBROW_NEON -void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { +#ifdef HAS_RAWTOARGBROW_NEON +void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) { asm volatile ( "movi v5.8b, #255 \n" // Alpha "1: \n" @@ -809,30 +942,12 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) { "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } - -void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b - "subs %w2, %w2, #8 \n" // 8 processed per loop. - "orr v3.8b, v1.8b, v1.8b \n" // move g - "orr v4.8b, v0.8b, v0.8b \n" // move r - MEMACCESS(1) - "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r - "b.gt 1b \n" - : "+r"(src_raw), // %0 - "+r"(dst_rgb24), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List - ); -} +#endif // HAS_RAWTOARGBROW_NEON #define RGB565TOARGB \ "shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \ @@ -847,7 +962,8 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) { "orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \ "dup v2.2D, v0.D[1] \n" /* R */ -void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { +#ifdef HAS_RGB565TOARGBROW_NEON +void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) { asm volatile ( "movi v3.8b, #255 \n" // Alpha "1: \n" @@ -860,11 +976,12 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List ); } +#endif // HAS_RGB565TOARGBROW_NEON #define ARGB1555TOARGB \ "ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \ @@ -903,8 +1020,9 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) { "orr v2.16b, v1.16b, v3.16b \n" /* R */ \ "dup v1.2D, v0.D[1] \n" /* G */ \ +#ifdef HAS_ARGB1555TOARGBROW_NEON void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, - int width) { + int pix) { asm volatile ( "movi v3.8b, #255 \n" // Alpha "1: \n" @@ -917,11 +1035,12 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } +#endif // HAS_ARGB1555TOARGBROW_NEON #define ARGB4444TOARGB \ "shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \ @@ -935,8 +1054,9 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb, "dup v0.2D, v2.D[1] \n" \ "dup v1.2D, v3.D[1] \n" +#ifdef HAS_ARGB4444TOARGBROW_NEON void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, - int width) { + int pix) { asm volatile ( "1: \n" MEMACCESS(0) @@ -948,13 +1068,15 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb, "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List ); } +#endif // HAS_ARGB4444TOARGBROW_NEON -void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { +#ifdef HAS_ARGBTORGB24ROW_NEON +void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) { asm volatile ( "1: \n" MEMACCESS(0) @@ -965,13 +1087,15 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb24), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List ); } +#endif // HAS_ARGBTORGB24ROW_NEON -void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { +#ifdef HAS_ARGBTORAWROW_NEON +void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) { asm volatile ( "1: \n" MEMACCESS(0) @@ -984,13 +1108,15 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_raw), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List ); } +#endif // HAS_ARGBTORAWROW_NEON -void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { +#ifdef HAS_YUY2TOYROW_NEON +void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1001,13 +1127,15 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_yuy2), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1" // Clobber List ); } +#endif // HAS_YUY2TOYROW_NEON -void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { +#ifdef HAS_UYVYTOYROW_NEON +void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1018,14 +1146,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_uyvy), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1" // Clobber List ); } +#endif // HAS_UYVYTOYROW_NEON +#ifdef HAS_YUY2TOUV422ROW_NEON void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, - int width) { + int pix) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1039,14 +1169,16 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v, : "+r"(src_yuy2), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(width) // %3 + "+r"(pix) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } +#endif // HAS_YUY2TOUV422ROW_NEON +#ifdef HAS_UYVYTOUV422ROW_NEON void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, - int width) { + int pix) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1060,14 +1192,16 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v, : "+r"(src_uyvy), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(width) // %3 + "+r"(pix) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } +#endif // HAS_UYVYTOUV422ROW_NEON +#ifdef HAS_YUY2TOUVROW_NEON void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_yuy2b = src_yuy2 + stride_yuy2; asm volatile ( "1: \n" @@ -1087,15 +1221,17 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2, "+r"(src_yuy2b), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } +#endif // HAS_YUY2TOUVROW_NEON +#ifdef HAS_UYVYTOUVROW_NEON void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_uyvyb = src_uyvy + stride_uyvy; asm volatile ( "1: \n" @@ -1115,16 +1251,18 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy, "+r"(src_uyvyb), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" // Clobber List ); } +#endif // HAS_UYVYTOUVROW_NEON // For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA. +#ifdef HAS_ARGBSHUFFLEROW_NEON void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, - const uint8* shuffler, int width) { + const uint8* shuffler, int pix) { asm volatile ( MEMACCESS(3) "ld1 {v2.16b}, [%3] \n" // shuffler @@ -1138,12 +1276,14 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb, "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : "r"(shuffler) // %3 : "cc", "memory", "v0", "v1", "v2" // Clobber List ); } +#endif // HAS_ARGBSHUFFLEROW_NEON +#ifdef HAS_I422TOYUY2ROW_NEON void I422ToYUY2Row_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1170,7 +1310,9 @@ void I422ToYUY2Row_NEON(const uint8* src_y, : "cc", "memory", "v0", "v1", "v2", "v3" ); } +#endif // HAS_I422TOYUY2ROW_NEON +#ifdef HAS_I422TOUYVYROW_NEON void I422ToUYVYRow_NEON(const uint8* src_y, const uint8* src_u, const uint8* src_v, @@ -1197,8 +1339,10 @@ void I422ToUYVYRow_NEON(const uint8* src_y, : "cc", "memory", "v0", "v1", "v2", "v3" ); } +#endif // HAS_I422TOUYVYROW_NEON -void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { +#ifdef HAS_ARGBTORGB565ROW_NEON +void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1210,12 +1354,14 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_rgb565), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v20", "v21", "v22", "v23" ); } +#endif // HAS_ARGBTORGB565ROW_NEON +#ifdef HAS_ARGBTORGB565DITHERROW_NEON void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, const uint32 dither4, int width) { asm volatile ( @@ -1238,9 +1384,11 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb, : "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23" ); } +#endif // HAS_ARGBTORGB565ROW_NEON +#ifdef HAS_ARGBTOARGB1555ROW_NEON void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, - int width) { + int pix) { asm volatile ( "1: \n" MEMACCESS(0) @@ -1252,14 +1400,16 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555, "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb1555), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v20", "v21", "v22", "v23" ); } +#endif // HAS_ARGBTOARGB1555ROW_NEON +#ifdef HAS_ARGBTOARGB4444ROW_NEON void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, - int width) { + int pix) { asm volatile ( "movi v4.16b, #0x0f \n" // bits to clear with vbic. "1: \n" @@ -1272,13 +1422,15 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444, "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_argb4444), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23" ); } +#endif // HAS_ARGBTOARGB4444ROW_NEON -void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { +#ifdef HAS_ARGBTOYROW_NEON +void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -1298,30 +1450,15 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } +#endif // HAS_ARGBTOYROW_NEON -void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) { - asm volatile ( - "1: \n" - MEMACCESS(0) - "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels - "subs %w2, %w2, #16 \n" // 16 processed per loop - MEMACCESS(1) - "st1 {v3.16b}, [%1], #16 \n" // store 16 A's. - "b.gt 1b \n" - : "+r"(src_argb), // %0 - "+r"(dst_a), // %1 - "+r"(width) // %2 - : - : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List - ); -} - -void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { +#ifdef HAS_ARGBTOYJROW_NEON +void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) { asm volatile ( "movi v4.8b, #15 \n" // B * 0.11400 coefficient "movi v5.8b, #75 \n" // G * 0.58700 coefficient @@ -1339,15 +1476,17 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_argb), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } +#endif // HAS_ARGBTOYJROW_NEON // 8x1 pixels. +#ifdef HAS_ARGBTOUV444ROW_NEON void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) { + int pix) { asm volatile ( "movi v24.8b, #112 \n" // UB / VR 0.875 coefficient "movi v25.8b, #74 \n" // UG -0.5781 coefficient @@ -1380,24 +1519,62 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(width) // %3 + "+r"(pix) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26", "v27", "v28", "v29" ); } +#endif // HAS_ARGBTOUV444ROW_NEON -#define RGBTOUV_SETUP_REG \ - "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \ - "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \ - "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \ - "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \ - "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \ - "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */ +// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGBTOUV422ROW_NEON +void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, + int pix) { + asm volatile ( + RGBTOUV_SETUP_REG + "1: \n" + MEMACCESS(0) + "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels. -// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32. + "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts. + "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts. + "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts. + + "subs %w3, %w3, #16 \n" // 16 processed per loop. + "mul v3.8h, v0.8h, v20.8h \n" // B + "mls v3.8h, v1.8h, v21.8h \n" // G + "mls v3.8h, v2.8h, v22.8h \n" // R + "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned + + "mul v4.8h, v2.8h, v20.8h \n" // R + "mls v4.8h, v1.8h, v24.8h \n" // G + "mls v4.8h, v0.8h, v23.8h \n" // B + "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned + + "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U + "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V + + MEMACCESS(1) + "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U. + MEMACCESS(2) + "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V. + "b.gt 1b \n" + : "+r"(src_argb), // %0 + "+r"(dst_u), // %1 + "+r"(dst_v), // %2 + "+r"(pix) // %3 + : + : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", + "v20", "v21", "v22", "v23", "v24", "v25" + ); +} +#endif // HAS_ARGBTOUV422ROW_NEON + +// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32. +#ifdef HAS_ARGBTOUV411ROW_NEON void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, - int width) { + int pix) { asm volatile ( RGBTOUV_SETUP_REG "1: \n" @@ -1439,14 +1616,15 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, : "+r"(src_argb), // %0 "+r"(dst_u), // %1 "+r"(dst_v), // %2 - "+r"(width) // %3 + "+r"(pix) // %3 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } +#endif // HAS_ARGBTOUV411ROW_NEON -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. #define RGBTOUV(QB, QG, QR) \ "mul v3.8h, " #QB ",v20.8h \n" /* B */ \ "mul v4.8h, " #QR ",v20.8h \n" /* R */ \ @@ -1462,8 +1640,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v, // TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr. // TODO(fbarchard): consider ptrdiff_t for all strides. +#ifdef HAS_ARGBTOUVROW_NEON void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_argb_1 = src_argb + src_stride_argb; asm volatile ( RGBTOUV_SETUP_REG @@ -1495,16 +1674,18 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb, "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } +#endif // HAS_ARGBTOUVROW_NEON // TODO(fbarchard): Subsample match C code. +#ifdef HAS_ARGBTOUVJROW_NEON void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_argb_1 = src_argb + src_stride_argb; asm volatile ( "movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2 @@ -1540,15 +1721,17 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb, "+r"(src_argb_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } +#endif // HAS_ARGBTOUVJROW_NEON +#ifdef HAS_BGRATOUVROW_NEON void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_bgra_1 = src_bgra + src_stride_bgra; asm volatile ( RGBTOUV_SETUP_REG @@ -1579,15 +1762,17 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra, "+r"(src_bgra_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } +#endif // HAS_BGRATOUVROW_NEON +#ifdef HAS_ABGRTOUVROW_NEON void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_abgr_1 = src_abgr + src_stride_abgr; asm volatile ( RGBTOUV_SETUP_REG @@ -1618,15 +1803,17 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr, "+r"(src_abgr_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } +#endif // HAS_ABGRTOUVROW_NEON +#ifdef HAS_RGBATOUVROW_NEON void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_rgba_1 = src_rgba + src_stride_rgba; asm volatile ( RGBTOUV_SETUP_REG @@ -1657,15 +1844,17 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba, "+r"(src_rgba_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } +#endif // HAS_RGBATOUVROW_NEON +#ifdef HAS_RGB24TOUVROW_NEON void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24; asm volatile ( RGBTOUV_SETUP_REG @@ -1696,15 +1885,17 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24, "+r"(src_rgb24_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } +#endif // HAS_RGB24TOUVROW_NEON +#ifdef HAS_RAWTOUVROW_NEON void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_raw_1 = src_raw + src_stride_raw; asm volatile ( RGBTOUV_SETUP_REG @@ -1735,16 +1926,18 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw, "+r"(src_raw_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20", "v21", "v22", "v23", "v24", "v25" ); } +#endif // HAS_RAWTOUVROW_NEON -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_RGB565TOUVROW_NEON void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565; asm volatile ( "movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2 @@ -1808,17 +2001,19 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565, "+r"(src_rgb565_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27" ); } +#endif // HAS_RGB565TOUVROW_NEON -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB1555TOUVROW_NEON void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555; asm volatile ( RGBTOUV_SETUP_REG @@ -1877,17 +2072,19 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555, "+r"(src_argb1555_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28" ); } +#endif // HAS_ARGB1555TOUVROW_NEON -// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16. +// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16. +#ifdef HAS_ARGB4444TOUVROW_NEON void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, - uint8* dst_u, uint8* dst_v, int width) { + uint8* dst_u, uint8* dst_v, int pix) { const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444; asm volatile ( RGBTOUV_SETUP_REG @@ -1946,7 +2143,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, "+r"(src_argb4444_1), // %1 "+r"(dst_u), // %2 "+r"(dst_v), // %3 - "+r"(width) // %4 + "+r"(pix) // %4 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", @@ -1954,8 +2151,10 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444, ); } +#endif // HAS_ARGB4444TOUVROW_NEON -void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { +#ifdef HAS_RGB565TOYROW_NEON +void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) { asm volatile ( "movi v24.8b, #13 \n" // B * 0.1016 coefficient "movi v25.8b, #65 \n" // G * 0.5078 coefficient @@ -1976,14 +2175,16 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_rgb565), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6", "v24", "v25", "v26", "v27" ); } +#endif // HAS_RGB565TOYROW_NEON -void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { +#ifdef HAS_ARGB1555TOYROW_NEON +void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) { asm volatile ( "movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2004,13 +2205,15 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_argb1555), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } +#endif // HAS_ARGB1555TOYROW_NEON -void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { +#ifdef HAS_ARGB4444TOYROW_NEON +void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) { asm volatile ( "movi v24.8b, #13 \n" // B * 0.1016 coefficient "movi v25.8b, #65 \n" // G * 0.5078 coefficient @@ -2031,13 +2234,15 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_argb4444), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27" ); } +#endif // HAS_ARGB4444TOYROW_NEON -void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { +#ifdef HAS_BGRATOYROW_NEON +void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) { asm volatile ( "movi v4.8b, #33 \n" // R * 0.2578 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2057,13 +2262,15 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_bgra), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } +#endif // HAS_BGRATOYROW_NEON -void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { +#ifdef HAS_ABGRTOYROW_NEON +void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) { asm volatile ( "movi v4.8b, #33 \n" // R * 0.2578 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2083,13 +2290,15 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_abgr), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } +#endif // HAS_ABGRTOYROW_NEON -void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { +#ifdef HAS_RGBATOYROW_NEON +void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) { asm volatile ( "movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2109,13 +2318,15 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_rgba), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } +#endif // HAS_RGBATOYROW_NEON -void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { +#ifdef HAS_RGB24TOYROW_NEON +void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) { asm volatile ( "movi v4.8b, #13 \n" // B * 0.1016 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2135,13 +2346,15 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_rgb24), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } +#endif // HAS_RGB24TOYROW_NEON -void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { +#ifdef HAS_RAWTOYROW_NEON +void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) { asm volatile ( "movi v4.8b, #33 \n" // R * 0.2578 coefficient "movi v5.8b, #65 \n" // G * 0.5078 coefficient @@ -2161,13 +2374,15 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) { "b.gt 1b \n" : "+r"(src_raw), // %0 "+r"(dst_y), // %1 - "+r"(width) // %2 + "+r"(pix) // %2 : : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16" ); } +#endif // HAS_RAWTOYROW_NEON // Bilinear filter 16x2 -> 16x1 +#ifdef HAS_INTERPOLATEROW_NEON void InterpolateRow_NEON(uint8* dst_ptr, const uint8* src_ptr, ptrdiff_t src_stride, int dst_width, int source_y_fraction) { @@ -2177,8 +2392,12 @@ void InterpolateRow_NEON(uint8* dst_ptr, asm volatile ( "cmp %w4, #0 \n" "b.eq 100f \n" + "cmp %w4, #64 \n" + "b.eq 75f \n" "cmp %w4, #128 \n" "b.eq 50f \n" + "cmp %w4, #192 \n" + "b.eq 25f \n" "dup v5.16b, %w4 \n" "dup v4.16b, %w5 \n" @@ -2200,6 +2419,20 @@ void InterpolateRow_NEON(uint8* dst_ptr, "b.gt 1b \n" "b 99f \n" + // Blend 25 / 75. + "25: \n" + MEMACCESS(1) + "ld1 {v0.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v1.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 25b \n" + "b 99f \n" + // Blend 50 / 50. "50: \n" MEMACCESS(1) @@ -2213,6 +2446,20 @@ void InterpolateRow_NEON(uint8* dst_ptr, "b.gt 50b \n" "b 99f \n" + // Blend 75 / 25. + "75: \n" + MEMACCESS(1) + "ld1 {v1.16b}, [%1], #16 \n" + MEMACCESS(2) + "ld1 {v0.16b}, [%2], #16 \n" + "subs %w3, %w3, #16 \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + "urhadd v0.16b, v0.16b, v1.16b \n" + MEMACCESS(0) + "st1 {v0.16b}, [%0], #16 \n" + "b.gt 75b \n" + "b 99f \n" + // Blend 100 / 0 - Copy row unchanged. "100: \n" MEMACCESS(1) @@ -2233,8 +2480,10 @@ void InterpolateRow_NEON(uint8* dst_ptr, : "cc", "memory", "v0", "v1", "v3", "v4", "v5" ); } +#endif // HAS_INTERPOLATEROW_NEON // dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr +#ifdef HAS_ARGBBLENDROW_NEON void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -2303,8 +2552,10 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1, "v16", "v17", "v18" ); } +#endif // HAS_ARGBBLENDROW_NEON // Attenuate 8 pixels at a time. +#ifdef HAS_ARGBATTENUATEROW_NEON void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( // Attenuate 8 pixels. @@ -2328,9 +2579,11 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } +#endif // HAS_ARGBATTENUATEROW_NEON // Quantize 8 ARGB pixels (32 bytes). // dst = (dst * scale >> 16) * interval_size + interval_offset; +#ifdef HAS_ARGBQUANTIZEROW_NEON void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, int interval_offset, int width) { asm volatile ( @@ -2370,10 +2623,12 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6" ); } +#endif // HAS_ARGBQUANTIZEROW_NEON // Shade 8 pixels at a time by specified value. // NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8. // Rounding in vqrdmulh does +1 to high if high bit of low s16 is set. +#ifdef HAS_ARGBSHADEROW_NEON void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, uint32 value) { asm volatile ( @@ -2408,10 +2663,12 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width, : "cc", "memory", "v0", "v4", "v5", "v6", "v7" ); } +#endif // HAS_ARGBSHADEROW_NEON // Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels // Similar to ARGBToYJ but stores ARGB. // C code is (15 * b + 75 * g + 38 * r + 64) >> 7; +#ifdef HAS_ARGBGRAYROW_NEON void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { asm volatile ( "movi v24.8b, #15 \n" // B * 0.11400 coefficient @@ -2437,12 +2694,14 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) { : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26" ); } +#endif // HAS_ARGBGRAYROW_NEON // Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels. // b = (r * 35 + g * 68 + b * 17) >> 7 // g = (r * 45 + g * 88 + b * 22) >> 7 // r = (r * 50 + g * 98 + b * 24) >> 7 +#ifdef HAS_ARGBSEPIAROW_NEON void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { asm volatile ( "movi v20.8b, #17 \n" // BB coefficient @@ -2480,10 +2739,12 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) { "v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30" ); } +#endif // HAS_ARGBSEPIAROW_NEON // Tranform 8 ARGB pixels (32 bytes) with color matrix. // TODO(fbarchard): Was same as Sepia except matrix is provided. This function // needs to saturate. Consider doing a non-saturating version. +#ifdef HAS_ARGBCOLORMATRIXROW_NEON void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, const int8* matrix_argb, int width) { asm volatile ( @@ -2543,9 +2804,11 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb, "v18", "v19", "v22", "v23", "v24", "v25" ); } +#endif // HAS_ARGBCOLORMATRIXROW_NEON // TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable. // Multiply 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBMULTIPLYROW_NEON void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -2576,8 +2839,10 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } +#endif // HAS_ARGBMULTIPLYROW_NEON // Add 2 rows of ARGB pixels together, 8 pixels at a time. +#ifdef HAS_ARGBADDROW_NEON void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -2604,8 +2869,10 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } +#endif // HAS_ARGBADDROW_NEON // Subtract 2 rows of ARGB pixels, 8 pixels at a time. +#ifdef HAS_ARGBSUBTRACTROW_NEON void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, uint8* dst_argb, int width) { asm volatile ( @@ -2632,12 +2899,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1, : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7" ); } +#endif // HAS_ARGBSUBTRACTROW_NEON // Adds Sobel X and Sobel Y and stores Sobel into ARGB. // A = 255 // R = Sobel // G = Sobel // B = Sobel +#ifdef HAS_SOBELROW_NEON void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( @@ -2663,8 +2932,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, : "cc", "memory", "v0", "v1", "v2", "v3" ); } +#endif // HAS_SOBELROW_NEON // Adds Sobel X and Sobel Y and stores Sobel into plane. +#ifdef HAS_SOBELTOPLANEROW_NEON void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_y, int width) { asm volatile ( @@ -2687,12 +2958,14 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, : "cc", "memory", "v0", "v1" ); } +#endif // HAS_SOBELTOPLANEROW_NEON // Mixes Sobel X, Sobel Y and Sobel into ARGB. // A = 255 // R = Sobel X // G = Sobel // B = Sobel Y +#ifdef HAS_SOBELXYROW_NEON void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, uint8* dst_argb, int width) { asm volatile ( @@ -2716,11 +2989,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely, : "cc", "memory", "v0", "v1", "v2", "v3" ); } +#endif // HAS_SOBELXYROW_NEON // SobelX as a matrix is // -1 0 1 // -2 0 2 // -1 0 1 +#ifdef HAS_SOBELXROW_NEON void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, const uint8* src_y2, uint8* dst_sobelx, int width) { asm volatile ( @@ -2759,11 +3034,13 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } +#endif // HAS_SOBELXROW_NEON // SobelY as a matrix is // -1 -2 -1 // 0 0 0 // 1 2 1 +#ifdef HAS_SOBELYROW_NEON void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, uint8* dst_sobely, int width) { asm volatile ( @@ -2801,6 +3078,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1, : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List ); } +#endif // HAS_SOBELYROW_NEON #endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__) #ifdef __cplusplus |