summaryrefslogtreecommitdiff
path: root/third_party/libyuv/source/row_neon64.cc
diff options
context:
space:
mode:
authorJames Bankoski <jimbankoski@google.com>2016-07-01 19:08:04 +0000
committerJames Bankoski <jimbankoski@google.com>2016-07-01 19:14:28 +0000
commitc5372cf077394856eb1aa10e72bcc8e25bb9b3ec (patch)
tree7cfaf2e17b6c1374a3c3d10026dd74c1ed9992c0 /third_party/libyuv/source/row_neon64.cc
parentaa81375d73ee33d382e7f717c519db6159e497ee (diff)
downloadlibvpx-c5372cf077394856eb1aa10e72bcc8e25bb9b3ec.tar
libvpx-c5372cf077394856eb1aa10e72bcc8e25bb9b3ec.tar.gz
libvpx-c5372cf077394856eb1aa10e72bcc8e25bb9b3ec.tar.bz2
libvpx-c5372cf077394856eb1aa10e72bcc8e25bb9b3ec.zip
Revert "libyuv: update to 2f101fdb"
Compile failures on linux platform. BUG=webm:1253 This reverts commit aa81375d73ee33d382e7f717c519db6159e497ee. Change-Id: Ibab2c4827bc21518dc03c6e9716b5015cff56fc7
Diffstat (limited to 'third_party/libyuv/source/row_neon64.cc')
-rw-r--r--third_party/libyuv/source/row_neon64.cc844
1 files changed, 561 insertions, 283 deletions
diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc
index 6375d4f55..5d015454b 100644
--- a/third_party/libyuv/source/row_neon64.cc
+++ b/third_party/libyuv/source/row_neon64.cc
@@ -91,15 +91,17 @@ extern "C" {
"uzp2 v3.8b, v2.8b, v2.8b \n" \
"ins v1.s[1], v3.s[0] \n"
-#define YUVTORGB_SETUP \
+#define YUV422TORGB_SETUP_REG \
"ld1r {v24.8h}, [%[kUVBiasBGR]], #2 \n" \
"ld1r {v25.8h}, [%[kUVBiasBGR]], #2 \n" \
"ld1r {v26.8h}, [%[kUVBiasBGR]] \n" \
"ld1r {v31.4s}, [%[kYToRgb]] \n" \
- "ld2 {v27.8h, v28.8h}, [%[kUVToRB]] \n" \
- "ld2 {v29.8h, v30.8h}, [%[kUVToG]] \n"
+ "movi v27.8h, #128 \n" \
+ "movi v28.8h, #102 \n" \
+ "movi v29.8h, #25 \n" \
+ "movi v30.8h, #52 \n"
-#define YUVTORGB(vR, vG, vB) \
+#define YUV422TORGB(vR, vG, vB) \
"uxtl v0.8h, v0.8b \n" /* Extract Y */ \
"shll v2.8h, v1.8b, #8 \n" /* Replicate UV */ \
"ushll2 v3.4s, v0.8h, #0 \n" /* Y */ \
@@ -127,19 +129,57 @@ extern "C" {
"sqshrun " #vG ".8b, " #vG ".8h, #6 \n" /* G */ \
"sqshrun " #vR ".8b, " #vR ".8h, #6 \n" /* R */ \
+// YUV to RGB conversion constants.
+// Y contribution to R,G,B. Scale and bias.
+#define YG 18997 /* round(1.164 * 64 * 256 * 256 / 257) */
+#define YGB 1160 /* 1.164 * 64 * 16 - adjusted for even error distribution */
+
+// U and V contributions to R,G,B.
+#define UB -128 /* -min(128, round(2.018 * 64)) */
+#define UG 25 /* -round(-0.391 * 64) */
+#define VG 52 /* -round(-0.813 * 64) */
+#define VR -102 /* -round(1.596 * 64) */
+
+// Bias values to subtract 16 from Y and 128 from U and V.
+#define BB (UB * 128 - YGB)
+#define BG (UG * 128 + VG * 128 - YGB)
+#define BR (VR * 128 - YGB)
+
+static vec16 kUVBiasBGR = { BB, BG, BR, 0, 0, 0, 0, 0 };
+static vec32 kYToRgb = { 0x0101 * YG, 0, 0, 0 };
+
+#undef YG
+#undef YGB
+#undef UB
+#undef UG
+#undef VG
+#undef VR
+#undef BB
+#undef BG
+#undef BR
+
+#define RGBTOUV_SETUP_REG \
+ "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
+ "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
+ "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
+ "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
+ "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
+ "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
+
+
+#ifdef HAS_I444TOARGBROW_NEON
void I444ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV444
- YUVTORGB(v22, v21, v20)
- "subs %w4, %w4, #8 \n"
+ YUV422TORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
@@ -148,28 +188,27 @@ void I444ToARGBRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I444TOARGBROW_NEON
+#ifdef HAS_I422TOARGBROW_NEON
void I422ToARGBRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
@@ -178,91 +217,114 @@ void I422ToARGBRow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOARGBROW_NEON
-void I422AlphaToARGBRow_NEON(const uint8* src_y,
- const uint8* src_u,
- const uint8* src_v,
- const uint8* src_a,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
- int width) {
+#ifdef HAS_I411TOARGBROW_NEON
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
+ "1: \n"
+ READYUV411
+ YUV422TORGB(v22, v21, v20)
+ "subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n" /* A */
+ MEMACCESS(3)
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+#endif // HAS_I411TOARGBROW_NEON
+
+#ifdef HAS_I422TOBGRAROW_NEON
+void I422ToBGRARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v21, v22, v23)
+ "subs %w4, %w4, #8 \n"
+ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3)
- "ld1 {v23.8b}, [%3], #8 \n"
- "subs %w5, %w5, #8 \n"
- MEMACCESS(4)
- "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%4], #32 \n"
+ "st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
- "+r"(src_a), // %3
- "+r"(dst_argb), // %4
- "+r"(width) // %5
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ "+r"(dst_bgra), // %3
+ "+r"(width) // %4
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOBGRAROW_NEON
-void I411ToARGBRow_NEON(const uint8* src_y,
+#ifdef HAS_I422TOABGRROW_NEON
+void I422ToABGRRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
- uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
+ uint8* dst_abgr,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n" /* A */
+ YUV422TORGB_SETUP_REG
"1: \n"
- READYUV411
- YUVTORGB(v22, v21, v20)
+ READYUV422
+ YUV422TORGB(v20, v21, v22)
"subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(src_u), // %1
"+r"(src_v), // %2
- "+r"(dst_argb), // %3
+ "+r"(dst_abgr), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOABGRROW_NEON
+#ifdef HAS_I422TORGBAROW_NEON
void I422ToRGBARow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgba,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v20.8b, #255 \n" /* A */
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v23, v22, v21)
+ YUV422TORGB(v23, v22, v21)
"subs %w4, %w4, #8 \n"
+ "movi v20.8b, #255 \n" /* A */
MEMACCESS(3)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%3], #32 \n"
"b.gt 1b \n"
@@ -271,26 +333,25 @@ void I422ToRGBARow_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_rgba), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TORGBAROW_NEON
+#ifdef HAS_I422TORGB24ROW_NEON
void I422ToRGB24Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb24,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
MEMACCESS(3)
"st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
@@ -300,33 +361,60 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_rgb24), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TORGB24ROW_NEON
+
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width) {
+ asm volatile (
+ YUV422TORGB_SETUP_REG
+ "1: \n"
+ READYUV422
+ YUV422TORGB(v20, v21, v22)
+ "subs %w4, %w4, #8 \n"
+ MEMACCESS(3)
+ "st3 {v20.8b,v21.8b,v22.8b}, [%3], #24 \n"
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_raw), // %3
+ "+r"(width) // %4
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+#endif // HAS_I422TORAWROW_NEON
#define ARGBTORGB565 \
"shll v0.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
"sri v0.8h, v21.8h, #5 \n" /* RG */ \
"sri v0.8h, v20.8h, #11 \n" /* RGB */
+#ifdef HAS_I422TORGB565ROW_NEON
void I422ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
ARGBTORGB565
MEMACCESS(3)
@@ -337,37 +425,36 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_rgb565), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TORGB565ROW_NEON
#define ARGBTOARGB1555 \
"shll v0.8h, v23.8b, #8 \n" /* A */ \
"shll v22.8h, v22.8b, #8 \n" /* R */ \
- "shll v21.8h, v21.8b, #8 \n" /* G */ \
"shll v20.8h, v20.8b, #8 \n" /* B */ \
+ "shll v21.8h, v21.8b, #8 \n" /* G */ \
"sri v0.8h, v22.8h, #1 \n" /* AR */ \
"sri v0.8h, v21.8h, #6 \n" /* ARG */ \
"sri v0.8h, v20.8h, #11 \n" /* ARGB */
+#ifdef HAS_I422TOARGB1555ROW_NEON
void I422ToARGB1555Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb1555,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
+ "movi v23.8b, #255 \n"
ARGBTOARGB1555
MEMACCESS(3)
"st1 {v0.8h}, [%3], #16 \n" // store 8 pixels RGB565.
@@ -377,14 +464,13 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb1555), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOARGB1555ROW_NEON
#define ARGBTOARGB4444 \
/* Input v20.8b<=B, v21.8b<=G, v22.8b<=R, v23.8b<=A, v4.8b<=0x0f */ \
@@ -396,18 +482,18 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"orr v1.8b, v22.8b, v23.8b \n" /* RA */ \
"zip1 v0.16b, v0.16b, v1.16b \n" /* BGRA */
+#ifdef HAS_I422TOARGB4444ROW_NEON
void I422ToARGB4444Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
uint8* dst_argb4444,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"movi v4.16b, #0x0f \n" // bits to clear with vbic.
"1: \n"
READYUV422
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w4, %w4, #8 \n"
"movi v23.8b, #255 \n"
ARGBTOARGB4444
@@ -419,40 +505,41 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"+r"(src_v), // %2
"+r"(dst_argb4444), // %3
"+r"(width) // %4
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I422TOARGB4444ROW_NEON
+#ifdef HAS_I400TOARGBROW_NEON
void I400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
+ int64 width64 = (int64)(width);
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUV400
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&kYuvI601Constants.kUVToRB),
- [kUVToG]"r"(&kYuvI601Constants.kUVToG),
- [kUVBiasBGR]"r"(&kYuvI601Constants.kUVBiasBGR),
- [kYToRgb]"r"(&kYuvI601Constants.kYToRgb)
+ "+r"(width64) // %2
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_I400TOARGBROW_NEON
+#ifdef HAS_J400TOARGBROW_NEON
void J400ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
@@ -474,19 +561,20 @@ void J400ToARGBRow_NEON(const uint8* src_y,
: "cc", "memory", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_J400TOARGBROW_NEON
+#ifdef HAS_NV12TOARGBROW_NEON
void NV12ToARGBRow_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV12
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
@@ -494,53 +582,78 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_NV12TOARGBROW_NEON
+#ifdef HAS_NV21TOARGBROW_NEON
void NV21ToARGBRow_NEON(const uint8* src_y,
- const uint8* src_vu,
+ const uint8* src_uv,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV21
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(2)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%2], #32 \n"
"b.gt 1b \n"
: "+r"(src_y), // %0
- "+r"(src_vu), // %1
+ "+r"(src_uv), // %1
"+r"(dst_argb), // %2
"+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_NV21TOARGBROW_NEON
+#ifdef HAS_NV12TORGB565ROW_NEON
void NV12ToRGB565Row_NEON(const uint8* src_y,
const uint8* src_uv,
uint8* dst_rgb565,
- const struct YuvConstants* yuvconstants,
int width) {
asm volatile (
- YUVTORGB_SETUP
+ YUV422TORGB_SETUP_REG
"1: \n"
READNV12
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
+ "subs %w3, %w3, #8 \n"
+ ARGBTORGB565
+ MEMACCESS(2)
+ "st1 {v0.8h}, [%2], 16 \n" // store 8 pixels RGB565.
+ "b.gt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
+ "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
+ );
+}
+#endif // HAS_NV12TORGB565ROW_NEON
+
+#ifdef HAS_NV21TORGB565ROW_NEON
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ YUV422TORGB_SETUP_REG
+ "1: \n"
+ READNV21
+ YUV422TORGB(v22, v21, v20)
"subs %w3, %w3, #8 \n"
ARGBTORGB565
MEMACCESS(2)
@@ -550,68 +663,68 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
"+r"(src_uv), // %1
"+r"(dst_rgb565), // %2
"+r"(width) // %3
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_NV21TORGB565ROW_NEON
+#ifdef HAS_YUY2TOARGBROW_NEON
void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
+ int64 width64 = (int64)(width);
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READYUY2
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], #32 \n"
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ "+r"(width64) // %2
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_YUY2TOARGBROW_NEON
+#ifdef HAS_UYVYTOARGBROW_NEON
void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
- const struct YuvConstants* yuvconstants,
int width) {
+ int64 width64 = (int64)(width);
asm volatile (
- YUVTORGB_SETUP
- "movi v23.8b, #255 \n"
+ YUV422TORGB_SETUP_REG
"1: \n"
READUYVY
- YUVTORGB(v22, v21, v20)
+ YUV422TORGB(v22, v21, v20)
"subs %w2, %w2, #8 \n"
+ "movi v23.8b, #255 \n"
MEMACCESS(1)
"st4 {v20.8b,v21.8b,v22.8b,v23.8b}, [%1], 32 \n"
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
- : [kUVToRB]"r"(&yuvconstants->kUVToRB),
- [kUVToG]"r"(&yuvconstants->kUVToG),
- [kUVBiasBGR]"r"(&yuvconstants->kUVBiasBGR),
- [kYToRgb]"r"(&yuvconstants->kYToRgb)
+ "+r"(width64) // %2
+ : [kUVBiasBGR]"r"(&kUVBiasBGR),
+ [kYToRgb]"r"(&kYToRgb)
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v20",
"v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30"
);
}
+#endif // HAS_UYVYTOARGBROW_NEON
// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+#ifdef HAS_SPLITUVROW_NEON
void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
asm volatile (
@@ -632,8 +745,10 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
: "cc", "memory", "v0", "v1" // Clobber List
);
}
+#endif // HAS_SPLITUVROW_NEON
// Reads 16 U's and V's and writes out 16 pairs of UV.
+#ifdef HAS_MERGEUVROW_NEON
void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
int width) {
asm volatile (
@@ -655,8 +770,10 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
: "cc", "memory", "v0", "v1" // Clobber List
);
}
+#endif // HAS_MERGEUVROW_NEON
// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+#ifdef HAS_COPYROW_NEON
void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
"1: \n"
@@ -673,16 +790,17 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_COPYROW_NEON
// SetRow writes 'count' bytes using an 8 bit value repeated.
void SetRow_NEON(uint8* dst, uint8 v8, int count) {
asm volatile (
"dup v0.16b, %w2 \n" // duplicate 16 bytes
"1: \n"
- "subs %w1, %w1, #16 \n" // 16 bytes per loop
+ "subs %w1, %w1, #16 \n" // 16 bytes per loop
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v8) // %2
@@ -694,10 +812,10 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
asm volatile (
"dup v0.4s, %w2 \n" // duplicate 4 ints
"1: \n"
- "subs %w1, %w1, #4 \n" // 4 ints per loop
+ "subs %w1, %w1, #4 \n" // 4 ints per loop
MEMACCESS(0)
"st1 {v0.16b}, [%0], #16 \n" // store
- "b.gt 1b \n"
+ "b.gt 1b \n"
: "+r"(dst), // %0
"+r"(count) // %1
: "r"(v32) // %2
@@ -705,15 +823,18 @@ void ARGBSetRow_NEON(uint8* dst, uint32 v32, int count) {
);
}
+#ifdef HAS_MIRRORROW_NEON
void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ int64 width64 = (int64) width;
asm volatile (
// Start at end of source row.
- "add %0, %0, %w2, sxtw \n"
+ "add %0, %0, %2 \n"
"sub %0, %0, #16 \n"
+
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #16 \n" // 16 pixels per loop.
+ "subs %2, %2, #16 \n" // 16 pixels per loop.
"rev64 v0.16b, v0.16b \n"
MEMACCESS(1)
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
@@ -722,22 +843,26 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(width64) // %2
: "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "v0"
);
}
+#endif // HAS_MIRRORROW_NEON
+#ifdef HAS_MIRRORUVROW_NEON
void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
+ int64 width64 = (int64) width;
asm volatile (
// Start at end of source row.
- "add %0, %0, %w3, sxtw #1 \n"
+ "add %0, %0, %3, lsl #1 \n"
"sub %0, %0, #16 \n"
+
"1: \n"
MEMACCESS(0)
"ld2 {v0.8b, v1.8b}, [%0], %4 \n" // src -= 16
- "subs %w3, %w3, #8 \n" // 8 pixels per loop.
+ "subs %3, %3, #8 \n" // 8 pixels per loop.
"rev64 v0.8b, v0.8b \n"
"rev64 v1.8b, v1.8b \n"
MEMACCESS(1)
@@ -748,21 +873,25 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
: "+r"(src_uv), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(width64) // %3
: "r"((ptrdiff_t)-16) // %4
: "cc", "memory", "v0", "v1"
);
}
+#endif // HAS_MIRRORUVROW_NEON
+#ifdef HAS_ARGBMIRRORROW_NEON
void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ int64 width64 = (int64) width;
asm volatile (
- // Start at end of source row.
- "add %0, %0, %w2, sxtw #2 \n"
+ // Start at end of source row.
+ "add %0, %0, %2, lsl #2 \n"
"sub %0, %0, #16 \n"
+
"1: \n"
MEMACCESS(0)
"ld1 {v0.16b}, [%0], %3 \n" // src -= 16
- "subs %w2, %w2, #4 \n" // 4 pixels per loop.
+ "subs %2, %2, #4 \n" // 4 pixels per loop.
"rev64 v0.4s, v0.4s \n"
MEMACCESS(1)
"st1 {v0.D}[1], [%1], #8 \n" // dst += 16
@@ -771,13 +900,15 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
"b.gt 1b \n"
: "+r"(src), // %0
"+r"(dst), // %1
- "+r"(width) // %2
+ "+r"(width64) // %2
: "r"((ptrdiff_t)-16) // %3
: "cc", "memory", "v0"
);
}
+#endif // HAS_ARGBMIRRORROW_NEON
-void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
asm volatile (
"movi v4.8b, #255 \n" // Alpha
"1: \n"
@@ -789,13 +920,15 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int width) {
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
+#endif // HAS_RGB24TOARGBROW_NEON
-void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
asm volatile (
"movi v5.8b, #255 \n" // Alpha
"1: \n"
@@ -809,30 +942,12 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int width) {
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
-
-void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld3 {v0.8b,v1.8b,v2.8b}, [%0], #24 \n" // read r g b
- "subs %w2, %w2, #8 \n" // 8 processed per loop.
- "orr v3.8b, v1.8b, v1.8b \n" // move g
- "orr v4.8b, v0.8b, v0.8b \n" // move r
- MEMACCESS(1)
- "st3 {v2.8b,v3.8b,v4.8b}, [%1], #24 \n" // store b g r
- "b.gt 1b \n"
- : "+r"(src_raw), // %0
- "+r"(dst_rgb24), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
- );
-}
+#endif // HAS_RAWTOARGBROW_NEON
#define RGB565TOARGB \
"shrn v6.8b, v0.8h, #5 \n" /* G xxGGGGGG */ \
@@ -847,7 +962,8 @@ void RAWToRGB24Row_NEON(const uint8* src_raw, uint8* dst_rgb24, int width) {
"orr v0.16b, v0.16b, v2.16b \n" /* R,B */ \
"dup v2.2D, v0.D[1] \n" /* R */
-void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
+#ifdef HAS_RGB565TOARGBROW_NEON
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
asm volatile (
"movi v3.8b, #255 \n" // Alpha
"1: \n"
@@ -860,11 +976,12 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6" // Clobber List
);
}
+#endif // HAS_RGB565TOARGBROW_NEON
#define ARGB1555TOARGB \
"ushr v2.8h, v0.8h, #10 \n" /* R xxxRRRRR */ \
@@ -903,8 +1020,9 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int width) {
"orr v2.16b, v1.16b, v3.16b \n" /* R */ \
"dup v1.2D, v0.D[1] \n" /* G */ \
+#ifdef HAS_ARGB1555TOARGBROW_NEON
void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
- int width) {
+ int pix) {
asm volatile (
"movi v3.8b, #255 \n" // Alpha
"1: \n"
@@ -917,11 +1035,12 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_ARGB1555TOARGBROW_NEON
#define ARGB4444TOARGB \
"shrn v1.8b, v0.8h, #8 \n" /* v1(l) AR */ \
@@ -935,8 +1054,9 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
"dup v0.2D, v2.D[1] \n" \
"dup v1.2D, v3.D[1] \n"
+#ifdef HAS_ARGB4444TOARGBROW_NEON
void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
- int width) {
+ int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -948,13 +1068,15 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4" // Clobber List
);
}
+#endif // HAS_ARGB4444TOARGBROW_NEON
-void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -965,13 +1087,15 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int width) {
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb24), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4" // Clobber List
);
}
+#endif // HAS_ARGBTORGB24ROW_NEON
-void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -984,13 +1108,15 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int width) {
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_raw), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v1", "v2", "v3", "v4", "v5" // Clobber List
);
}
+#endif // HAS_ARGBTORAWROW_NEON
-void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1001,13 +1127,15 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_yuy2), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
);
}
+#endif // HAS_YUY2TOYROW_NEON
-void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1018,14 +1146,16 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_uyvy), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1" // Clobber List
);
}
+#endif // HAS_UYVYTOYROW_NEON
+#ifdef HAS_YUY2TOUV422ROW_NEON
void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1039,14 +1169,16 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
: "+r"(src_yuy2), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_YUY2TOUV422ROW_NEON
+#ifdef HAS_UYVYTOUV422ROW_NEON
void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1060,14 +1192,16 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
: "+r"(src_uyvy), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_UYVYTOUV422ROW_NEON
+#ifdef HAS_YUY2TOUVROW_NEON
void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_yuy2b = src_yuy2 + stride_yuy2;
asm volatile (
"1: \n"
@@ -1087,15 +1221,17 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"+r"(src_yuy2b), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7" // Clobber List
);
}
+#endif // HAS_YUY2TOUVROW_NEON
+#ifdef HAS_UYVYTOUVROW_NEON
void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_uyvyb = src_uyvy + stride_uyvy;
asm volatile (
"1: \n"
@@ -1115,16 +1251,18 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"+r"(src_uyvyb), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4",
"v5", "v6", "v7" // Clobber List
);
}
+#endif // HAS_UYVYTOUVROW_NEON
// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+#ifdef HAS_ARGBSHUFFLEROW_NEON
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
- const uint8* shuffler, int width) {
+ const uint8* shuffler, int pix) {
asm volatile (
MEMACCESS(3)
"ld1 {v2.16b}, [%3] \n" // shuffler
@@ -1138,12 +1276,14 @@ void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
: "r"(shuffler) // %3
: "cc", "memory", "v0", "v1", "v2" // Clobber List
);
}
+#endif // HAS_ARGBSHUFFLEROW_NEON
+#ifdef HAS_I422TOYUY2ROW_NEON
void I422ToYUY2Row_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1170,7 +1310,9 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
+#endif // HAS_I422TOYUY2ROW_NEON
+#ifdef HAS_I422TOUYVYROW_NEON
void I422ToUYVYRow_NEON(const uint8* src_y,
const uint8* src_u,
const uint8* src_v,
@@ -1197,8 +1339,10 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
+#endif // HAS_I422TOUYVYROW_NEON
-void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
+#ifdef HAS_ARGBTORGB565ROW_NEON
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1210,12 +1354,14 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int width) {
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_rgb565), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_ARGBTORGB565ROW_NEON
+#ifdef HAS_ARGBTORGB565DITHERROW_NEON
void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
const uint32 dither4, int width) {
asm volatile (
@@ -1238,9 +1384,11 @@ void ARGBToRGB565DitherRow_NEON(const uint8* src_argb, uint8* dst_rgb,
: "cc", "memory", "v0", "v1", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_ARGBTORGB565ROW_NEON
+#ifdef HAS_ARGBTOARGB1555ROW_NEON
void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
- int width) {
+ int pix) {
asm volatile (
"1: \n"
MEMACCESS(0)
@@ -1252,14 +1400,16 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb1555), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_ARGBTOARGB1555ROW_NEON
+#ifdef HAS_ARGBTOARGB4444ROW_NEON
void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
- int width) {
+ int pix) {
asm volatile (
"movi v4.16b, #0x0f \n" // bits to clear with vbic.
"1: \n"
@@ -1272,13 +1422,15 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_argb4444), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v4", "v20", "v21", "v22", "v23"
);
}
+#endif // HAS_ARGBTOARGB4444ROW_NEON
-void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+#ifdef HAS_ARGBTOYROW_NEON
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -1298,30 +1450,15 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBTOYROW_NEON
-void ARGBExtractAlphaRow_NEON(const uint8* src_argb, uint8* dst_a, int width) {
- asm volatile (
- "1: \n"
- MEMACCESS(0)
- "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load row 16 pixels
- "subs %w2, %w2, #16 \n" // 16 processed per loop
- MEMACCESS(1)
- "st1 {v3.16b}, [%1], #16 \n" // store 16 A's.
- "b.gt 1b \n"
- : "+r"(src_argb), // %0
- "+r"(dst_a), // %1
- "+r"(width) // %2
- :
- : "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
- );
-}
-
-void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
+#ifdef HAS_ARGBTOYJROW_NEON
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #15 \n" // B * 0.11400 coefficient
"movi v5.8b, #75 \n" // G * 0.58700 coefficient
@@ -1339,15 +1476,17 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_argb), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
+#endif // HAS_ARGBTOYJROW_NEON
// 8x1 pixels.
+#ifdef HAS_ARGBTOUV444ROW_NEON
void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
"movi v24.8b, #112 \n" // UB / VR 0.875 coefficient
"movi v25.8b, #74 \n" // UG -0.5781 coefficient
@@ -1380,24 +1519,62 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4",
"v24", "v25", "v26", "v27", "v28", "v29"
);
}
+#endif // HAS_ARGBTOUV444ROW_NEON
-#define RGBTOUV_SETUP_REG \
- "movi v20.8h, #56, lsl #0 \n" /* UB/VR coefficient (0.875) / 2 */ \
- "movi v21.8h, #37, lsl #0 \n" /* UG coefficient (-0.5781) / 2 */ \
- "movi v22.8h, #19, lsl #0 \n" /* UR coefficient (-0.2969) / 2 */ \
- "movi v23.8h, #9, lsl #0 \n" /* VB coefficient (-0.1406) / 2 */ \
- "movi v24.8h, #47, lsl #0 \n" /* VG coefficient (-0.7344) / 2 */ \
- "movi v25.16b, #0x80 \n" /* 128.5 (0x8080 in 16-bit) */
+// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGBTOUV422ROW_NEON
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ RGBTOUV_SETUP_REG
+ "1: \n"
+ MEMACCESS(0)
+ "ld4 {v0.16b,v1.16b,v2.16b,v3.16b}, [%0], #64 \n" // load 16 pixels.
-// 32x1 pixels -> 8x1. width is number of argb pixels. e.g. 32.
+ "uaddlp v0.8h, v0.16b \n" // B 16 bytes -> 8 shorts.
+ "uaddlp v1.8h, v1.16b \n" // G 16 bytes -> 8 shorts.
+ "uaddlp v2.8h, v2.16b \n" // R 16 bytes -> 8 shorts.
+
+ "subs %w3, %w3, #16 \n" // 16 processed per loop.
+ "mul v3.8h, v0.8h, v20.8h \n" // B
+ "mls v3.8h, v1.8h, v21.8h \n" // G
+ "mls v3.8h, v2.8h, v22.8h \n" // R
+ "add v3.8h, v3.8h, v25.8h \n" // +128 -> unsigned
+
+ "mul v4.8h, v2.8h, v20.8h \n" // R
+ "mls v4.8h, v1.8h, v24.8h \n" // G
+ "mls v4.8h, v0.8h, v23.8h \n" // B
+ "add v4.8h, v4.8h, v25.8h \n" // +128 -> unsigned
+
+ "uqshrn v0.8b, v3.8h, #8 \n" // 16 bit to 8 bit U
+ "uqshrn v1.8b, v4.8h, #8 \n" // 16 bit to 8 bit V
+
+ MEMACCESS(1)
+ "st1 {v0.8b}, [%1], #8 \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "st1 {v1.8b}, [%2], #8 \n" // store 8 pixels V.
+ "b.gt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+ "v20", "v21", "v22", "v23", "v24", "v25"
+ );
+}
+#endif // HAS_ARGBTOUV422ROW_NEON
+
+// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
+#ifdef HAS_ARGBTOUV411ROW_NEON
void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
- int width) {
+ int pix) {
asm volatile (
RGBTOUV_SETUP_REG
"1: \n"
@@ -1439,14 +1616,15 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
: "+r"(src_argb), // %0
"+r"(dst_u), // %1
"+r"(dst_v), // %2
- "+r"(width) // %3
+ "+r"(pix) // %3
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ARGBTOUV411ROW_NEON
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
#define RGBTOUV(QB, QG, QR) \
"mul v3.8h, " #QB ",v20.8h \n" /* B */ \
"mul v4.8h, " #QR ",v20.8h \n" /* R */ \
@@ -1462,8 +1640,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
// TODO(fbarchard): consider ptrdiff_t for all strides.
+#ifdef HAS_ARGBTOUVROW_NEON
void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1495,16 +1674,18 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ARGBTOUVROW_NEON
// TODO(fbarchard): Subsample match C code.
+#ifdef HAS_ARGBTOUVJROW_NEON
void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb_1 = src_argb + src_stride_argb;
asm volatile (
"movi v20.8h, #63, lsl #0 \n" // UB/VR coeff (0.500) / 2
@@ -1540,15 +1721,17 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"+r"(src_argb_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ARGBTOUVJROW_NEON
+#ifdef HAS_BGRATOUVROW_NEON
void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_bgra_1 = src_bgra + src_stride_bgra;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1579,15 +1762,17 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"+r"(src_bgra_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_BGRATOUVROW_NEON
+#ifdef HAS_ABGRTOUVROW_NEON
void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_abgr_1 = src_abgr + src_stride_abgr;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1618,15 +1803,17 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"+r"(src_abgr_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ABGRTOUVROW_NEON
+#ifdef HAS_RGBATOUVROW_NEON
void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_rgba_1 = src_rgba + src_stride_rgba;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1657,15 +1844,17 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"+r"(src_rgba_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_RGBATOUVROW_NEON
+#ifdef HAS_RGB24TOUVROW_NEON
void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_rgb24_1 = src_rgb24 + src_stride_rgb24;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1696,15 +1885,17 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"+r"(src_rgb24_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_RGB24TOUVROW_NEON
+#ifdef HAS_RAWTOUVROW_NEON
void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_raw_1 = src_raw + src_stride_raw;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1735,16 +1926,18 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"+r"(src_raw_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v20", "v21", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_RAWTOUVROW_NEON
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_RGB565TOUVROW_NEON
void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_rgb565_1 = src_rgb565 + src_stride_rgb565;
asm volatile (
"movi v22.8h, #56, lsl #0 \n" // UB / VR coeff (0.875) / 2
@@ -1808,17 +2001,19 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"+r"(src_rgb565_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24",
"v25", "v26", "v27"
);
}
+#endif // HAS_RGB565TOUVROW_NEON
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB1555TOUVROW_NEON
void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb1555_1 = src_argb1555 + src_stride_argb1555;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1877,17 +2072,19 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"+r"(src_argb1555_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
"v26", "v27", "v28"
);
}
+#endif // HAS_ARGB1555TOUVROW_NEON
-// 16x2 pixels -> 8x1. width is number of argb pixels. e.g. 16.
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB4444TOUVROW_NEON
void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
- uint8* dst_u, uint8* dst_v, int width) {
+ uint8* dst_u, uint8* dst_v, int pix) {
const uint8* src_argb4444_1 = src_argb4444 + src_stride_argb4444;
asm volatile (
RGBTOUV_SETUP_REG
@@ -1946,7 +2143,7 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"+r"(src_argb4444_1), // %1
"+r"(dst_u), // %2
"+r"(dst_v), // %3
- "+r"(width) // %4
+ "+r"(pix) // %4
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6",
"v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25",
@@ -1954,8 +2151,10 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
);
}
+#endif // HAS_ARGB4444TOUVROW_NEON
-void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
+#ifdef HAS_RGB565TOYROW_NEON
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
asm volatile (
"movi v24.8b, #13 \n" // B * 0.1016 coefficient
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
@@ -1976,14 +2175,16 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_rgb565), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v6",
"v24", "v25", "v26", "v27"
);
}
+#endif // HAS_RGB565TOYROW_NEON
-void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
+#ifdef HAS_ARGB1555TOYROW_NEON
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2004,13 +2205,15 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_argb1555), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGB1555TOYROW_NEON
-void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
+#ifdef HAS_ARGB4444TOYROW_NEON
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
asm volatile (
"movi v24.8b, #13 \n" // B * 0.1016 coefficient
"movi v25.8b, #65 \n" // G * 0.5078 coefficient
@@ -2031,13 +2234,15 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_argb4444), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v24", "v25", "v26", "v27"
);
}
+#endif // HAS_ARGB4444TOYROW_NEON
-void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
+#ifdef HAS_BGRATOYROW_NEON
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2057,13 +2262,15 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_bgra), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_BGRATOYROW_NEON
-void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
+#ifdef HAS_ABGRTOYROW_NEON
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2083,13 +2290,15 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_abgr), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_ABGRTOYROW_NEON
-void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
+#ifdef HAS_RGBATOYROW_NEON
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2109,13 +2318,15 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_rgba), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_RGBATOYROW_NEON
-void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
+#ifdef HAS_RGB24TOYROW_NEON
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #13 \n" // B * 0.1016 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2135,13 +2346,15 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_rgb24), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_RGB24TOYROW_NEON
-void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
+#ifdef HAS_RAWTOYROW_NEON
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
asm volatile (
"movi v4.8b, #33 \n" // R * 0.2578 coefficient
"movi v5.8b, #65 \n" // G * 0.5078 coefficient
@@ -2161,13 +2374,15 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int width) {
"b.gt 1b \n"
: "+r"(src_raw), // %0
"+r"(dst_y), // %1
- "+r"(width) // %2
+ "+r"(pix) // %2
:
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16"
);
}
+#endif // HAS_RAWTOYROW_NEON
// Bilinear filter 16x2 -> 16x1
+#ifdef HAS_INTERPOLATEROW_NEON
void InterpolateRow_NEON(uint8* dst_ptr,
const uint8* src_ptr, ptrdiff_t src_stride,
int dst_width, int source_y_fraction) {
@@ -2177,8 +2392,12 @@ void InterpolateRow_NEON(uint8* dst_ptr,
asm volatile (
"cmp %w4, #0 \n"
"b.eq 100f \n"
+ "cmp %w4, #64 \n"
+ "b.eq 75f \n"
"cmp %w4, #128 \n"
"b.eq 50f \n"
+ "cmp %w4, #192 \n"
+ "b.eq 25f \n"
"dup v5.16b, %w4 \n"
"dup v4.16b, %w5 \n"
@@ -2200,6 +2419,20 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b.gt 1b \n"
"b 99f \n"
+ // Blend 25 / 75.
+ "25: \n"
+ MEMACCESS(1)
+ "ld1 {v0.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v1.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 25b \n"
+ "b 99f \n"
+
// Blend 50 / 50.
"50: \n"
MEMACCESS(1)
@@ -2213,6 +2446,20 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"b.gt 50b \n"
"b 99f \n"
+ // Blend 75 / 25.
+ "75: \n"
+ MEMACCESS(1)
+ "ld1 {v1.16b}, [%1], #16 \n"
+ MEMACCESS(2)
+ "ld1 {v0.16b}, [%2], #16 \n"
+ "subs %w3, %w3, #16 \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ "urhadd v0.16b, v0.16b, v1.16b \n"
+ MEMACCESS(0)
+ "st1 {v0.16b}, [%0], #16 \n"
+ "b.gt 75b \n"
+ "b 99f \n"
+
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
MEMACCESS(1)
@@ -2233,8 +2480,10 @@ void InterpolateRow_NEON(uint8* dst_ptr,
: "cc", "memory", "v0", "v1", "v3", "v4", "v5"
);
}
+#endif // HAS_INTERPOLATEROW_NEON
// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+#ifdef HAS_ARGBBLENDROW_NEON
void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -2303,8 +2552,10 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"v16", "v17", "v18"
);
}
+#endif // HAS_ARGBBLENDROW_NEON
// Attenuate 8 pixels at a time.
+#ifdef HAS_ARGBATTENUATEROW_NEON
void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
// Attenuate 8 pixels.
@@ -2328,9 +2579,11 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
+#endif // HAS_ARGBATTENUATEROW_NEON
// Quantize 8 ARGB pixels (32 bytes).
// dst = (dst * scale >> 16) * interval_size + interval_offset;
+#ifdef HAS_ARGBQUANTIZEROW_NEON
void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
int interval_offset, int width) {
asm volatile (
@@ -2370,10 +2623,12 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6"
);
}
+#endif // HAS_ARGBQUANTIZEROW_NEON
// Shade 8 pixels at a time by specified value.
// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+#ifdef HAS_ARGBSHADEROW_NEON
void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
uint32 value) {
asm volatile (
@@ -2408,10 +2663,12 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
: "cc", "memory", "v0", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBSHADEROW_NEON
// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
// Similar to ARGBToYJ but stores ARGB.
// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+#ifdef HAS_ARGBGRAYROW_NEON
void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
"movi v24.8b, #15 \n" // B * 0.11400 coefficient
@@ -2437,12 +2694,14 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v24", "v25", "v26"
);
}
+#endif // HAS_ARGBGRAYROW_NEON
// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
// b = (r * 35 + g * 68 + b * 17) >> 7
// g = (r * 45 + g * 88 + b * 22) >> 7
// r = (r * 50 + g * 98 + b * 24) >> 7
+#ifdef HAS_ARGBSEPIAROW_NEON
void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
asm volatile (
"movi v20.8b, #17 \n" // BB coefficient
@@ -2480,10 +2739,12 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"v20", "v21", "v22", "v24", "v25", "v26", "v28", "v29", "v30"
);
}
+#endif // HAS_ARGBSEPIAROW_NEON
// Tranform 8 ARGB pixels (32 bytes) with color matrix.
// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
// needs to saturate. Consider doing a non-saturating version.
+#ifdef HAS_ARGBCOLORMATRIXROW_NEON
void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
asm volatile (
@@ -2543,9 +2804,11 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"v18", "v19", "v22", "v23", "v24", "v25"
);
}
+#endif // HAS_ARGBCOLORMATRIXROW_NEON
// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -2576,8 +2839,10 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBMULTIPLYROW_NEON
// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBADDROW_NEON
void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -2604,8 +2869,10 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBADDROW_NEON
// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+#ifdef HAS_ARGBSUBTRACTROW_NEON
void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
uint8* dst_argb, int width) {
asm volatile (
@@ -2632,12 +2899,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
: "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7"
);
}
+#endif // HAS_ARGBSUBTRACTROW_NEON
// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
// A = 255
// R = Sobel
// G = Sobel
// B = Sobel
+#ifdef HAS_SOBELROW_NEON
void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
@@ -2663,8 +2932,10 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
+#endif // HAS_SOBELROW_NEON
// Adds Sobel X and Sobel Y and stores Sobel into plane.
+#ifdef HAS_SOBELTOPLANEROW_NEON
void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_y, int width) {
asm volatile (
@@ -2687,12 +2958,14 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
: "cc", "memory", "v0", "v1"
);
}
+#endif // HAS_SOBELTOPLANEROW_NEON
// Mixes Sobel X, Sobel Y and Sobel into ARGB.
// A = 255
// R = Sobel X
// G = Sobel
// B = Sobel Y
+#ifdef HAS_SOBELXYROW_NEON
void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
uint8* dst_argb, int width) {
asm volatile (
@@ -2716,11 +2989,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
: "cc", "memory", "v0", "v1", "v2", "v3"
);
}
+#endif // HAS_SOBELXYROW_NEON
// SobelX as a matrix is
// -1 0 1
// -2 0 2
// -1 0 1
+#ifdef HAS_SOBELXROW_NEON
void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
const uint8* src_y2, uint8* dst_sobelx, int width) {
asm volatile (
@@ -2759,11 +3034,13 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_SOBELXROW_NEON
// SobelY as a matrix is
// -1 -2 -1
// 0 0 0
// 1 2 1
+#ifdef HAS_SOBELYROW_NEON
void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
uint8* dst_sobely, int width) {
asm volatile (
@@ -2801,6 +3078,7 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
: "cc", "memory", "v0", "v1", "v2", "v3" // Clobber List
);
}
+#endif // HAS_SOBELYROW_NEON
#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
#ifdef __cplusplus