summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJames Zern <jzern@google.com>2014-08-10 16:15:18 -0700
committerJames Zern <jzern@google.com>2014-08-12 22:54:05 -0700
commit3a7d467da9f432f010470b7ae39e5bd7c9247323 (patch)
tree587d4b69ab3b43123ccda2305da64d81ac06a92a
parentccddd5d0f9a69c7369f9a68c155577e4e0b3afd1 (diff)
downloadlibvpx-3a7d467da9f432f010470b7ae39e5bd7c9247323.tar
libvpx-3a7d467da9f432f010470b7ae39e5bd7c9247323.tar.gz
libvpx-3a7d467da9f432f010470b7ae39e5bd7c9247323.tar.bz2
libvpx-3a7d467da9f432f010470b7ae39e5bd7c9247323.zip
libyuv: update to r1041
Change-Id: I38dad398844ee424a7a92a745ab703645018d02b
-rw-r--r--examples.mk18
-rw-r--r--third_party/libyuv/README.libvpx5
-rw-r--r--third_party/libyuv/include/libyuv/compare.h73
-rw-r--r--third_party/libyuv/include/libyuv/convert.h254
-rw-r--r--third_party/libyuv/include/libyuv/convert_argb.h225
-rw-r--r--third_party/libyuv/include/libyuv/convert_from.h173
-rw-r--r--third_party/libyuv/include/libyuv/convert_from_argb.h166
-rw-r--r--third_party/libyuv/include/libyuv/cpu_id.h2
-rw-r--r--third_party/libyuv/include/libyuv/format_conversion.h168
-rw-r--r--third_party/libyuv/include/libyuv/mjpeg_decoder.h193
-rw-r--r--third_party/libyuv/include/libyuv/planar_functions.h6
-rw-r--r--third_party/libyuv/include/libyuv/rotate.h117
-rw-r--r--third_party/libyuv/include/libyuv/rotate_argb.h33
-rw-r--r--third_party/libyuv/include/libyuv/row.h39
-rw-r--r--third_party/libyuv/include/libyuv/scale.h2
-rw-r--r--third_party/libyuv/include/libyuv/scale_argb.h57
-rw-r--r--third_party/libyuv/include/libyuv/scale_row.h2
-rw-r--r--third_party/libyuv/include/libyuv/version.h16
-rw-r--r--third_party/libyuv/include/libyuv/video_common.h182
-rw-r--r--third_party/libyuv/source/compare.cc325
-rw-r--r--third_party/libyuv/source/compare_common.cc42
-rw-r--r--third_party/libyuv/source/compare_neon.cc64
-rw-r--r--third_party/libyuv/source/compare_posix.cc158
-rw-r--r--third_party/libyuv/source/compare_win.cc232
-rw-r--r--third_party/libyuv/source/convert.cc1513
-rw-r--r--third_party/libyuv/source/convert_argb.cc938
-rw-r--r--third_party/libyuv/source/convert_from.cc1210
-rw-r--r--third_party/libyuv/source/convert_from_argb.cc1113
-rw-r--r--third_party/libyuv/source/convert_jpeg.cc392
-rw-r--r--third_party/libyuv/source/convert_to_argb.cc327
-rw-r--r--third_party/libyuv/source/convert_to_i420.cc383
-rw-r--r--third_party/libyuv/source/cpu_id.cc15
-rw-r--r--third_party/libyuv/source/format_conversion.cc552
-rw-r--r--third_party/libyuv/source/mjpeg_decoder.cc566
-rw-r--r--third_party/libyuv/source/mjpeg_validate.cc47
-rw-r--r--third_party/libyuv/source/planar_functions.cc12
-rw-r--r--third_party/libyuv/source/rotate.cc1301
-rw-r--r--third_party/libyuv/source/rotate_argb.cc209
-rw-r--r--third_party/libyuv/source/rotate_mips.cc485
-rw-r--r--third_party/libyuv/source/rotate_neon.cc533
-rw-r--r--third_party/libyuv/source/row_any.cc12
-rw-r--r--third_party/libyuv/source/row_common.cc4
-rw-r--r--third_party/libyuv/source/row_mips.cc9
-rw-r--r--third_party/libyuv/source/row_neon.cc306
-rw-r--r--third_party/libyuv/source/row_neon64.cc3323
-rw-r--r--third_party/libyuv/source/row_posix.cc2
-rw-r--r--third_party/libyuv/source/row_win.cc226
-rw-r--r--third_party/libyuv/source/row_x86.asm146
-rw-r--r--third_party/libyuv/source/scale.cc10
-rw-r--r--third_party/libyuv/source/scale_argb.cc809
-rw-r--r--third_party/libyuv/source/scale_common.cc10
-rw-r--r--third_party/libyuv/source/scale_mips.cc7
-rw-r--r--third_party/libyuv/source/scale_neon.cc128
-rw-r--r--third_party/libyuv/source/scale_posix.cc2
-rw-r--r--third_party/libyuv/source/scale_win.cc2
-rw-r--r--third_party/libyuv/source/video_common.cc64
-rw-r--r--third_party/libyuv/source/x86inc.asm1136
57 files changed, 18211 insertions, 133 deletions
diff --git a/examples.mk b/examples.mk
index a47db04ae..91bd45aa4 100644
--- a/examples.mk
+++ b/examples.mk
@@ -9,8 +9,12 @@
##
LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \
+ third_party/libyuv/include/libyuv/convert.h \
+ third_party/libyuv/include/libyuv/convert_argb.h \
+ third_party/libyuv/include/libyuv/convert_from.h \
third_party/libyuv/include/libyuv/cpu_id.h \
third_party/libyuv/include/libyuv/planar_functions.h \
+ third_party/libyuv/include/libyuv/rotate.h \
third_party/libyuv/include/libyuv/row.h \
third_party/libyuv/include/libyuv/scale.h \
third_party/libyuv/include/libyuv/scale_row.h \
@@ -20,14 +24,15 @@ LIBYUV_SRCS += third_party/libyuv/include/libyuv/basic_types.h \
third_party/libyuv/source/row_common.cc \
third_party/libyuv/source/row_mips.cc \
third_party/libyuv/source/row_neon.cc \
+ third_party/libyuv/source/row_neon64.cc \
third_party/libyuv/source/row_posix.cc \
third_party/libyuv/source/row_win.cc \
- third_party/libyuv/source/scale.cc \
+ third_party/libyuv/source/scale.cc \
third_party/libyuv/source/scale_common.cc \
third_party/libyuv/source/scale_mips.cc \
third_party/libyuv/source/scale_neon.cc \
third_party/libyuv/source/scale_posix.cc \
- third_party/libyuv/source/scale_win.cc
+ third_party/libyuv/source/scale_win.cc \
LIBWEBM_MUXER_SRCS += third_party/libwebm/mkvmuxer.cpp \
third_party/libwebm/mkvmuxerutil.cpp \
@@ -210,17 +215,18 @@ endif
# from an installed tree or a version controlled tree. Determine
# the proper paths.
ifeq ($(HAVE_ALT_TREE_LAYOUT),yes)
- LIB_PATH := $(SRC_PATH_BARE)/../lib
- INC_PATH := $(SRC_PATH_BARE)/../include
+ LIB_PATH-yes := $(SRC_PATH_BARE)/../lib
+ INC_PATH-yes := $(SRC_PATH_BARE)/../include
else
LIB_PATH-yes += $(if $(BUILD_PFX),$(BUILD_PFX),.)
INC_PATH-$(CONFIG_VP8_DECODER) += $(SRC_PATH_BARE)/vp8
INC_PATH-$(CONFIG_VP8_ENCODER) += $(SRC_PATH_BARE)/vp8
INC_PATH-$(CONFIG_VP9_DECODER) += $(SRC_PATH_BARE)/vp9
INC_PATH-$(CONFIG_VP9_ENCODER) += $(SRC_PATH_BARE)/vp9
- LIB_PATH := $(call enabled,LIB_PATH)
- INC_PATH := $(call enabled,INC_PATH)
endif
+INC_PATH-$(CONFIG_LIBYUV) += $(SRC_PATH_BARE)/third_party/libyuv/include
+LIB_PATH := $(call enabled,LIB_PATH)
+INC_PATH := $(call enabled,INC_PATH)
INTERNAL_CFLAGS = $(addprefix -I,$(INC_PATH))
INTERNAL_LDFLAGS += $(addprefix -L,$(LIB_PATH))
diff --git a/third_party/libyuv/README.libvpx b/third_party/libyuv/README.libvpx
index 577b42d0a..fa5b498ca 100644
--- a/third_party/libyuv/README.libvpx
+++ b/third_party/libyuv/README.libvpx
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1005
+Version: 1041
License: BSD
License File: LICENSE
@@ -13,5 +13,4 @@ which down-samples the original input video (f.g. 1280x720) a number of times
in order to encode multiple resolution bit streams.
Local Modifications:
-Modified the original scaler code minimally with include file changes to fit
-in our current build system.
+None.
diff --git a/third_party/libyuv/include/libyuv/compare.h b/third_party/libyuv/include/libyuv/compare.h
new file mode 100644
index 000000000..5dfac7c86
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/compare.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_COMPARE_H_ // NOLINT
+#define INCLUDE_LIBYUV_COMPARE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Compute a hash for specified memory. Seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed);
+
+// Sum Square Error - used to compute Mean Square Error or PSNR.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a,
+ const uint8* src_b, int count);
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height);
+
+static const int kMaxPsnr = 128;
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count);
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height);
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height);
+
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height);
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_COMPARE_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/convert.h b/third_party/libyuv/include/libyuv/convert.h
new file mode 100644
index 000000000..1bd45c837
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/convert.h
@@ -0,0 +1,254 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes.
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert I444 to I420.
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert I422 to I420.
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert I411 to I420.
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Copy I420 to I420.
+#define I420ToI420 I420Copy
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert I400 (grey) to I420.
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert NV21 to I420.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert Q420 to I420.
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// ARGB little endian (bgra in memory) to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// BGRA little endian (argb in memory) to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// ABGR little endian (rgba in memory) to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGBA little endian (abgr in memory) to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB little endian (bgr in memory) to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB big endian (rgb in memory) to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// RGB12 (R444 fourcc) little endian to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture.
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToI420(const uint8* sample, size_t sample_size,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_width, int src_height,
+ int dst_width, int dst_height);
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+ int* width, int* height);
+#endif
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_y" number of bytes in a row of the dst_y plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToI420(const uint8* src_frame, size_t src_size,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int crop_width, int crop_height,
+ enum RotationMode rotation,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/convert_argb.h b/third_party/libyuv/include/libyuv/convert_argb.h
new file mode 100644
index 000000000..a18014ca2
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/convert_argb.h
@@ -0,0 +1,225 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_ARGB_H_
+
+#include "libyuv/basic_types.h"
+// TODO(fbarchard): Remove the following headers includes
+#include "libyuv/convert_from.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+
+// TODO(fbarchard): This set of functions should exactly match convert.h
+// Add missing Q420.
+// TODO(fbarchard): Add tests. Create random content of right size and convert
+// with C vs Opt and or to I420 and compare.
+// TODO(fbarchard): Some of these functions lack parameter setting.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Alias.
+#define ARGBToARGB ARGBCopy
+
+// Copy ARGB to ARGB.
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert I400 (grey) to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Alias.
+#define YToARGB I400ToARGB_Reference
+
+// Convert I400 to ARGB. Reverse of ARGBToI400.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// TODO(fbarchard): Convert Q420 to ARGB.
+// LIBYUV_API
+// int Q420ToARGB(const uint8* src_y, int src_stride_y,
+// const uint8* src_yuy2, int src_stride_yuy2,
+// uint8* dst_argb, int dst_stride_argb,
+// int width, int height);
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// BGRA little endian (argb in memory) to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// ABGR little endian (rgba in memory) to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGBA little endian (abgr in memory) to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Deprecated function name.
+#define BG24ToARGB RGB24ToARGB
+
+// RGB little endian (bgr in memory) to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB big endian (rgb in memory) to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB16 (RGBP fourcc) little endian to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB15 (RGBO fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// RGB12 (R444 fourcc) little endian to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_frame, int src_stride_frame,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+#ifdef HAVE_JPEG
+// src_width/height provided by capture
+// dst_width/height for clipping determine final size.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample, size_t sample_size,
+ uint8* dst_argb, int dst_stride_argb,
+ int src_width, int src_height,
+ int dst_width, int dst_height);
+#endif
+
+// Note Bayer formats (BGGR) to ARGB are in format_conversion.h.
+
+// Convert camera sample to ARGB with cropping, rotation and vertical flip.
+// "src_size" is needed to parse MJPG.
+// "dst_stride_argb" number of bytes in a row of the dst_argb plane.
+// Normally this would be the same as dst_width, with recommended alignment
+// to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected. The caller should
+// allocate the I420 buffer according to rotation.
+// "dst_stride_u" number of bytes in a row of the dst_u plane.
+// Normally this would be the same as (dst_width + 1) / 2, with
+// recommended alignment to 16 bytes for better efficiency.
+// If rotation of 90 or 270 is used, stride is affected.
+// "crop_x" and "crop_y" are starting position for cropping.
+// To center, crop_x = (src_width - dst_width) / 2
+// crop_y = (src_height - dst_height) / 2
+// "src_width" / "src_height" is size of src_frame in pixels.
+// "src_height" can be negative indicating a vertically flipped image source.
+// "crop_width" / "crop_height" is the size to crop the src to.
+// Must be less than or equal to src_width/src_height
+// Cropping parameters are pre-rotation.
+// "rotation" can be 0, 90, 180 or 270.
+// "format" is a fourcc. ie 'I420', 'YUY2'
+// Returns 0 for successful; -1 for invalid parameter. Non-zero for failure.
+LIBYUV_API
+int ConvertToARGB(const uint8* src_frame, size_t src_size,
+ uint8* dst_argb, int dst_stride_argb,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int crop_width, int crop_height,
+ enum RotationMode rotation,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_ARGB_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/convert_from.h b/third_party/libyuv/include/libyuv/convert_from.h
new file mode 100644
index 000000000..b1cf57f7d
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/convert_from.h
@@ -0,0 +1,173 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// See Also convert.h for conversions from formats to I420.
+
+// I420Copy in convert to I420ToI420.
+
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Copy to I400. Source can be I420, I422, I444, I400, NV12 or NV21.
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// TODO(fbarchard): I420ToM420
+// TODO(fbarchard): I420ToQ420
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height);
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height);
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Note Bayer formats (BGGR) To I420 are in format_conversion.h.
+
+// Convert I420 to specified format.
+// "dst_sample_stride" is bytes in a row for the destination. Pass 0 if the
+// buffer has contiguous rows. Can be negative. A multiple of 16 is optimal.
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+ const uint8* u, int u_stride,
+ const uint8* v, int v_stride,
+ uint8* dst_sample, int dst_sample_stride,
+ int width, int height,
+ uint32 format);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/convert_from_argb.h b/third_party/libyuv/include/libyuv/convert_from_argb.h
new file mode 100644
index 000000000..90f43af04
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/convert_from_argb.h
@@ -0,0 +1,166 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB to ARGB.
+#define ARGBToARGB ARGBCopy
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Convert ARGB To BGRA.
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height);
+
+// Convert ARGB To ABGR.
+LIBYUV_API
+int ARGBToABGR(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height);
+
+// Convert ARGB To RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height);
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height);
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb, int dst_stride_rgb,
+ int width, int height);
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height);
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height);
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height);
+
+// Convert ARGB To I444.
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB To I422.
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB To I420. (also in convert.h)
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB To I411.
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Convert ARGB to J400. (JPeg full range).
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ int width, int height);
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height);
+
+// Convert ARGB To NV12.
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height);
+
+// Convert ARGB To NV21.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height);
+
+// Convert ARGB To YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height);
+
+// Convert ARGB To UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_CONVERT_FROM_ARGB_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/cpu_id.h b/third_party/libyuv/include/libyuv/cpu_id.h
index fd6276b49..dc858a814 100644
--- a/third_party/libyuv/include/libyuv/cpu_id.h
+++ b/third_party/libyuv/include/libyuv/cpu_id.h
@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_CPU_ID_H_ // NOLINT
#define INCLUDE_LIBYUV_CPU_ID_H_
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/include/libyuv/format_conversion.h b/third_party/libyuv/include/libyuv/format_conversion.h
new file mode 100644
index 000000000..b18bf0534
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/format_conversion.h
@@ -0,0 +1,168 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_FORMATCONVERSION_H_ // NOLINT
+#define INCLUDE_LIBYUV_FORMATCONVERSION_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert Bayer RGB formats to I420.
+LIBYUV_API
+int BayerBGGRToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int BayerGBRGToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int BayerGRBGToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+LIBYUV_API
+int BayerRGGBToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToI420(b, bs, f, y, ys, u, us, v, vs, w, h) \
+ BayerToI420(b, bs, y, ys, u, us, v, vs, w, h, f)
+
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ uint32 src_fourcc_bayer);
+
+// Convert I420 to Bayer RGB formats.
+LIBYUV_API
+int I420ToBayerBGGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBayerGBRG(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBayerGRBG(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+LIBYUV_API
+int I420ToBayerRGGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height);
+
+// Temporary API mapper.
+#define I420ToBayerRGB(y, ys, u, us, v, vs, b, bs, f, w, h) \
+ I420ToBayer(y, ys, u, us, v, vs, b, bs, w, h, f)
+
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_frame, int dst_stride_frame,
+ int width, int height,
+ uint32 dst_fourcc_bayer);
+
+// Convert Bayer RGB formats to ARGB.
+LIBYUV_API
+int BayerBGGRToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int BayerGBRGToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int BayerGRBGToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+LIBYUV_API
+int BayerRGGBToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height);
+
+// Temporary API mapper.
+#define BayerRGBToARGB(b, bs, f, a, as, w, h) BayerToARGB(b, bs, a, as, w, h, f)
+
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ uint32 src_fourcc_bayer);
+
+// Converts ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayerBGGR(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGBRG(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+LIBYUV_API
+int ARGBToBayerGRBG(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+LIBYUV_API
+int ARGBToBayerRGGB(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height);
+
+// Temporary API mapper.
+#define ARGBToBayerRGB(a, as, b, bs, f, w, h) ARGBToBayer(b, bs, a, as, w, h, f)
+
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height,
+ uint32 dst_fourcc_bayer);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_FORMATCONVERSION_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/mjpeg_decoder.h b/third_party/libyuv/include/libyuv/mjpeg_decoder.h
new file mode 100644
index 000000000..82fd95df2
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/mjpeg_decoder.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_MJPEG_DECODER_H_ // NOLINT
+#define INCLUDE_LIBYUV_MJPEG_DECODER_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+// NOTE: For a simplified public API use convert.h MJPGToI420().
+
+struct jpeg_common_struct;
+struct jpeg_decompress_struct;
+struct jpeg_source_mgr;
+
+namespace libyuv {
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+static const uint32 kUnknownDataSize = 0xFFFFFFFF;
+
+enum JpegSubsamplingType {
+ kJpegYuv420,
+ kJpegYuv422,
+ kJpegYuv411,
+ kJpegYuv444,
+ kJpegYuv400,
+ kJpegUnknown
+};
+
+struct Buffer {
+ const uint8* data;
+ int len;
+};
+
+struct BufferVector {
+ Buffer* buffers;
+ int len;
+ int pos;
+};
+
+struct SetJmpErrorMgr;
+
+// MJPEG ("Motion JPEG") is a pseudo-standard video codec where the frames are
+// simply independent JPEG images with a fixed huffman table (which is omitted).
+// It is rarely used in video transmission, but is common as a camera capture
+// format, especially in Logitech devices. This class implements a decoder for
+// MJPEG frames.
+//
+// See http://tools.ietf.org/html/rfc2435
+class LIBYUV_API MJpegDecoder {
+ public:
+ typedef void (*CallbackFunction)(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows);
+
+ static const int kColorSpaceUnknown;
+ static const int kColorSpaceGrayscale;
+ static const int kColorSpaceRgb;
+ static const int kColorSpaceYCbCr;
+ static const int kColorSpaceCMYK;
+ static const int kColorSpaceYCCK;
+
+ MJpegDecoder();
+ ~MJpegDecoder();
+
+ // Loads a new frame, reads its headers, and determines the uncompressed
+ // image format.
+ // Returns LIBYUV_TRUE if image looks valid and format is supported.
+ // If return value is LIBYUV_TRUE, then the values for all the following
+ // getters are populated.
+ // src_len is the size of the compressed mjpeg frame in bytes.
+ LIBYUV_BOOL LoadFrame(const uint8* src, size_t src_len);
+
+ // Returns width of the last loaded frame in pixels.
+ int GetWidth();
+
+ // Returns height of the last loaded frame in pixels.
+ int GetHeight();
+
+ // Returns format of the last loaded frame. The return value is one of the
+ // kColorSpace* constants.
+ int GetColorSpace();
+
+ // Number of color components in the color space.
+ int GetNumComponents();
+
+ // Sample factors of the n-th component.
+ int GetHorizSampFactor(int component);
+
+ int GetVertSampFactor(int component);
+
+ int GetHorizSubSampFactor(int component);
+
+ int GetVertSubSampFactor(int component);
+
+ // Public for testability.
+ int GetImageScanlinesPerImcuRow();
+
+ // Public for testability.
+ int GetComponentScanlinesPerImcuRow(int component);
+
+ // Width of a component in bytes.
+ int GetComponentWidth(int component);
+
+ // Height of a component.
+ int GetComponentHeight(int component);
+
+ // Width of a component in bytes with padding for DCTSIZE. Public for testing.
+ int GetComponentStride(int component);
+
+ // Size of a component in bytes.
+ int GetComponentSize(int component);
+
+ // Call this after LoadFrame() if you decide you don't want to decode it
+ // after all.
+ LIBYUV_BOOL UnloadFrame();
+
+ // Decodes the entire image into a one-buffer-per-color-component format.
+ // dst_width must match exactly. dst_height must be <= to image height; if
+ // less, the image is cropped. "planes" must have size equal to at least
+ // GetNumComponents() and they must point to non-overlapping buffers of size
+ // at least GetComponentSize(i). The pointers in planes are incremented
+ // to point to after the end of the written data.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ LIBYUV_BOOL DecodeToBuffers(uint8** planes, int dst_width, int dst_height);
+
+ // Decodes the entire image and passes the data via repeated calls to a
+ // callback function. Each call will get the data for a whole number of
+ // image scanlines.
+ // TODO(fbarchard): Add dst_x, dst_y to allow specific rect to be decoded.
+ LIBYUV_BOOL DecodeToCallback(CallbackFunction fn, void* opaque,
+ int dst_width, int dst_height);
+
+ // The helper function which recognizes the jpeg sub-sampling type.
+ static JpegSubsamplingType JpegSubsamplingTypeHelper(
+ int* subsample_x, int* subsample_y, int number_of_components);
+
+ private:
+
+ void AllocOutputBuffers(int num_outbufs);
+ void DestroyOutputBuffers();
+
+ LIBYUV_BOOL StartDecode();
+ LIBYUV_BOOL FinishDecode();
+
+ void SetScanlinePointers(uint8** data);
+ LIBYUV_BOOL DecodeImcuRow();
+
+ int GetComponentScanlinePadding(int component);
+
+ // A buffer holding the input data for a frame.
+ Buffer buf_;
+ BufferVector buf_vec_;
+
+ jpeg_decompress_struct* decompress_struct_;
+ jpeg_source_mgr* source_mgr_;
+ SetJmpErrorMgr* error_mgr_;
+
+ // LIBYUV_TRUE iff at least one component has scanline padding. (i.e.,
+ // GetComponentScanlinePadding() != 0.)
+ LIBYUV_BOOL has_scanline_padding_;
+
+ // Temporaries used to point to scanline outputs.
+ int num_outbufs_; // Outermost size of all arrays below.
+ uint8*** scanlines_;
+ int* scanlines_sizes_;
+ // Temporary buffer used for decoding when we can't decode directly to the
+ // output buffers. Large enough for just one iMCU row.
+ uint8** databuf_;
+ int* databuf_strides_;
+};
+
+} // namespace libyuv
+
+#endif // __cplusplus
+#endif // INCLUDE_LIBYUV_MJPEG_DECODER_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/planar_functions.h b/third_party/libyuv/include/libyuv/planar_functions.h
index 43f8df36d..d10a16985 100644
--- a/third_party/libyuv/include/libyuv/planar_functions.h
+++ b/third_party/libyuv/include/libyuv/planar_functions.h
@@ -11,11 +11,11 @@
#ifndef INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_ // NOLINT
#define INCLUDE_LIBYUV_PLANAR_FUNCTIONS_H_
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
// TODO(fbarchard): Remove the following headers includes.
-// #include "convert.h"
-// #include "convert_argb.h"
+#include "libyuv/convert.h"
+#include "libyuv/convert_argb.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/include/libyuv/rotate.h b/third_party/libyuv/include/libyuv/rotate.h
new file mode 100644
index 000000000..8af60b895
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/rotate.h
@@ -0,0 +1,117 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_H_ // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Supported rotation.
+typedef enum RotationMode {
+ kRotate0 = 0, // No rotation.
+ kRotate90 = 90, // Rotate 90 degrees clockwise.
+ kRotate180 = 180, // Rotate 180 degrees.
+ kRotate270 = 270, // Rotate 270 degrees clockwise.
+
+ // Deprecated.
+ kRotateNone = 0,
+ kRotateClockwise = 90,
+ kRotateCounterClockwise = 270,
+} RotationModeEnum;
+
+// Rotate I420 frame.
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_width, int src_height, enum RotationMode mode);
+
+// Rotate NV12 input and store in I420.
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_width, int src_height, enum RotationMode mode);
+
+// Rotate a plane by 0, 90, 180, or 270.
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int src_width, int src_height, enum RotationMode mode);
+
+// Rotate planes by 90, 180, 270. Deprecated.
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+// Rotations for when U and V are interleaved.
+// These functions take one input pointer and
+// split the data into two buffers while
+// rotating them. Deprecated.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+// The 90 and 270 functions are based on transposes.
+// Doing a transpose with reversing the read/write
+// order will result in a rotation by +- 90 degrees.
+// Deprecated.
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height);
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/rotate_argb.h b/third_party/libyuv/include/libyuv/rotate_argb.h
new file mode 100644
index 000000000..660ff5573
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/rotate_argb.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_ROTATE_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_ROTATE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/rotate.h" // For RotationMode.
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Rotate ARGB frame
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int src_width, int src_height, enum RotationMode mode);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_ROTATE_ARGB_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/row.h b/third_party/libyuv/include/libyuv/row.h
index daf5a45e1..fdfe1ae35 100644
--- a/third_party/libyuv/include/libyuv/row.h
+++ b/third_party/libyuv/include/libyuv/row.h
@@ -13,7 +13,11 @@
#include <stdlib.h> // For malloc.
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
+
+#if defined(__native_client__)
+#include "ppapi/c/pp_macros.h" // For PPAPI_RELEASE
+#endif
#ifdef __cplusplus
namespace libyuv {
@@ -38,7 +42,8 @@ extern "C" {
var = 0
#if defined(__pnacl__) || defined(__CLR_VER) || defined(COVERAGE_ENABLED) || \
- defined(TARGET_IPHONE_SIMULATOR)
+ defined(TARGET_IPHONE_SIMULATOR) || \
+ (defined(_MSC_VER) && defined(__clang__))
#define LIBYUV_DISABLE_X86
#endif
// True if compiling for SSSE3 as a requirement.
@@ -47,7 +52,12 @@ extern "C" {
#endif
// Enable for NaCL pepper 33 for bundle and AVX2 support.
-// #define NEW_BINUTILS
+#if defined(__native_client__) && PPAPI_RELEASE >= 33
+#define NEW_BINUTILS
+#endif
+#if defined(__native_client__) && defined(__arm__) && PPAPI_RELEASE < 37
+#define LIBYUV_DISABLE_NEON
+#endif
// The following are available on all x86 platforms:
#if !defined(LIBYUV_DISABLE_X86) && \
@@ -152,6 +162,11 @@ extern "C" {
#define HAS_YUY2TOYROW_SSE2
#endif
+// The following are available on x64 Visual C:
+#if !defined(LIBYUV_DISABLE_X86) && defined (_M_X64)
+#define HAS_I422TOARGBROW_SSSE3
+#endif
+
// GCC >= 4.7.0 required for AVX2.
#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
#if (__GNUC__ > 4) || (__GNUC__ == 4 && (__GNUC_MINOR__ >= 7))
@@ -235,6 +250,10 @@ extern "C" {
#define HAS_MIRRORROW_SSE2
#endif
+// The following are available on arm64 platforms:
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+#endif
+
// The following are available on Neon platforms:
#if !defined(LIBYUV_DISABLE_NEON) && \
(defined(__ARM_NEON__) || defined(LIBYUV_NEON))
@@ -330,7 +349,8 @@ extern "C" {
#endif
// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
#define HAS_COPYROW_MIPS
#if defined(__mips_dsp) && (__mips_dsp_rev >= 2)
#define HAS_I422TOABGRROW_MIPS_DSPR2
@@ -426,7 +446,7 @@ typedef uint8 uvec8[16];
"lea " #offset "(%q" #base ",%q" #index "," #scale "),%%r14d\n" \
#opcode " (%%r15,%%r14),%" #arg "\n" \
BUNDLEUNLOCK
-#else
+#else // defined(__native_client__) && defined(__x86_64__)
#define BUNDLEALIGN "\n"
#define MEMACCESS(base) "(%" #base ")"
#define MEMACCESS2(offset, base) #offset "(%" #base ")"
@@ -443,6 +463,15 @@ typedef uint8 uvec8[16];
#opcode " %%" #reg ","#offset "(%" #base ",%" #index "," #scale ")\n"
#define MEMOPARG(opcode, offset, base, index, scale, arg) \
#opcode " " #offset "(%" #base ",%" #index "," #scale "),%" #arg "\n"
+#endif // defined(__native_client__) && defined(__x86_64__)
+
+#if defined(__arm__)
+#undef MEMACCESS
+#if defined(__native_client__)
+#define MEMACCESS(base) ".p2align 3\nbic %" #base ", #0xc0000000\n"
+#else
+#define MEMACCESS(base) "\n"
+#endif
#endif
void I444ToARGBRow_NEON(const uint8* src_y,
diff --git a/third_party/libyuv/include/libyuv/scale.h b/third_party/libyuv/include/libyuv/scale.h
index 973d46457..a3bc07e0f 100644
--- a/third_party/libyuv/include/libyuv/scale.h
+++ b/third_party/libyuv/include/libyuv/scale.h
@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_SCALE_H_ // NOLINT
#define INCLUDE_LIBYUV_SCALE_H_
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/include/libyuv/scale_argb.h b/third_party/libyuv/include/libyuv/scale_argb.h
new file mode 100644
index 000000000..0c9b36257
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/scale_argb.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_SCALE_ARGB_H_ // NOLINT
+#define INCLUDE_LIBYUV_SCALE_ARGB_H_
+
+#include "libyuv/basic_types.h"
+#include "libyuv/scale.h" // For FilterMode
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering);
+
+// Clipped scale takes destination rectangle coordinates for clip values.
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering);
+
+// TODO(fbarchard): Implement this.
+// Scale with YUV conversion to ARGB and clipping.
+LIBYUV_API
+int YUVToARGBScaleClip(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint32 src_fourcc,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ uint32 dst_fourcc,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_SCALE_ARGB_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/scale_row.h b/third_party/libyuv/include/libyuv/scale_row.h
index 5d91f8f76..8dc0762f2 100644
--- a/third_party/libyuv/include/libyuv/scale_row.h
+++ b/third_party/libyuv/include/libyuv/scale_row.h
@@ -11,7 +11,7 @@
#ifndef INCLUDE_LIBYUV_SCALE_ROW_H_ // NOLINT
#define INCLUDE_LIBYUV_SCALE_ROW_H_
-#include "basic_types.h"
+#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/include/libyuv/version.h b/third_party/libyuv/include/libyuv/version.h
new file mode 100644
index 000000000..912c4c9e0
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/version.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef INCLUDE_LIBYUV_VERSION_H_ // NOLINT
+#define INCLUDE_LIBYUV_VERSION_H_
+
+#define LIBYUV_VERSION 1041
+
+#endif // INCLUDE_LIBYUV_VERSION_H_ NOLINT
diff --git a/third_party/libyuv/include/libyuv/video_common.h b/third_party/libyuv/include/libyuv/video_common.h
new file mode 100644
index 000000000..91acc2ffc
--- /dev/null
+++ b/third_party/libyuv/include/libyuv/video_common.h
@@ -0,0 +1,182 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+// Common definitions for video, including fourcc and VideoFormat.
+
+#ifndef INCLUDE_LIBYUV_VIDEO_COMMON_H_ // NOLINT
+#define INCLUDE_LIBYUV_VIDEO_COMMON_H_
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Definition of FourCC codes
+//////////////////////////////////////////////////////////////////////////////
+
+// Convert four characters to a FourCC code.
+// Needs to be a macro otherwise the OS X compiler complains when the kFormat*
+// constants are used in a switch.
+#ifdef __cplusplus
+#define FOURCC(a, b, c, d) ( \
+ (static_cast<uint32>(a)) | (static_cast<uint32>(b) << 8) | \
+ (static_cast<uint32>(c) << 16) | (static_cast<uint32>(d) << 24))
+#else
+#define FOURCC(a, b, c, d) ( \
+ ((uint32)(a)) | ((uint32)(b) << 8) | /* NOLINT */ \
+ ((uint32)(c) << 16) | ((uint32)(d) << 24)) /* NOLINT */
+#endif
+
+// Some pages discussing FourCC codes:
+// http://www.fourcc.org/yuv.php
+// http://v4l2spec.bytesex.org/spec/book1.htm
+// http://developer.apple.com/quicktime/icefloe/dispatch020.html
+// http://msdn.microsoft.com/library/windows/desktop/dd206750.aspx#nv12
+// http://people.xiph.org/~xiphmont/containers/nut/nut4cc.txt
+
+// FourCC codes grouped according to implementation efficiency.
+// Primary formats should convert in 1 efficient step.
+// Secondary formats are converted in 2 steps.
+// Auxilliary formats call primary converters.
+enum FourCC {
+ // 9 Primary YUV formats: 5 planar, 2 biplanar, 2 packed.
+ FOURCC_I420 = FOURCC('I', '4', '2', '0'),
+ FOURCC_I422 = FOURCC('I', '4', '2', '2'),
+ FOURCC_I444 = FOURCC('I', '4', '4', '4'),
+ FOURCC_I411 = FOURCC('I', '4', '1', '1'),
+ FOURCC_I400 = FOURCC('I', '4', '0', '0'),
+ FOURCC_NV21 = FOURCC('N', 'V', '2', '1'),
+ FOURCC_NV12 = FOURCC('N', 'V', '1', '2'),
+ FOURCC_YUY2 = FOURCC('Y', 'U', 'Y', '2'),
+ FOURCC_UYVY = FOURCC('U', 'Y', 'V', 'Y'),
+
+ // 2 Secondary YUV formats: row biplanar.
+ FOURCC_M420 = FOURCC('M', '4', '2', '0'),
+ FOURCC_Q420 = FOURCC('Q', '4', '2', '0'),
+
+ // 9 Primary RGB formats: 4 32 bpp, 2 24 bpp, 3 16 bpp.
+ FOURCC_ARGB = FOURCC('A', 'R', 'G', 'B'),
+ FOURCC_BGRA = FOURCC('B', 'G', 'R', 'A'),
+ FOURCC_ABGR = FOURCC('A', 'B', 'G', 'R'),
+ FOURCC_24BG = FOURCC('2', '4', 'B', 'G'),
+ FOURCC_RAW = FOURCC('r', 'a', 'w', ' '),
+ FOURCC_RGBA = FOURCC('R', 'G', 'B', 'A'),
+ FOURCC_RGBP = FOURCC('R', 'G', 'B', 'P'), // rgb565 LE.
+ FOURCC_RGBO = FOURCC('R', 'G', 'B', 'O'), // argb1555 LE.
+ FOURCC_R444 = FOURCC('R', '4', '4', '4'), // argb4444 LE.
+
+ // 4 Secondary RGB formats: 4 Bayer Patterns.
+ FOURCC_RGGB = FOURCC('R', 'G', 'G', 'B'),
+ FOURCC_BGGR = FOURCC('B', 'G', 'G', 'R'),
+ FOURCC_GRBG = FOURCC('G', 'R', 'B', 'G'),
+ FOURCC_GBRG = FOURCC('G', 'B', 'R', 'G'),
+
+ // 1 Primary Compressed YUV format.
+ FOURCC_MJPG = FOURCC('M', 'J', 'P', 'G'),
+
+ // 5 Auxiliary YUV variations: 3 with U and V planes are swapped, 1 Alias.
+ FOURCC_YV12 = FOURCC('Y', 'V', '1', '2'),
+ FOURCC_YV16 = FOURCC('Y', 'V', '1', '6'),
+ FOURCC_YV24 = FOURCC('Y', 'V', '2', '4'),
+ FOURCC_YU12 = FOURCC('Y', 'U', '1', '2'), // Linux version of I420.
+ FOURCC_J420 = FOURCC('J', '4', '2', '0'),
+ FOURCC_J400 = FOURCC('J', '4', '0', '0'),
+
+ // 14 Auxiliary aliases. CanonicalFourCC() maps these to canonical fourcc.
+ FOURCC_IYUV = FOURCC('I', 'Y', 'U', 'V'), // Alias for I420.
+ FOURCC_YU16 = FOURCC('Y', 'U', '1', '6'), // Alias for I422.
+ FOURCC_YU24 = FOURCC('Y', 'U', '2', '4'), // Alias for I444.
+ FOURCC_YUYV = FOURCC('Y', 'U', 'Y', 'V'), // Alias for YUY2.
+ FOURCC_YUVS = FOURCC('y', 'u', 'v', 's'), // Alias for YUY2 on Mac.
+ FOURCC_HDYC = FOURCC('H', 'D', 'Y', 'C'), // Alias for UYVY.
+ FOURCC_2VUY = FOURCC('2', 'v', 'u', 'y'), // Alias for UYVY on Mac.
+ FOURCC_JPEG = FOURCC('J', 'P', 'E', 'G'), // Alias for MJPG.
+ FOURCC_DMB1 = FOURCC('d', 'm', 'b', '1'), // Alias for MJPG on Mac.
+ FOURCC_BA81 = FOURCC('B', 'A', '8', '1'), // Alias for BGGR.
+ FOURCC_RGB3 = FOURCC('R', 'G', 'B', '3'), // Alias for RAW.
+ FOURCC_BGR3 = FOURCC('B', 'G', 'R', '3'), // Alias for 24BG.
+ FOURCC_CM32 = FOURCC(0, 0, 0, 32), // Alias for BGRA kCMPixelFormat_32ARGB
+ FOURCC_CM24 = FOURCC(0, 0, 0, 24), // Alias for RAW kCMPixelFormat_24RGB
+ FOURCC_L555 = FOURCC('L', '5', '5', '5'), // Alias for RGBO.
+ FOURCC_L565 = FOURCC('L', '5', '6', '5'), // Alias for RGBP.
+ FOURCC_5551 = FOURCC('5', '5', '5', '1'), // Alias for RGBO.
+
+ // 1 Auxiliary compressed YUV format set aside for capturer.
+ FOURCC_H264 = FOURCC('H', '2', '6', '4'),
+
+ // Match any fourcc.
+ FOURCC_ANY = -1,
+};
+
+enum FourCCBpp {
+ // Canonical fourcc codes used in our code.
+ FOURCC_BPP_I420 = 12,
+ FOURCC_BPP_I422 = 16,
+ FOURCC_BPP_I444 = 24,
+ FOURCC_BPP_I411 = 12,
+ FOURCC_BPP_I400 = 8,
+ FOURCC_BPP_NV21 = 12,
+ FOURCC_BPP_NV12 = 12,
+ FOURCC_BPP_YUY2 = 16,
+ FOURCC_BPP_UYVY = 16,
+ FOURCC_BPP_M420 = 12,
+ FOURCC_BPP_Q420 = 12,
+ FOURCC_BPP_ARGB = 32,
+ FOURCC_BPP_BGRA = 32,
+ FOURCC_BPP_ABGR = 32,
+ FOURCC_BPP_RGBA = 32,
+ FOURCC_BPP_24BG = 24,
+ FOURCC_BPP_RAW = 24,
+ FOURCC_BPP_RGBP = 16,
+ FOURCC_BPP_RGBO = 16,
+ FOURCC_BPP_R444 = 16,
+ FOURCC_BPP_RGGB = 8,
+ FOURCC_BPP_BGGR = 8,
+ FOURCC_BPP_GRBG = 8,
+ FOURCC_BPP_GBRG = 8,
+ FOURCC_BPP_YV12 = 12,
+ FOURCC_BPP_YV16 = 16,
+ FOURCC_BPP_YV24 = 24,
+ FOURCC_BPP_YU12 = 12,
+ FOURCC_BPP_J420 = 12,
+ FOURCC_BPP_J400 = 8,
+ FOURCC_BPP_MJPG = 0, // 0 means unknown.
+ FOURCC_BPP_H264 = 0,
+ FOURCC_BPP_IYUV = 12,
+ FOURCC_BPP_YU16 = 16,
+ FOURCC_BPP_YU24 = 24,
+ FOURCC_BPP_YUYV = 16,
+ FOURCC_BPP_YUVS = 16,
+ FOURCC_BPP_HDYC = 16,
+ FOURCC_BPP_2VUY = 16,
+ FOURCC_BPP_JPEG = 1,
+ FOURCC_BPP_DMB1 = 1,
+ FOURCC_BPP_BA81 = 8,
+ FOURCC_BPP_RGB3 = 24,
+ FOURCC_BPP_BGR3 = 24,
+ FOURCC_BPP_CM32 = 32,
+ FOURCC_BPP_CM24 = 24,
+
+ // Match any fourcc.
+ FOURCC_BPP_ANY = 0, // 0 means unknown.
+};
+
+// Converts fourcc aliases into canonical ones.
+LIBYUV_API uint32 CanonicalFourCC(uint32 fourcc);
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
+#endif // INCLUDE_LIBYUV_VIDEO_COMMON_H_ NOLINT
diff --git a/third_party/libyuv/source/compare.cc b/third_party/libyuv/source/compare.cc
new file mode 100644
index 000000000..9ea81b4e2
--- /dev/null
+++ b/third_party/libyuv/source/compare.cc
@@ -0,0 +1,325 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/compare.h"
+
+#include <float.h>
+#include <math.h>
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed);
+
+// This module is for Visual C x86
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__))))
+#define HAS_HASHDJB2_SSE41
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed);
+
+#if _MSC_VER >= 1700
+#define HAS_HASHDJB2_AVX2
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed);
+#endif
+
+#endif // HAS_HASHDJB2_SSE41
+
+// hash seed of 5381 recommended.
+LIBYUV_API
+uint32 HashDjb2(const uint8* src, uint64 count, uint32 seed) {
+ const int kBlockSize = 1 << 15; // 32768;
+ int remainder;
+ uint32 (*HashDjb2_SSE)(const uint8* src, int count, uint32 seed) = HashDjb2_C;
+#if defined(HAS_HASHDJB2_SSE41)
+ if (TestCpuFlag(kCpuHasSSE41)) {
+ HashDjb2_SSE = HashDjb2_SSE41;
+ }
+#endif
+#if defined(HAS_HASHDJB2_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ HashDjb2_SSE = HashDjb2_AVX2;
+ }
+#endif
+
+ while (count >= (uint64)(kBlockSize)) {
+ seed = HashDjb2_SSE(src, kBlockSize, seed);
+ src += kBlockSize;
+ count -= kBlockSize;
+ }
+ remainder = (int)(count) & ~15;
+ if (remainder) {
+ seed = HashDjb2_SSE(src, remainder, seed);
+ src += remainder;
+ count -= remainder;
+ }
+ remainder = (int)(count) & 15;
+ if (remainder) {
+ seed = HashDjb2_C(src, remainder, seed);
+ }
+ return seed;
+}
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count);
+#if !defined(LIBYUV_DISABLE_NEON) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SUMSQUAREERROR_NEON
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count);
+#endif
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#define HAS_SUMSQUAREERROR_SSE2
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+// Visual C 2012 required for AVX2.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && _MSC_VER >= 1700
+#define HAS_SUMSQUAREERROR_AVX2
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count);
+#endif
+
+// TODO(fbarchard): Refactor into row function.
+LIBYUV_API
+uint64 ComputeSumSquareError(const uint8* src_a, const uint8* src_b,
+ int count) {
+ // SumSquareError returns values 0 to 65535 for each squared difference.
+ // Up to 65536 of those can be summed and remain within a uint32.
+ // After each block of 65536 pixels, accumulate into a uint64.
+ const int kBlockSize = 65536;
+ int remainder = count & (kBlockSize - 1) & ~31;
+ uint64 sse = 0;
+ int i;
+ uint32 (*SumSquareError)(const uint8* src_a, const uint8* src_b, int count) =
+ SumSquareError_C;
+#if defined(HAS_SUMSQUAREERROR_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ SumSquareError = SumSquareError_NEON;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(src_a, 16) && IS_ALIGNED(src_b, 16)) {
+ // Note only used for multiples of 16 so count is not checked.
+ SumSquareError = SumSquareError_SSE2;
+ }
+#endif
+#if defined(HAS_SUMSQUAREERROR_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2)) {
+ // Note only used for multiples of 32 so count is not checked.
+ SumSquareError = SumSquareError_AVX2;
+ }
+#endif
+#ifdef _OPENMP
+#pragma omp parallel for reduction(+: sse)
+#endif
+ for (i = 0; i < (count - (kBlockSize - 1)); i += kBlockSize) {
+ sse += SumSquareError(src_a + i, src_b + i, kBlockSize);
+ }
+ src_a += count & ~(kBlockSize - 1);
+ src_b += count & ~(kBlockSize - 1);
+ if (remainder) {
+ sse += SumSquareError(src_a, src_b, remainder);
+ src_a += remainder;
+ src_b += remainder;
+ }
+ remainder = count & 31;
+ if (remainder) {
+ sse += SumSquareError_C(src_a, src_b, remainder);
+ }
+ return sse;
+}
+
+LIBYUV_API
+uint64 ComputeSumSquareErrorPlane(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height) {
+ uint64 sse = 0;
+ int h;
+ // Coalesce rows.
+ if (stride_a == width &&
+ stride_b == width) {
+ width *= height;
+ height = 1;
+ stride_a = stride_b = 0;
+ }
+ for (h = 0; h < height; ++h) {
+ sse += ComputeSumSquareError(src_a, src_b, width);
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+ return sse;
+}
+
+LIBYUV_API
+double SumSquareErrorToPsnr(uint64 sse, uint64 count) {
+ double psnr;
+ if (sse > 0) {
+ double mse = (double)(count) / (double)(sse);
+ psnr = 10.0 * log10(255.0 * 255.0 * mse);
+ } else {
+ psnr = kMaxPsnr; // Limit to prevent divide by 0
+ }
+
+ if (psnr > kMaxPsnr)
+ psnr = kMaxPsnr;
+
+ return psnr;
+}
+
+LIBYUV_API
+double CalcFramePsnr(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height) {
+ const uint64 samples = width * height;
+ const uint64 sse = ComputeSumSquareErrorPlane(src_a, stride_a,
+ src_b, stride_b,
+ width, height);
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+LIBYUV_API
+double I420Psnr(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height) {
+ const uint64 sse_y = ComputeSumSquareErrorPlane(src_y_a, stride_y_a,
+ src_y_b, stride_y_b,
+ width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const uint64 sse_u = ComputeSumSquareErrorPlane(src_u_a, stride_u_a,
+ src_u_b, stride_u_b,
+ width_uv, height_uv);
+ const uint64 sse_v = ComputeSumSquareErrorPlane(src_v_a, stride_v_a,
+ src_v_b, stride_v_b,
+ width_uv, height_uv);
+ const uint64 samples = width * height + 2 * (width_uv * height_uv);
+ const uint64 sse = sse_y + sse_u + sse_v;
+ return SumSquareErrorToPsnr(sse, samples);
+}
+
+static const int64 cc1 = 26634; // (64^2*(.01*255)^2
+static const int64 cc2 = 239708; // (64^2*(.03*255)^2
+
+static double Ssim8x8_C(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b) {
+ int64 sum_a = 0;
+ int64 sum_b = 0;
+ int64 sum_sq_a = 0;
+ int64 sum_sq_b = 0;
+ int64 sum_axb = 0;
+
+ int i;
+ for (i = 0; i < 8; ++i) {
+ int j;
+ for (j = 0; j < 8; ++j) {
+ sum_a += src_a[j];
+ sum_b += src_b[j];
+ sum_sq_a += src_a[j] * src_a[j];
+ sum_sq_b += src_b[j] * src_b[j];
+ sum_axb += src_a[j] * src_b[j];
+ }
+
+ src_a += stride_a;
+ src_b += stride_b;
+ }
+
+ {
+ const int64 count = 64;
+ // scale the constants by number of pixels
+ const int64 c1 = (cc1 * count * count) >> 12;
+ const int64 c2 = (cc2 * count * count) >> 12;
+
+ const int64 sum_a_x_sum_b = sum_a * sum_b;
+
+ const int64 ssim_n = (2 * sum_a_x_sum_b + c1) *
+ (2 * count * sum_axb - 2 * sum_a_x_sum_b + c2);
+
+ const int64 sum_a_sq = sum_a*sum_a;
+ const int64 sum_b_sq = sum_b*sum_b;
+
+ const int64 ssim_d = (sum_a_sq + sum_b_sq + c1) *
+ (count * sum_sq_a - sum_a_sq +
+ count * sum_sq_b - sum_b_sq + c2);
+
+ if (ssim_d == 0.0) {
+ return DBL_MAX;
+ }
+ return ssim_n * 1.0 / ssim_d;
+ }
+}
+
+// We are using a 8x8 moving window with starting location of each 8x8 window
+// on the 4x4 pixel grid. Such arrangement allows the windows to overlap
+// block boundaries to penalize blocking artifacts.
+LIBYUV_API
+double CalcFrameSsim(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b,
+ int width, int height) {
+ int samples = 0;
+ double ssim_total = 0;
+ double (*Ssim8x8)(const uint8* src_a, int stride_a,
+ const uint8* src_b, int stride_b) = Ssim8x8_C;
+
+ // sample point start with each 4x4 location
+ int i;
+ for (i = 0; i < height - 8; i += 4) {
+ int j;
+ for (j = 0; j < width - 8; j += 4) {
+ ssim_total += Ssim8x8(src_a + j, stride_a, src_b + j, stride_b);
+ samples++;
+ }
+
+ src_a += stride_a * 4;
+ src_b += stride_b * 4;
+ }
+
+ ssim_total /= samples;
+ return ssim_total;
+}
+
+LIBYUV_API
+double I420Ssim(const uint8* src_y_a, int stride_y_a,
+ const uint8* src_u_a, int stride_u_a,
+ const uint8* src_v_a, int stride_v_a,
+ const uint8* src_y_b, int stride_y_b,
+ const uint8* src_u_b, int stride_u_b,
+ const uint8* src_v_b, int stride_v_b,
+ int width, int height) {
+ const double ssim_y = CalcFrameSsim(src_y_a, stride_y_a,
+ src_y_b, stride_y_b, width, height);
+ const int width_uv = (width + 1) >> 1;
+ const int height_uv = (height + 1) >> 1;
+ const double ssim_u = CalcFrameSsim(src_u_a, stride_u_a,
+ src_u_b, stride_u_b,
+ width_uv, height_uv);
+ const double ssim_v = CalcFrameSsim(src_v_a, stride_v_a,
+ src_v_b, stride_v_b,
+ width_uv, height_uv);
+ return ssim_y * 0.8 + 0.1 * (ssim_u + ssim_v);
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/compare_common.cc b/third_party/libyuv/source/compare_common.cc
new file mode 100644
index 000000000..c546b5182
--- /dev/null
+++ b/third_party/libyuv/source/compare_common.cc
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+uint32 SumSquareError_C(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse = 0u;
+ int i;
+ for (i = 0; i < count; ++i) {
+ int diff = src_a[i] - src_b[i];
+ sse += (uint32)(diff * diff);
+ }
+ return sse;
+}
+
+// hash seed of 5381 recommended.
+// Internal C version of HashDjb2 with int sized count for efficiency.
+uint32 HashDjb2_C(const uint8* src, int count, uint32 seed) {
+ uint32 hash = seed;
+ int i;
+ for (i = 0; i < count; ++i) {
+ hash += (hash << 5) + src[i];
+ }
+ return hash;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/compare_neon.cc b/third_party/libyuv/source/compare_neon.cc
new file mode 100644
index 000000000..5e7b8e443
--- /dev/null
+++ b/third_party/libyuv/source/compare_neon.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+uint32 SumSquareError_NEON(const uint8* src_a, const uint8* src_b, int count) {
+ volatile uint32 sse;
+ asm volatile (
+ "vmov.u8 q8, #0 \n"
+ "vmov.u8 q10, #0 \n"
+ "vmov.u8 q9, #0 \n"
+ "vmov.u8 q11, #0 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n"
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n"
+ "subs %2, %2, #16 \n"
+ "vsubl.u8 q2, d0, d2 \n"
+ "vsubl.u8 q3, d1, d3 \n"
+ "vmlal.s16 q8, d4, d4 \n"
+ "vmlal.s16 q9, d6, d6 \n"
+ "vmlal.s16 q10, d5, d5 \n"
+ "vmlal.s16 q11, d7, d7 \n"
+ "bgt 1b \n"
+
+ "vadd.u32 q8, q8, q9 \n"
+ "vadd.u32 q10, q10, q11 \n"
+ "vadd.u32 q11, q8, q10 \n"
+ "vpaddl.u32 q1, q11 \n"
+ "vadd.u64 d0, d2, d3 \n"
+ "vmov.32 %3, d0[0] \n"
+ : "+r"(src_a),
+ "+r"(src_b),
+ "+r"(count),
+ "=r"(sse)
+ :
+ : "memory", "cc", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
+ return sse;
+}
+
+#endif // __ARM_NEON__
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/compare_posix.cc b/third_party/libyuv/source/compare_posix.cc
new file mode 100644
index 000000000..ac361190e
--- /dev/null
+++ b/third_party/libyuv/source/compare_posix.cc
@@ -0,0 +1,158 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && (defined(__x86_64__) || defined(__i386__))
+
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ uint32 sse;
+ asm volatile ( // NOLINT
+ "pxor %%xmm0,%%xmm0 \n"
+ "pxor %%xmm5,%%xmm5 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqa " MEMACCESS(0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10, 0) ",%0 \n"
+ "movdqa " MEMACCESS(1) ",%%xmm2 \n"
+ "lea " MEMLEA(0x10, 1) ",%1 \n"
+ "sub $0x10,%2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "psubusb %%xmm2,%%xmm1 \n"
+ "psubusb %%xmm3,%%xmm2 \n"
+ "por %%xmm2,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm5,%%xmm1 \n"
+ "punpckhbw %%xmm5,%%xmm2 \n"
+ "pmaddwd %%xmm1,%%xmm1 \n"
+ "pmaddwd %%xmm2,%%xmm2 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "paddd %%xmm2,%%xmm0 \n"
+ "jg 1b \n"
+
+ "pshufd $0xee,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "pshufd $0x1,%%xmm0,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "movd %%xmm0,%3 \n"
+
+ : "+r"(src_a), // %0
+ "+r"(src_b), // %1
+ "+r"(count), // %2
+ "=g"(sse) // %3
+ :
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm5"
+#endif
+ ); // NOLINT
+ return sse;
+}
+
+#endif // defined(__x86_64__) || defined(__i386__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+static uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ uint32 hash;
+ asm volatile ( // NOLINT
+ "movd %2,%%xmm0 \n"
+ "pxor %%xmm7,%%xmm7 \n"
+ "movdqa %4,%%xmm6 \n"
+ LABELALIGN
+ "1: \n"
+ "movdqu " MEMACCESS(0) ",%%xmm1 \n"
+ "lea " MEMLEA(0x10, 0) ",%0 \n"
+ "pmulld %%xmm6,%%xmm0 \n"
+ "movdqa %5,%%xmm5 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklbw %%xmm7,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "punpcklwd %%xmm7,%%xmm3 \n"
+ "pmulld %%xmm5,%%xmm3 \n"
+ "movdqa %6,%%xmm5 \n"
+ "movdqa %%xmm2,%%xmm4 \n"
+ "punpckhwd %%xmm7,%%xmm4 \n"
+ "pmulld %%xmm5,%%xmm4 \n"
+ "movdqa %7,%%xmm5 \n"
+ "punpckhbw %%xmm7,%%xmm1 \n"
+ "movdqa %%xmm1,%%xmm2 \n"
+ "punpcklwd %%xmm7,%%xmm2 \n"
+ "pmulld %%xmm5,%%xmm2 \n"
+ "movdqa %8,%%xmm5 \n"
+ "punpckhwd %%xmm7,%%xmm1 \n"
+ "pmulld %%xmm5,%%xmm1 \n"
+ "paddd %%xmm4,%%xmm3 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "sub $0x10,%1 \n"
+ "paddd %%xmm3,%%xmm1 \n"
+ "pshufd $0xe,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "pshufd $0x1,%%xmm1,%%xmm2 \n"
+ "paddd %%xmm2,%%xmm1 \n"
+ "paddd %%xmm1,%%xmm0 \n"
+ "jg 1b \n"
+ "movd %%xmm0,%3 \n"
+ : "+r"(src), // %0
+ "+r"(count), // %1
+ "+rm"(seed), // %2
+ "=g"(hash) // %3
+ : "m"(kHash16x33), // %4
+ "m"(kHashMul0), // %5
+ "m"(kHashMul1), // %6
+ "m"(kHashMul2), // %7
+ "m"(kHashMul3) // %8
+ : "memory", "cc"
+#if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+#endif
+ ); // NOLINT
+ return hash;
+}
+#endif // defined(__x86_64__) || (defined(__i386__) && !defined(__pic__)))
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/third_party/libyuv/source/compare_win.cc b/third_party/libyuv/source/compare_win.cc
new file mode 100644
index 000000000..99831651f
--- /dev/null
+++ b/third_party/libyuv/source/compare_win.cc
@@ -0,0 +1,232 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_SSE2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ pxor xmm0, xmm0
+ pxor xmm5, xmm5
+
+ align 4
+ wloop:
+ movdqa xmm1, [eax]
+ lea eax, [eax + 16]
+ movdqa xmm2, [edx]
+ lea edx, [edx + 16]
+ sub ecx, 16
+ movdqa xmm3, xmm1 // abs trick
+ psubusb xmm1, xmm2
+ psubusb xmm2, xmm3
+ por xmm1, xmm2
+ movdqa xmm2, xmm1
+ punpcklbw xmm1, xmm5
+ punpckhbw xmm2, xmm5
+ pmaddwd xmm1, xmm1
+ pmaddwd xmm2, xmm2
+ paddd xmm0, xmm1
+ paddd xmm0, xmm2
+ jg wloop
+
+ pshufd xmm1, xmm0, 0xee
+ paddd xmm0, xmm1
+ pshufd xmm1, xmm0, 0x01
+ paddd xmm0, xmm1
+ movd eax, xmm0
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+// C4752: found Intel(R) Advanced Vector Extensions; consider using /arch:AVX.
+#pragma warning(disable: 4752)
+__declspec(naked) __declspec(align(16))
+uint32 SumSquareError_AVX2(const uint8* src_a, const uint8* src_b, int count) {
+ __asm {
+ mov eax, [esp + 4] // src_a
+ mov edx, [esp + 8] // src_b
+ mov ecx, [esp + 12] // count
+ vpxor ymm0, ymm0, ymm0 // sum
+ vpxor ymm5, ymm5, ymm5 // constant 0 for unpck
+ sub edx, eax
+
+ align 4
+ wloop:
+ vmovdqu ymm1, [eax]
+ vmovdqu ymm2, [eax + edx]
+ lea eax, [eax + 32]
+ sub ecx, 32
+ vpsubusb ymm3, ymm1, ymm2 // abs difference trick
+ vpsubusb ymm2, ymm2, ymm1
+ vpor ymm1, ymm2, ymm3
+ vpunpcklbw ymm2, ymm1, ymm5 // u16. mutates order.
+ vpunpckhbw ymm1, ymm1, ymm5
+ vpmaddwd ymm2, ymm2, ymm2 // square + hadd to u32.
+ vpmaddwd ymm1, ymm1, ymm1
+ vpaddd ymm0, ymm0, ymm1
+ vpaddd ymm0, ymm0, ymm2
+ jg wloop
+
+ vpshufd ymm1, ymm0, 0xee // 3, 2 + 1, 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpshufd ymm1, ymm0, 0x01 // 1 + 0 both lanes.
+ vpaddd ymm0, ymm0, ymm1
+ vpermq ymm1, ymm0, 0x02 // high + low lane.
+ vpaddd ymm0, ymm0, ymm1
+ vmovd eax, xmm0
+ vzeroupper
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+#define HAS_HASHDJB2_SSE41
+static uvec32 kHash16x33 = { 0x92d9e201, 0, 0, 0 }; // 33 ^ 16
+static uvec32 kHashMul0 = {
+ 0x0c3525e1, // 33 ^ 15
+ 0xa3476dc1, // 33 ^ 14
+ 0x3b4039a1, // 33 ^ 13
+ 0x4f5f0981, // 33 ^ 12
+};
+static uvec32 kHashMul1 = {
+ 0x30f35d61, // 33 ^ 11
+ 0x855cb541, // 33 ^ 10
+ 0x040a9121, // 33 ^ 9
+ 0x747c7101, // 33 ^ 8
+};
+static uvec32 kHashMul2 = {
+ 0xec41d4e1, // 33 ^ 7
+ 0x4cfa3cc1, // 33 ^ 6
+ 0x025528a1, // 33 ^ 5
+ 0x00121881, // 33 ^ 4
+};
+static uvec32 kHashMul3 = {
+ 0x00008c61, // 33 ^ 3
+ 0x00000441, // 33 ^ 2
+ 0x00000021, // 33 ^ 1
+ 0x00000001, // 33 ^ 0
+};
+
+// 27: 66 0F 38 40 C6 pmulld xmm0,xmm6
+// 44: 66 0F 38 40 DD pmulld xmm3,xmm5
+// 59: 66 0F 38 40 E5 pmulld xmm4,xmm5
+// 72: 66 0F 38 40 D5 pmulld xmm2,xmm5
+// 83: 66 0F 38 40 CD pmulld xmm1,xmm5
+#define pmulld(reg) _asm _emit 0x66 _asm _emit 0x0F _asm _emit 0x38 \
+ _asm _emit 0x40 _asm _emit reg
+
+__declspec(naked) __declspec(align(16))
+uint32 HashDjb2_SSE41(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ movd xmm0, [esp + 12] // seed
+
+ pxor xmm7, xmm7 // constant 0 for unpck
+ movdqa xmm6, kHash16x33
+
+ align 4
+ wloop:
+ movdqu xmm1, [eax] // src[0-15]
+ lea eax, [eax + 16]
+ pmulld(0xc6) // pmulld xmm0,xmm6 hash *= 33 ^ 16
+ movdqa xmm5, kHashMul0
+ movdqa xmm2, xmm1
+ punpcklbw xmm2, xmm7 // src[0-7]
+ movdqa xmm3, xmm2
+ punpcklwd xmm3, xmm7 // src[0-3]
+ pmulld(0xdd) // pmulld xmm3, xmm5
+ movdqa xmm5, kHashMul1
+ movdqa xmm4, xmm2
+ punpckhwd xmm4, xmm7 // src[4-7]
+ pmulld(0xe5) // pmulld xmm4, xmm5
+ movdqa xmm5, kHashMul2
+ punpckhbw xmm1, xmm7 // src[8-15]
+ movdqa xmm2, xmm1
+ punpcklwd xmm2, xmm7 // src[8-11]
+ pmulld(0xd5) // pmulld xmm2, xmm5
+ movdqa xmm5, kHashMul3
+ punpckhwd xmm1, xmm7 // src[12-15]
+ pmulld(0xcd) // pmulld xmm1, xmm5
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ sub ecx, 16
+ paddd xmm1, xmm3
+
+ pshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0x01
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ jg wloop
+
+ movd eax, xmm0 // return hash
+ ret
+ }
+}
+
+// Visual C 2012 required for AVX2.
+#if _MSC_VER >= 1700
+__declspec(naked) __declspec(align(16))
+uint32 HashDjb2_AVX2(const uint8* src, int count, uint32 seed) {
+ __asm {
+ mov eax, [esp + 4] // src
+ mov ecx, [esp + 8] // count
+ movd xmm0, [esp + 12] // seed
+ movdqa xmm6, kHash16x33
+
+ align 4
+ wloop:
+ vpmovzxbd xmm3, dword ptr [eax] // src[0-3]
+ pmulld xmm0, xmm6 // hash *= 33 ^ 16
+ vpmovzxbd xmm4, dword ptr [eax + 4] // src[4-7]
+ pmulld xmm3, kHashMul0
+ vpmovzxbd xmm2, dword ptr [eax + 8] // src[8-11]
+ pmulld xmm4, kHashMul1
+ vpmovzxbd xmm1, dword ptr [eax + 12] // src[12-15]
+ pmulld xmm2, kHashMul2
+ lea eax, [eax + 16]
+ pmulld xmm1, kHashMul3
+ paddd xmm3, xmm4 // add 16 results
+ paddd xmm1, xmm2
+ sub ecx, 16
+ paddd xmm1, xmm3
+ pshufd xmm2, xmm1, 0x0e // upper 2 dwords
+ paddd xmm1, xmm2
+ pshufd xmm2, xmm1, 0x01
+ paddd xmm1, xmm2
+ paddd xmm0, xmm1
+ jg wloop
+
+ movd eax, xmm0 // return hash
+ ret
+ }
+}
+#endif // _MSC_VER >= 1700
+
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert.cc b/third_party/libyuv/source/convert.cc
new file mode 100644
index 000000000..874a6cb7c
--- /dev/null
+++ b/third_party/libyuv/source/convert.cc
@@ -0,0 +1,1513 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// Any I4xx To I420 format with mirroring.
+static int I4xxToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_y_width, int src_y_height,
+ int src_uv_width, int src_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int dst_uv_width = SUBSAMPLE(dst_y_width, 1, 1);
+ const int dst_uv_height = SUBSAMPLE(dst_y_height, 1, 1);
+ if (src_y_width == 0 || src_y_height == 0 ||
+ src_uv_width == 0 || src_uv_height == 0) {
+ return -1;
+ }
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+ dst_y, dst_stride_y, dst_y_width, dst_y_height,
+ kFilterBilinear);
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+ dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+ dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ return 0;
+}
+
+// Copy I420 with optional flipping
+// TODO(fbarchard): Use Scale plane which supports mirroring, but ensure
+// is does row coalescing.
+LIBYUV_API
+int I420Copy(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ if (dst_y) {
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ }
+ // Copy UV planes.
+ CopyPlane(src_u, src_stride_u, dst_u, dst_stride_u, halfwidth, halfheight);
+ CopyPlane(src_v, src_stride_v, dst_v, dst_stride_v, halfwidth, halfheight);
+ return 0;
+}
+
+// 422 chroma is 1/2 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I422ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ const int src_uv_width = SUBSAMPLE(width, 1, 1);
+ return I4xxToI420(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ src_uv_width, height);
+}
+
+// 444 chroma is 1x width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I444ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ return I4xxToI420(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ width, height);
+}
+
+// 411 chroma is 1/4 width, 1x height
+// 420 chroma is 1/2 width, 1/2 height
+LIBYUV_API
+int I411ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ const int src_uv_width = SUBSAMPLE(width, 3, 2);
+ return I4xxToI420(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ src_uv_width, height);
+}
+
+// I400 is greyscale typically used in MJPG
+LIBYUV_API
+int I400ToI420(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ SetPlane(dst_u, dst_stride_u, halfwidth, halfheight, 128);
+ SetPlane(dst_v, dst_stride_v, halfwidth, halfheight, 128);
+ return 0;
+}
+
+static void CopyPlane2(const uint8* src, int src_stride_0, int src_stride_1,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ int y;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src, 16) &&
+ IS_ALIGNED(src_stride_0, 16) && IS_ALIGNED(src_stride_1, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Copy plane
+ for (y = 0; y < height - 1; y += 2) {
+ CopyRow(src, dst, width);
+ CopyRow(src + src_stride_0, dst + dst_stride, width);
+ src += src_stride_0 + src_stride_1;
+ dst += dst_stride * 2;
+ }
+ if (height & 1) {
+ CopyRow(src, dst, width);
+ }
+}
+
+// Support converting from FOURCC_M420
+// Useful for bandwidth constrained transports like USB 1.0 and 2.0 and for
+// easy conversion to I420.
+// M420 format description:
+// M420 is row biplanar 420: 2 rows of Y and 1 row of UV.
+// Chroma is half width / half height. (420)
+// src_stride_m420 is row planar. Normally this will be the width in pixels.
+// The UV plane is half width, but 2 values, so src_stride_m420 applies to
+// this as well as the two Y planes.
+static int X420ToI420(const uint8* src_y,
+ int src_stride_y0, int src_stride_y1,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ void (*SplitUVRow)(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix) =
+ SplitUVRow_C;
+ if (!src_y || !src_uv ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+ dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // Coalesce rows.
+ if (src_stride_y0 == width &&
+ src_stride_y1 == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y0 = src_stride_y1 = dst_stride_y = 0;
+ }
+ // Coalesce rows.
+ if (src_stride_uv == halfwidth * 2 &&
+ dst_stride_u == halfwidth &&
+ dst_stride_v == halfwidth) {
+ halfwidth *= halfheight;
+ halfheight = 1;
+ src_stride_uv = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_SPLITUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ SplitUVRow = SplitUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ SplitUVRow = SplitUVRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_uv, 16) && IS_ALIGNED(src_stride_uv, 16) &&
+ IS_ALIGNED(dst_u, 16) && IS_ALIGNED(dst_stride_u, 16) &&
+ IS_ALIGNED(dst_v, 16) && IS_ALIGNED(dst_stride_v, 16)) {
+ SplitUVRow = SplitUVRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ SplitUVRow = SplitUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ SplitUVRow = SplitUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ SplitUVRow = SplitUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ SplitUVRow = SplitUVRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_SPLITUVROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && halfwidth >= 16) {
+ SplitUVRow = SplitUVRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ SplitUVRow = SplitUVRow_Unaligned_MIPS_DSPR2;
+ if (IS_ALIGNED(src_uv, 4) && IS_ALIGNED(src_stride_uv, 4) &&
+ IS_ALIGNED(dst_u, 4) && IS_ALIGNED(dst_stride_u, 4) &&
+ IS_ALIGNED(dst_v, 4) && IS_ALIGNED(dst_stride_v, 4)) {
+ SplitUVRow = SplitUVRow_MIPS_DSPR2;
+ }
+ }
+ }
+#endif
+
+ if (dst_y) {
+ if (src_stride_y0 == src_stride_y1) {
+ CopyPlane(src_y, src_stride_y0, dst_y, dst_stride_y, width, height);
+ } else {
+ CopyPlane2(src_y, src_stride_y0, src_stride_y1, dst_y, dst_stride_y,
+ width, height);
+ }
+ }
+
+ for (y = 0; y < halfheight; ++y) {
+ // Copy a row of UV.
+ SplitUVRow(src_uv, dst_u, dst_v, halfwidth);
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ src_uv += src_stride_uv;
+ }
+ return 0;
+}
+
+// Convert NV12 to I420.
+LIBYUV_API
+int NV12ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y,
+ src_uv, src_stride_uv,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+}
+
+// Convert NV21 to I420. Same as NV12 but u and v pointers swapped.
+LIBYUV_API
+int NV21ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_vu, int src_stride_vu,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ return X420ToI420(src_y, src_stride_y, src_stride_y,
+ src_vu, src_stride_vu,
+ dst_y, dst_stride_y,
+ dst_v, dst_stride_v,
+ dst_u, dst_stride_u,
+ width, height);
+}
+
+// Convert M420 to I420.
+LIBYUV_API
+int M420ToI420(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ return X420ToI420(src_m420, src_stride_m420, src_stride_m420 * 2,
+ src_m420 + src_stride_m420 * 2, src_stride_m420 * 3,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+}
+
+// Convert Q420 to I420.
+// Format is rows of YY/YUYV
+LIBYUV_API
+int Q420ToI420(const uint8* src_y, int src_stride_y,
+ const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ int halfheight = (height + 1) >> 1;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+ void (*YUY2ToUV422Row)(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int pix) = YUY2ToUV422Row_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2, uint8* dst_y, int pix) =
+ YUY2ToYRow_C;
+ if (!src_y || !src_yuy2 ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+ dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+ // CopyRow for rows of just Y in Q420 copied to Y plane of I420.
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_X86)
+ if (IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src_y, 16) && IS_ALIGNED(src_stride_y, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUV422Row = YUY2ToUV422Row_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width >= 16) {
+ YUY2ToUV422Row = YUY2ToUV422Row_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUV422Row = YUY2ToUV422Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ CopyRow(src_y, dst_y, width);
+ src_y += src_stride_y;
+ dst_y += dst_stride_y;
+
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ CopyRow(src_y, dst_y, width);
+ YUY2ToUV422Row(src_yuy2, dst_u, dst_v, width);
+ }
+ return 0;
+}
+
+// Convert YUY2 to I420.
+LIBYUV_API
+int YUY2ToI420(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*YUY2ToUVRow)(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) = YUY2ToUVRow_C;
+ void (*YUY2ToYRow)(const uint8* src_yuy2,
+ uint8* dst_y, int pix) = YUY2ToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+#if defined(HAS_YUY2TOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_Unaligned_SSE2;
+ YUY2ToYRow = YUY2ToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16)) {
+ YUY2ToUVRow = YUY2ToUVRow_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ YUY2ToYRow = YUY2ToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_AVX2;
+ YUY2ToYRow = YUY2ToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ YUY2ToUVRow = YUY2ToUVRow_AVX2;
+ YUY2ToYRow = YUY2ToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_YUY2TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YUY2ToYRow = YUY2ToYRow_Any_NEON;
+ if (width >= 16) {
+ YUY2ToUVRow = YUY2ToUVRow_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToYRow = YUY2ToYRow_NEON;
+ YUY2ToUVRow = YUY2ToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ YUY2ToUVRow(src_yuy2, src_stride_yuy2, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ YUY2ToYRow(src_yuy2 + src_stride_yuy2, dst_y + dst_stride_y, width);
+ src_yuy2 += src_stride_yuy2 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ YUY2ToUVRow(src_yuy2, 0, dst_u, dst_v, width);
+ YUY2ToYRow(src_yuy2, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert UYVY to I420.
+LIBYUV_API
+int UYVYToI420(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*UYVYToUVRow)(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) = UYVYToUVRow_C;
+ void (*UYVYToYRow)(const uint8* src_uyvy,
+ uint8* dst_y, int pix) = UYVYToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+#if defined(HAS_UYVYTOYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ UYVYToUVRow = UYVYToUVRow_Any_SSE2;
+ UYVYToYRow = UYVYToYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToUVRow = UYVYToUVRow_Unaligned_SSE2;
+ UYVYToYRow = UYVYToYRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16)) {
+ UYVYToUVRow = UYVYToUVRow_SSE2;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ UYVYToYRow = UYVYToYRow_SSE2;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ UYVYToUVRow = UYVYToUVRow_Any_AVX2;
+ UYVYToYRow = UYVYToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ UYVYToUVRow = UYVYToUVRow_AVX2;
+ UYVYToYRow = UYVYToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_UYVYTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ UYVYToYRow = UYVYToYRow_Any_NEON;
+ if (width >= 16) {
+ UYVYToUVRow = UYVYToUVRow_Any_NEON;
+ }
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToYRow = UYVYToYRow_NEON;
+ UYVYToUVRow = UYVYToUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ UYVYToUVRow(src_uyvy, src_stride_uyvy, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ UYVYToYRow(src_uyvy + src_stride_uyvy, dst_y + dst_stride_y, width);
+ src_uyvy += src_stride_uyvy * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ UYVYToUVRow(src_uyvy, 0, dst_u, dst_v, width);
+ UYVYToYRow(src_uyvy, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to I420.
+LIBYUV_API
+int ARGBToI420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ if (!src_argb ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2) && defined(HAS_ARGBTOUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToUVRow = ARGBToUVRow_Any_AVX2;
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUVRow = ARGBToUVRow_AVX2;
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert BGRA to I420.
+LIBYUV_API
+int BGRAToI420(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*BGRAToUVRow)(const uint8* src_bgra0, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int width) = BGRAToUVRow_C;
+ void (*BGRAToYRow)(const uint8* src_bgra, uint8* dst_y, int pix) =
+ BGRAToYRow_C;
+ if (!src_bgra ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_bgra = src_bgra + (height - 1) * src_stride_bgra;
+ src_stride_bgra = -src_stride_bgra;
+ }
+#if defined(HAS_BGRATOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ BGRAToUVRow = BGRAToUVRow_Any_SSSE3;
+ BGRAToYRow = BGRAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_Unaligned_SSSE3;
+ BGRAToYRow = BGRAToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_bgra, 16) && IS_ALIGNED(src_stride_bgra, 16)) {
+ BGRAToUVRow = BGRAToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ BGRAToYRow = BGRAToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_BGRATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ BGRAToYRow = BGRAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ BGRAToYRow = BGRAToYRow_NEON;
+ }
+ if (width >= 16) {
+ BGRAToUVRow = BGRAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ BGRAToUVRow = BGRAToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ BGRAToUVRow(src_bgra, src_stride_bgra, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ BGRAToYRow(src_bgra + src_stride_bgra, dst_y + dst_stride_y, width);
+ src_bgra += src_stride_bgra * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ BGRAToUVRow(src_bgra, 0, dst_u, dst_v, width);
+ BGRAToYRow(src_bgra, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert ABGR to I420.
+LIBYUV_API
+int ABGRToI420(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ABGRToUVRow)(const uint8* src_abgr0, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int width) = ABGRToUVRow_C;
+ void (*ABGRToYRow)(const uint8* src_abgr, uint8* dst_y, int pix) =
+ ABGRToYRow_C;
+ if (!src_abgr ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_abgr = src_abgr + (height - 1) * src_stride_abgr;
+ src_stride_abgr = -src_stride_abgr;
+ }
+#if defined(HAS_ABGRTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ABGRToUVRow = ABGRToUVRow_Any_SSSE3;
+ ABGRToYRow = ABGRToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_Unaligned_SSSE3;
+ ABGRToYRow = ABGRToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_abgr, 16) && IS_ALIGNED(src_stride_abgr, 16)) {
+ ABGRToUVRow = ABGRToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ABGRToYRow = ABGRToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_ABGRTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ABGRToYRow = ABGRToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ABGRToYRow = ABGRToYRow_NEON;
+ }
+ if (width >= 16) {
+ ABGRToUVRow = ABGRToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ABGRToUVRow = ABGRToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ABGRToUVRow(src_abgr, src_stride_abgr, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ ABGRToYRow(src_abgr + src_stride_abgr, dst_y + dst_stride_y, width);
+ src_abgr += src_stride_abgr * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ABGRToUVRow(src_abgr, 0, dst_u, dst_v, width);
+ ABGRToYRow(src_abgr, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert RGBA to I420.
+LIBYUV_API
+int RGBAToI420(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*RGBAToUVRow)(const uint8* src_rgba0, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int width) = RGBAToUVRow_C;
+ void (*RGBAToYRow)(const uint8* src_rgba, uint8* dst_y, int pix) =
+ RGBAToYRow_C;
+ if (!src_rgba ||
+ !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgba = src_rgba + (height - 1) * src_stride_rgba;
+ src_stride_rgba = -src_stride_rgba;
+ }
+#if defined(HAS_RGBATOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ RGBAToUVRow = RGBAToUVRow_Any_SSSE3;
+ RGBAToYRow = RGBAToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_Unaligned_SSSE3;
+ RGBAToYRow = RGBAToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_rgba, 16) && IS_ALIGNED(src_stride_rgba, 16)) {
+ RGBAToUVRow = RGBAToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ RGBAToYRow = RGBAToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_RGBATOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGBAToYRow = RGBAToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGBAToYRow = RGBAToYRow_NEON;
+ }
+ if (width >= 16) {
+ RGBAToUVRow = RGBAToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGBAToUVRow = RGBAToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ RGBAToUVRow(src_rgba, src_stride_rgba, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ RGBAToYRow(src_rgba + src_stride_rgba, dst_y + dst_stride_y, width);
+ src_rgba += src_stride_rgba * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ RGBAToUVRow(src_rgba, 0, dst_u, dst_v, width);
+ RGBAToYRow(src_rgba, dst_y, width);
+ }
+ return 0;
+}
+
+// Convert RGB24 to I420.
+LIBYUV_API
+int RGB24ToI420(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+#if defined(HAS_RGB24TOYROW_NEON)
+ void (*RGB24ToUVRow)(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int width) = RGB24ToUVRow_C;
+ void (*RGB24ToYRow)(const uint8* src_rgb24, uint8* dst_y, int pix) =
+ RGB24ToYRow_C;
+#else
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RGB24ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+ if (!src_rgb24 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+
+#if defined(HAS_RGB24TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGB24ToYRow = RGB24ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToYRow = RGB24ToYRow_NEON;
+ }
+ if (width >= 16) {
+ RGB24ToUVRow = RGB24ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToUVRow = RGB24ToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_RGB24TOYROW_NEON
+
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_RGB24TOYROW_NEON
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB24TOYROW_NEON)
+ RGB24ToUVRow(src_rgb24, src_stride_rgb24, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+ RGB24ToYRow(src_rgb24 + src_stride_rgb24, dst_y + dst_stride_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ RGB24ToARGBRow(src_rgb24 + src_stride_rgb24, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb24 += src_stride_rgb24 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB24TOYROW_NEON)
+ RGB24ToUVRow(src_rgb24, 0, dst_u, dst_v, width);
+ RGB24ToYRow(src_rgb24, dst_y, width);
+#else
+ RGB24ToARGBRow(src_rgb24, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RGB24TOYROW_NEON)
+ free_aligned_buffer_64(row);
+#endif
+ return 0;
+}
+
+// Convert RAW to I420.
+LIBYUV_API
+int RAWToI420(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+#if defined(HAS_RAWTOYROW_NEON)
+ void (*RAWToUVRow)(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int width) = RAWToUVRow_C;
+ void (*RAWToYRow)(const uint8* src_raw, uint8* dst_y, int pix) =
+ RAWToYRow_C;
+#else
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RAWToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+ if (!src_raw || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+
+#if defined(HAS_RAWTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RAWToYRow = RAWToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToYRow = RAWToYRow_NEON;
+ }
+ if (width >= 16) {
+ RAWToUVRow = RAWToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToUVRow = RAWToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_RAWTOYROW_NEON
+
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_RAWTOYROW_NEON
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, src_stride_raw, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+ RAWToYRow(src_raw + src_stride_raw, dst_y + dst_stride_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ RAWToARGBRow(src_raw + src_stride_raw, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_raw += src_stride_raw * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RAWTOYROW_NEON)
+ RAWToUVRow(src_raw, 0, dst_u, dst_v, width);
+ RAWToYRow(src_raw, dst_y, width);
+#else
+ RAWToARGBRow(src_raw, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RAWTOYROW_NEON)
+ free_aligned_buffer_64(row);
+#endif
+ return 0;
+}
+
+// Convert RGB565 to I420.
+LIBYUV_API
+int RGB565ToI420(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+#if defined(HAS_RGB565TOYROW_NEON)
+ void (*RGB565ToUVRow)(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int width) = RGB565ToUVRow_C;
+ void (*RGB565ToYRow)(const uint8* src_rgb565, uint8* dst_y, int pix) =
+ RGB565ToYRow_C;
+#else
+ void (*RGB565ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RGB565ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+ if (!src_rgb565 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+
+#if defined(HAS_RGB565TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGB565ToYRow = RGB565ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToYRow = RGB565ToYRow_NEON;
+ }
+ if (width >= 16) {
+ RGB565ToUVRow = RGB565ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ RGB565ToUVRow = RGB565ToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_RGB565TOYROW_NEON
+
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_RGB565TOYROW_NEON
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_RGB565TOYROW_NEON)
+ RGB565ToUVRow(src_rgb565, src_stride_rgb565, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+ RGB565ToYRow(src_rgb565 + src_stride_rgb565, dst_y + dst_stride_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ RGB565ToARGBRow(src_rgb565 + src_stride_rgb565, row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_rgb565 += src_stride_rgb565 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_RGB565TOYROW_NEON)
+ RGB565ToUVRow(src_rgb565, 0, dst_u, dst_v, width);
+ RGB565ToYRow(src_rgb565, dst_y, width);
+#else
+ RGB565ToARGBRow(src_rgb565, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_RGB565TOYROW_NEON)
+ free_aligned_buffer_64(row);
+#endif
+ return 0;
+}
+
+// Convert ARGB1555 to I420.
+LIBYUV_API
+int ARGB1555ToI420(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ void (*ARGB1555ToUVRow)(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int width) = ARGB1555ToUVRow_C;
+ void (*ARGB1555ToYRow)(const uint8* src_argb1555, uint8* dst_y, int pix) =
+ ARGB1555ToYRow_C;
+#else
+ void (*ARGB1555ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ ARGB1555ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+ if (!src_argb1555 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGB1555ToYRow = ARGB1555ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToYRow = ARGB1555ToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB1555ToUVRow = ARGB1555ToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_ARGB1555TOYROW_NEON
+
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_ARGB1555TOYROW_NEON
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ ARGB1555ToUVRow(src_argb1555, src_stride_argb1555, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+ ARGB1555ToYRow(src_argb1555 + src_stride_argb1555, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGB1555ToARGBRow(src_argb1555 + src_stride_argb1555, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_argb1555 += src_stride_argb1555 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_ARGB1555TOYROW_NEON)
+ ARGB1555ToUVRow(src_argb1555, 0, dst_u, dst_v, width);
+ ARGB1555ToYRow(src_argb1555, dst_y, width);
+#else
+ ARGB1555ToARGBRow(src_argb1555, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_ARGB1555TOYROW_NEON)
+ free_aligned_buffer_64(row);
+#endif
+ return 0;
+}
+
+// Convert ARGB4444 to I420.
+LIBYUV_API
+int ARGB4444ToI420(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ void (*ARGB4444ToUVRow)(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int width) = ARGB4444ToUVRow_C;
+ void (*ARGB4444ToYRow)(const uint8* src_argb4444, uint8* dst_y, int pix) =
+ ARGB4444ToYRow_C;
+#else
+ void (*ARGB4444ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ ARGB4444ToARGBRow_C;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+#endif
+ if (!src_argb4444 || !dst_y || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGB4444ToYRow = ARGB4444ToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToYRow = ARGB4444ToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGB4444ToUVRow = ARGB4444ToUVRow_NEON;
+ }
+ }
+ }
+#else // HAS_ARGB4444TOYROW_NEON
+
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif // HAS_ARGBTOUVROW_SSSE3
+#endif // HAS_ARGB4444TOYROW_NEON
+
+ for (y = 0; y < height - 1; y += 2) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, src_stride_argb4444, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+ ARGB4444ToYRow(src_argb4444 + src_stride_argb4444, dst_y + dst_stride_y,
+ width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGB4444ToARGBRow(src_argb4444 + src_stride_argb4444, row + kRowSize,
+ width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+#endif
+ src_argb4444 += src_stride_argb4444 * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+#if defined(HAS_ARGB4444TOYROW_NEON)
+ ARGB4444ToUVRow(src_argb4444, 0, dst_u, dst_v, width);
+ ARGB4444ToYRow(src_argb4444, dst_y, width);
+#else
+ ARGB4444ToARGBRow(src_argb4444, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+#endif
+ }
+#if !defined(HAS_ARGB4444TOYROW_NEON)
+ free_aligned_buffer_64(row);
+#endif
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_argb.cc b/third_party/libyuv/source/convert_argb.cc
new file mode 100644
index 000000000..ac0bc3d15
--- /dev/null
+++ b/third_party/libyuv/source/convert_argb.cc
@@ -0,0 +1,938 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Copy ARGB with optional flipping
+LIBYUV_API
+int ARGBCopy(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ if (!src_argb || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ CopyPlane(src_argb, src_stride_argb, dst_argb, dst_stride_argb,
+ width * 4, height);
+ return 0;
+}
+
+// Convert I444 to ARGB.
+LIBYUV_API
+int I444ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*I444ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I444ToARGBRow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u == width &&
+ src_stride_v == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I444TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I444ToARGBRow = I444ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I444ToARGBRow = I444ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I444ToARGBRow = I444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I444ToARGBRow = I444ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I444ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I422 to ARGB.
+LIBYUV_API
+int I422ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I411 to ARGB.
+LIBYUV_API
+int I411ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*I411ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I411ToARGBRow_C;
+ if (!src_y || !src_u || !src_v ||
+ !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 4 == width &&
+ src_stride_v * 4 == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_argb = 0;
+ }
+#if defined(HAS_I411TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I411ToARGBRow = I411ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I411ToARGBRow = I411ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I411ToARGBRow = I411ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I411TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I411ToARGBRow = I411ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I411ToARGBRow = I411ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I411ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB_Reference(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*YToARGBRow)(const uint8* y_buf,
+ uint8* rgb_buf,
+ int width) = YToARGBRow_C;
+ if (!src_y || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_YTOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ YToARGBRow = YToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ YToARGBRow = YToARGBRow_SSE2;
+ }
+ }
+#elif defined(HAS_YTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YToARGBRow = YToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ YToARGBRow = YToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ YToARGBRow(src_y, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ }
+ return 0;
+}
+
+// Convert I400 to ARGB.
+LIBYUV_API
+int I400ToARGB(const uint8* src_y, int src_stride_y,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*I400ToARGBRow)(const uint8* src_y, uint8* dst_argb, int pix) =
+ I400ToARGBRow_C;
+ if (!src_y || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_argb = 0;
+ }
+#if defined(HAS_I400TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8) {
+ I400ToARGBRow = I400ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I400ToARGBRow = I400ToARGBRow_SSE2;
+ }
+ }
+ }
+#elif defined(HAS_I400TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I400ToARGBRow = I400ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I400ToARGBRow = I400ToARGBRow_NEON;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ I400ToARGBRow(src_y, dst_argb, width);
+ src_y += src_stride_y;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Shuffle table for converting BGRA to ARGB.
+static uvec8 kShuffleMaskBGRAToARGB = {
+ 3u, 2u, 1u, 0u, 7u, 6u, 5u, 4u, 11u, 10u, 9u, 8u, 15u, 14u, 13u, 12u
+};
+
+// Shuffle table for converting ABGR to ARGB.
+static uvec8 kShuffleMaskABGRToARGB = {
+ 2u, 1u, 0u, 3u, 6u, 5u, 4u, 7u, 10u, 9u, 8u, 11u, 14u, 13u, 12u, 15u
+};
+
+// Shuffle table for converting RGBA to ARGB.
+static uvec8 kShuffleMaskRGBAToARGB = {
+ 1u, 2u, 3u, 0u, 5u, 6u, 7u, 4u, 9u, 10u, 11u, 8u, 13u, 14u, 15u, 12u
+};
+
+// Convert BGRA to ARGB.
+LIBYUV_API
+int BGRAToARGB(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskBGRAToARGB),
+ width, height);
+}
+
+// Convert ARGB to BGRA (same as BGRAToARGB).
+LIBYUV_API
+int ARGBToBGRA(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_bgra, src_stride_bgra,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskBGRAToARGB),
+ width, height);
+}
+
+// Convert ABGR to ARGB.
+LIBYUV_API
+int ABGRToARGB(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskABGRToARGB),
+ width, height);
+}
+
+// Convert ARGB to ABGR to (same as ABGRToARGB).
+LIBYUV_API
+int ARGBToABGR(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_abgr, src_stride_abgr,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskABGRToARGB),
+ width, height);
+}
+
+// Convert RGBA to ARGB.
+LIBYUV_API
+int RGBAToARGB(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ return ARGBShuffle(src_rgba, src_stride_rgba,
+ dst_argb, dst_stride_argb,
+ (const uint8*)(&kShuffleMaskRGBAToARGB),
+ width, height);
+}
+
+// Convert RGB24 to ARGB.
+LIBYUV_API
+int RGB24ToARGB(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*RGB24ToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RGB24ToARGBRow_C;
+ if (!src_rgb24 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb24 = src_rgb24 + (height - 1) * src_stride_rgb24;
+ src_stride_rgb24 = -src_stride_rgb24;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb24 == width * 3 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb24 = dst_stride_argb = 0;
+ }
+#if defined(HAS_RGB24TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_SSSE3;
+ }
+ }
+#elif defined(HAS_RGB24TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGB24ToARGBRow = RGB24ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB24ToARGBRow = RGB24ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB24ToARGBRow(src_rgb24, dst_argb, width);
+ src_rgb24 += src_stride_rgb24;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RAW to ARGB.
+LIBYUV_API
+int RAWToARGB(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*RAWToARGBRow)(const uint8* src_rgb, uint8* dst_argb, int pix) =
+ RAWToARGBRow_C;
+ if (!src_raw || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_raw = src_raw + (height - 1) * src_stride_raw;
+ src_stride_raw = -src_stride_raw;
+ }
+ // Coalesce rows.
+ if (src_stride_raw == width * 3 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_raw = dst_stride_argb = 0;
+ }
+#if defined(HAS_RAWTOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RAWToARGBRow = RAWToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ RAWToARGBRow = RAWToARGBRow_SSSE3;
+ }
+ }
+#elif defined(HAS_RAWTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RAWToARGBRow = RAWToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RAWToARGBRow = RAWToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RAWToARGBRow(src_raw, dst_argb, width);
+ src_raw += src_stride_raw;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert RGB565 to ARGB.
+LIBYUV_API
+int RGB565ToARGB(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*RGB565ToARGBRow)(const uint8* src_rgb565, uint8* dst_argb, int pix) =
+ RGB565ToARGBRow_C;
+ if (!src_rgb565 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_rgb565 = src_rgb565 + (height - 1) * src_stride_rgb565;
+ src_stride_rgb565 = -src_stride_rgb565;
+ }
+ // Coalesce rows.
+ if (src_stride_rgb565 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_rgb565 = dst_stride_argb = 0;
+ }
+#if defined(HAS_RGB565TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_SSE2;
+ }
+ }
+#elif defined(HAS_RGB565TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ RGB565ToARGBRow = RGB565ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ RGB565ToARGBRow = RGB565ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ RGB565ToARGBRow(src_rgb565, dst_argb, width);
+ src_rgb565 += src_stride_rgb565;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB1555 to ARGB.
+LIBYUV_API
+int ARGB1555ToARGB(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGB1555ToARGBRow)(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) = ARGB1555ToARGBRow_C;
+ if (!src_argb1555 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb1555 = src_argb1555 + (height - 1) * src_stride_argb1555;
+ src_stride_argb1555 = -src_stride_argb1555;
+ }
+ // Coalesce rows.
+ if (src_stride_argb1555 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb1555 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGB1555TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_SSE2;
+ }
+ }
+#elif defined(HAS_ARGB1555TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB1555ToARGBRow = ARGB1555ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGB1555ToARGBRow(src_argb1555, dst_argb, width);
+ src_argb1555 += src_stride_argb1555;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert ARGB4444 to ARGB.
+LIBYUV_API
+int ARGB4444ToARGB(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*ARGB4444ToARGBRow)(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) = ARGB4444ToARGBRow_C;
+ if (!src_argb4444 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb4444 = src_argb4444 + (height - 1) * src_stride_argb4444;
+ src_stride_argb4444 = -src_stride_argb4444;
+ }
+ // Coalesce rows.
+ if (src_stride_argb4444 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_argb4444 = dst_stride_argb = 0;
+ }
+#if defined(HAS_ARGB4444TOARGBROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 8 &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_SSE2;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_SSE2;
+ }
+ }
+#elif defined(HAS_ARGB4444TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGB4444ToARGBRow = ARGB4444ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGB4444ToARGBRow(src_argb4444, dst_argb, width);
+ src_argb4444 += src_stride_argb4444;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert NV12 to ARGB.
+LIBYUV_API
+int NV12ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*NV12ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV12ToARGBRow_C;
+ if (!src_y || !src_uv || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV12ToARGBRow(src_y, src_uv, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert NV21 to ARGB.
+LIBYUV_API
+int NV21ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*NV21ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV21ToARGBRow_C;
+ if (!src_y || !src_uv || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV21TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ NV21ToARGBRow = NV21ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_NV21TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV21ToARGBRow = NV21ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV21ToARGBRow = NV21ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ NV21ToARGBRow(src_y, src_uv, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_uv += src_stride_uv;
+ }
+ }
+ return 0;
+}
+
+// Convert M420 to ARGB.
+LIBYUV_API
+int M420ToARGB(const uint8* src_m420, int src_stride_m420,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*NV12ToARGBRow)(const uint8* y_buf,
+ const uint8* uv_buf,
+ uint8* rgb_buf,
+ int width) = NV12ToARGBRow_C;
+ if (!src_m420 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_NV12TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ NV12ToARGBRow = NV12ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_NV12TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ NV12ToARGBRow = NV12ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ NV12ToARGBRow = NV12ToARGBRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+ NV12ToARGBRow(src_m420 + src_stride_m420, src_m420 + src_stride_m420 * 2,
+ dst_argb + dst_stride_argb, width);
+ dst_argb += dst_stride_argb * 2;
+ src_m420 += src_stride_m420 * 3;
+ }
+ if (height & 1) {
+ NV12ToARGBRow(src_m420, src_m420 + src_stride_m420 * 2, dst_argb, width);
+ }
+ return 0;
+}
+
+// Convert YUY2 to ARGB.
+LIBYUV_API
+int YUY2ToARGB(const uint8* src_yuy2, int src_stride_yuy2,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*YUY2ToARGBRow)(const uint8* src_yuy2, uint8* dst_argb, int pix) =
+ YUY2ToARGBRow_C;
+ if (!src_yuy2 || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_yuy2 = src_yuy2 + (height - 1) * src_stride_yuy2;
+ src_stride_yuy2 = -src_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_yuy2 == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_yuy2 = dst_stride_argb = 0;
+ }
+#if defined(HAS_YUY2TOARGBROW_SSSE3)
+ // Posix is 16, Windows is 8.
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_yuy2, 16) && IS_ALIGNED(src_stride_yuy2, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_YUY2TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ YUY2ToARGBRow = YUY2ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ YUY2ToARGBRow = YUY2ToARGBRow_NEON;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ YUY2ToARGBRow(src_yuy2, dst_argb, width);
+ src_yuy2 += src_stride_yuy2;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+// Convert UYVY to ARGB.
+LIBYUV_API
+int UYVYToARGB(const uint8* src_uyvy, int src_stride_uyvy,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*UYVYToARGBRow)(const uint8* src_uyvy, uint8* dst_argb, int pix) =
+ UYVYToARGBRow_C;
+ if (!src_uyvy || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_uyvy = src_uyvy + (height - 1) * src_stride_uyvy;
+ src_stride_uyvy = -src_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_uyvy == width * 2 &&
+ dst_stride_argb == width * 4) {
+ width *= height;
+ height = 1;
+ src_stride_uyvy = dst_stride_argb = 0;
+ }
+#if defined(HAS_UYVYTOARGBROW_SSSE3)
+ // Posix is 16, Windows is 8.
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ UYVYToARGBRow = UYVYToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_uyvy, 16) && IS_ALIGNED(src_stride_uyvy, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ UYVYToARGBRow = UYVYToARGBRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_UYVYTOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ UYVYToARGBRow = UYVYToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ UYVYToARGBRow = UYVYToARGBRow_NEON;
+ }
+ }
+#endif
+ for (y = 0; y < height; ++y) {
+ UYVYToARGBRow(src_uyvy, dst_argb, width);
+ src_uyvy += src_stride_uyvy;
+ dst_argb += dst_stride_argb;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_from.cc b/third_party/libyuv/source/convert_from.cc
new file mode 100644
index 000000000..c1a2f62f0
--- /dev/null
+++ b/third_party/libyuv/source/convert_from.cc
@@ -0,0 +1,1210 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/convert.h" // For I420Copy
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/rotate.h"
+#include "libyuv/scale.h" // For ScalePlane()
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define SUBSAMPLE(v, a, s) (v < 0) ? (-((-v + a) >> s)) : ((v + a) >> s)
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// I420 To any I4xx YUV format with mirroring.
+static int I420ToI4xx(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int src_y_width, int src_y_height,
+ int dst_uv_width, int dst_uv_height) {
+ const int dst_y_width = Abs(src_y_width);
+ const int dst_y_height = Abs(src_y_height);
+ const int src_uv_width = SUBSAMPLE(src_y_width, 1, 1);
+ const int src_uv_height = SUBSAMPLE(src_y_height, 1, 1);
+ if (src_y_width == 0 || src_y_height == 0 ||
+ dst_uv_width <= 0 || dst_uv_height <= 0) {
+ return -1;
+ }
+ ScalePlane(src_y, src_stride_y, src_y_width, src_y_height,
+ dst_y, dst_stride_y, dst_y_width, dst_y_height,
+ kFilterBilinear);
+ ScalePlane(src_u, src_stride_u, src_uv_width, src_uv_height,
+ dst_u, dst_stride_u, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ ScalePlane(src_v, src_stride_v, src_uv_width, src_uv_height,
+ dst_v, dst_stride_v, dst_uv_width, dst_uv_height,
+ kFilterBilinear);
+ return 0;
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 422 chroma is 1/2 width, 1x height
+LIBYUV_API
+int I420ToI422(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ const int dst_uv_width = (Abs(width) + 1) >> 1;
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 444 chroma is 1x width, 1x height
+LIBYUV_API
+int I420ToI444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ const int dst_uv_width = Abs(width);
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ dst_uv_width, dst_uv_height);
+}
+
+// 420 chroma is 1/2 width, 1/2 height
+// 411 chroma is 1/4 width, 1x height
+LIBYUV_API
+int I420ToI411(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ const int dst_uv_width = (Abs(width) + 3) >> 2;
+ const int dst_uv_height = Abs(height);
+ return I420ToI4xx(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height,
+ dst_uv_width, dst_uv_height);
+}
+
+// Copy to I400. Source can be I420,422,444,400,NV12,NV21
+LIBYUV_API
+int I400Copy(const uint8* src_y, int src_stride_y,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ if (!src_y || !dst_y ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_stride_y = -src_stride_y;
+ }
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ return 0;
+}
+
+LIBYUV_API
+int I422ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height) {
+ int y;
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_yuy2 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_yuy2 = 0;
+ }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToYUY2(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height) {
+ int y;
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) =
+ I422ToYUY2Row_C;
+ if (!src_y || !src_u || !src_v || !dst_yuy2 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ I422ToYUY2Row(src_y + src_stride_y, src_u, src_v,
+ dst_yuy2 + dst_stride_yuy2, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_yuy2 += dst_stride_yuy2 * 2;
+ }
+ if (height & 1) {
+ I422ToYUY2Row(src_y, src_u, src_v, dst_yuy2, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I422ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height) {
+ int y;
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_y == width &&
+ src_stride_u * 2 == width &&
+ src_stride_v * 2 == width &&
+ dst_stride_uyvy == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_y = src_stride_u = src_stride_v = dst_stride_uyvy = 0;
+ }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ src_y += src_stride_y;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToUYVY(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height) {
+ int y;
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) =
+ I422ToUYVYRow_C;
+ if (!src_y || !src_u || !src_v || !dst_uyvy ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ I422ToUYVYRow(src_y + src_stride_y, src_u, src_v,
+ dst_uyvy + dst_stride_uyvy, width);
+ src_y += src_stride_y * 2;
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uyvy += dst_stride_uyvy * 2;
+ }
+ if (height & 1) {
+ I422ToUYVYRow(src_y, src_u, src_v, dst_uyvy, width);
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToNV12(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ int y;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ // Coalesce rows.
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || !dst_y || !dst_uv ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_uv = dst_uv + (halfheight - 1) * dst_stride_uv;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_uv = -dst_stride_uv;
+ }
+ if (src_stride_y == width &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_y = dst_stride_y = 0;
+ }
+ // Coalesce rows.
+ if (src_stride_u == halfwidth &&
+ src_stride_v == halfwidth &&
+ dst_stride_uv == halfwidth * 2) {
+ halfwidth *= halfheight;
+ halfheight = 1;
+ src_stride_u = src_stride_v = dst_stride_uv = 0;
+ }
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_u, 16) && IS_ALIGNED(src_stride_u, 16) &&
+ IS_ALIGNED(src_v, 16) && IS_ALIGNED(src_stride_v, 16) &&
+ IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+
+ CopyPlane(src_y, src_stride_y, dst_y, dst_stride_y, width, height);
+ for (y = 0; y < halfheight; ++y) {
+ // Merge a row of U and V into a row of UV.
+ MergeUVRow_(src_u, src_v, dst_uv, halfwidth);
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ dst_uv += dst_stride_uv;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int I420ToNV21(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_vu, int dst_stride_vu,
+ int width, int height) {
+ return I420ToNV12(src_y, src_stride_y,
+ src_v, src_stride_v,
+ src_u, src_stride_u,
+ dst_y, src_stride_y,
+ dst_vu, dst_stride_vu,
+ width, height);
+}
+
+// Convert I420 to ARGB.
+LIBYUV_API
+int I420ToARGB(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height) {
+ int y;
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+ if (!src_y || !src_u || !src_v || !dst_argb ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, dst_argb, width);
+ dst_argb += dst_stride_argb;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to BGRA.
+LIBYUV_API
+int I420ToBGRA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bgra, int dst_stride_bgra,
+ int width, int height) {
+ int y;
+ void (*I422ToBGRARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToBGRARow_C;
+ if (!src_y || !src_u || !src_v || !dst_bgra ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_bgra = dst_bgra + (height - 1) * dst_stride_bgra;
+ dst_stride_bgra = -dst_stride_bgra;
+ }
+#if defined(HAS_I422TOBGRAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToBGRARow = I422ToBGRARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_bgra, 16) && IS_ALIGNED(dst_stride_bgra, 16)) {
+ I422ToBGRARow = I422ToBGRARow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToBGRARow = I422ToBGRARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToBGRARow = I422ToBGRARow_NEON;
+ }
+ }
+#elif defined(HAS_I422TOBGRAROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_bgra, 4) && IS_ALIGNED(dst_stride_bgra, 4)) {
+ I422ToBGRARow = I422ToBGRARow_MIPS_DSPR2;
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToBGRARow(src_y, src_u, src_v, dst_bgra, width);
+ dst_bgra += dst_stride_bgra;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ABGR.
+LIBYUV_API
+int I420ToABGR(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_abgr, int dst_stride_abgr,
+ int width, int height) {
+ int y;
+ void (*I422ToABGRRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToABGRRow_C;
+ if (!src_y || !src_u || !src_v || !dst_abgr ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_abgr = dst_abgr + (height - 1) * dst_stride_abgr;
+ dst_stride_abgr = -dst_stride_abgr;
+ }
+#if defined(HAS_I422TOABGRROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToABGRRow = I422ToABGRRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_abgr, 16) && IS_ALIGNED(dst_stride_abgr, 16)) {
+ I422ToABGRRow = I422ToABGRRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I422TOABGRROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToABGRRow = I422ToABGRRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToABGRRow = I422ToABGRRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToABGRRow(src_y, src_u, src_v, dst_abgr, width);
+ dst_abgr += dst_stride_abgr;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGBA.
+LIBYUV_API
+int I420ToRGBA(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height) {
+ int y;
+ void (*I422ToRGBARow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGBARow_C;
+ if (!src_y || !src_u || !src_v || !dst_rgba ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgba = dst_rgba + (height - 1) * dst_stride_rgba;
+ dst_stride_rgba = -dst_stride_rgba;
+ }
+#if defined(HAS_I422TORGBAROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGBARow = I422ToRGBARow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_rgba, 16) && IS_ALIGNED(dst_stride_rgba, 16)) {
+ I422ToRGBARow = I422ToRGBARow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_I422TORGBAROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToRGBARow = I422ToRGBARow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGBARow = I422ToRGBARow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGBARow(src_y, src_u, src_v, dst_rgba, width);
+ dst_rgba += dst_stride_rgba;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB24.
+LIBYUV_API
+int I420ToRGB24(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height) {
+ int y;
+ void (*I422ToRGB24Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGB24Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb24 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb24 = dst_rgb24 + (height - 1) * dst_stride_rgb24;
+ dst_stride_rgb24 = -dst_stride_rgb24;
+ }
+#if defined(HAS_I422TORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToRGB24Row = I422ToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB24Row = I422ToRGB24Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB24Row(src_y, src_u, src_v, dst_rgb24, width);
+ dst_rgb24 += dst_stride_rgb24;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RAW.
+LIBYUV_API
+int I420ToRAW(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_raw, int dst_stride_raw,
+ int width, int height) {
+ int y;
+ void (*I422ToRAWRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRAWRow_C;
+ if (!src_y || !src_u || !src_v || !dst_raw ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_raw = dst_raw + (height - 1) * dst_stride_raw;
+ dst_stride_raw = -dst_stride_raw;
+ }
+#if defined(HAS_I422TORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRAWRow = I422ToRAWRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRAWRow = I422ToRAWRow_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToRAWRow = I422ToRAWRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRAWRow = I422ToRAWRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRAWRow(src_y, src_u, src_v, dst_raw, width);
+ dst_raw += dst_stride_raw;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to ARGB1555.
+LIBYUV_API
+int I420ToARGB1555(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height) {
+ int y;
+ void (*I422ToARGB1555Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGB1555Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb1555 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb1555 = dst_argb1555 + (height - 1) * dst_stride_argb1555;
+ dst_stride_argb1555 = -dst_stride_argb1555;
+ }
+#if defined(HAS_I422TOARGB1555ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGB1555Row = I422ToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB1555Row = I422ToARGB1555Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB1555Row(src_y, src_u, src_v, dst_argb1555, width);
+ dst_argb1555 += dst_stride_argb1555;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+
+// Convert I420 to ARGB4444.
+LIBYUV_API
+int I420ToARGB4444(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height) {
+ int y;
+ void (*I422ToARGB4444Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGB4444Row_C;
+ if (!src_y || !src_u || !src_v || !dst_argb4444 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_argb4444 = dst_argb4444 + (height - 1) * dst_stride_argb4444;
+ dst_stride_argb4444 = -dst_stride_argb4444;
+ }
+#if defined(HAS_I422TOARGB4444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGB4444Row = I422ToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGB4444Row = I422ToARGB4444Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToARGB4444Row(src_y, src_u, src_v, dst_argb4444, width);
+ dst_argb4444 += dst_stride_argb4444;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to RGB565.
+LIBYUV_API
+int I420ToRGB565(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ int y;
+ void (*I422ToRGB565Row)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToRGB565Row_C;
+ if (!src_y || !src_u || !src_v || !dst_rgb565 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_rgb565 = dst_rgb565 + (height - 1) * dst_stride_rgb565;
+ dst_stride_rgb565 = -dst_stride_rgb565;
+ }
+#if defined(HAS_I422TORGB565ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_SSSE3;
+ }
+ }
+#elif defined(HAS_I422TORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToRGB565Row = I422ToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToRGB565Row = I422ToRGB565Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ I422ToRGB565Row(src_y, src_u, src_v, dst_rgb565, width);
+ dst_rgb565 += dst_stride_rgb565;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ return 0;
+}
+
+// Convert I420 to specified format
+LIBYUV_API
+int ConvertFromI420(const uint8* y, int y_stride,
+ const uint8* u, int u_stride,
+ const uint8* v, int v_stride,
+ uint8* dst_sample, int dst_sample_stride,
+ int width, int height,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ int r = 0;
+ if (!y || !u|| !v || !dst_sample ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ r = I420ToYUY2(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_UYVY:
+ r = I420ToUYVY(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_RGBP:
+ r = I420ToRGB565(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_RGBO:
+ r = I420ToARGB1555(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_R444:
+ r = I420ToARGB4444(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 2,
+ width, height);
+ break;
+ case FOURCC_24BG:
+ r = I420ToRGB24(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3,
+ width, height);
+ break;
+ case FOURCC_RAW:
+ r = I420ToRAW(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 3,
+ width, height);
+ break;
+ case FOURCC_ARGB:
+ r = I420ToARGB(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_BGRA:
+ r = I420ToBGRA(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_ABGR:
+ r = I420ToABGR(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_RGBA:
+ r = I420ToRGBA(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width * 4,
+ width, height);
+ break;
+ case FOURCC_BGGR:
+ r = I420ToBayerBGGR(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_GBRG:
+ r = I420ToBayerGBRG(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_GRBG:
+ r = I420ToBayerGRBG(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_RGGB:
+ r = I420ToBayerRGGB(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_I400:
+ r = I400Copy(y, y_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ case FOURCC_NV12: {
+ uint8* dst_uv = dst_sample + width * height;
+ r = I420ToNV12(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ dst_uv,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ }
+ case FOURCC_NV21: {
+ uint8* dst_vu = dst_sample + width * height;
+ r = I420ToNV21(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample,
+ dst_sample_stride ? dst_sample_stride : width,
+ dst_vu,
+ dst_sample_stride ? dst_sample_stride : width,
+ width, height);
+ break;
+ }
+ // TODO(fbarchard): Add M420 and Q420.
+ // Triplanar formats
+ // TODO(fbarchard): halfstride instead of halfwidth
+ case FOURCC_I420:
+ case FOURCC_YU12:
+ case FOURCC_YV12: {
+ int halfwidth = (width + 1) / 2;
+ int halfheight = (height + 1) / 2;
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV12) {
+ dst_v = dst_sample + width * height;
+ dst_u = dst_v + halfwidth * halfheight;
+ } else {
+ dst_u = dst_sample + width * height;
+ dst_v = dst_u + halfwidth * halfheight;
+ }
+ r = I420Copy(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, halfwidth,
+ dst_v, halfwidth,
+ width, height);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ int halfwidth = (width + 1) / 2;
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV16) {
+ dst_v = dst_sample + width * height;
+ dst_u = dst_v + halfwidth * height;
+ } else {
+ dst_u = dst_sample + width * height;
+ dst_v = dst_u + halfwidth * height;
+ }
+ r = I420ToI422(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, halfwidth,
+ dst_v, halfwidth,
+ width, height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ uint8* dst_u;
+ uint8* dst_v;
+ if (format == FOURCC_YV24) {
+ dst_v = dst_sample + width * height;
+ dst_u = dst_v + width * height;
+ } else {
+ dst_u = dst_sample + width * height;
+ dst_v = dst_u + width * height;
+ }
+ r = I420ToI444(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, width,
+ dst_v, width,
+ width, height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (width + 3) / 4;
+ uint8* dst_u = dst_sample + width * height;
+ uint8* dst_v = dst_u + quarterwidth * height;
+ r = I420ToI411(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ dst_sample, width,
+ dst_u, quarterwidth,
+ dst_v, quarterwidth,
+ width, height);
+ break;
+ }
+
+ // Formats not supported - MJPG, biplanar, some rgb formats.
+ default:
+ return -1; // unknown fourcc - return failure code.
+ }
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_from_argb.cc b/third_party/libyuv/source/convert_from_argb.cc
new file mode 100644
index 000000000..121a41611
--- /dev/null
+++ b/third_party/libyuv/source/convert_from_argb.cc
@@ -0,0 +1,1113 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_from_argb.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGB little endian (bgra in memory) to I444
+LIBYUV_API
+int ARGBToI444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ void (*ARGBToUV444Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV444Row_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u == width &&
+ dst_stride_v == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOUV444ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUV444Row = ARGBToUV444Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUV444Row = ARGBToUV444Row_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ ARGBToUV444Row = ARGBToUV444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ ARGBToUV444Row = ARGBToUV444Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUV444Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// ARGB little endian (bgra in memory) to I422
+LIBYUV_API
+int ARGBToI422(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV422Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u * 2 == width &&
+ dst_stride_v * 2 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+ }
+ }
+ }
+#endif
+
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUV422Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+// ARGB little endian (bgra in memory) to I411
+LIBYUV_API
+int ARGBToI411(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToUV411Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV411Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || !dst_u || !dst_v || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width &&
+ dst_stride_u * 4 == width &&
+ dst_stride_v * 4 == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = dst_stride_u = dst_stride_v = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 32) {
+ ARGBToUV411Row = ARGBToUV411Row_Any_NEON;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToUV411Row = ARGBToUV411Row_NEON;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUV411Row(src_argb, dst_u, dst_v, width);
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ return 0;
+}
+
+LIBYUV_API
+int ARGBToNV12(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+ uint8* row_v = row_u + ((halfwidth + 15) & ~15);
+ if (!src_argb ||
+ !dst_y || !dst_uv ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_u, row_v, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ return 0;
+}
+
+// Same as NV12 but U and V swapped.
+LIBYUV_API
+int ARGBToNV21(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_uv, int dst_stride_uv,
+ int width, int height) {
+ int y;
+ int halfwidth = (width + 1) >> 1;
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ void (*MergeUVRow_)(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) = MergeUVRow_C;
+ // Allocate a rows of uv.
+ align_buffer_64(row_u, ((halfwidth + 15) & ~15) * 2);
+ uint8* row_v = row_u + ((halfwidth + 15) & ~15);
+ if (!src_argb ||
+ !dst_y || !dst_uv ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_Unaligned_SSSE3;
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_SSE2;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_uv, 16) && IS_ALIGNED(dst_stride_uv, 16)) {
+ MergeUVRow_ = MergeUVRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && halfwidth >= 32) {
+ MergeUVRow_ = MergeUVRow_Any_AVX2;
+ if (IS_ALIGNED(halfwidth, 32)) {
+ MergeUVRow_ = MergeUVRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_MERGEUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && halfwidth >= 16) {
+ MergeUVRow_ = MergeUVRow_Any_NEON;
+ if (IS_ALIGNED(halfwidth, 16)) {
+ MergeUVRow_ = MergeUVRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVRow(src_argb, src_stride_argb, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ ARGBToYRow(src_argb + src_stride_argb, dst_y + dst_stride_y, width);
+ src_argb += src_stride_argb * 2;
+ dst_y += dst_stride_y * 2;
+ dst_uv += dst_stride_uv;
+ }
+ if (height & 1) {
+ ARGBToUVRow(src_argb, 0, row_u, row_v, width);
+ MergeUVRow_(row_v, row_u, dst_uv, halfwidth);
+ ARGBToYRow(src_argb, dst_y, width);
+ }
+ free_aligned_buffer_64(row_u);
+ return 0;
+}
+
+// Convert ARGB to YUY2.
+LIBYUV_API
+int ARGBToYUY2(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yuy2, int dst_stride_yuy2,
+ int width, int height) {
+ int y;
+ void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV422Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ void (*I422ToYUY2Row)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_yuy2, int width) = I422ToYUY2Row_C;
+
+ if (!src_argb || !dst_yuy2 ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_yuy2 = dst_yuy2 + (height - 1) * dst_stride_yuy2;
+ dst_stride_yuy2 = -dst_stride_yuy2;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_yuy2 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yuy2 = 0;
+ }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
+ }
+#endif
+
+#if defined(HAS_I422TOYUY2ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOYUY2ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToYUY2Row = I422ToYUY2Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToYUY2Row = I422ToYUY2Row_NEON;
+ }
+ }
+#endif
+
+ {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUV422Row(src_argb, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToYUY2Row(row_y, row_u, row_v, dst_yuy2, width);
+ src_argb += src_stride_argb;
+ dst_yuy2 += dst_stride_yuy2;
+ }
+
+ free_aligned_buffer_64(row_y);
+ }
+ return 0;
+}
+
+// Convert ARGB to UYVY.
+LIBYUV_API
+int ARGBToUYVY(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_uyvy, int dst_stride_uyvy,
+ int width, int height) {
+ int y;
+ void (*ARGBToUV422Row)(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) = ARGBToUV422Row_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ void (*I422ToUYVYRow)(const uint8* src_y, const uint8* src_u,
+ const uint8* src_v, uint8* dst_uyvy, int width) = I422ToUYVYRow_C;
+
+ if (!src_argb || !dst_uyvy ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ dst_uyvy = dst_uyvy + (height - 1) * dst_stride_uyvy;
+ dst_stride_uyvy = -dst_stride_uyvy;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_uyvy == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_uyvy = 0;
+ }
+#if defined(HAS_ARGBTOUV422ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUV422Row = ARGBToUV422Row_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUV422Row = ARGBToUV422Row_NEON;
+ }
+ }
+ }
+#endif
+
+#if defined(HAS_I422TOUYVYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_SSE2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_SSE2;
+ }
+ }
+#elif defined(HAS_I422TOUYVYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 16) {
+ I422ToUYVYRow = I422ToUYVYRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToUYVYRow = I422ToUYVYRow_NEON;
+ }
+ }
+#endif
+
+ {
+ // Allocate a rows of yuv.
+ align_buffer_64(row_y, ((width + 63) & ~63) * 2);
+ uint8* row_u = row_y + ((width + 63) & ~63);
+ uint8* row_v = row_u + ((width + 63) & ~63) / 2;
+
+ for (y = 0; y < height; ++y) {
+ ARGBToUV422Row(src_argb, row_u, row_v, width);
+ ARGBToYRow(src_argb, row_y, width);
+ I422ToUYVYRow(row_y, row_u, row_v, dst_uyvy, width);
+ src_argb += src_stride_argb;
+ dst_uyvy += dst_stride_uyvy;
+ }
+
+ free_aligned_buffer_64(row_y);
+ }
+ return 0;
+}
+
+// Convert ARGB to I400.
+LIBYUV_API
+int ARGBToI400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_y, int dst_stride_y,
+ int width, int height) {
+ int y;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ if (!src_argb || !dst_y || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_y == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_y = 0;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToYRow = ARGBToYRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYRow = ARGBToYRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToYRow(src_argb, dst_y, width);
+ src_argb += src_stride_argb;
+ dst_y += dst_stride_y;
+ }
+ return 0;
+}
+
+// Shuffle table for converting ARGB to RGBA.
+static uvec8 kShuffleMaskARGBToRGBA = {
+ 3u, 0u, 1u, 2u, 7u, 4u, 5u, 6u, 11u, 8u, 9u, 10u, 15u, 12u, 13u, 14u
+};
+
+// Convert ARGB to RGBA.
+LIBYUV_API
+int ARGBToRGBA(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgba, int dst_stride_rgba,
+ int width, int height) {
+ return ARGBShuffle(src_argb, src_stride_argb,
+ dst_rgba, dst_stride_rgba,
+ (const uint8*)(&kShuffleMaskARGBToRGBA),
+ width, height);
+}
+
+// Convert ARGB To RGB24.
+LIBYUV_API
+int ARGBToRGB24(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb24, int dst_stride_rgb24,
+ int width, int height) {
+ int y;
+ void (*ARGBToRGB24Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRGB24Row_C;
+ if (!src_argb || !dst_rgb24 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_rgb24 == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb24 = 0;
+ }
+#if defined(HAS_ARGBTORGB24ROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTORGB24ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToRGB24Row = ARGBToRGB24Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB24Row = ARGBToRGB24Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB24Row(src_argb, dst_rgb24, width);
+ src_argb += src_stride_argb;
+ dst_rgb24 += dst_stride_rgb24;
+ }
+ return 0;
+}
+
+// Convert ARGB To RAW.
+LIBYUV_API
+int ARGBToRAW(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_raw, int dst_stride_raw,
+ int width, int height) {
+ int y;
+ void (*ARGBToRAWRow)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRAWRow_C;
+ if (!src_argb || !dst_raw || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_raw == width * 3) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_raw = 0;
+ }
+#if defined(HAS_ARGBTORAWROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToRAWRow = ARGBToRAWRow_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTORAWROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToRAWRow = ARGBToRAWRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRAWRow = ARGBToRAWRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRAWRow(src_argb, dst_raw, width);
+ src_argb += src_stride_argb;
+ dst_raw += dst_stride_raw;
+ }
+ return 0;
+}
+
+// Convert ARGB To RGB565.
+LIBYUV_API
+int ARGBToRGB565(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_rgb565, int dst_stride_rgb565,
+ int width, int height) {
+ int y;
+ void (*ARGBToRGB565Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToRGB565Row_C;
+ if (!src_argb || !dst_rgb565 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_rgb565 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_rgb565 = 0;
+ }
+#if defined(HAS_ARGBTORGB565ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_SSE2;
+ }
+ }
+#elif defined(HAS_ARGBTORGB565ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToRGB565Row = ARGBToRGB565Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToRGB565Row = ARGBToRGB565Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToRGB565Row(src_argb, dst_rgb565, width);
+ src_argb += src_stride_argb;
+ dst_rgb565 += dst_stride_rgb565;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB1555.
+LIBYUV_API
+int ARGBToARGB1555(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb1555, int dst_stride_argb1555,
+ int width, int height) {
+ int y;
+ void (*ARGBToARGB1555Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToARGB1555Row_C;
+ if (!src_argb || !dst_argb1555 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb1555 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb1555 = 0;
+ }
+#if defined(HAS_ARGBTOARGB1555ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_SSE2;
+ }
+ }
+#elif defined(HAS_ARGBTOARGB1555ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB1555Row = ARGBToARGB1555Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToARGB1555Row(src_argb, dst_argb1555, width);
+ src_argb += src_stride_argb;
+ dst_argb1555 += dst_stride_argb1555;
+ }
+ return 0;
+}
+
+// Convert ARGB To ARGB4444.
+LIBYUV_API
+int ARGBToARGB4444(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb4444, int dst_stride_argb4444,
+ int width, int height) {
+ int y;
+ void (*ARGBToARGB4444Row)(const uint8* src_argb, uint8* dst_rgb, int pix) =
+ ARGBToARGB4444Row_C;
+ if (!src_argb || !dst_argb4444 || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_argb4444 == width * 2) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_argb4444 = 0;
+ }
+#if defined(HAS_ARGBTOARGB4444ROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && width >= 4 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_SSE2;
+ if (IS_ALIGNED(width, 4)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_SSE2;
+ }
+ }
+#elif defined(HAS_ARGBTOARGB4444ROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToARGB4444Row = ARGBToARGB4444Row_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToARGB4444Row(src_argb, dst_argb4444, width);
+ src_argb += src_stride_argb;
+ dst_argb4444 += dst_stride_argb4444;
+ }
+ return 0;
+}
+
+// Convert ARGB to J420. (JPeg full range I420).
+LIBYUV_API
+int ARGBToJ420(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height) {
+ int y;
+ void (*ARGBToUVJRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVJRow_C;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+ ARGBToYJRow_C;
+ if (!src_argb ||
+ !dst_yj || !dst_u || !dst_v ||
+ width <= 0 || height == 0) {
+ return -1;
+ }
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3) && defined(HAS_ARGBTOUVJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_Unaligned_SSSE3;
+ ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_SSSE3;
+ if (IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2) && defined(HAS_ARGBTOUVJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVJRow = ARGBToUVJRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVJRow = ARGBToUVJRow_NEON;
+ }
+ }
+ }
+#endif
+
+ for (y = 0; y < height - 1; y += 2) {
+ ARGBToUVJRow(src_argb, src_stride_argb, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ ARGBToYJRow(src_argb + src_stride_argb, dst_yj + dst_stride_yj, width);
+ src_argb += src_stride_argb * 2;
+ dst_yj += dst_stride_yj * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ ARGBToUVJRow(src_argb, 0, dst_u, dst_v, width);
+ ARGBToYJRow(src_argb, dst_yj, width);
+ }
+ return 0;
+}
+
+// Convert ARGB to J400.
+LIBYUV_API
+int ARGBToJ400(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_yj, int dst_stride_yj,
+ int width, int height) {
+ int y;
+ void (*ARGBToYJRow)(const uint8* src_argb, uint8* dst_yj, int pix) =
+ ARGBToYJRow_C;
+ if (!src_argb || !dst_yj || width <= 0 || height == 0) {
+ return -1;
+ }
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+ // Coalesce rows.
+ if (src_stride_argb == width * 4 &&
+ dst_stride_yj == width) {
+ width *= height;
+ height = 1;
+ src_stride_argb = dst_stride_yj = 0;
+ }
+#if defined(HAS_ARGBTOYJROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToYJRow = ARGBToYJRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYJRow = ARGBToYJRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16) &&
+ IS_ALIGNED(dst_yj, 16) && IS_ALIGNED(dst_stride_yj, 16)) {
+ ARGBToYJRow = ARGBToYJRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 32) {
+ ARGBToYJRow = ARGBToYJRow_Any_AVX2;
+ if (IS_ALIGNED(width, 32)) {
+ ARGBToYJRow = ARGBToYJRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_ARGBTOYJROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYJRow = ARGBToYJRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYJRow = ARGBToYJRow_NEON;
+ }
+ }
+#endif
+
+ for (y = 0; y < height; ++y) {
+ ARGBToYJRow(src_argb, dst_yj, width);
+ src_argb += src_stride_argb;
+ dst_yj += dst_stride_yj;
+ }
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_jpeg.cc b/third_party/libyuv/source/convert_jpeg.cc
new file mode 100644
index 000000000..bcb980f7f
--- /dev/null
+++ b/third_party/libyuv/source/convert_jpeg.cc
@@ -0,0 +1,392 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert.h"
+
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#ifdef HAVE_JPEG
+struct I420Buffers {
+ uint8* y;
+ int y_stride;
+ uint8* u;
+ int u_stride;
+ uint8* v;
+ int v_stride;
+ int w;
+ int h;
+};
+
+static void JpegCopyI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I420Copy(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I422ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I444ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI411ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I411ToI420(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToI420(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ I420Buffers* dest = (I420Buffers*)(opaque);
+ I400ToI420(data[0], strides[0],
+ dest->y, dest->y_stride,
+ dest->u, dest->u_stride,
+ dest->v, dest->v_stride,
+ dest->w, rows);
+ dest->y += rows * dest->y_stride;
+ dest->u += ((rows + 1) >> 1) * dest->u_stride;
+ dest->v += ((rows + 1) >> 1) * dest->v_stride;
+ dest->h -= rows;
+}
+
+// Query size of MJPG in pixels.
+LIBYUV_API
+int MJPGSize(const uint8* sample, size_t sample_size,
+ int* width, int* height) {
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret) {
+ *width = mjpeg_decoder.GetWidth();
+ *height = mjpeg_decoder.GetHeight();
+ }
+ mjpeg_decoder.UnloadFrame();
+ return ret ? 0 : -1; // -1 for runtime failure.
+}
+
+// MJPG (Motion JPeg) to I420
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* y, int y_stride,
+ uint8* u, int u_stride,
+ uint8* v, int v_stride,
+ int w, int h,
+ int dw, int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != w ||
+ mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ I420Buffers bufs = { y, y_stride, u, u_stride, v, v_stride, dw, dh };
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegCopyI420, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToI420, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToI420, &bufs, dw, dh);
+ // YUV411
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToI420, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToI420, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice. 411 is supported by libjpeg
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+
+#ifdef HAVE_JPEG
+struct ARGBBuffers {
+ uint8* argb;
+ int argb_stride;
+ int w;
+ int h;
+};
+
+static void JpegI420ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I420ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI422ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I422ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI444ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I444ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI411ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I411ToARGB(data[0], strides[0],
+ data[1], strides[1],
+ data[2], strides[2],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+static void JpegI400ToARGB(void* opaque,
+ const uint8* const* data,
+ const int* strides,
+ int rows) {
+ ARGBBuffers* dest = (ARGBBuffers*)(opaque);
+ I400ToARGB(data[0], strides[0],
+ dest->argb, dest->argb_stride,
+ dest->w, rows);
+ dest->argb += rows * dest->argb_stride;
+ dest->h -= rows;
+}
+
+// MJPG (Motion JPeg) to ARGB
+// TODO(fbarchard): review w and h requirement. dw and dh may be enough.
+LIBYUV_API
+int MJPGToARGB(const uint8* sample,
+ size_t sample_size,
+ uint8* argb, int argb_stride,
+ int w, int h,
+ int dw, int dh) {
+ if (sample_size == kUnknownDataSize) {
+ // ERROR: MJPEG frame size unknown
+ return -1;
+ }
+
+ // TODO(fbarchard): Port MJpeg to C.
+ MJpegDecoder mjpeg_decoder;
+ LIBYUV_BOOL ret = mjpeg_decoder.LoadFrame(sample, sample_size);
+ if (ret && (mjpeg_decoder.GetWidth() != w ||
+ mjpeg_decoder.GetHeight() != h)) {
+ // ERROR: MJPEG frame has unexpected dimensions
+ mjpeg_decoder.UnloadFrame();
+ return 1; // runtime failure
+ }
+ if (ret) {
+ ARGBBuffers bufs = { argb, argb_stride, dw, dh };
+ // YUV420
+ if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 2 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI420ToARGB, &bufs, dw, dh);
+ // YUV422
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 2 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI422ToARGB, &bufs, dw, dh);
+ // YUV444
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI444ToARGB, &bufs, dw, dh);
+ // YUV411
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceYCbCr &&
+ mjpeg_decoder.GetNumComponents() == 3 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 4 &&
+ mjpeg_decoder.GetVertSampFactor(1) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(1) == 1 &&
+ mjpeg_decoder.GetVertSampFactor(2) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(2) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI411ToARGB, &bufs, dw, dh);
+ // YUV400
+ } else if (mjpeg_decoder.GetColorSpace() ==
+ MJpegDecoder::kColorSpaceGrayscale &&
+ mjpeg_decoder.GetNumComponents() == 1 &&
+ mjpeg_decoder.GetVertSampFactor(0) == 1 &&
+ mjpeg_decoder.GetHorizSampFactor(0) == 1) {
+ ret = mjpeg_decoder.DecodeToCallback(&JpegI400ToARGB, &bufs, dw, dh);
+ } else {
+ // TODO(fbarchard): Implement conversion for any other colorspace/sample
+ // factors that occur in practice. 411 is supported by libjpeg
+ // ERROR: Unable to convert MJPEG frame because format is not supported
+ mjpeg_decoder.UnloadFrame();
+ return 1;
+ }
+ }
+ return ret ? 0 : 1;
+}
+#endif
+
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_to_argb.cc b/third_party/libyuv/source/convert_to_argb.cc
new file mode 100644
index 000000000..1b228a7b4
--- /dev/null
+++ b/third_party/libyuv/source/convert_to_argb.cc
@@ -0,0 +1,327 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/convert_argb.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/format_conversion.h"
+#ifdef HAVE_JPEG
+#include "libyuv/mjpeg_decoder.h"
+#endif
+#include "libyuv/rotate_argb.h"
+#include "libyuv/row.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToARGB(const uint8* sample, size_t sample_size,
+ uint8* crop_argb, int argb_stride,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int crop_width, int crop_height,
+ enum RotationMode rotation,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+ int r = 0;
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination crop_argb is same as source sample,
+ // also enable temporary buffer.
+ LIBYUV_BOOL need_buf = (rotation && format != FOURCC_ARGB) ||
+ crop_argb == sample;
+ uint8* tmp_argb = crop_argb;
+ int tmp_argb_stride = argb_stride;
+ uint8* rotate_buffer = NULL;
+ int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+ if (crop_argb == NULL || sample == NULL ||
+ src_width <= 0 || crop_width <= 0 ||
+ src_height == 0 || crop_height == 0) {
+ return -1;
+ }
+ if (src_height < 0) {
+ inv_crop_height = -inv_crop_height;
+ }
+
+ if (need_buf) {
+ int argb_size = crop_width * abs_crop_height * 4;
+ rotate_buffer = (uint8*)malloc(argb_size);
+ if (!rotate_buffer) {
+ return 1; // Out of memory runtime error.
+ }
+ crop_argb = rotate_buffer;
+ argb_stride = crop_width;
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToARGB(src, aligned_src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToARGB(src, aligned_src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToARGB(src, src_width * 3,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToARGB(src, src_width * 3,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToARGB(src, src_width * 4,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToARGB(src, src_width * 4,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToARGB(src, src_width * 4,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToARGB(src, src_width * 4,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToARGB(src, src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToARGB(src, src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToARGB(src, src_width * 2,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ // TODO(fbarchard): Support cropping Bayer by odd numbers
+ // by adjusting fourcc.
+ case FOURCC_BGGR:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerBGGRToARGB(src, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+
+ case FOURCC_GBRG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGBRGToARGB(src, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+
+ case FOURCC_GRBG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGRBGToARGB(src, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+
+ case FOURCC_RGGB:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerRGGBToARGB(src, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToARGB(src, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ r = NV12ToARGB(src, src_width,
+ src_uv, aligned_src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ // Call NV12 but with u and v parameters swapped.
+ r = NV21ToARGB(src, src_width,
+ src_uv, aligned_src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToARGB(src, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+// case FOURCC_Q420:
+// src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+// src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+// src_width + crop_x * 2;
+// r = Q420ToARGB(src, src_width * 3,
+// src_uv, src_width * 3,
+// crop_argb, argb_stride,
+// crop_width, inv_crop_height);
+// break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YU12:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToARGB(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToARGB(src_y, src_width,
+ src_u, src_width,
+ src_v, src_width,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (src_width + 3) / 4;
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u = sample + src_width * abs_src_height +
+ quarterwidth * crop_y + crop_x / 4;
+ const uint8* src_v = sample + src_width * abs_src_height +
+ quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+ r = I411ToARGB(src_y, src_width,
+ src_u, quarterwidth,
+ src_v, quarterwidth,
+ crop_argb, argb_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToARGB(sample, sample_size,
+ crop_argb, argb_stride,
+ src_width, abs_src_height, crop_width, inv_crop_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = ARGBRotate(crop_argb, argb_stride,
+ tmp_argb, tmp_argb_stride,
+ crop_width, abs_crop_height, rotation);
+ }
+ free(rotate_buffer);
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/convert_to_i420.cc b/third_party/libyuv/source/convert_to_i420.cc
new file mode 100644
index 000000000..7b194fff7
--- /dev/null
+++ b/third_party/libyuv/source/convert_to_i420.cc
@@ -0,0 +1,383 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "libyuv/convert.h"
+
+#include "libyuv/format_conversion.h"
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Convert camera sample to I420 with cropping, rotation and vertical flip.
+// src_width is used for source stride computation
+// src_height is used to compute location of planes, and indicate inversion
+// sample_size is measured in bytes and is the size of the frame.
+// With MJPEG it is the compressed size of the frame.
+LIBYUV_API
+int ConvertToI420(const uint8* sample,
+ size_t sample_size,
+ uint8* y, int y_stride,
+ uint8* u, int u_stride,
+ uint8* v, int v_stride,
+ int crop_x, int crop_y,
+ int src_width, int src_height,
+ int crop_width, int crop_height,
+ enum RotationMode rotation,
+ uint32 fourcc) {
+ uint32 format = CanonicalFourCC(fourcc);
+ int aligned_src_width = (src_width + 1) & ~1;
+ const uint8* src;
+ const uint8* src_uv;
+ int abs_src_height = (src_height < 0) ? -src_height : src_height;
+ int inv_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+ int r = 0;
+ LIBYUV_BOOL need_buf = (rotation && format != FOURCC_I420 &&
+ format != FOURCC_NV12 && format != FOURCC_NV21 &&
+ format != FOURCC_YU12 && format != FOURCC_YV12) || y == sample;
+ uint8* tmp_y = y;
+ uint8* tmp_u = u;
+ uint8* tmp_v = v;
+ int tmp_y_stride = y_stride;
+ int tmp_u_stride = u_stride;
+ int tmp_v_stride = v_stride;
+ uint8* rotate_buffer = NULL;
+ int abs_crop_height = (crop_height < 0) ? -crop_height : crop_height;
+
+ if (!y || !u || !v || !sample ||
+ src_width <= 0 || crop_width <= 0 ||
+ src_height == 0 || crop_height == 0) {
+ return -1;
+ }
+ if (src_height < 0) {
+ inv_crop_height = -inv_crop_height;
+ }
+
+ // One pass rotation is available for some formats. For the rest, convert
+ // to I420 (with optional vertical flipping) into a temporary I420 buffer,
+ // and then rotate the I420 to the final destination buffer.
+ // For in-place conversion, if destination y is same as source sample,
+ // also enable temporary buffer.
+ if (need_buf) {
+ int y_size = crop_width * abs_crop_height;
+ int uv_size = ((crop_width + 1) / 2) * ((abs_crop_height + 1) / 2);
+ rotate_buffer = (uint8*)malloc(y_size + uv_size * 2);
+ if (!rotate_buffer) {
+ return 1; // Out of memory runtime error.
+ }
+ y = rotate_buffer;
+ u = y + y_size;
+ v = u + uv_size;
+ y_stride = crop_width;
+ u_stride = v_stride = ((crop_width + 1) / 2);
+ }
+
+ switch (format) {
+ // Single plane formats
+ case FOURCC_YUY2:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = YUY2ToI420(src, aligned_src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_UYVY:
+ src = sample + (aligned_src_width * crop_y + crop_x) * 2;
+ r = UYVYToI420(src, aligned_src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBP:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = RGB565ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBO:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB1555ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_R444:
+ src = sample + (src_width * crop_y + crop_x) * 2;
+ r = ARGB4444ToI420(src, src_width * 2,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_24BG:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RGB24ToI420(src, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RAW:
+ src = sample + (src_width * crop_y + crop_x) * 3;
+ r = RAWToI420(src, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ARGB:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ARGBToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_BGRA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = BGRAToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_ABGR:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = ABGRToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGBA:
+ src = sample + (src_width * crop_y + crop_x) * 4;
+ r = RGBAToI420(src, src_width * 4,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ // TODO(fbarchard): Support cropping Bayer by odd numbers
+ // by adjusting fourcc.
+ case FOURCC_BGGR:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerBGGRToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_GBRG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGBRGToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_GRBG:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerGRBGToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_RGGB:
+ src = sample + (src_width * crop_y + crop_x);
+ r = BayerRGGBToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_I400:
+ src = sample + src_width * crop_y + crop_x;
+ r = I400ToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ // Biplanar formats
+ case FOURCC_NV12:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ r = NV12ToI420Rotate(src, src_width,
+ src_uv, aligned_src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height, rotation);
+ break;
+ case FOURCC_NV21:
+ src = sample + (src_width * crop_y + crop_x);
+ src_uv = sample + aligned_src_width * (src_height + crop_y / 2) + crop_x;
+ // Call NV12 but with u and v parameters swapped.
+ r = NV12ToI420Rotate(src, src_width,
+ src_uv, aligned_src_width,
+ y, y_stride,
+ v, v_stride,
+ u, u_stride,
+ crop_width, inv_crop_height, rotation);
+ break;
+ case FOURCC_M420:
+ src = sample + (src_width * crop_y) * 12 / 8 + crop_x;
+ r = M420ToI420(src, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ case FOURCC_Q420:
+ src = sample + (src_width + aligned_src_width * 2) * crop_y + crop_x;
+ src_uv = sample + (src_width + aligned_src_width * 2) * crop_y +
+ src_width + crop_x * 2;
+ r = Q420ToI420(src, src_width * 3,
+ src_uv, src_width * 3,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ // Triplanar formats
+ case FOURCC_I420:
+ case FOURCC_YU12:
+ case FOURCC_YV12: {
+ const uint8* src_y = sample + (src_width * crop_y + crop_x);
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ int halfheight = (abs_src_height + 1) / 2;
+ if (format == FOURCC_YV12) {
+ src_v = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ (halfwidth * crop_y + crop_x) / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (halfheight + crop_y / 2) + crop_x / 2;
+ }
+ r = I420Rotate(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height, rotation);
+ break;
+ }
+ case FOURCC_I422:
+ case FOURCC_YV16: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ int halfwidth = (src_width + 1) / 2;
+ if (format == FOURCC_YV16) {
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ } else {
+ src_u = sample + src_width * abs_src_height +
+ halfwidth * crop_y + crop_x / 2;
+ src_v = sample + src_width * abs_src_height +
+ halfwidth * (abs_src_height + crop_y) + crop_x / 2;
+ }
+ r = I422ToI420(src_y, src_width,
+ src_u, halfwidth,
+ src_v, halfwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I444:
+ case FOURCC_YV24: {
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u;
+ const uint8* src_v;
+ if (format == FOURCC_YV24) {
+ src_v = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_u = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ } else {
+ src_u = sample + src_width * (abs_src_height + crop_y) + crop_x;
+ src_v = sample + src_width * (abs_src_height * 2 + crop_y) + crop_x;
+ }
+ r = I444ToI420(src_y, src_width,
+ src_u, src_width,
+ src_v, src_width,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+ case FOURCC_I411: {
+ int quarterwidth = (src_width + 3) / 4;
+ const uint8* src_y = sample + src_width * crop_y + crop_x;
+ const uint8* src_u = sample + src_width * abs_src_height +
+ quarterwidth * crop_y + crop_x / 4;
+ const uint8* src_v = sample + src_width * abs_src_height +
+ quarterwidth * (abs_src_height + crop_y) + crop_x / 4;
+ r = I411ToI420(src_y, src_width,
+ src_u, quarterwidth,
+ src_v, quarterwidth,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ crop_width, inv_crop_height);
+ break;
+ }
+#ifdef HAVE_JPEG
+ case FOURCC_MJPG:
+ r = MJPGToI420(sample, sample_size,
+ y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ src_width, abs_src_height, crop_width, inv_crop_height);
+ break;
+#endif
+ default:
+ r = -1; // unknown fourcc - return failure code.
+ }
+
+ if (need_buf) {
+ if (!r) {
+ r = I420Rotate(y, y_stride,
+ u, u_stride,
+ v, v_stride,
+ tmp_y, tmp_y_stride,
+ tmp_u, tmp_u_stride,
+ tmp_v, tmp_v_stride,
+ crop_width, abs_crop_height, rotation);
+ }
+ free(rotate_buffer);
+ }
+
+ return r;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/cpu_id.cc b/third_party/libyuv/source/cpu_id.cc
index 520cfe510..2e0d61d20 100644
--- a/third_party/libyuv/source/cpu_id.cc
+++ b/third_party/libyuv/source/cpu_id.cc
@@ -8,9 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
+#include "libyuv/cpu_id.h"
-#ifdef _MSC_VER
+#if defined(_MSC_VER) && !defined(__clang__)
#include <intrin.h> // For __cpuidex()
#endif
#if !defined(__pnacl__) && !defined(__CLR_VER) && \
@@ -27,7 +27,7 @@
#include <stdio.h>
#include <string.h>
-#include "third_party/libyuv/include/libyuv/basic_types.h" // For CPU_X86
+#include "libyuv/basic_types.h" // For CPU_X86
#ifdef __cplusplus
namespace libyuv {
@@ -48,7 +48,7 @@ extern "C" {
defined(__i386__) || defined(__x86_64__))
LIBYUV_API
void CpuId(uint32 info_eax, uint32 info_ecx, uint32* cpu_info) {
-#if defined(_MSC_VER)
+#if defined(_MSC_VER) && !defined(__clang__)
#if (_MSC_FULL_VER >= 160040219)
__cpuidex((int*)(cpu_info), info_eax, info_ecx);
#elif defined(_M_IX86)
@@ -188,10 +188,14 @@ LIBYUV_API SAFEBUFFERS
int InitCpuFlags(void) {
#if !defined(__pnacl__) && !defined(__CLR_VER) && defined(CPU_X86)
+ uint32 cpu_info0[4] = { 0, 0, 0, 0 };
uint32 cpu_info1[4] = { 0, 0, 0, 0 };
uint32 cpu_info7[4] = { 0, 0, 0, 0 };
+ CpuId(0, 0, cpu_info0);
CpuId(1, 0, cpu_info1);
- CpuId(7, 0, cpu_info7);
+ if (cpu_info0[0] >= 7) {
+ CpuId(7, 0, cpu_info7);
+ }
cpu_info_ = ((cpu_info1[3] & 0x04000000) ? kCpuHasSSE2 : 0) |
((cpu_info1[2] & 0x00000200) ? kCpuHasSSSE3 : 0) |
((cpu_info1[2] & 0x00080000) ? kCpuHasSSE41 : 0) |
@@ -199,6 +203,7 @@ int InitCpuFlags(void) {
((cpu_info7[1] & 0x00000200) ? kCpuHasERMS : 0) |
((cpu_info1[2] & 0x00001000) ? kCpuHasFMA3 : 0) |
kCpuHasX86;
+
#ifdef HAS_XGETBV
if ((cpu_info1[2] & 0x18000000) == 0x18000000 && // AVX and OSSave
TestOsSaveYmm()) { // Saves YMM.
diff --git a/third_party/libyuv/source/format_conversion.cc b/third_party/libyuv/source/format_conversion.cc
new file mode 100644
index 000000000..a3daf96a9
--- /dev/null
+++ b/third_party/libyuv/source/format_conversion.cc
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/format_conversion.h"
+
+#include "libyuv/basic_types.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/video_common.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// generate a selector mask useful for pshufb
+static uint32 GenerateSelector(int select0, int select1) {
+ return (uint32)(select0) |
+ (uint32)((select1 + 4) << 8) |
+ (uint32)((select0 + 8) << 16) |
+ (uint32)((select1 + 12) << 24);
+}
+
+static int MakeSelectors(const int blue_index,
+ const int green_index,
+ const int red_index,
+ uint32 dst_fourcc_bayer,
+ uint32* index_map) {
+ // Now build a lookup table containing the indices for the four pixels in each
+ // 2x2 Bayer grid.
+ switch (dst_fourcc_bayer) {
+ case FOURCC_BGGR:
+ index_map[0] = GenerateSelector(blue_index, green_index);
+ index_map[1] = GenerateSelector(green_index, red_index);
+ break;
+ case FOURCC_GBRG:
+ index_map[0] = GenerateSelector(green_index, blue_index);
+ index_map[1] = GenerateSelector(red_index, green_index);
+ break;
+ case FOURCC_RGGB:
+ index_map[0] = GenerateSelector(red_index, green_index);
+ index_map[1] = GenerateSelector(green_index, blue_index);
+ break;
+ case FOURCC_GRBG:
+ index_map[0] = GenerateSelector(green_index, red_index);
+ index_map[1] = GenerateSelector(blue_index, green_index);
+ break;
+ default:
+ return -1; // Bad FourCC
+ }
+ return 0;
+}
+
+// Converts 32 bit ARGB to Bayer RGB formats.
+LIBYUV_API
+int ARGBToBayer(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height,
+ uint32 dst_fourcc_bayer) {
+ int y;
+ const int blue_index = 0; // Offsets for ARGB format
+ const int green_index = 1;
+ const int red_index = 2;
+ uint32 index_map[2];
+ void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) = ARGBToBayerRow_C;
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8 &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride_argb, 16)) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_NEON;
+ }
+ }
+#endif
+ if (MakeSelectors(blue_index, green_index, red_index,
+ dst_fourcc_bayer, index_map)) {
+ return -1; // Bad FourCC
+ }
+
+ for (y = 0; y < height; ++y) {
+ ARGBToBayerRow(src_argb, dst_bayer, index_map[y & 1], width);
+ src_argb += src_stride_argb;
+ dst_bayer += dst_stride_bayer;
+ }
+ return 0;
+}
+
+#define AVG(a, b) (((a) + (b)) >> 1)
+
+static void BayerRowBG(const uint8* src_bayer0, int src_stride_bayer,
+ uint8* dst_argb, int pix) {
+ const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+ uint8 g = src_bayer0[1];
+ uint8 r = src_bayer1[1];
+ int x;
+ for (x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = src_bayer0[0];
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = AVG(r, src_bayer1[1]);
+ dst_argb[3] = 255U;
+ dst_argb[4] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = src_bayer1[1];
+ dst_argb[7] = 255U;
+ g = src_bayer0[1];
+ r = src_bayer1[1];
+ src_bayer0 += 2;
+ src_bayer1 += 2;
+ dst_argb += 8;
+ }
+ dst_argb[0] = src_bayer0[0];
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = AVG(r, src_bayer1[1]);
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer0[0];
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = src_bayer1[1];
+ dst_argb[7] = 255U;
+ }
+}
+
+static void BayerRowRG(const uint8* src_bayer0, int src_stride_bayer,
+ uint8* dst_argb, int pix) {
+ const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+ uint8 g = src_bayer0[1];
+ uint8 b = src_bayer1[1];
+ int x;
+ for (x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = AVG(b, src_bayer1[1]);
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = src_bayer0[0];
+ dst_argb[3] = 255U;
+ dst_argb[4] = src_bayer1[1];
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[7] = 255U;
+ g = src_bayer0[1];
+ b = src_bayer1[1];
+ src_bayer0 += 2;
+ src_bayer1 += 2;
+ dst_argb += 8;
+ }
+ dst_argb[0] = AVG(b, src_bayer1[1]);
+ dst_argb[1] = AVG(g, src_bayer0[1]);
+ dst_argb[2] = src_bayer0[0];
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer1[1];
+ dst_argb[5] = src_bayer0[1];
+ dst_argb[6] = src_bayer0[0];
+ dst_argb[7] = 255U;
+ }
+}
+
+static void BayerRowGB(const uint8* src_bayer0, int src_stride_bayer,
+ uint8* dst_argb, int pix) {
+ const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+ uint8 b = src_bayer0[1];
+ int x;
+ for (x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = AVG(b, src_bayer0[1]);
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = src_bayer1[0];
+ dst_argb[3] = 255U;
+ dst_argb[4] = src_bayer0[1];
+ dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[6] = AVG(src_bayer1[0], src_bayer1[2]);
+ dst_argb[7] = 255U;
+ b = src_bayer0[1];
+ src_bayer0 += 2;
+ src_bayer1 += 2;
+ dst_argb += 8;
+ }
+ dst_argb[0] = AVG(b, src_bayer0[1]);
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = src_bayer1[0];
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer0[1];
+ dst_argb[5] = src_bayer0[0];
+ dst_argb[6] = src_bayer1[0];
+ dst_argb[7] = 255U;
+ }
+}
+
+static void BayerRowGR(const uint8* src_bayer0, int src_stride_bayer,
+ uint8* dst_argb, int pix) {
+ const uint8* src_bayer1 = src_bayer0 + src_stride_bayer;
+ uint8 r = src_bayer0[1];
+ int x;
+ for (x = 0; x < pix - 2; x += 2) {
+ dst_argb[0] = src_bayer1[0];
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = AVG(r, src_bayer0[1]);
+ dst_argb[3] = 255U;
+ dst_argb[4] = AVG(src_bayer1[0], src_bayer1[2]);
+ dst_argb[5] = AVG(src_bayer0[0], src_bayer0[2]);
+ dst_argb[6] = src_bayer0[1];
+ dst_argb[7] = 255U;
+ r = src_bayer0[1];
+ src_bayer0 += 2;
+ src_bayer1 += 2;
+ dst_argb += 8;
+ }
+ dst_argb[0] = src_bayer1[0];
+ dst_argb[1] = src_bayer0[0];
+ dst_argb[2] = AVG(r, src_bayer0[1]);
+ dst_argb[3] = 255U;
+ if (!(pix & 1)) {
+ dst_argb[4] = src_bayer1[0];
+ dst_argb[5] = src_bayer0[0];
+ dst_argb[6] = src_bayer0[1];
+ dst_argb[7] = 255U;
+ }
+}
+
+// Converts any Bayer RGB format to ARGB.
+LIBYUV_API
+int BayerToARGB(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ uint32 src_fourcc_bayer) {
+ int y;
+ void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int pix);
+ void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int pix);
+ if (height < 0) {
+ height = -height;
+ dst_argb = dst_argb + (height - 1) * dst_stride_argb;
+ dst_stride_argb = -dst_stride_argb;
+ }
+ switch (src_fourcc_bayer) {
+ case FOURCC_BGGR:
+ BayerRow0 = BayerRowBG;
+ BayerRow1 = BayerRowGR;
+ break;
+ case FOURCC_GBRG:
+ BayerRow0 = BayerRowGB;
+ BayerRow1 = BayerRowRG;
+ break;
+ case FOURCC_GRBG:
+ BayerRow0 = BayerRowGR;
+ BayerRow1 = BayerRowBG;
+ break;
+ case FOURCC_RGGB:
+ BayerRow0 = BayerRowRG;
+ BayerRow1 = BayerRowGB;
+ break;
+ default:
+ return -1; // Bad FourCC
+ }
+
+ for (y = 0; y < height - 1; y += 2) {
+ BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
+ BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+ dst_argb + dst_stride_argb, width);
+ src_bayer += src_stride_bayer * 2;
+ dst_argb += dst_stride_argb * 2;
+ }
+ if (height & 1) {
+ BayerRow0(src_bayer, src_stride_bayer, dst_argb, width);
+ }
+ return 0;
+}
+
+// Converts any Bayer RGB format to ARGB.
+LIBYUV_API
+int BayerToI420(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ uint32 src_fourcc_bayer) {
+ void (*BayerRow0)(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int pix);
+ void (*BayerRow1)(const uint8* src_bayer, int src_stride_bayer,
+ uint8* dst_argb, int pix);
+
+ void (*ARGBToUVRow)(const uint8* src_argb0, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int width) = ARGBToUVRow_C;
+ void (*ARGBToYRow)(const uint8* src_argb, uint8* dst_y, int pix) =
+ ARGBToYRow_C;
+ // Negative height means invert the image.
+ if (height < 0) {
+ int halfheight;
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ dst_y = dst_y + (height - 1) * dst_stride_y;
+ dst_u = dst_u + (halfheight - 1) * dst_stride_u;
+ dst_v = dst_v + (halfheight - 1) * dst_stride_v;
+ dst_stride_y = -dst_stride_y;
+ dst_stride_u = -dst_stride_u;
+ dst_stride_v = -dst_stride_v;
+ }
+#if defined(HAS_ARGBTOYROW_SSSE3) && defined(HAS_ARGBTOUVROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_SSSE3;
+ ARGBToYRow = ARGBToYRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToYRow = ARGBToYRow_Unaligned_SSSE3;
+ ARGBToUVRow = ARGBToUVRow_SSSE3;
+ if (IS_ALIGNED(dst_y, 16) && IS_ALIGNED(dst_stride_y, 16)) {
+ ARGBToYRow = ARGBToYRow_SSSE3;
+ }
+ }
+ }
+#elif defined(HAS_ARGBTOYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToYRow = ARGBToYRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToYRow = ARGBToYRow_NEON;
+ }
+ if (width >= 16) {
+ ARGBToUVRow = ARGBToUVRow_Any_NEON;
+ if (IS_ALIGNED(width, 16)) {
+ ARGBToUVRow = ARGBToUVRow_NEON;
+ }
+ }
+ }
+#endif
+
+ switch (src_fourcc_bayer) {
+ case FOURCC_BGGR:
+ BayerRow0 = BayerRowBG;
+ BayerRow1 = BayerRowGR;
+ break;
+ case FOURCC_GBRG:
+ BayerRow0 = BayerRowGB;
+ BayerRow1 = BayerRowRG;
+ break;
+ case FOURCC_GRBG:
+ BayerRow0 = BayerRowGR;
+ BayerRow1 = BayerRowBG;
+ break;
+ case FOURCC_RGGB:
+ BayerRow0 = BayerRowRG;
+ BayerRow1 = BayerRowGB;
+ break;
+ default:
+ return -1; // Bad FourCC
+ }
+
+ {
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+ int y;
+ for (y = 0; y < height - 1; y += 2) {
+ BayerRow0(src_bayer, src_stride_bayer, row, width);
+ BayerRow1(src_bayer + src_stride_bayer, -src_stride_bayer,
+ row + kRowSize, width);
+ ARGBToUVRow(row, kRowSize, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ ARGBToYRow(row + kRowSize, dst_y + dst_stride_y, width);
+ src_bayer += src_stride_bayer * 2;
+ dst_y += dst_stride_y * 2;
+ dst_u += dst_stride_u;
+ dst_v += dst_stride_v;
+ }
+ if (height & 1) {
+ BayerRow0(src_bayer, src_stride_bayer, row, width);
+ ARGBToUVRow(row, 0, dst_u, dst_v, width);
+ ARGBToYRow(row, dst_y, width);
+ }
+ free_aligned_buffer_64(row);
+ }
+ return 0;
+}
+
+// Convert I420 to Bayer.
+LIBYUV_API
+int I420ToBayer(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_bayer, int dst_stride_bayer,
+ int width, int height,
+ uint32 dst_fourcc_bayer) {
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+ void (*ARGBToBayerRow)(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) = ARGBToBayerRow_C;
+ const int blue_index = 0; // Offsets for ARGB format
+ const int green_index = 1;
+ const int red_index = 2;
+ uint32 index_map[2];
+ // Negative height means invert the image.
+ if (height < 0) {
+ int halfheight;
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && width >= 16) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2)) {
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+#if defined(HAS_ARGBTOBAYERROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && width >= 8) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_SSSE3;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_SSSE3;
+ }
+ }
+#elif defined(HAS_ARGBTOBAYERROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && width >= 8) {
+ ARGBToBayerRow = ARGBToBayerRow_Any_NEON;
+ if (IS_ALIGNED(width, 8)) {
+ ARGBToBayerRow = ARGBToBayerRow_NEON;
+ }
+ }
+#endif
+
+ if (MakeSelectors(blue_index, green_index, red_index,
+ dst_fourcc_bayer, index_map)) {
+ return -1; // Bad FourCC
+ }
+ {
+ // Allocate a row of ARGB.
+ align_buffer_64(row, width * 4);
+ int y;
+ for (y = 0; y < height; ++y) {
+ I422ToARGBRow(src_y, src_u, src_v, row, width);
+ ARGBToBayerRow(row, dst_bayer, index_map[y & 1], width);
+ dst_bayer += dst_stride_bayer;
+ src_y += src_stride_y;
+ if (y & 1) {
+ src_u += src_stride_u;
+ src_v += src_stride_v;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+ return 0;
+}
+
+#define MAKEBAYERFOURCC(BAYER) \
+LIBYUV_API \
+int Bayer##BAYER##ToI420(const uint8* src_bayer, int src_stride_bayer, \
+ uint8* dst_y, int dst_stride_y, \
+ uint8* dst_u, int dst_stride_u, \
+ uint8* dst_v, int dst_stride_v, \
+ int width, int height) { \
+ return BayerToI420(src_bayer, src_stride_bayer, \
+ dst_y, dst_stride_y, \
+ dst_u, dst_stride_u, \
+ dst_v, dst_stride_v, \
+ width, height, \
+ FOURCC_##BAYER); \
+} \
+ \
+LIBYUV_API \
+int I420ToBayer##BAYER(const uint8* src_y, int src_stride_y, \
+ const uint8* src_u, int src_stride_u, \
+ const uint8* src_v, int src_stride_v, \
+ uint8* dst_bayer, int dst_stride_bayer, \
+ int width, int height) { \
+ return I420ToBayer(src_y, src_stride_y, \
+ src_u, src_stride_u, \
+ src_v, src_stride_v, \
+ dst_bayer, dst_stride_bayer, \
+ width, height, \
+ FOURCC_##BAYER); \
+} \
+ \
+LIBYUV_API \
+int ARGBToBayer##BAYER(const uint8* src_argb, int src_stride_argb, \
+ uint8* dst_bayer, int dst_stride_bayer, \
+ int width, int height) { \
+ return ARGBToBayer(src_argb, src_stride_argb, \
+ dst_bayer, dst_stride_bayer, \
+ width, height, \
+ FOURCC_##BAYER); \
+} \
+ \
+LIBYUV_API \
+int Bayer##BAYER##ToARGB(const uint8* src_bayer, int src_stride_bayer, \
+ uint8* dst_argb, int dst_stride_argb, \
+ int width, int height) { \
+ return BayerToARGB(src_bayer, src_stride_bayer, \
+ dst_argb, dst_stride_argb, \
+ width, height, \
+ FOURCC_##BAYER); \
+}
+
+MAKEBAYERFOURCC(BGGR)
+MAKEBAYERFOURCC(GBRG)
+MAKEBAYERFOURCC(GRBG)
+MAKEBAYERFOURCC(RGGB)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/mjpeg_decoder.cc b/third_party/libyuv/source/mjpeg_decoder.cc
new file mode 100644
index 000000000..15b0ed88a
--- /dev/null
+++ b/third_party/libyuv/source/mjpeg_decoder.cc
@@ -0,0 +1,566 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef HAVE_JPEG
+#include <assert.h>
+
+#if !defined(__pnacl__) && !defined(__CLR_VER) && !defined(COVERAGE_ENABLED) &&\
+ !defined(TARGET_IPHONE_SIMULATOR)
+// Must be included before jpeglib.
+#include <setjmp.h>
+#define HAVE_SETJMP
+#endif
+struct FILE; // For jpeglib.h.
+
+// C++ build requires extern C for jpeg internals.
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <jpeglib.h>
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#include "libyuv/planar_functions.h" // For CopyPlane().
+
+namespace libyuv {
+
+#ifdef HAVE_SETJMP
+struct SetJmpErrorMgr {
+ jpeg_error_mgr base; // Must be at the top
+ jmp_buf setjmp_buffer;
+};
+#endif
+
+const int MJpegDecoder::kColorSpaceUnknown = JCS_UNKNOWN;
+const int MJpegDecoder::kColorSpaceGrayscale = JCS_GRAYSCALE;
+const int MJpegDecoder::kColorSpaceRgb = JCS_RGB;
+const int MJpegDecoder::kColorSpaceYCbCr = JCS_YCbCr;
+const int MJpegDecoder::kColorSpaceCMYK = JCS_CMYK;
+const int MJpegDecoder::kColorSpaceYCCK = JCS_YCCK;
+
+// Methods that are passed to jpeglib.
+boolean fill_input_buffer(jpeg_decompress_struct* cinfo);
+void init_source(jpeg_decompress_struct* cinfo);
+void skip_input_data(jpeg_decompress_struct* cinfo,
+ long num_bytes); // NOLINT
+void term_source(jpeg_decompress_struct* cinfo);
+void ErrorHandler(jpeg_common_struct* cinfo);
+
+MJpegDecoder::MJpegDecoder()
+ : has_scanline_padding_(LIBYUV_FALSE),
+ num_outbufs_(0),
+ scanlines_(NULL),
+ scanlines_sizes_(NULL),
+ databuf_(NULL),
+ databuf_strides_(NULL) {
+ decompress_struct_ = new jpeg_decompress_struct;
+ source_mgr_ = new jpeg_source_mgr;
+#ifdef HAVE_SETJMP
+ error_mgr_ = new SetJmpErrorMgr;
+ decompress_struct_->err = jpeg_std_error(&error_mgr_->base);
+ // Override standard exit()-based error handler.
+ error_mgr_->base.error_exit = &ErrorHandler;
+#endif
+ decompress_struct_->client_data = NULL;
+ source_mgr_->init_source = &init_source;
+ source_mgr_->fill_input_buffer = &fill_input_buffer;
+ source_mgr_->skip_input_data = &skip_input_data;
+ source_mgr_->resync_to_restart = &jpeg_resync_to_restart;
+ source_mgr_->term_source = &term_source;
+ jpeg_create_decompress(decompress_struct_);
+ decompress_struct_->src = source_mgr_;
+ buf_vec_.buffers = &buf_;
+ buf_vec_.len = 1;
+}
+
+MJpegDecoder::~MJpegDecoder() {
+ jpeg_destroy_decompress(decompress_struct_);
+ delete decompress_struct_;
+ delete source_mgr_;
+#ifdef HAVE_SETJMP
+ delete error_mgr_;
+#endif
+ DestroyOutputBuffers();
+}
+
+LIBYUV_BOOL MJpegDecoder::LoadFrame(const uint8* src, size_t src_len) {
+ if (!ValidateJpeg(src, src_len)) {
+ return LIBYUV_FALSE;
+ }
+
+ buf_.data = src;
+ buf_.len = (int)(src_len);
+ buf_vec_.pos = 0;
+ decompress_struct_->client_data = &buf_vec_;
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_read_header, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ if (jpeg_read_header(decompress_struct_, TRUE) != JPEG_HEADER_OK) {
+ // ERROR: Bad MJPEG header
+ return LIBYUV_FALSE;
+ }
+ AllocOutputBuffers(GetNumComponents());
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_size = GetComponentScanlinesPerImcuRow(i);
+ if (scanlines_sizes_[i] != scanlines_size) {
+ if (scanlines_[i]) {
+ delete scanlines_[i];
+ }
+ scanlines_[i] = new uint8* [scanlines_size];
+ scanlines_sizes_[i] = scanlines_size;
+ }
+
+ // We allocate padding for the final scanline to pad it up to DCTSIZE bytes
+ // to avoid memory errors, since jpeglib only reads full MCUs blocks. For
+ // the preceding scanlines, the padding is not needed/wanted because the
+ // following addresses will already be valid (they are the initial bytes of
+ // the next scanline) and will be overwritten when jpeglib writes out that
+ // next scanline.
+ int databuf_stride = GetComponentStride(i);
+ int databuf_size = scanlines_size * databuf_stride;
+ if (databuf_strides_[i] != databuf_stride) {
+ if (databuf_[i]) {
+ delete databuf_[i];
+ }
+ databuf_[i] = new uint8[databuf_size];
+ databuf_strides_[i] = databuf_stride;
+ }
+
+ if (GetComponentStride(i) != GetComponentWidth(i)) {
+ has_scanline_padding_ = LIBYUV_TRUE;
+ }
+ }
+ return LIBYUV_TRUE;
+}
+
+static int DivideAndRoundUp(int numerator, int denominator) {
+ return (numerator + denominator - 1) / denominator;
+}
+
+static int DivideAndRoundDown(int numerator, int denominator) {
+ return numerator / denominator;
+}
+
+// Returns width of the last loaded frame.
+int MJpegDecoder::GetWidth() {
+ return decompress_struct_->image_width;
+}
+
+// Returns height of the last loaded frame.
+int MJpegDecoder::GetHeight() {
+ return decompress_struct_->image_height;
+}
+
+// Returns format of the last loaded frame. The return value is one of the
+// kColorSpace* constants.
+int MJpegDecoder::GetColorSpace() {
+ return decompress_struct_->jpeg_color_space;
+}
+
+// Number of color components in the color space.
+int MJpegDecoder::GetNumComponents() {
+ return decompress_struct_->num_components;
+}
+
+// Sample factors of the n-th component.
+int MJpegDecoder::GetHorizSampFactor(int component) {
+ return decompress_struct_->comp_info[component].h_samp_factor;
+}
+
+int MJpegDecoder::GetVertSampFactor(int component) {
+ return decompress_struct_->comp_info[component].v_samp_factor;
+}
+
+int MJpegDecoder::GetHorizSubSampFactor(int component) {
+ return decompress_struct_->max_h_samp_factor /
+ GetHorizSampFactor(component);
+}
+
+int MJpegDecoder::GetVertSubSampFactor(int component) {
+ return decompress_struct_->max_v_samp_factor /
+ GetVertSampFactor(component);
+}
+
+int MJpegDecoder::GetImageScanlinesPerImcuRow() {
+ return decompress_struct_->max_v_samp_factor * DCTSIZE;
+}
+
+int MJpegDecoder::GetComponentScanlinesPerImcuRow(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetImageScanlinesPerImcuRow(), vs);
+}
+
+int MJpegDecoder::GetComponentWidth(int component) {
+ int hs = GetHorizSubSampFactor(component);
+ return DivideAndRoundUp(GetWidth(), hs);
+}
+
+int MJpegDecoder::GetComponentHeight(int component) {
+ int vs = GetVertSubSampFactor(component);
+ return DivideAndRoundUp(GetHeight(), vs);
+}
+
+// Get width in bytes padded out to a multiple of DCTSIZE
+int MJpegDecoder::GetComponentStride(int component) {
+ return (GetComponentWidth(component) + DCTSIZE - 1) & ~(DCTSIZE - 1);
+}
+
+int MJpegDecoder::GetComponentSize(int component) {
+ return GetComponentWidth(component) * GetComponentHeight(component);
+}
+
+LIBYUV_BOOL MJpegDecoder::UnloadFrame() {
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called jpeg_abort_decompress, it experienced an error, and we called
+ // longjmp() and rewound the stack to here. Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ jpeg_abort_decompress(decompress_struct_);
+ return LIBYUV_TRUE;
+}
+
+// TODO(fbarchard): Allow rectangle to be specified: x, y, width, height.
+LIBYUV_BOOL MJpegDecoder::DecodeToBuffers(
+ uint8** planes, int dst_width, int dst_height) {
+ if (dst_width != GetWidth() ||
+ dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return LIBYUV_FALSE;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ if (!StartDecode()) {
+ return LIBYUV_FALSE;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // Compute amount of lines to skip to implement vertical crop.
+ // TODO(fbarchard): Ensure skip is a multiple of maximum component
+ // subsample. ie 2
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ // There is no API to skip lines in the output data, so we read them
+ // into the temp buffer.
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip. Must read it and then
+ // copy the parts we want into the destination.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip =
+ DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i) -
+ rows_to_skip;
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ CopyPlane(databuf_[i] + data_to_skip, GetComponentStride(i),
+ planes[i], GetComponentWidth(i),
+ GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ lines_left -= (GetImageScanlinesPerImcuRow() - skip);
+ }
+ }
+
+ // Read full MCUs but cropped horizontally
+ for (; lines_left > GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy = GetComponentScanlinesPerImcuRow(i);
+ CopyPlane(databuf_[i], GetComponentStride(i),
+ planes[i], GetComponentWidth(i),
+ GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int scanlines_to_copy =
+ DivideAndRoundUp(lines_left, GetVertSubSampFactor(i));
+ CopyPlane(databuf_[i], GetComponentStride(i),
+ planes[i], GetComponentWidth(i),
+ GetComponentWidth(i), scanlines_to_copy);
+ planes[i] += scanlines_to_copy * GetComponentWidth(i);
+ }
+ }
+ return FinishDecode();
+}
+
+LIBYUV_BOOL MJpegDecoder::DecodeToCallback(CallbackFunction fn, void* opaque,
+ int dst_width, int dst_height) {
+ if (dst_width != GetWidth() ||
+ dst_height > GetHeight()) {
+ // ERROR: Bad dimensions
+ return LIBYUV_FALSE;
+ }
+#ifdef HAVE_SETJMP
+ if (setjmp(error_mgr_->setjmp_buffer)) {
+ // We called into jpeglib, it experienced an error sometime during this
+ // function call, and we called longjmp() and rewound the stack to here.
+ // Return error.
+ return LIBYUV_FALSE;
+ }
+#endif
+ if (!StartDecode()) {
+ return LIBYUV_FALSE;
+ }
+ SetScanlinePointers(databuf_);
+ int lines_left = dst_height;
+ // TODO(fbarchard): Compute amount of lines to skip to implement vertical crop
+ int skip = (GetHeight() - dst_height) / 2;
+ if (skip > 0) {
+ while (skip >= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ skip -= GetImageScanlinesPerImcuRow();
+ }
+ if (skip > 0) {
+ // Have a partial iMCU row left over to skip.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ for (int i = 0; i < num_outbufs_; ++i) {
+ // TODO(fbarchard): Compute skip to avoid this
+ assert(skip % GetVertSubSampFactor(i) == 0);
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ // Change our own data buffer pointers so we can pass them to the
+ // callback.
+ databuf_[i] += data_to_skip;
+ }
+ int scanlines_to_copy = GetImageScanlinesPerImcuRow() - skip;
+ (*fn)(opaque, databuf_, databuf_strides_, scanlines_to_copy);
+ // Now change them back.
+ for (int i = 0; i < num_outbufs_; ++i) {
+ int rows_to_skip = DivideAndRoundDown(skip, GetVertSubSampFactor(i));
+ int data_to_skip = rows_to_skip * GetComponentStride(i);
+ databuf_[i] -= data_to_skip;
+ }
+ lines_left -= scanlines_to_copy;
+ }
+ }
+ // Read full MCUs until we get to the crop point.
+ for (; lines_left >= GetImageScanlinesPerImcuRow();
+ lines_left -= GetImageScanlinesPerImcuRow()) {
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, GetImageScanlinesPerImcuRow());
+ }
+ if (lines_left > 0) {
+ // Have a partial iMCU row left over to decode.
+ if (!DecodeImcuRow()) {
+ FinishDecode();
+ return LIBYUV_FALSE;
+ }
+ (*fn)(opaque, databuf_, databuf_strides_, lines_left);
+ }
+ return FinishDecode();
+}
+
+void init_source(j_decompress_ptr cinfo) {
+ fill_input_buffer(cinfo);
+}
+
+boolean fill_input_buffer(j_decompress_ptr cinfo) {
+ BufferVector* buf_vec = (BufferVector*)(cinfo->client_data);
+ if (buf_vec->pos >= buf_vec->len) {
+ assert(0 && "No more data");
+ // ERROR: No more data
+ return FALSE;
+ }
+ cinfo->src->next_input_byte = buf_vec->buffers[buf_vec->pos].data;
+ cinfo->src->bytes_in_buffer = buf_vec->buffers[buf_vec->pos].len;
+ ++buf_vec->pos;
+ return TRUE;
+}
+
+void skip_input_data(j_decompress_ptr cinfo,
+ long num_bytes) { // NOLINT
+ cinfo->src->next_input_byte += num_bytes;
+}
+
+void term_source(j_decompress_ptr cinfo) {
+ // Nothing to do.
+}
+
+#ifdef HAVE_SETJMP
+void ErrorHandler(j_common_ptr cinfo) {
+ // This is called when a jpeglib command experiences an error. Unfortunately
+ // jpeglib's error handling model is not very flexible, because it expects the
+ // error handler to not return--i.e., it wants the program to terminate. To
+ // recover from errors we use setjmp() as shown in their example. setjmp() is
+ // C's implementation for the "call with current continuation" functionality
+ // seen in some functional programming languages.
+ // A formatted message can be output, but is unsafe for release.
+#ifdef DEBUG
+ char buf[JMSG_LENGTH_MAX];
+ (*cinfo->err->format_message)(cinfo, buf);
+ // ERROR: Error in jpeglib: buf
+#endif
+
+ SetJmpErrorMgr* mgr = (SetJmpErrorMgr*)(cinfo->err);
+ // This rewinds the call stack to the point of the corresponding setjmp()
+ // and causes it to return (for a second time) with value 1.
+ longjmp(mgr->setjmp_buffer, 1);
+}
+#endif
+
+void MJpegDecoder::AllocOutputBuffers(int num_outbufs) {
+ if (num_outbufs != num_outbufs_) {
+ // We could perhaps optimize this case to resize the output buffers without
+ // necessarily having to delete and recreate each one, but it's not worth
+ // it.
+ DestroyOutputBuffers();
+
+ scanlines_ = new uint8** [num_outbufs];
+ scanlines_sizes_ = new int[num_outbufs];
+ databuf_ = new uint8* [num_outbufs];
+ databuf_strides_ = new int[num_outbufs];
+
+ for (int i = 0; i < num_outbufs; ++i) {
+ scanlines_[i] = NULL;
+ scanlines_sizes_[i] = 0;
+ databuf_[i] = NULL;
+ databuf_strides_[i] = 0;
+ }
+
+ num_outbufs_ = num_outbufs;
+ }
+}
+
+void MJpegDecoder::DestroyOutputBuffers() {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ delete [] scanlines_[i];
+ delete [] databuf_[i];
+ }
+ delete [] scanlines_;
+ delete [] databuf_;
+ delete [] scanlines_sizes_;
+ delete [] databuf_strides_;
+ scanlines_ = NULL;
+ databuf_ = NULL;
+ scanlines_sizes_ = NULL;
+ databuf_strides_ = NULL;
+ num_outbufs_ = 0;
+}
+
+// JDCT_IFAST and do_block_smoothing improve performance substantially.
+LIBYUV_BOOL MJpegDecoder::StartDecode() {
+ decompress_struct_->raw_data_out = TRUE;
+ decompress_struct_->dct_method = JDCT_IFAST; // JDCT_ISLOW is default
+ decompress_struct_->dither_mode = JDITHER_NONE;
+ // Not applicable to 'raw':
+ decompress_struct_->do_fancy_upsampling = (boolean)(LIBYUV_FALSE);
+ // Only for buffered mode:
+ decompress_struct_->enable_2pass_quant = (boolean)(LIBYUV_FALSE);
+ // Blocky but fast:
+ decompress_struct_->do_block_smoothing = (boolean)(LIBYUV_FALSE);
+
+ if (!jpeg_start_decompress(decompress_struct_)) {
+ // ERROR: Couldn't start JPEG decompressor";
+ return LIBYUV_FALSE;
+ }
+ return LIBYUV_TRUE;
+}
+
+LIBYUV_BOOL MJpegDecoder::FinishDecode() {
+ // jpeglib considers it an error if we finish without decoding the whole
+ // image, so we call "abort" rather than "finish".
+ jpeg_abort_decompress(decompress_struct_);
+ return LIBYUV_TRUE;
+}
+
+void MJpegDecoder::SetScanlinePointers(uint8** data) {
+ for (int i = 0; i < num_outbufs_; ++i) {
+ uint8* data_i = data[i];
+ for (int j = 0; j < scanlines_sizes_[i]; ++j) {
+ scanlines_[i][j] = data_i;
+ data_i += GetComponentStride(i);
+ }
+ }
+}
+
+inline LIBYUV_BOOL MJpegDecoder::DecodeImcuRow() {
+ return (unsigned int)(GetImageScanlinesPerImcuRow()) ==
+ jpeg_read_raw_data(decompress_struct_,
+ scanlines_,
+ GetImageScanlinesPerImcuRow());
+}
+
+// The helper function which recognizes the jpeg sub-sampling type.
+JpegSubsamplingType MJpegDecoder::JpegSubsamplingTypeHelper(
+ int* subsample_x, int* subsample_y, int number_of_components) {
+ if (number_of_components == 3) { // Color images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 2 && subsample_y[1] == 2 &&
+ subsample_x[2] == 2 && subsample_y[2] == 2) {
+ return kJpegYuv420;
+ } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 2 && subsample_y[1] == 1 &&
+ subsample_x[2] == 2 && subsample_y[2] == 1) {
+ return kJpegYuv422;
+ } else if (subsample_x[0] == 1 && subsample_y[0] == 1 &&
+ subsample_x[1] == 1 && subsample_y[1] == 1 &&
+ subsample_x[2] == 1 && subsample_y[2] == 1) {
+ return kJpegYuv444;
+ }
+ } else if (number_of_components == 1) { // Grey-scale images.
+ if (subsample_x[0] == 1 && subsample_y[0] == 1) {
+ return kJpegYuv400;
+ }
+ }
+ return kJpegUnknown;
+}
+
+} // namespace libyuv
+#endif // HAVE_JPEG
+
diff --git a/third_party/libyuv/source/mjpeg_validate.cc b/third_party/libyuv/source/mjpeg_validate.cc
new file mode 100644
index 000000000..23d22d099
--- /dev/null
+++ b/third_party/libyuv/source/mjpeg_validate.cc
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/mjpeg_decoder.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// Helper function to validate the jpeg appears intact.
+// TODO(fbarchard): Optimize case where SOI is found but EOI is not.
+LIBYUV_BOOL ValidateJpeg(const uint8* sample, size_t sample_size) {
+ size_t i;
+ if (sample_size < 64) {
+ // ERROR: Invalid jpeg size: sample_size
+ return LIBYUV_FALSE;
+ }
+ if (sample[0] != 0xff || sample[1] != 0xd8) { // Start Of Image
+ // ERROR: Invalid jpeg initial start code
+ return LIBYUV_FALSE;
+ }
+ for (i = sample_size - 2; i > 1;) {
+ if (sample[i] != 0xd9) {
+ if (sample[i] == 0xff && sample[i + 1] == 0xd9) { // End Of Image
+ return LIBYUV_TRUE; // Success: Valid jpeg.
+ }
+ --i;
+ }
+ --i;
+ }
+ // ERROR: Invalid jpeg end code not found. Size sample_size
+ return LIBYUV_FALSE;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/third_party/libyuv/source/planar_functions.cc b/third_party/libyuv/source/planar_functions.cc
index 68b8f46e4..3857008ca 100644
--- a/third_party/libyuv/source/planar_functions.cc
+++ b/third_party/libyuv/source/planar_functions.cc
@@ -8,15 +8,15 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/planar_functions.h"
+#include "libyuv/planar_functions.h"
#include <string.h> // for memset()
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
+#include "libyuv/cpu_id.h"
#ifdef HAVE_JPEG
-#include "third_party/libyuv/include/libyuv/mjpeg_decoder.h"
+#include "libyuv/mjpeg_decoder.h"
#endif
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -37,6 +37,10 @@ void CopyPlane(const uint8* src_y, int src_stride_y,
height = 1;
src_stride_y = dst_stride_y = 0;
}
+ // Nothing to do.
+ if (src_y == dst_y && src_stride_y == dst_stride_y) {
+ return;
+ }
#if defined(HAS_COPYROW_X86)
if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
CopyRow = CopyRow_X86;
diff --git a/third_party/libyuv/source/rotate.cc b/third_party/libyuv/source/rotate.cc
new file mode 100644
index 000000000..2ef3228cb
--- /dev/null
+++ b/third_party/libyuv/source/rotate.cc
@@ -0,0 +1,1301 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || defined(__x86_64__) || defined(__i386__))
+#if defined(__APPLE__) && defined(__i386__)
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".private_extern _" #name " \n" \
+ ".align 4,0x90 \n" \
+"_" #name ": \n"
+#elif defined(__MINGW32__) || defined(__CYGWIN__) && defined(__i386__)
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".align 4,0x90 \n" \
+"_" #name ": \n"
+#else
+#define DECLARE_FUNCTION(name) \
+ ".text \n" \
+ ".align 4,0x90 \n" \
+#name ": \n"
+#endif
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width);
+#define HAS_MIRRORROW_UV_NEON
+void MirrorUVRow_NEON(const uint8* src, uint8* dst_a, uint8* dst_b, int width);
+#define HAS_TRANSPOSE_WX8_NEON
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width);
+#define HAS_TRANSPOSE_UVWX8_NEON
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width);
+#endif // defined(__ARM_NEON__)
+
+#if !defined(LIBYUV_DISABLE_MIPS) && !defined(__native_client__) && \
+ defined(__mips__) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+#define HAS_TRANSPOSE_WX8_MIPS_DSPR2
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width);
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width);
+#define HAS_TRANSPOSE_UVWx8_MIPS_DSPR2
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width);
+#endif // defined(__mips__)
+
+#if !defined(LIBYUV_DISABLE_X86) && \
+ defined(_M_IX86) && defined(_MSC_VER)
+#define HAS_TRANSPOSE_WX8_SSSE3
+__declspec(naked) __declspec(align(16))
+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ __asm {
+ push edi
+ push esi
+ push ebp
+ mov eax, [esp + 12 + 4] // src
+ mov edi, [esp + 12 + 8] // src_stride
+ mov edx, [esp + 12 + 12] // dst
+ mov esi, [esp + 12 + 16] // dst_stride
+ mov ecx, [esp + 12 + 20] // width
+
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ align 4
+ convertloop:
+ movq xmm0, qword ptr [eax]
+ lea ebp, [eax + 8]
+ movq xmm1, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm0, xmm1
+ movq xmm2, qword ptr [eax]
+ movdqa xmm1, xmm0
+ palignr xmm1, xmm1, 8
+ movq xmm3, qword ptr [eax + edi]
+ lea eax, [eax + 2 * edi]
+ punpcklbw xmm2, xmm3
+ movdqa xmm3, xmm2
+ movq xmm4, qword ptr [eax]
+ palignr xmm3, xmm3, 8
+ movq xmm5, qword ptr [eax + edi]
+ punpcklbw xmm4, xmm5
+ lea eax, [eax + 2 * edi]
+ movdqa xmm5, xmm4
+ movq xmm6, qword ptr [eax]
+ palignr xmm5, xmm5, 8
+ movq xmm7, qword ptr [eax + edi]
+ punpcklbw xmm6, xmm7
+ mov eax, ebp
+ movdqa xmm7, xmm6
+ palignr xmm7, xmm7, 8
+ // Second round of bit swap.
+ punpcklwd xmm0, xmm2
+ punpcklwd xmm1, xmm3
+ movdqa xmm2, xmm0
+ movdqa xmm3, xmm1
+ palignr xmm2, xmm2, 8
+ palignr xmm3, xmm3, 8
+ punpcklwd xmm4, xmm6
+ punpcklwd xmm5, xmm7
+ movdqa xmm6, xmm4
+ movdqa xmm7, xmm5
+ palignr xmm6, xmm6, 8
+ palignr xmm7, xmm7, 8
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ punpckldq xmm0, xmm4
+ movq qword ptr [edx], xmm0
+ movdqa xmm4, xmm0
+ palignr xmm4, xmm4, 8
+ movq qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ punpckldq xmm2, xmm6
+ movdqa xmm6, xmm2
+ palignr xmm6, xmm6, 8
+ movq qword ptr [edx], xmm2
+ punpckldq xmm1, xmm5
+ movq qword ptr [edx + esi], xmm6
+ lea edx, [edx + 2 * esi]
+ movdqa xmm5, xmm1
+ movq qword ptr [edx], xmm1
+ palignr xmm5, xmm5, 8
+ punpckldq xmm3, xmm7
+ movq qword ptr [edx + esi], xmm5
+ lea edx, [edx + 2 * esi]
+ movq qword ptr [edx], xmm3
+ movdqa xmm7, xmm3
+ palignr xmm7, xmm7, 8
+ sub ecx, 8
+ movq qword ptr [edx + esi], xmm7
+ lea edx, [edx + 2 * esi]
+ jg convertloop
+
+ pop ebp
+ pop esi
+ pop edi
+ ret
+ }
+}
+
+#define HAS_TRANSPOSE_UVWX8_SSE2
+__declspec(naked) __declspec(align(16))
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int w) {
+ __asm {
+ push ebx
+ push esi
+ push edi
+ push ebp
+ mov eax, [esp + 16 + 4] // src
+ mov edi, [esp + 16 + 8] // src_stride
+ mov edx, [esp + 16 + 12] // dst_a
+ mov esi, [esp + 16 + 16] // dst_stride_a
+ mov ebx, [esp + 16 + 20] // dst_b
+ mov ebp, [esp + 16 + 24] // dst_stride_b
+ mov ecx, esp
+ sub esp, 4 + 16
+ and esp, ~15
+ mov [esp + 16], ecx
+ mov ecx, [ecx + 16 + 28] // w
+
+ align 4
+ convertloop:
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ movdqa xmm0, [eax]
+ movdqa xmm1, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm0 // use xmm7 as temp register.
+ punpcklbw xmm0, xmm1
+ punpckhbw xmm7, xmm1
+ movdqa xmm1, xmm7
+ movdqa xmm2, [eax]
+ movdqa xmm3, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm2
+ punpcklbw xmm2, xmm3
+ punpckhbw xmm7, xmm3
+ movdqa xmm3, xmm7
+ movdqa xmm4, [eax]
+ movdqa xmm5, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa xmm7, xmm4
+ punpcklbw xmm4, xmm5
+ punpckhbw xmm7, xmm5
+ movdqa xmm5, xmm7
+ movdqa xmm6, [eax]
+ movdqa xmm7, [eax + edi]
+ lea eax, [eax + 2 * edi]
+ movdqa [esp], xmm5 // backup xmm5
+ neg edi
+ movdqa xmm5, xmm6 // use xmm5 as temp register.
+ punpcklbw xmm6, xmm7
+ punpckhbw xmm5, xmm7
+ movdqa xmm7, xmm5
+ lea eax, [eax + 8 * edi + 16]
+ neg edi
+ // Second round of bit swap.
+ movdqa xmm5, xmm0
+ punpcklwd xmm0, xmm2
+ punpckhwd xmm5, xmm2
+ movdqa xmm2, xmm5
+ movdqa xmm5, xmm1
+ punpcklwd xmm1, xmm3
+ punpckhwd xmm5, xmm3
+ movdqa xmm3, xmm5
+ movdqa xmm5, xmm4
+ punpcklwd xmm4, xmm6
+ punpckhwd xmm5, xmm6
+ movdqa xmm6, xmm5
+ movdqa xmm5, [esp] // restore xmm5
+ movdqa [esp], xmm6 // backup xmm6
+ movdqa xmm6, xmm5 // use xmm6 as temp register.
+ punpcklwd xmm5, xmm7
+ punpckhwd xmm6, xmm7
+ movdqa xmm7, xmm6
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ movdqa xmm6, xmm0
+ punpckldq xmm0, xmm4
+ punpckhdq xmm6, xmm4
+ movdqa xmm4, xmm6
+ movdqa xmm6, [esp] // restore xmm6
+ movlpd qword ptr [edx], xmm0
+ movhpd qword ptr [ebx], xmm0
+ movlpd qword ptr [edx + esi], xmm4
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm4
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm2 // use xmm0 as the temp register.
+ punpckldq xmm2, xmm6
+ movlpd qword ptr [edx], xmm2
+ movhpd qword ptr [ebx], xmm2
+ punpckhdq xmm0, xmm6
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm1 // use xmm0 as the temp register.
+ punpckldq xmm1, xmm5
+ movlpd qword ptr [edx], xmm1
+ movhpd qword ptr [ebx], xmm1
+ punpckhdq xmm0, xmm5
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ movdqa xmm0, xmm3 // use xmm0 as the temp register.
+ punpckldq xmm3, xmm7
+ movlpd qword ptr [edx], xmm3
+ movhpd qword ptr [ebx], xmm3
+ punpckhdq xmm0, xmm7
+ sub ecx, 8
+ movlpd qword ptr [edx + esi], xmm0
+ lea edx, [edx + 2 * esi]
+ movhpd qword ptr [ebx + ebp], xmm0
+ lea ebx, [ebx + 2 * ebp]
+ jg convertloop
+
+ mov esp, [esp + 16]
+ pop ebp
+ pop edi
+ pop esi
+ pop ebx
+ ret
+ }
+}
+#elif !defined(LIBYUV_DISABLE_X86) && \
+ (defined(__i386__) || (defined(__x86_64__) && !defined(__native_client__)))
+#define HAS_TRANSPOSE_WX8_SSSE3
+static void TransposeWx8_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ ".p2align 2 \n"
+ "1: \n"
+ "movq (%0),%%xmm0 \n"
+ "movq (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "movq (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "movq (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movq (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "movq (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movq (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "lea 0x8(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "sub $0x8,%2 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc"
+ #if defined(__SSE2__)
+ , "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7"
+ #endif
+ );
+}
+
+#if !defined(LIBYUV_DISABLE_X86) && defined(__i386__)
+#define HAS_TRANSPOSE_UVWX8_SSE2
+void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int w);
+ asm (
+ DECLARE_FUNCTION(TransposeUVWx8_SSE2)
+ "push %ebx \n"
+ "push %esi \n"
+ "push %edi \n"
+ "push %ebp \n"
+ "mov 0x14(%esp),%eax \n"
+ "mov 0x18(%esp),%edi \n"
+ "mov 0x1c(%esp),%edx \n"
+ "mov 0x20(%esp),%esi \n"
+ "mov 0x24(%esp),%ebx \n"
+ "mov 0x28(%esp),%ebp \n"
+ "mov %esp,%ecx \n"
+ "sub $0x14,%esp \n"
+ "and $0xfffffff0,%esp \n"
+ "mov %ecx,0x10(%esp) \n"
+ "mov 0x2c(%ecx),%ecx \n"
+
+"1: \n"
+ "movdqa (%eax),%xmm0 \n"
+ "movdqa (%eax,%edi,1),%xmm1 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm0,%xmm7 \n"
+ "punpcklbw %xmm1,%xmm0 \n"
+ "punpckhbw %xmm1,%xmm7 \n"
+ "movdqa %xmm7,%xmm1 \n"
+ "movdqa (%eax),%xmm2 \n"
+ "movdqa (%eax,%edi,1),%xmm3 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm2,%xmm7 \n"
+ "punpcklbw %xmm3,%xmm2 \n"
+ "punpckhbw %xmm3,%xmm7 \n"
+ "movdqa %xmm7,%xmm3 \n"
+ "movdqa (%eax),%xmm4 \n"
+ "movdqa (%eax,%edi,1),%xmm5 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm4,%xmm7 \n"
+ "punpcklbw %xmm5,%xmm4 \n"
+ "punpckhbw %xmm5,%xmm7 \n"
+ "movdqa %xmm7,%xmm5 \n"
+ "movdqa (%eax),%xmm6 \n"
+ "movdqa (%eax,%edi,1),%xmm7 \n"
+ "lea (%eax,%edi,2),%eax \n"
+ "movdqa %xmm5,(%esp) \n"
+ "neg %edi \n"
+ "movdqa %xmm6,%xmm5 \n"
+ "punpcklbw %xmm7,%xmm6 \n"
+ "punpckhbw %xmm7,%xmm5 \n"
+ "movdqa %xmm5,%xmm7 \n"
+ "lea 0x10(%eax,%edi,8),%eax \n"
+ "neg %edi \n"
+ "movdqa %xmm0,%xmm5 \n"
+ "punpcklwd %xmm2,%xmm0 \n"
+ "punpckhwd %xmm2,%xmm5 \n"
+ "movdqa %xmm5,%xmm2 \n"
+ "movdqa %xmm1,%xmm5 \n"
+ "punpcklwd %xmm3,%xmm1 \n"
+ "punpckhwd %xmm3,%xmm5 \n"
+ "movdqa %xmm5,%xmm3 \n"
+ "movdqa %xmm4,%xmm5 \n"
+ "punpcklwd %xmm6,%xmm4 \n"
+ "punpckhwd %xmm6,%xmm5 \n"
+ "movdqa %xmm5,%xmm6 \n"
+ "movdqa (%esp),%xmm5 \n"
+ "movdqa %xmm6,(%esp) \n"
+ "movdqa %xmm5,%xmm6 \n"
+ "punpcklwd %xmm7,%xmm5 \n"
+ "punpckhwd %xmm7,%xmm6 \n"
+ "movdqa %xmm6,%xmm7 \n"
+ "movdqa %xmm0,%xmm6 \n"
+ "punpckldq %xmm4,%xmm0 \n"
+ "punpckhdq %xmm4,%xmm6 \n"
+ "movdqa %xmm6,%xmm4 \n"
+ "movdqa (%esp),%xmm6 \n"
+ "movlpd %xmm0,(%edx) \n"
+ "movhpd %xmm0,(%ebx) \n"
+ "movlpd %xmm4,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm4,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm2,%xmm0 \n"
+ "punpckldq %xmm6,%xmm2 \n"
+ "movlpd %xmm2,(%edx) \n"
+ "movhpd %xmm2,(%ebx) \n"
+ "punpckhdq %xmm6,%xmm0 \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm1,%xmm0 \n"
+ "punpckldq %xmm5,%xmm1 \n"
+ "movlpd %xmm1,(%edx) \n"
+ "movhpd %xmm1,(%ebx) \n"
+ "punpckhdq %xmm5,%xmm0 \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "movdqa %xmm3,%xmm0 \n"
+ "punpckldq %xmm7,%xmm3 \n"
+ "movlpd %xmm3,(%edx) \n"
+ "movhpd %xmm3,(%ebx) \n"
+ "punpckhdq %xmm7,%xmm0 \n"
+ "sub $0x8,%ecx \n"
+ "movlpd %xmm0,(%edx,%esi,1) \n"
+ "lea (%edx,%esi,2),%edx \n"
+ "movhpd %xmm0,(%ebx,%ebp,1) \n"
+ "lea (%ebx,%ebp,2),%ebx \n"
+ "jg 1b \n"
+ "mov 0x10(%esp),%esp \n"
+ "pop %ebp \n"
+ "pop %edi \n"
+ "pop %esi \n"
+ "pop %ebx \n"
+#if defined(__native_client__)
+ "pop %ecx \n"
+ "and $0xffffffe0,%ecx \n"
+ "jmp *%ecx \n"
+#else
+ "ret \n"
+#endif
+);
+#elif !defined(LIBYUV_DISABLE_X86) && !defined(__native_client__) && \
+ defined(__x86_64__)
+// 64 bit version has enough registers to do 16x8 to 8x16 at a time.
+#define HAS_TRANSPOSE_WX8_FAST_SSSE3
+static void TransposeWx8_FAST_SSSE3(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride, int width) {
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ ".p2align 2 \n"
+"1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%3),%%xmm1 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "movdqa %%xmm0,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm9 \n"
+ "palignr $0x8,%%xmm1,%%xmm1 \n"
+ "palignr $0x8,%%xmm9,%%xmm9 \n"
+ "movdqa (%0,%3),%%xmm3 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm2,%%xmm10 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm10 \n"
+ "movdqa %%xmm2,%%xmm3 \n"
+ "movdqa %%xmm10,%%xmm11 \n"
+ "movdqa (%0),%%xmm4 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "movdqa (%0,%3),%%xmm5 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm4,%%xmm12 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm12 \n"
+ "movdqa %%xmm4,%%xmm5 \n"
+ "movdqa %%xmm12,%%xmm13 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movdqa (%0,%3),%%xmm7 \n"
+ "lea (%0,%3,2),%0 \n"
+ "movdqa %%xmm6,%%xmm14 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "punpckhbw %%xmm7,%%xmm14 \n"
+ "neg %3 \n"
+ "movdqa %%xmm6,%%xmm7 \n"
+ "movdqa %%xmm14,%%xmm15 \n"
+ "lea 0x10(%0,%3,8),%0 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "neg %3 \n"
+ // Second round of bit swap.
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm0,%%xmm2 \n"
+ "movdqa %%xmm1,%%xmm3 \n"
+ "palignr $0x8,%%xmm2,%%xmm2 \n"
+ "palignr $0x8,%%xmm3,%%xmm3 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm4,%%xmm6 \n"
+ "movdqa %%xmm5,%%xmm7 \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "punpcklwd %%xmm10,%%xmm8 \n"
+ "punpcklwd %%xmm11,%%xmm9 \n"
+ "movdqa %%xmm8,%%xmm10 \n"
+ "movdqa %%xmm9,%%xmm11 \n"
+ "palignr $0x8,%%xmm10,%%xmm10 \n"
+ "palignr $0x8,%%xmm11,%%xmm11 \n"
+ "punpcklwd %%xmm14,%%xmm12 \n"
+ "punpcklwd %%xmm15,%%xmm13 \n"
+ "movdqa %%xmm12,%%xmm14 \n"
+ "movdqa %%xmm13,%%xmm15 \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movq %%xmm0,(%1) \n"
+ "movdqa %%xmm0,%%xmm4 \n"
+ "palignr $0x8,%%xmm4,%%xmm4 \n"
+ "movq %%xmm4,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movdqa %%xmm2,%%xmm6 \n"
+ "movq %%xmm2,(%1) \n"
+ "palignr $0x8,%%xmm6,%%xmm6 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movq %%xmm6,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm1,%%xmm5 \n"
+ "movq %%xmm1,(%1) \n"
+ "palignr $0x8,%%xmm5,%%xmm5 \n"
+ "movq %%xmm5,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movq %%xmm3,(%1) \n"
+ "movdqa %%xmm3,%%xmm7 \n"
+ "palignr $0x8,%%xmm7,%%xmm7 \n"
+ "movq %%xmm7,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm12,%%xmm8 \n"
+ "movq %%xmm8,(%1) \n"
+ "movdqa %%xmm8,%%xmm12 \n"
+ "palignr $0x8,%%xmm12,%%xmm12 \n"
+ "movq %%xmm12,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm14,%%xmm10 \n"
+ "movdqa %%xmm10,%%xmm14 \n"
+ "movq %%xmm10,(%1) \n"
+ "palignr $0x8,%%xmm14,%%xmm14 \n"
+ "punpckldq %%xmm13,%%xmm9 \n"
+ "movq %%xmm14,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "movdqa %%xmm9,%%xmm13 \n"
+ "movq %%xmm9,(%1) \n"
+ "palignr $0x8,%%xmm13,%%xmm13 \n"
+ "movq %%xmm13,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "punpckldq %%xmm15,%%xmm11 \n"
+ "movq %%xmm11,(%1) \n"
+ "movdqa %%xmm11,%%xmm15 \n"
+ "palignr $0x8,%%xmm15,%%xmm15 \n"
+ "sub $0x10,%2 \n"
+ "movq %%xmm15,(%1,%4) \n"
+ "lea (%1,%4,2),%1 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ : "r"((intptr_t)(src_stride)), // %3
+ "r"((intptr_t)(dst_stride)) // %4
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9", "xmm10", "xmm11", "xmm12", "xmm13", "xmm14", "xmm15"
+);
+}
+
+#define HAS_TRANSPOSE_UVWX8_SSE2
+static void TransposeUVWx8_SSE2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int w) {
+ asm volatile (
+ // Read in the data from the source pointer.
+ // First round of bit swap.
+ ".p2align 2 \n"
+"1: \n"
+ "movdqa (%0),%%xmm0 \n"
+ "movdqa (%0,%4),%%xmm1 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpcklbw %%xmm1,%%xmm0 \n"
+ "punpckhbw %%xmm1,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm1 \n"
+ "movdqa (%0),%%xmm2 \n"
+ "movdqa (%0,%4),%%xmm3 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpcklbw %%xmm3,%%xmm2 \n"
+ "punpckhbw %%xmm3,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm3 \n"
+ "movdqa (%0),%%xmm4 \n"
+ "movdqa (%0,%4),%%xmm5 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "punpcklbw %%xmm5,%%xmm4 \n"
+ "punpckhbw %%xmm5,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm5 \n"
+ "movdqa (%0),%%xmm6 \n"
+ "movdqa (%0,%4),%%xmm7 \n"
+ "lea (%0,%4,2),%0 \n"
+ "movdqa %%xmm6,%%xmm8 \n"
+ "punpcklbw %%xmm7,%%xmm6 \n"
+ "neg %4 \n"
+ "lea 0x10(%0,%4,8),%0 \n"
+ "punpckhbw %%xmm7,%%xmm8 \n"
+ "movdqa %%xmm8,%%xmm7 \n"
+ "neg %4 \n"
+ // Second round of bit swap.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "movdqa %%xmm1,%%xmm9 \n"
+ "punpckhwd %%xmm2,%%xmm8 \n"
+ "punpckhwd %%xmm3,%%xmm9 \n"
+ "punpcklwd %%xmm2,%%xmm0 \n"
+ "punpcklwd %%xmm3,%%xmm1 \n"
+ "movdqa %%xmm8,%%xmm2 \n"
+ "movdqa %%xmm9,%%xmm3 \n"
+ "movdqa %%xmm4,%%xmm8 \n"
+ "movdqa %%xmm5,%%xmm9 \n"
+ "punpckhwd %%xmm6,%%xmm8 \n"
+ "punpckhwd %%xmm7,%%xmm9 \n"
+ "punpcklwd %%xmm6,%%xmm4 \n"
+ "punpcklwd %%xmm7,%%xmm5 \n"
+ "movdqa %%xmm8,%%xmm6 \n"
+ "movdqa %%xmm9,%%xmm7 \n"
+ // Third round of bit swap.
+ // Write to the destination pointer.
+ "movdqa %%xmm0,%%xmm8 \n"
+ "punpckldq %%xmm4,%%xmm0 \n"
+ "movlpd %%xmm0,(%1) \n" // Write back U channel
+ "movhpd %%xmm0,(%2) \n" // Write back V channel
+ "punpckhdq %%xmm4,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm2,%%xmm8 \n"
+ "punpckldq %%xmm6,%%xmm2 \n"
+ "movlpd %%xmm2,(%1) \n"
+ "movhpd %%xmm2,(%2) \n"
+ "punpckhdq %%xmm6,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm1,%%xmm8 \n"
+ "punpckldq %%xmm5,%%xmm1 \n"
+ "movlpd %%xmm1,(%1) \n"
+ "movhpd %%xmm1,(%2) \n"
+ "punpckhdq %%xmm5,%%xmm8 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "movdqa %%xmm3,%%xmm8 \n"
+ "punpckldq %%xmm7,%%xmm3 \n"
+ "movlpd %%xmm3,(%1) \n"
+ "movhpd %%xmm3,(%2) \n"
+ "punpckhdq %%xmm7,%%xmm8 \n"
+ "sub $0x8,%3 \n"
+ "movlpd %%xmm8,(%1,%5) \n"
+ "lea (%1,%5,2),%1 \n"
+ "movhpd %%xmm8,(%2,%6) \n"
+ "lea (%2,%6,2),%2 \n"
+ "jg 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst_a), // %1
+ "+r"(dst_b), // %2
+ "+r"(w) // %3
+ : "r"((intptr_t)(src_stride)), // %4
+ "r"((intptr_t)(dst_stride_a)), // %5
+ "r"((intptr_t)(dst_stride_b)) // %6
+ : "memory", "cc",
+ "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7",
+ "xmm8", "xmm9"
+);
+}
+#endif
+#endif
+
+static void TransposeWx8_C(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst[0] = src[0 * src_stride];
+ dst[1] = src[1 * src_stride];
+ dst[2] = src[2 * src_stride];
+ dst[3] = src[3 * src_stride];
+ dst[4] = src[4 * src_stride];
+ dst[5] = src[5 * src_stride];
+ dst[6] = src[6 * src_stride];
+ dst[7] = src[7 * src_stride];
+ ++src;
+ dst += dst_stride;
+ }
+}
+
+static void TransposeWxH_C(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst[i * dst_stride + j] = src[j * src_stride + i];
+ }
+ }
+}
+
+LIBYUV_API
+void TransposePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ int i = height;
+ void (*TransposeWx8)(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) = TransposeWx8_C;
+#if defined(HAS_TRANSPOSE_WX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeWx8 = TransposeWx8_NEON;
+ }
+#endif
+#if defined(HAS_TRANSPOSE_WX8_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 8)) {
+ TransposeWx8 = TransposeWx8_SSSE3;
+ }
+#endif
+#if defined(HAS_TRANSPOSE_WX8_FAST_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) &&
+ IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ TransposeWx8 = TransposeWx8_FAST_SSSE3;
+ }
+#endif
+#if defined(HAS_TRANSPOSE_WX8_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2)) {
+ if (IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ TransposeWx8 = TransposeWx8_FAST_MIPS_DSPR2;
+ } else {
+ TransposeWx8 = TransposeWx8_MIPS_DSPR2;
+ }
+ }
+#endif
+
+ // Work across the source in 8x8 tiles
+ while (i >= 8) {
+ TransposeWx8(src, src_stride, dst, dst_stride, width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst += 8; // Move over 8 columns.
+ i -= 8;
+ }
+
+ TransposeWxH_C(src, src_stride, dst, dst_stride, width, i);
+}
+
+LIBYUV_API
+void RotatePlane90(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 90 is a transpose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane270(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 270 is a transpose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ TransposePlane(src, src_stride, dst, dst_stride, width, height);
+}
+
+LIBYUV_API
+void RotatePlane180(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ align_buffer_64(row, width);
+ const uint8* src_bot = src + src_stride * (height - 1);
+ uint8* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ int y;
+ void (*MirrorRow)(const uint8* src, uint8* dst, int width) = MirrorRow_C;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_MIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 16)) {
+ MirrorRow = MirrorRow_NEON;
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ MirrorRow = MirrorRow_SSE2;
+ }
+#endif
+#if defined(HAS_MIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ MirrorRow = MirrorRow_SSSE3;
+ }
+#endif
+#if defined(HAS_MIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 32)) {
+ MirrorRow = MirrorRow_AVX2;
+ }
+#endif
+#if defined(HAS_MIRRORROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4) &&
+ IS_ALIGNED(dst, 4) && IS_ALIGNED(dst_stride, 4)) {
+ MirrorRow = MirrorRow_MIPS_DSPR2;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86) && IS_ALIGNED(width, 4)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width, 32) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Odd height will harmlessly mirror the middle row twice.
+ for (y = 0; y < half_height; ++y) {
+ MirrorRow(src, row, width); // Mirror first row into a buffer
+ src += src_stride;
+ MirrorRow(src_bot, dst, width); // Mirror last row into first row
+ dst += dst_stride;
+ CopyRow(row, dst_bot, width); // Copy first mirrored row into last
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+static void TransposeUVWx8_C(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ int i;
+ for (i = 0; i < width; ++i) {
+ dst_a[0] = src[0 * src_stride + 0];
+ dst_b[0] = src[0 * src_stride + 1];
+ dst_a[1] = src[1 * src_stride + 0];
+ dst_b[1] = src[1 * src_stride + 1];
+ dst_a[2] = src[2 * src_stride + 0];
+ dst_b[2] = src[2 * src_stride + 1];
+ dst_a[3] = src[3 * src_stride + 0];
+ dst_b[3] = src[3 * src_stride + 1];
+ dst_a[4] = src[4 * src_stride + 0];
+ dst_b[4] = src[4 * src_stride + 1];
+ dst_a[5] = src[5 * src_stride + 0];
+ dst_b[5] = src[5 * src_stride + 1];
+ dst_a[6] = src[6 * src_stride + 0];
+ dst_b[6] = src[6 * src_stride + 1];
+ dst_a[7] = src[7 * src_stride + 0];
+ dst_b[7] = src[7 * src_stride + 1];
+ src += 2;
+ dst_a += dst_stride_a;
+ dst_b += dst_stride_b;
+ }
+}
+
+static void TransposeUVWxH_C(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ int i;
+ for (i = 0; i < width * 2; i += 2) {
+ int j;
+ for (j = 0; j < height; ++j) {
+ dst_a[j + ((i >> 1) * dst_stride_a)] = src[i + (j * src_stride)];
+ dst_b[j + ((i >> 1) * dst_stride_b)] = src[i + (j * src_stride) + 1];
+ }
+ }
+}
+
+LIBYUV_API
+void TransposeUV(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ int i = height;
+ void (*TransposeUVWx8)(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) = TransposeUVWx8_C;
+#if defined(HAS_TRANSPOSE_UVWX8_NEON)
+ if (TestCpuFlag(kCpuHasNEON)) {
+ TransposeUVWx8 = TransposeUVWx8_NEON;
+ }
+#elif defined(HAS_TRANSPOSE_UVWX8_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) &&
+ IS_ALIGNED(width, 8) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ TransposeUVWx8 = TransposeUVWx8_SSE2;
+ }
+#elif defined(HAS_TRANSPOSE_UVWx8_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(width, 2) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ TransposeUVWx8 = TransposeUVWx8_MIPS_DSPR2;
+ }
+#endif
+
+ // Work through the source in 8x8 tiles.
+ while (i >= 8) {
+ TransposeUVWx8(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width);
+ src += 8 * src_stride; // Go down 8 rows.
+ dst_a += 8; // Move over 8 columns.
+ dst_b += 8; // Move over 8 columns.
+ i -= 8;
+ }
+
+ TransposeUVWxH_C(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width, i);
+}
+
+LIBYUV_API
+void RotateUV90(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+
+ TransposeUV(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width, height);
+}
+
+LIBYUV_API
+void RotateUV270(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ dst_a += dst_stride_a * (width - 1);
+ dst_b += dst_stride_b * (width - 1);
+ dst_stride_a = -dst_stride_a;
+ dst_stride_b = -dst_stride_b;
+
+ TransposeUV(src, src_stride,
+ dst_a, dst_stride_a,
+ dst_b, dst_stride_b,
+ width, height);
+}
+
+// Rotate 180 is a horizontal and vertical flip.
+LIBYUV_API
+void RotateUV180(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width, int height) {
+ int i;
+ void (*MirrorRowUV)(const uint8* src, uint8* dst_u, uint8* dst_v, int width) =
+ MirrorUVRow_C;
+#if defined(HAS_MIRRORUVROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 8)) {
+ MirrorRowUV = MirrorUVRow_NEON;
+ }
+#elif defined(HAS_MIRRORROW_UV_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 16) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16)) {
+ MirrorRowUV = MirrorUVRow_SSSE3;
+ }
+#elif defined(HAS_MIRRORUVROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) &&
+ IS_ALIGNED(src, 4) && IS_ALIGNED(src_stride, 4)) {
+ MirrorRowUV = MirrorUVRow_MIPS_DSPR2;
+ }
+#endif
+
+ dst_a += dst_stride_a * (height - 1);
+ dst_b += dst_stride_b * (height - 1);
+
+ for (i = 0; i < height; ++i) {
+ MirrorRowUV(src, dst_a, dst_b, width);
+ src += src_stride;
+ dst_a -= dst_stride_a;
+ dst_b -= dst_stride_b;
+ }
+}
+
+LIBYUV_API
+int RotatePlane(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height,
+ enum RotationMode mode) {
+ if (!src || width <= 0 || height == 0 || !dst) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src = src + (height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ CopyPlane(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ case kRotate90:
+ RotatePlane90(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src, src_stride,
+ dst, dst_stride,
+ width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int I420Rotate(const uint8* src_y, int src_stride_y,
+ const uint8* src_u, int src_stride_u,
+ const uint8* src_v, int src_stride_v,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_u || !src_v || width <= 0 || height == 0 ||
+ !dst_y || !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_u = src_u + (halfheight - 1) * src_stride_u;
+ src_v = src_v + (halfheight - 1) * src_stride_v;
+ src_stride_y = -src_stride_y;
+ src_stride_u = -src_stride_u;
+ src_stride_v = -src_stride_v;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return I420Copy(src_y, src_stride_y,
+ src_u, src_stride_u,
+ src_v, src_stride_v,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotatePlane90(src_u, src_stride_u,
+ dst_u, dst_stride_u,
+ halfwidth, halfheight);
+ RotatePlane90(src_v, src_stride_v,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotatePlane270(src_u, src_stride_u,
+ dst_u, dst_stride_u,
+ halfwidth, halfheight);
+ RotatePlane270(src_v, src_stride_v,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotatePlane180(src_u, src_stride_u,
+ dst_u, dst_stride_u,
+ halfwidth, halfheight);
+ RotatePlane180(src_v, src_stride_v,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+LIBYUV_API
+int NV12ToI420Rotate(const uint8* src_y, int src_stride_y,
+ const uint8* src_uv, int src_stride_uv,
+ uint8* dst_y, int dst_stride_y,
+ uint8* dst_u, int dst_stride_u,
+ uint8* dst_v, int dst_stride_v,
+ int width, int height,
+ enum RotationMode mode) {
+ int halfwidth = (width + 1) >> 1;
+ int halfheight = (height + 1) >> 1;
+ if (!src_y || !src_uv || width <= 0 || height == 0 ||
+ !dst_y || !dst_u || !dst_v) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ halfheight = (height + 1) >> 1;
+ src_y = src_y + (height - 1) * src_stride_y;
+ src_uv = src_uv + (halfheight - 1) * src_stride_uv;
+ src_stride_y = -src_stride_y;
+ src_stride_uv = -src_stride_uv;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return NV12ToI420(src_y, src_stride_y,
+ src_uv, src_stride_uv,
+ dst_y, dst_stride_y,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ width, height);
+ case kRotate90:
+ RotatePlane90(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotateUV90(src_uv, src_stride_uv,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ case kRotate270:
+ RotatePlane270(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotateUV270(src_uv, src_stride_uv,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ case kRotate180:
+ RotatePlane180(src_y, src_stride_y,
+ dst_y, dst_stride_y,
+ width, height);
+ RotateUV180(src_uv, src_stride_uv,
+ dst_u, dst_stride_u,
+ dst_v, dst_stride_v,
+ halfwidth, halfheight);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/rotate_argb.cc b/third_party/libyuv/source/rotate_argb.cc
new file mode 100644
index 000000000..ab0f9ce07
--- /dev/null
+++ b/third_party/libyuv/source/rotate_argb.cc
@@ -0,0 +1,209 @@
+/*
+ * Copyright 2012 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/rotate.h"
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/convert.h"
+#include "libyuv/planar_functions.h"
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// ARGBScale has a function to copy pixels to a row, striding each source
+// pixel by a constant.
+#if !defined(LIBYUV_DISABLE_X86) && \
+ (defined(_M_IX86) || \
+ (defined(__x86_64__) && !defined(__native_client__)) || defined(__i386__))
+#define HAS_SCALEARGBROWDOWNEVEN_SSE2
+void ScaleARGBRowDownEven_SSE2(const uint8* src_ptr, int src_stride,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width);
+#endif
+#if !defined(LIBYUV_DISABLE_NEON) && !defined(__native_client__) && \
+ (defined(__ARM_NEON__) || defined(LIBYUV_NEON))
+#define HAS_SCALEARGBROWDOWNEVEN_NEON
+void ScaleARGBRowDownEven_NEON(const uint8* src_ptr, int src_stride,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width);
+#endif
+
+void ScaleARGBRowDownEven_C(const uint8* src_ptr, int,
+ int src_stepx,
+ uint8* dst_ptr, int dst_width);
+
+static void ARGBTranspose(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ int i;
+ int src_pixel_step = src_stride >> 2;
+ void (*ScaleARGBRowDownEven)(const uint8* src_ptr, int src_stride,
+ int src_step, uint8* dst_ptr, int dst_width) = ScaleARGBRowDownEven_C;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(height, 4) && // Width of dest.
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_SSE2;
+ }
+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(height, 4) && // Width of dest.
+ IS_ALIGNED(src, 4)) {
+ ScaleARGBRowDownEven = ScaleARGBRowDownEven_NEON;
+ }
+#endif
+
+ for (i = 0; i < width; ++i) { // column of source to row of dest.
+ ScaleARGBRowDownEven(src, 0, src_pixel_step, dst, height);
+ dst += dst_stride;
+ src += 4;
+ }
+}
+
+void ARGBRotate90(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 90 is a ARGBTranspose with the source read
+ // from bottom to top. So set the source pointer to the end
+ // of the buffer and flip the sign of the source stride.
+ src += src_stride * (height - 1);
+ src_stride = -src_stride;
+ ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate270(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Rotate by 270 is a ARGBTranspose with the destination written
+ // from bottom to top. So set the destination pointer to the end
+ // of the buffer and flip the sign of the destination stride.
+ dst += dst_stride * (width - 1);
+ dst_stride = -dst_stride;
+ ARGBTranspose(src, src_stride, dst, dst_stride, width, height);
+}
+
+void ARGBRotate180(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width, int height) {
+ // Swap first and last row and mirror the content. Uses a temporary row.
+ align_buffer_64(row, width * 4);
+ const uint8* src_bot = src + src_stride * (height - 1);
+ uint8* dst_bot = dst + dst_stride * (height - 1);
+ int half_height = (height + 1) >> 1;
+ int y;
+ void (*ARGBMirrorRow)(const uint8* src, uint8* dst, int width) =
+ ARGBMirrorRow_C;
+ void (*CopyRow)(const uint8* src, uint8* dst, int width) = CopyRow_C;
+#if defined(HAS_ARGBMIRRORROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && IS_ALIGNED(width, 4) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ARGBMirrorRow = ARGBMirrorRow_SSSE3;
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && IS_ALIGNED(width, 8)) {
+ ARGBMirrorRow = ARGBMirrorRow_AVX2;
+ }
+#endif
+#if defined(HAS_ARGBMIRRORROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width, 4)) {
+ ARGBMirrorRow = ARGBMirrorRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(width * 4, 32)) {
+ CopyRow = CopyRow_NEON;
+ }
+#endif
+#if defined(HAS_COPYROW_X86)
+ if (TestCpuFlag(kCpuHasX86)) {
+ CopyRow = CopyRow_X86;
+ }
+#endif
+#if defined(HAS_COPYROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(width * 4, 32) &&
+ IS_ALIGNED(src, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst, 16) && IS_ALIGNED(dst_stride, 16)) {
+ CopyRow = CopyRow_SSE2;
+ }
+#endif
+#if defined(HAS_COPYROW_ERMS)
+ if (TestCpuFlag(kCpuHasERMS)) {
+ CopyRow = CopyRow_ERMS;
+ }
+#endif
+#if defined(HAS_COPYROW_MIPS)
+ if (TestCpuFlag(kCpuHasMIPS)) {
+ CopyRow = CopyRow_MIPS;
+ }
+#endif
+
+ // Odd height will harmlessly mirror the middle row twice.
+ for (y = 0; y < half_height; ++y) {
+ ARGBMirrorRow(src, row, width); // Mirror first row into a buffer
+ ARGBMirrorRow(src_bot, dst, width); // Mirror last row into first row
+ CopyRow(row, dst_bot, width * 4); // Copy first mirrored row into last
+ src += src_stride;
+ dst += dst_stride;
+ src_bot -= src_stride;
+ dst_bot -= dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+LIBYUV_API
+int ARGBRotate(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_argb, int dst_stride_argb,
+ int width, int height,
+ enum RotationMode mode) {
+ if (!src_argb || width <= 0 || height == 0 || !dst_argb) {
+ return -1;
+ }
+
+ // Negative height means invert the image.
+ if (height < 0) {
+ height = -height;
+ src_argb = src_argb + (height - 1) * src_stride_argb;
+ src_stride_argb = -src_stride_argb;
+ }
+
+ switch (mode) {
+ case kRotate0:
+ // copy frame
+ return ARGBCopy(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ case kRotate90:
+ ARGBRotate90(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ return 0;
+ case kRotate270:
+ ARGBRotate270(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ return 0;
+ case kRotate180:
+ ARGBRotate180(src_argb, src_stride_argb,
+ dst_argb, dst_stride_argb,
+ width, height);
+ return 0;
+ default:
+ break;
+ }
+ return -1;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/rotate_mips.cc b/third_party/libyuv/source/rotate_mips.cc
new file mode 100644
index 000000000..70770fd06
--- /dev/null
+++ b/third_party/libyuv/source/rotate_mips.cc
@@ -0,0 +1,485 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_MIPS) && \
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
+void TransposeWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+//dst + dst_stride word aligned
+ "1: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "sw $s0, 0(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "sw $s1, 4(%[dst]) \n"
+ "bnez %[width], 1b \n"
+ " addu %[dst], %[dst], %[dst_stride] \n"
+ "b 2f \n"
+//dst + dst_stride unaligned
+ "11: \n"
+ "lbu $t0, 0(%[src]) \n"
+ "lbux $t1, %[src_stride](%[src]) \n"
+ "lbux $t8, $t2(%[src]) \n"
+ "lbux $t9, $t3(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s0, $t8, $t0 \n"
+ "lbux $t0, $t4(%[src]) \n"
+ "lbux $t1, $t5(%[src]) \n"
+ "lbux $t8, $t6(%[src]) \n"
+ "lbux $t9, $t7(%[src]) \n"
+ "sll $t1, $t1, 16 \n"
+ "sll $t9, $t9, 16 \n"
+ "or $t0, $t0, $t1 \n"
+ "or $t8, $t8, $t9 \n"
+ "precr.qb.ph $s1, $t8, $t0 \n"
+ "swr $s0, 0(%[dst]) \n"
+ "swl $s0, 3(%[dst]) \n"
+ "addiu %[width], -1 \n"
+ "addiu %[src], 1 \n"
+ "swr $s1, 4(%[dst]) \n"
+ "swl $s1, 7(%[dst]) \n"
+ "bnez %[width], 11b \n"
+ "addu %[dst], %[dst], %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ :[src] "+r" (src),
+ [dst] "+r" (dst),
+ [width] "+r" (width)
+ :[src_stride] "r" (src_stride),
+ [dst_stride] "r" (dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1"
+ );
+}
+
+void TransposeWx8_FAST_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ __asm__ __volatile__ (
+ ".set noat \n"
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+
+ "srl $AT, %[width], 0x2 \n"
+ "andi $t0, %[dst], 0x3 \n"
+ "andi $t1, %[dst_stride], 0x3 \n"
+ "or $t0, $t0, $t1 \n"
+ "bnez $t0, 11f \n"
+ " subu $t7, $t9, %[src_stride] \n"
+//dst + dst_stride word aligned
+ "1: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "sw $s4, 0(%[dst]) \n"
+ "sw $t0, 4(%[dst]) \n"
+ "sw $s6, 0($s0) \n"
+ "sw $t8, 4($s0) \n"
+ "sw $s5, 0($s1) \n"
+ "sw $t1, 4($s1) \n"
+ "sw $s7, 0($s2) \n"
+ "sw $t9, 4($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 1b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "b 2f \n"
+//dst + dst_stride unaligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n"
+ "lwx $t1, %[src_stride](%[src]) \n"
+ "lwx $t8, $t2(%[src]) \n"
+ "lwx $t9, $t3(%[src]) \n"
+
+// t0 = | 30 | 20 | 10 | 00 |
+// t1 = | 31 | 21 | 11 | 01 |
+// t8 = | 32 | 22 | 12 | 02 |
+// t9 = | 33 | 23 | 13 | 03 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 21 | 01 | 20 | 00 |
+ // s1 = | 23 | 03 | 22 | 02 |
+ // s2 = | 31 | 11 | 30 | 10 |
+ // s3 = | 33 | 13 | 32 | 12 |
+
+ "precr.qb.ph $s4, $s1, $s0 \n"
+ "precrq.qb.ph $s5, $s1, $s0 \n"
+ "precr.qb.ph $s6, $s3, $s2 \n"
+ "precrq.qb.ph $s7, $s3, $s2 \n"
+
+ // s4 = | 03 | 02 | 01 | 00 |
+ // s5 = | 23 | 22 | 21 | 20 |
+ // s6 = | 13 | 12 | 11 | 10 |
+ // s7 = | 33 | 32 | 31 | 30 |
+
+ "lwx $t0, $t4(%[src]) \n"
+ "lwx $t1, $t5(%[src]) \n"
+ "lwx $t8, $t6(%[src]) \n"
+ "lwx $t9, $t7(%[src]) \n"
+
+// t0 = | 34 | 24 | 14 | 04 |
+// t1 = | 35 | 25 | 15 | 05 |
+// t8 = | 36 | 26 | 16 | 06 |
+// t9 = | 37 | 27 | 17 | 07 |
+
+ "precr.qb.ph $s0, $t1, $t0 \n"
+ "precr.qb.ph $s1, $t9, $t8 \n"
+ "precrq.qb.ph $s2, $t1, $t0 \n"
+ "precrq.qb.ph $s3, $t9, $t8 \n"
+
+ // s0 = | 25 | 05 | 24 | 04 |
+ // s1 = | 27 | 07 | 26 | 06 |
+ // s2 = | 35 | 15 | 34 | 14 |
+ // s3 = | 37 | 17 | 36 | 16 |
+
+ "precr.qb.ph $t0, $s1, $s0 \n"
+ "precrq.qb.ph $t1, $s1, $s0 \n"
+ "precr.qb.ph $t8, $s3, $s2 \n"
+ "precrq.qb.ph $t9, $s3, $s2 \n"
+
+ // t0 = | 07 | 06 | 05 | 04 |
+ // t1 = | 27 | 26 | 25 | 24 |
+ // t8 = | 17 | 16 | 15 | 14 |
+ // t9 = | 37 | 36 | 35 | 34 |
+
+ "addu $s0, %[dst], %[dst_stride] \n"
+ "addu $s1, $s0, %[dst_stride] \n"
+ "addu $s2, $s1, %[dst_stride] \n"
+
+ "swr $s4, 0(%[dst]) \n"
+ "swl $s4, 3(%[dst]) \n"
+ "swr $t0, 4(%[dst]) \n"
+ "swl $t0, 7(%[dst]) \n"
+ "swr $s6, 0($s0) \n"
+ "swl $s6, 3($s0) \n"
+ "swr $t8, 4($s0) \n"
+ "swl $t8, 7($s0) \n"
+ "swr $s5, 0($s1) \n"
+ "swl $s5, 3($s1) \n"
+ "swr $t1, 4($s1) \n"
+ "swl $t1, 7($s1) \n"
+ "swr $s7, 0($s2) \n"
+ "swl $s7, 3($s2) \n"
+ "swr $t9, 4($s2) \n"
+ "swl $t9, 7($s2) \n"
+
+ "addiu $AT, -1 \n"
+ "addiu %[src], 4 \n"
+
+ "bnez $AT, 11b \n"
+ " addu %[dst], $s2, %[dst_stride] \n"
+ "2: \n"
+ ".set pop \n"
+ ".set at \n"
+ :[src] "+r" (src),
+ [dst] "+r" (dst),
+ [width] "+r" (width)
+ :[src_stride] "r" (src_stride),
+ [dst_stride] "r" (dst_stride)
+ : "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7"
+ );
+}
+
+void TransposeUVWx8_MIPS_DSPR2(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ __asm__ __volatile__ (
+ ".set push \n"
+ ".set noreorder \n"
+ "beqz %[width], 2f \n"
+ " sll $t2, %[src_stride], 0x1 \n" // src_stride x 2
+ "sll $t4, %[src_stride], 0x2 \n" // src_stride x 4
+ "sll $t9, %[src_stride], 0x3 \n" // src_stride x 8
+ "addu $t3, $t2, %[src_stride] \n"
+ "addu $t5, $t4, %[src_stride] \n"
+ "addu $t6, $t2, $t4 \n"
+ "subu $t7, $t9, %[src_stride] \n"
+ "srl $t1, %[width], 1 \n"
+
+// check word aligment for dst_a, dst_b, dst_stride_a and dst_stride_b
+ "andi $t0, %[dst_a], 0x3 \n"
+ "andi $t8, %[dst_b], 0x3 \n"
+ "or $t0, $t0, $t8 \n"
+ "andi $t8, %[dst_stride_a], 0x3 \n"
+ "andi $s5, %[dst_stride_b], 0x3 \n"
+ "or $t8, $t8, $s5 \n"
+ "or $t0, $t0, $t8 \n"
+ "bnez $t0, 11f \n"
+ " nop \n"
+// dst + dst_stride word aligned (both, a & b dst addresses)
+ "1: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "sw $s3, 0($s5) \n"
+ "sw $s4, 0($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "sw $s3, 0(%[dst_a]) \n"
+ "sw $s4, 0(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+ "sw $s3, 4($s5) \n"
+ "sw $s4, 4($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "sw $s3, 4(%[dst_a]) \n"
+ "sw $s4, 4(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 1b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+ "b 2f \n"
+ " nop \n"
+
+// dst_a or dst_b or dst_stride_a or dst_stride_b not word aligned
+ "11: \n"
+ "lw $t0, 0(%[src]) \n" // |B0|A0|b0|a0|
+ "lwx $t8, %[src_stride](%[src]) \n" // |B1|A1|b1|a1|
+ "addu $s5, %[dst_a], %[dst_stride_a] \n"
+ "lwx $t9, $t2(%[src]) \n" // |B2|A2|b2|a2|
+ "lwx $s0, $t3(%[src]) \n" // |B3|A3|b3|a3|
+ "addu $s6, %[dst_b], %[dst_stride_b] \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B1|A1|B0|A0|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B3|A3|B2|A2|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A3|A2|A1|A0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B3|B2|B1|B0|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b1|a1|b0|a0|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b3|a3|b2|a2|
+
+ "swr $s3, 0($s5) \n"
+ "swl $s3, 3($s5) \n"
+ "swr $s4, 0($s6) \n"
+ "swl $s4, 3($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a3|a2|a1|a0|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b3|b2|b1|b0|
+
+ "lwx $t0, $t4(%[src]) \n" // |B4|A4|b4|a4|
+ "lwx $t8, $t5(%[src]) \n" // |B5|A5|b5|a5|
+ "lwx $t9, $t6(%[src]) \n" // |B6|A6|b6|a6|
+ "lwx $s0, $t7(%[src]) \n" // |B7|A7|b7|a7|
+ "swr $s3, 0(%[dst_a]) \n"
+ "swl $s3, 3(%[dst_a]) \n"
+ "swr $s4, 0(%[dst_b]) \n"
+ "swl $s4, 3(%[dst_b]) \n"
+
+ "precrq.ph.w $s1, $t8, $t0 \n" // |B5|A5|B4|A4|
+ "precrq.ph.w $s2, $s0, $t9 \n" // |B6|A6|B7|A7|
+ "precr.qb.ph $s3, $s2, $s1 \n" // |A7|A6|A5|A4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |B7|B6|B5|B4|
+
+ "sll $t0, $t0, 16 \n"
+ "packrl.ph $s1, $t8, $t0 \n" // |b5|a5|b4|a4|
+ "sll $t9, $t9, 16 \n"
+ "packrl.ph $s2, $s0, $t9 \n" // |b7|a7|b6|a6|
+
+ "swr $s3, 4($s5) \n"
+ "swl $s3, 7($s5) \n"
+ "swr $s4, 4($s6) \n"
+ "swl $s4, 7($s6) \n"
+
+ "precr.qb.ph $s3, $s2, $s1 \n" // |a7|a6|a5|a4|
+ "precrq.qb.ph $s4, $s2, $s1 \n" // |b7|b6|b5|b4|
+
+ "addiu %[src], 4 \n"
+ "addiu $t1, -1 \n"
+ "sll $t0, %[dst_stride_a], 1 \n"
+ "sll $t8, %[dst_stride_b], 1 \n"
+ "swr $s3, 4(%[dst_a]) \n"
+ "swl $s3, 7(%[dst_a]) \n"
+ "swr $s4, 4(%[dst_b]) \n"
+ "swl $s4, 7(%[dst_b]) \n"
+ "addu %[dst_a], %[dst_a], $t0 \n"
+ "bnez $t1, 11b \n"
+ " addu %[dst_b], %[dst_b], $t8 \n"
+
+ "2: \n"
+ ".set pop \n"
+ : [src] "+r" (src),
+ [dst_a] "+r" (dst_a),
+ [dst_b] "+r" (dst_b),
+ [width] "+r" (width),
+ [src_stride] "+r" (src_stride)
+ : [dst_stride_a] "r" (dst_stride_a),
+ [dst_stride_b] "r" (dst_stride_b)
+ : "t0", "t1", "t2", "t3", "t4", "t5",
+ "t6", "t7", "t8", "t9",
+ "s0", "s1", "s2", "s3",
+ "s4", "s5", "s6"
+ );
+}
+
+#endif // defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/rotate_neon.cc b/third_party/libyuv/source/rotate_neon.cc
new file mode 100644
index 000000000..d354e11fa
--- /dev/null
+++ b/third_party/libyuv/source/rotate_neon.cc
@@ -0,0 +1,533 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#include "libyuv/basic_types.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__ARM_NEON__)
+
+static uvec8 kVTbl4x4Transpose =
+ { 0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15 };
+
+void TransposeWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst, int dst_stride,
+ int width) {
+ const uint8* src_temp = NULL;
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %5, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ ".p2align 2 \n"
+ "1: \n"
+ "mov %0, %1 \n"
+
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d1}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d3}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d4}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d5}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d6}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.8 {d7}, [%0] \n"
+
+ "vtrn.8 d1, d0 \n"
+ "vtrn.8 d3, d2 \n"
+ "vtrn.8 d5, d4 \n"
+ "vtrn.8 d7, d6 \n"
+
+ "vtrn.16 d1, d3 \n"
+ "vtrn.16 d0, d2 \n"
+ "vtrn.16 d5, d7 \n"
+ "vtrn.16 d4, d6 \n"
+
+ "vtrn.32 d1, d5 \n"
+ "vtrn.32 d0, d4 \n"
+ "vtrn.32 d3, d7 \n"
+ "vtrn.32 d2, d6 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "vst1.8 {d1}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d3}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d2}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d5}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d4}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d7}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d6}, [%0] \n"
+
+ "add %1, #8 \n" // src += 8
+ "add %3, %3, %4, lsl #3 \n" // dst += 8 * dst_stride
+ "subs %5, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %5, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %5, #2 \n"
+ "blt 3f \n"
+
+ "cmp %5, #4 \n"
+ "blt 2f \n"
+
+ // 4x8 block
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d0[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d1[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d2[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d2[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d3[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.32 {d3[1]}, [%0] \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(6)
+ "vld1.8 {q3}, [%6] \n"
+
+ "vtbl.8 d4, {d0, d1}, d6 \n"
+ "vtbl.8 d5, {d0, d1}, d7 \n"
+ "vtbl.8 d0, {d2, d3}, d6 \n"
+ "vtbl.8 d1, {d2, d3}, d7 \n"
+
+ // TODO(frkoenig): Rework shuffle above to
+ // write out with 4 instead of 8 writes.
+ MEMACCESS(0)
+ "vst1.32 {d4[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d4[1]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d5[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d5[1]}, [%0] \n"
+
+ "add %0, %3, #4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d0[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d0[1]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d1[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d1[1]}, [%0] \n"
+
+ "add %1, #4 \n" // src += 4
+ "add %3, %3, %4, lsl #2 \n" // dst += 4 * dst_stride
+ "subs %5, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %5, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "vld1.16 {d0[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d1[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d0[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d1[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d0[2]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d1[2]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d0[3]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.16 {d1[3]}, [%0] \n"
+
+ "vtrn.8 d0, d1 \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "vst1.64 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.64 {d1}, [%0] \n"
+
+ "add %1, #2 \n" // src += 2
+ "add %3, %3, %4, lsl #1 \n" // dst += 2 * dst_stride
+ "subs %5, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[0]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[1]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[2]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[3]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[4]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[5]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[6]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld1.8 {d0[7]}, [%1] \n"
+
+ MEMACCESS(3)
+ "vst1.64 {d0}, [%3] \n"
+
+ "4: \n"
+
+ : "+r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst), // %3
+ "+r"(dst_stride), // %4
+ "+r"(width) // %5
+ : "r"(&kVTbl4x4Transpose) // %6
+ : "memory", "cc", "q0", "q1", "q2", "q3"
+ );
+}
+
+static uvec8 kVTbl4x4TransposeDi =
+ { 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15 };
+
+void TransposeUVWx8_NEON(const uint8* src, int src_stride,
+ uint8* dst_a, int dst_stride_a,
+ uint8* dst_b, int dst_stride_b,
+ int width) {
+ const uint8* src_temp = NULL;
+ asm volatile (
+ // loops are on blocks of 8. loop will stop when
+ // counter gets to or below 0. starting the counter
+ // at w-8 allow for this
+ "sub %7, #8 \n"
+
+ // handle 8x8 blocks. this should be the majority of the plane
+ ".p2align 2 \n"
+ "1: \n"
+ "mov %0, %1 \n"
+
+ MEMACCESS(0)
+ "vld2.8 {d0, d1}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d2, d3}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d4, d5}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d6, d7}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d16, d17}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d18, d19}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d20, d21}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.8 {d22, d23}, [%0] \n"
+
+ "vtrn.8 q1, q0 \n"
+ "vtrn.8 q3, q2 \n"
+ "vtrn.8 q9, q8 \n"
+ "vtrn.8 q11, q10 \n"
+
+ "vtrn.16 q1, q3 \n"
+ "vtrn.16 q0, q2 \n"
+ "vtrn.16 q9, q11 \n"
+ "vtrn.16 q8, q10 \n"
+
+ "vtrn.32 q1, q9 \n"
+ "vtrn.32 q0, q8 \n"
+ "vtrn.32 q3, q11 \n"
+ "vtrn.32 q2, q10 \n"
+
+ "vrev16.8 q0, q0 \n"
+ "vrev16.8 q1, q1 \n"
+ "vrev16.8 q2, q2 \n"
+ "vrev16.8 q3, q3 \n"
+ "vrev16.8 q8, q8 \n"
+ "vrev16.8 q9, q9 \n"
+ "vrev16.8 q10, q10 \n"
+ "vrev16.8 q11, q11 \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "vst1.8 {d2}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d6}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d4}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d18}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d16}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d22}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.8 {d20}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ MEMACCESS(0)
+ "vst1.8 {d3}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d1}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d7}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d5}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d19}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d17}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d23}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.8 {d21}, [%0] \n"
+
+ "add %1, #8*2 \n" // src += 8*2
+ "add %3, %3, %4, lsl #3 \n" // dst_a += 8 * dst_stride_a
+ "add %5, %5, %6, lsl #3 \n" // dst_b += 8 * dst_stride_b
+ "subs %7, #8 \n" // w -= 8
+ "bge 1b \n"
+
+ // add 8 back to counter. if the result is 0 there are
+ // no residuals.
+ "adds %7, #8 \n"
+ "beq 4f \n"
+
+ // some residual, so between 1 and 7 lines left to transpose
+ "cmp %7, #2 \n"
+ "blt 3f \n"
+
+ "cmp %7, #4 \n"
+ "blt 2f \n"
+
+ // TODO(frkoenig): Clean this up
+ // 4x8 block
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "vld1.64 {d0}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d1}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d2}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d3}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d4}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d5}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d6}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld1.64 {d7}, [%0] \n"
+
+ MEMACCESS(8)
+ "vld1.8 {q15}, [%8] \n"
+
+ "vtrn.8 q0, q1 \n"
+ "vtrn.8 q2, q3 \n"
+
+ "vtbl.8 d16, {d0, d1}, d30 \n"
+ "vtbl.8 d17, {d0, d1}, d31 \n"
+ "vtbl.8 d18, {d2, d3}, d30 \n"
+ "vtbl.8 d19, {d2, d3}, d31 \n"
+ "vtbl.8 d20, {d4, d5}, d30 \n"
+ "vtbl.8 d21, {d4, d5}, d31 \n"
+ "vtbl.8 d22, {d6, d7}, d30 \n"
+ "vtbl.8 d23, {d6, d7}, d31 \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "vst1.32 {d16[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d16[1]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d17[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d17[1]}, [%0], %4 \n"
+
+ "add %0, %3, #4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d20[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d20[1]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d21[0]}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d21[1]}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ MEMACCESS(0)
+ "vst1.32 {d18[0]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d18[1]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d19[0]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d19[1]}, [%0], %6 \n"
+
+ "add %0, %5, #4 \n"
+ MEMACCESS(0)
+ "vst1.32 {d22[0]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d22[1]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d23[0]}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.32 {d23[1]}, [%0] \n"
+
+ "add %1, #4*2 \n" // src += 4 * 2
+ "add %3, %3, %4, lsl #2 \n" // dst_a += 4 * dst_stride_a
+ "add %5, %5, %6, lsl #2 \n" // dst_b += 4 * dst_stride_b
+ "subs %7, #4 \n" // w -= 4
+ "beq 4f \n"
+
+ // some residual, check to see if it includes a 2x8 block,
+ // or less
+ "cmp %7, #2 \n"
+ "blt 3f \n"
+
+ // 2x8 block
+ "2: \n"
+ "mov %0, %1 \n"
+ MEMACCESS(0)
+ "vld2.16 {d0[0], d2[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d1[0], d3[0]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d0[1], d2[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d1[1], d3[1]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d0[2], d2[2]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d1[2], d3[2]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d0[3], d2[3]}, [%0], %2 \n"
+ MEMACCESS(0)
+ "vld2.16 {d1[3], d3[3]}, [%0] \n"
+
+ "vtrn.8 d0, d1 \n"
+ "vtrn.8 d2, d3 \n"
+
+ "mov %0, %3 \n"
+
+ MEMACCESS(0)
+ "vst1.64 {d0}, [%0], %4 \n"
+ MEMACCESS(0)
+ "vst1.64 {d2}, [%0] \n"
+
+ "mov %0, %5 \n"
+
+ MEMACCESS(0)
+ "vst1.64 {d1}, [%0], %6 \n"
+ MEMACCESS(0)
+ "vst1.64 {d3}, [%0] \n"
+
+ "add %1, #2*2 \n" // src += 2 * 2
+ "add %3, %3, %4, lsl #1 \n" // dst_a += 2 * dst_stride_a
+ "add %5, %5, %6, lsl #1 \n" // dst_b += 2 * dst_stride_b
+ "subs %7, #2 \n" // w -= 2
+ "beq 4f \n"
+
+ // 1x8 block
+ "3: \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[0], d1[0]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[1], d1[1]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[2], d1[2]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[3], d1[3]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[4], d1[4]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[5], d1[5]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[6], d1[6]}, [%1], %2 \n"
+ MEMACCESS(1)
+ "vld2.8 {d0[7], d1[7]}, [%1] \n"
+
+ MEMACCESS(3)
+ "vst1.64 {d0}, [%3] \n"
+ MEMACCESS(5)
+ "vst1.64 {d1}, [%5] \n"
+
+ "4: \n"
+
+ : "+r"(src_temp), // %0
+ "+r"(src), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_a), // %3
+ "+r"(dst_stride_a), // %4
+ "+r"(dst_b), // %5
+ "+r"(dst_stride_b), // %6
+ "+r"(width) // %7
+ : "r"(&kVTbl4x4TransposeDi) // %8
+ : "memory", "cc",
+ "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11"
+ );
+}
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row_any.cc b/third_party/libyuv/source/row_any.cc
index 27a0de119..97ef84417 100644
--- a/third_party/libyuv/source/row_any.cc
+++ b/third_party/libyuv/source/row_any.cc
@@ -8,9 +8,9 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
-#include "third_party/libyuv/include/libyuv/basic_types.h"
+#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
@@ -35,10 +35,12 @@ extern "C" {
}
#ifdef HAS_I422TOARGBROW_SSSE3
-YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
- 0, 4, 7)
YANY(I422ToARGBRow_Any_SSSE3, I422ToARGBRow_Unaligned_SSSE3, I422ToARGBRow_C,
1, 4, 7)
+#endif // HAS_I422TOARGBROW_SSSE3
+#ifdef HAS_I444TOARGBROW_SSSE3
+YANY(I444ToARGBRow_Any_SSSE3, I444ToARGBRow_Unaligned_SSSE3, I444ToARGBRow_C,
+ 0, 4, 7)
YANY(I411ToARGBRow_Any_SSSE3, I411ToARGBRow_Unaligned_SSSE3, I411ToARGBRow_C,
2, 4, 7)
YANY(I422ToBGRARow_Any_SSSE3, I422ToBGRARow_Unaligned_SSSE3, I422ToBGRARow_C,
@@ -59,7 +61,7 @@ YANY(I422ToRGB24Row_Any_SSSE3, I422ToRGB24Row_SSSE3, I422ToRGB24Row_C, 1, 3, 7)
YANY(I422ToRAWRow_Any_SSSE3, I422ToRAWRow_SSSE3, I422ToRAWRow_C, 1, 3, 7)
YANY(I422ToYUY2Row_Any_SSE2, I422ToYUY2Row_SSE2, I422ToYUY2Row_C, 1, 2, 15)
YANY(I422ToUYVYRow_Any_SSE2, I422ToUYVYRow_SSE2, I422ToUYVYRow_C, 1, 2, 15)
-#endif // HAS_I422TOARGBROW_SSSE3
+#endif // HAS_I444TOARGBROW_SSSE3
#ifdef HAS_I422TOARGBROW_AVX2
YANY(I422ToARGBRow_Any_AVX2, I422ToARGBRow_AVX2, I422ToARGBRow_C, 1, 4, 15)
#endif // HAS_I422TOARGBROW_AVX2
diff --git a/third_party/libyuv/source/row_common.cc b/third_party/libyuv/source/row_common.cc
index ceb3836cd..fa2b752a2 100644
--- a/third_party/libyuv/source/row_common.cc
+++ b/third_party/libyuv/source/row_common.cc
@@ -8,11 +8,11 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
#include <string.h> // For memcpy and memset.
-#include "third_party/libyuv/include/libyuv/basic_types.h"
+#include "libyuv/basic_types.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/source/row_mips.cc b/third_party/libyuv/source/row_mips.cc
index a804670d3..ae9370c1b 100644
--- a/third_party/libyuv/source/row_mips.cc
+++ b/third_party/libyuv/source/row_mips.cc
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -16,7 +16,8 @@ extern "C" {
#endif
// The following are available on Mips platforms:
-#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__)
+#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips__) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
#ifdef HAS_COPYROW_MIPS
void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
@@ -376,7 +377,9 @@ void CopyRow_MIPS(const uint8* src, uint8* dst, int count) {
// MIPS DSPR2 functions
#if !defined(LIBYUV_DISABLE_MIPS) && defined(__mips_dsp) && \
- (__mips_dsp_rev >= 2)
+ (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
+
void SplitUVRow_MIPS_DSPR2(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
int width) {
__asm__ __volatile__ (
diff --git a/third_party/libyuv/source/row_neon.cc b/third_party/libyuv/source/row_neon.cc
index c5ae2c583..1392cf5fc 100644
--- a/third_party/libyuv/source/row_neon.cc
+++ b/third_party/libyuv/source/row_neon.cc
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -20,34 +20,46 @@ extern "C" {
// Read 8 Y, 4 U and 4 V from 422
#define READYUV422 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.32 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
"vld1.32 {d2[1]}, [%2]! \n"
// Read 8 Y, 2 U and 2 V from 422
#define READYUV411 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.16 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
"vld1.16 {d2[1]}, [%2]! \n" \
"vmov.u8 d3, d2 \n" \
"vzip.u8 d2, d3 \n"
// Read 8 Y, 8 U and 8 V from 444
#define READYUV444 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
+ MEMACCESS(2) \
"vld1.8 {d3}, [%2]! \n" \
"vpaddl.u8 q1, q1 \n" \
"vrshrn.u16 d2, q1, #1 \n"
// Read 8 Y, and set 4 U and 4 V to 128
#define READYUV400 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
"vmov.u8 d2, #128 \n"
// Read 8 Y and 4 UV from NV12
#define READNV12 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d2, d3 \n" \
@@ -55,7 +67,9 @@ extern "C" {
// Read 8 Y and 4 VU from NV21
#define READNV21 \
+ MEMACCESS(0) \
"vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
"vld1.8 {d2}, [%1]! \n" \
"vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
"vuzp.u8 d3, d2 \n" \
@@ -63,6 +77,7 @@ extern "C" {
// Read 8 YUY2
#define READYUY2 \
+ MEMACCESS(0) \
"vld2.8 {d0, d2}, [%0]! \n" \
"vmov.u8 d3, d2 \n" \
"vuzp.u8 d2, d3 \n" \
@@ -70,6 +85,7 @@ extern "C" {
// Read 8 UYVY
#define READUYVY \
+ MEMACCESS(0) \
"vld2.8 {d2, d3}, [%0]! \n" \
"vmov.u8 d0, d3 \n" \
"vmov.u8 d3, d2 \n" \
@@ -113,7 +129,9 @@ void I444ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -124,6 +142,7 @@ void I444ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -144,7 +163,9 @@ void I422ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -155,6 +176,7 @@ void I422ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -175,7 +197,9 @@ void I411ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -186,6 +210,7 @@ void I411ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -206,7 +231,9 @@ void I422ToBGRARow_NEON(const uint8* src_y,
uint8* dst_bgra,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -218,6 +245,7 @@ void I422ToBGRARow_NEON(const uint8* src_y,
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
"vmov.u8 d19, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -238,7 +266,9 @@ void I422ToABGRRow_NEON(const uint8* src_y,
uint8* dst_abgr,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -250,6 +280,7 @@ void I422ToABGRRow_NEON(const uint8* src_y,
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d20, d21, d22, d23}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -270,7 +301,9 @@ void I422ToRGBARow_NEON(const uint8* src_y,
uint8* dst_rgba,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -281,6 +314,7 @@ void I422ToRGBARow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vmov.u8 d19, #255 \n"
+ MEMACCESS(3)
"vst4.8 {d19, d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -301,7 +335,9 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
uint8* dst_rgb24,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -311,6 +347,7 @@ void I422ToRGB24Row_NEON(const uint8* src_y,
READYUV422
YUV422TORGB
"subs %4, %4, #8 \n"
+ MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -331,7 +368,9 @@ void I422ToRAWRow_NEON(const uint8* src_y,
uint8* dst_raw,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -342,6 +381,7 @@ void I422ToRAWRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
"vswp.u8 d20, d22 \n"
+ MEMACCESS(3)
"vst3.8 {d20, d21, d22}, [%3]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -374,7 +414,9 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -385,6 +427,7 @@ void I422ToRGB565Row_NEON(const uint8* src_y,
YUV422TORGB
"subs %4, %4, #8 \n"
ARGBTORGB565
+ MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -420,7 +463,9 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
uint8* dst_argb1555,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -432,6 +477,7 @@ void I422ToARGB1555Row_NEON(const uint8* src_y,
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
ARGBTOARGB1555
+ MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -461,7 +507,9 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
uint8* dst_argb4444,
int width) {
asm volatile (
+ MEMACCESS(5)
"vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
"vld1.8 {d25}, [%6] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -474,6 +522,7 @@ void I422ToARGB4444Row_NEON(const uint8* src_y,
"subs %4, %4, #8 \n"
"vmov.u8 d23, #255 \n"
ARGBTOARGB4444
+ MEMACCESS(3)
"vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -492,7 +541,9 @@ void YToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -503,6 +554,7 @@ void YToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -522,10 +574,12 @@ void I400ToARGBRow_NEON(const uint8* src_y,
".p2align 2 \n"
"vmov.u8 d23, #255 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d20}, [%0]! \n"
"vmov d21, d20 \n"
"vmov d22, d20 \n"
"subs %2, %2, #8 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -541,7 +595,9 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -552,6 +608,7 @@ void NV12ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -570,7 +627,9 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -581,6 +640,7 @@ void NV21ToARGBRow_NEON(const uint8* src_y,
YUV422TORGB
"subs %3, %3, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
"vst4.8 {d20, d21, d22, d23}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -599,7 +659,9 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -610,6 +672,7 @@ void NV12ToRGB565Row_NEON(const uint8* src_y,
YUV422TORGB
"subs %3, %3, #8 \n"
ARGBTORGB565
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -628,7 +691,9 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
uint8* dst_rgb565,
int width) {
asm volatile (
+ MEMACCESS(4)
"vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {d25}, [%5] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -639,6 +704,7 @@ void NV21ToRGB565Row_NEON(const uint8* src_y,
YUV422TORGB
"subs %3, %3, #8 \n"
ARGBTORGB565
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -656,7 +722,9 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -667,6 +735,7 @@ void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -683,7 +752,9 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
uint8* dst_argb,
int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
"vld1.8 {d25}, [%4] \n"
"vmov.u8 d26, #128 \n"
"vmov.u16 q14, #74 \n"
@@ -694,6 +765,7 @@ void UYVYToARGBRow_NEON(const uint8* src_uyvy,
YUV422TORGB
"subs %2, %2, #8 \n"
"vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d21, d22, d23}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -712,9 +784,12 @@ void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
"subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store U
+ MEMACCESS(2)
"vst1.8 {q1}, [%2]! \n" // store V
"bgt 1b \n"
: "+r"(src_uv), // %0
@@ -732,9 +807,12 @@ void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load U
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load V
"subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(2)
"vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
"bgt 1b \n"
:
@@ -752,8 +830,10 @@ void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
"subs %2, %2, #32 \n" // 32 processed per loop
+ MEMACCESS(1)
"vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
"bgt 1b \n"
: "+r"(src), // %0
@@ -770,6 +850,7 @@ void SetRow_NEON(uint8* dst, uint32 v32, int count) {
"vdup.u32 q0, %2 \n" // duplicate 4 ints
"1: \n"
"subs %1, %1, #16 \n" // 16 bytes per loop
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n" // store
"bgt 1b \n"
: "+r"(dst), // %0
@@ -798,10 +879,13 @@ void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #16 \n" // 16 pixels per loop.
"vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src), // %0
@@ -822,10 +906,13 @@ void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
"subs %3, #8 \n" // 8 pixels per loop.
"vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // dst += 8
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
@@ -846,10 +933,13 @@ void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0], r3 \n" // src -= 16
"subs %2, #4 \n" // 4 pixels per loop.
"vrev64.32 q0, q0 \n"
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src), // %0
@@ -865,8 +955,10 @@ void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
"vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -882,9 +974,11 @@ void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
"vmov.u8 d4, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
"vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -912,9 +1006,11 @@ void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
"vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_rgb565), // %0
@@ -958,9 +1054,11 @@ void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
"vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb1555), // %0
@@ -987,9 +1085,11 @@ void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
"vmov.u8 d3, #255 \n" // Alpha
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb4444), // %0
@@ -1004,8 +1104,10 @@ void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1020,9 +1122,11 @@ void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
"vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1037,8 +1141,10 @@ void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
"subs %2, %2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1053,8 +1159,10 @@ void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
"subs %2, %2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1070,9 +1178,12 @@ void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
"vst1.8 {d3}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1089,9 +1200,12 @@ void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
"vst1.8 {d2}, [%2]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1109,12 +1223,16 @@ void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
"add %1, %0, %1 \n" // stride + src_yuy2
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
"vrhadd.u8 d1, d1, d5 \n" // average rows of U
"vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
"vst1.8 {d3}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_yuy2), // %0
@@ -1133,12 +1251,16 @@ void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
"add %1, %0, %1 \n" // stride + src_uyvy
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
"subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
"vrhadd.u8 d0, d0, d4 \n" // average rows of U
"vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
"vst1.8 {d2}, [%3]! \n" // store 8 V.
"bgt 1b \n"
: "+r"(src_uyvy), // %0
@@ -1157,10 +1279,13 @@ void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
// change the stride to row 2 pointer
"add %1, %0 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
"subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
"vrhadd.u8 q0, q1 \n" // average row 1 and 2
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_uv), // %0
@@ -1178,11 +1303,13 @@ void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
asm volatile (
"vmov.u32 d6[0], %3 \n" // selector
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
"vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
"vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
"vtrn.u32 d4, d5 \n" // combine 8 pixels
+ MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n" // store 8.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1198,8 +1325,10 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
uint32 /*selector*/, int pix) {
asm volatile (
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
"vst1.8 {d1}, [%1]! \n" // store 8 G's.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1214,12 +1343,15 @@ void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
const uint8* shuffler, int pix) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // shuffler
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 4 pixels.
"subs %2, %2, #4 \n" // 4 processed per loop
"vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
"vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store 4.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1237,10 +1369,14 @@ void I422ToYUY2Row_NEON(const uint8* src_y,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
"vld1.8 {d3}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -1260,10 +1396,14 @@ void I422ToUYVYRow_NEON(const uint8* src_y,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
"vld1.8 {d2}, [%2]! \n" // load 8 Vs
"subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
"vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
"bgt 1b \n"
: "+r"(src_y), // %0
@@ -1280,9 +1420,11 @@ void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTORGB565
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1298,9 +1440,11 @@ void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB1555
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1317,9 +1461,11 @@ void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
"vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGBTOARGB4444
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1338,6 +1484,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@@ -1345,6 +1492,7 @@ void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1362,12 +1510,14 @@ void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
"vmlal.u8 q2, d1, d25 \n" // G
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1390,6 +1540,7 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@@ -1405,7 +1556,9 @@ void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1429,7 +1582,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
@@ -1450,7 +1605,9 @@ void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1475,12 +1632,16 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(0)
"vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
"vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1508,7 +1669,9 @@ void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
"vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1547,12 +1710,16 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1564,7 +1731,9 @@ void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1591,12 +1760,16 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1608,7 +1781,9 @@ void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -1634,12 +1809,16 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
"vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
"vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
@@ -1651,7 +1830,9 @@ void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q3, q2, q1)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_bgra), // %0
@@ -1677,12 +1858,16 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1694,7 +1879,9 @@ void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_abgr), // %0
@@ -1720,12 +1907,16 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
"vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ MEMACCESS(1)
"vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
"vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
@@ -1737,7 +1928,9 @@ void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgba), // %0
@@ -1763,12 +1956,16 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
"vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1780,7 +1977,9 @@ void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -1806,12 +2005,16 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ MEMACCESS(0)
"vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
"vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ MEMACCESS(1)
"vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
"vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
@@ -1823,7 +2026,9 @@ void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
"subs %4, %4, #16 \n" // 32 processed per loop.
RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -1850,22 +2055,26 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
RGB565TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
RGB565TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
@@ -1887,7 +2096,9 @@ void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_rgb565), // %0
@@ -1914,22 +2125,26 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
RGB555TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
RGB555TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
@@ -1951,7 +2166,9 @@ void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb1555), // %0
@@ -1978,22 +2195,26 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"vmov.u16 q15, #0x8080 \n" // 128.5
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
"vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
"vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
"vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
"vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
ARGB4444TOARGB
"vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
@@ -2015,7 +2236,9 @@ void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
"vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
"vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
"vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
"vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
"bgt 1b \n"
: "+r"(src_argb4444), // %0
@@ -2037,6 +2260,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
RGB565TOARGB
@@ -2045,6 +2269,7 @@ void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgb565), // %0
@@ -2063,6 +2288,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB1555TOARGB
@@ -2071,6 +2297,7 @@ void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb1555), // %0
@@ -2089,6 +2316,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"vmov.u8 d27, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
ARGB4444TOARGB
@@ -2097,6 +2325,7 @@ void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
"vmlal.u8 q2, d2, d26 \n" // R
"vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_argb4444), // %0
@@ -2115,6 +2344,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // R
@@ -2122,6 +2352,7 @@ void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
"vmlal.u8 q8, d3, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_bgra), // %0
@@ -2140,6 +2371,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // R
@@ -2147,6 +2379,7 @@ void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
"vmlal.u8 q8, d2, d6 \n" // B
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_abgr), // %0
@@ -2165,6 +2398,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d1, d4 \n" // B
@@ -2172,6 +2406,7 @@ void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
"vmlal.u8 q8, d3, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgba), // %0
@@ -2190,6 +2425,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
@@ -2197,6 +2433,7 @@ void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
"vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_rgb24), // %0
@@ -2215,6 +2452,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"vmov.u8 d7, #16 \n" // Add 16 constant
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q8, d0, d4 \n" // B
@@ -2222,6 +2460,7 @@ void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
"vmlal.u8 q8, d2, d6 \n" // R
"vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
"vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
"bgt 1b \n"
: "+r"(src_raw), // %0
@@ -2252,7 +2491,9 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
@@ -2261,46 +2502,58 @@ void InterpolateRow_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
@@ -2323,7 +2576,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"blt 89f \n"
// Blend 8 pixels.
"8: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a
@@ -2337,6 +2592,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
"bge 8b \n"
@@ -2346,7 +2602,9 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// Blend 1 pixels.
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ MEMACCESS(1)
"vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
"subs %3, %3, #1 \n" // 1 processed per loop.
"vmull.u8 q10, d4, d3 \n" // db * a
@@ -2360,6 +2618,7 @@ void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"vqadd.u8 q0, q0, q2 \n" // + sbg
"vqadd.u8 d2, d2, d6 \n" // + sr
"vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
"vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
"bge 1b \n"
@@ -2379,6 +2638,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
asm volatile (
// Attenuate 8 pixels.
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q10, d0, d3 \n" // b * a
@@ -2387,6 +2647,7 @@ void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
"vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
"vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2410,6 +2671,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmovl.u8 q0, d0 \n" // b (0 .. 255)
@@ -2427,6 +2689,7 @@ void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
"vqmovn.u16 d0, q0 \n"
"vqmovn.u16 d2, q1 \n"
"vqmovn.u16 d4, q2 \n"
+ MEMACCESS(0)
"vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(dst_argb), // %0
@@ -2451,6 +2714,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q10, d20 \n" // b (0 .. 255)
@@ -2465,6 +2729,7 @@ void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
"vqmovn.u16 d22, q11 \n"
"vqmovn.u16 d24, q12 \n"
"vqmovn.u16 d26, q13 \n"
+ MEMACCESS(1)
"vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2485,6 +2750,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d24 \n" // B
@@ -2493,6 +2759,7 @@ void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
"vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
"vmov d1, d0 \n" // G
"vmov d2, d0 \n" // R
+ MEMACCESS(1)
"vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2520,6 +2787,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vmov.u8 d30, #50 \n" // BR coefficient
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
"subs %1, %1, #8 \n" // 8 processed per loop.
"vmull.u8 q2, d0, d20 \n" // B to Sepia B
@@ -2534,6 +2802,7 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
"vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
"vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
"vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ MEMACCESS(0)
"vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(dst_argb), // %0
@@ -2550,12 +2819,14 @@ void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
const int8* matrix_argb, int width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
"vmovl.s8 q0, d4 \n" // B,G coefficients s16.
"vmovl.s8 q1, d5 \n" // R,A coefficients s16.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
"subs %2, %2, #8 \n" // 8 processed per loop.
"vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
@@ -2594,6 +2865,7 @@ void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
"vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
"vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
"vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ MEMACCESS(1)
"vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -2614,7 +2886,9 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vmull.u8 q0, d0, d1 \n" // multiply B
@@ -2625,6 +2899,7 @@ void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
"vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
"vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
"vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
@@ -2645,11 +2920,14 @@ void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 q0, q0, q2 \n" // add B, G
"vqadd.u8 q1, q1, q3 \n" // add R, A
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
@@ -2669,11 +2947,14 @@ void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqsub.u8 q0, q0, q2 \n" // subtract B, G
"vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
@@ -2698,12 +2979,15 @@ void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
"vld1.8 {d1}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d0, d0, d1 \n" // add
"vmov.u8 d1, d0 \n"
"vmov.u8 d2, d0 \n"
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
@@ -2722,10 +3006,13 @@ void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// 16 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n" // load 16 sobely.
"subs %3, %3, #16 \n" // 16 processed per loop.
"vqadd.u8 q0, q0, q1 \n" // add
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n" // store 16 pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
@@ -2749,10 +3036,13 @@ void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
// 8 pixel loop.
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
"vld1.8 {d0}, [%1]! \n" // load 8 sobely.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vqadd.u8 d1, d0, d2 \n" // add
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
"bgt 1b \n"
: "+r"(src_sobelx), // %0
@@ -2773,21 +3063,28 @@ void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0],%5 \n" // top
+ MEMACCESS(0)
"vld1.8 {d1}, [%0],%6 \n"
"vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(1)
"vld1.8 {d2}, [%1],%5 \n" // center * 2
+ MEMACCESS(1)
"vld1.8 {d3}, [%1],%6 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(2)
"vld1.8 {d2}, [%2],%5 \n" // bottom
+ MEMACCESS(2)
"vld1.8 {d3}, [%2],%6 \n"
"subs %4, %4, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
+ MEMACCESS(3)
"vst1.8 {d0}, [%3]! \n" // store 8 sobelx
"bgt 1b \n"
: "+r"(src_y0), // %0
@@ -2810,21 +3107,28 @@ void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0],%4 \n" // left
+ MEMACCESS(1)
"vld1.8 {d1}, [%1],%4 \n"
"vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0],%4 \n" // center * 2
+ MEMACCESS(1)
"vld1.8 {d3}, [%1],%4 \n"
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0],%5 \n" // right
+ MEMACCESS(1)
"vld1.8 {d3}, [%1],%5 \n"
"subs %3, %3, #8 \n" // 8 pixels
"vsubl.u8 q1, d2, d3 \n"
"vadd.s16 q0, q0, q1 \n"
"vabs.s16 q0, q0 \n"
"vqmovn.u16 d0, q0 \n"
+ MEMACCESS(2)
"vst1.8 {d0}, [%2]! \n" // store 8 sobely
"bgt 1b \n"
: "+r"(src_y0), // %0
diff --git a/third_party/libyuv/source/row_neon64.cc b/third_party/libyuv/source/row_neon64.cc
new file mode 100644
index 000000000..46e9ceb33
--- /dev/null
+++ b/third_party/libyuv/source/row_neon64.cc
@@ -0,0 +1,3323 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+// This module is for GCC Neon
+#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+// Read 8 Y, 4 U and 4 V from 422
+#define READYUV422 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.32 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
+ "vld1.32 {d2[1]}, [%2]! \n"
+
+// Read 8 Y, 2 U and 2 V from 422
+#define READYUV411 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.16 {d2[0]}, [%1]! \n" \
+ MEMACCESS(2) \
+ "vld1.16 {d2[1]}, [%2]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vzip.u8 d2, d3 \n"
+
+// Read 8 Y, 8 U and 8 V from 444
+#define READYUV444 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.8 {d2}, [%1]! \n" \
+ MEMACCESS(2) \
+ "vld1.8 {d3}, [%2]! \n" \
+ "vpaddl.u8 q1, q1 \n" \
+ "vrshrn.u16 d2, q1, #1 \n"
+
+// Read 8 Y, and set 4 U and 4 V to 128
+#define READYUV400 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ "vmov.u8 d2, #128 \n"
+
+// Read 8 Y and 4 UV from NV12
+#define READNV12 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 Y and 4 VU from NV21
+#define READNV21 \
+ MEMACCESS(0) \
+ "vld1.8 {d0}, [%0]! \n" \
+ MEMACCESS(1) \
+ "vld1.8 {d2}, [%1]! \n" \
+ "vmov.u8 d3, d2 \n"/* split odd/even uv apart */\
+ "vuzp.u8 d3, d2 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 YUY2
+#define READYUY2 \
+ MEMACCESS(0) \
+ "vld2.8 {d0, d2}, [%0]! \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+// Read 8 UYVY
+#define READUYVY \
+ MEMACCESS(0) \
+ "vld2.8 {d2, d3}, [%0]! \n" \
+ "vmov.u8 d0, d3 \n" \
+ "vmov.u8 d3, d2 \n" \
+ "vuzp.u8 d2, d3 \n" \
+ "vtrn.u32 d2, d3 \n"
+
+#define YUV422TORGB \
+ "veor.u8 d2, d26 \n"/*subtract 128 from u and v*/\
+ "vmull.s8 q8, d2, d24 \n"/* u/v B/R component */\
+ "vmull.s8 q9, d2, d25 \n"/* u/v G component */\
+ "vmov.u8 d1, #0 \n"/* split odd/even y apart */\
+ "vtrn.u8 d0, d1 \n" \
+ "vsub.s16 q0, q0, q15 \n"/* offset y */\
+ "vmul.s16 q0, q0, q14 \n" \
+ "vadd.s16 d18, d19 \n" \
+ "vqadd.s16 d20, d0, d16 \n" /* B */ \
+ "vqadd.s16 d21, d1, d16 \n" \
+ "vqadd.s16 d22, d0, d17 \n" /* R */ \
+ "vqadd.s16 d23, d1, d17 \n" \
+ "vqadd.s16 d16, d0, d18 \n" /* G */ \
+ "vqadd.s16 d17, d1, d18 \n" \
+ "vqshrun.s16 d0, q10, #6 \n" /* B */ \
+ "vqshrun.s16 d1, q11, #6 \n" /* G */ \
+ "vqshrun.s16 d2, q8, #6 \n" /* R */ \
+ "vmovl.u8 q10, d0 \n"/* set up for reinterleave*/\
+ "vmovl.u8 q11, d1 \n" \
+ "vmovl.u8 q8, d2 \n" \
+ "vtrn.u8 d20, d21 \n" \
+ "vtrn.u8 d22, d23 \n" \
+ "vtrn.u8 d16, d17 \n" \
+ "vmov.u8 d21, d16 \n"
+
+static vec8 kUVToRB = { 127, 127, 127, 127, 102, 102, 102, 102,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+static vec8 kUVToG = { -25, -25, -25, -25, -52, -52, -52, -52,
+ 0, 0, 0, 0, 0, 0, 0, 0 };
+
+#ifdef HAS_I444TOARGBROW_NEON
+void I444ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV444
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I444TOARGBROW_NEON
+
+#ifdef HAS_I422TOARGBROW_NEON
+void I422ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TOARGBROW_NEON
+
+#ifdef HAS_I411TOARGBROW_NEON
+void I411ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV411
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I411TOARGBROW_NEON
+
+#ifdef HAS_I422TOBGRAROW_NEON
+void I422ToBGRARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_bgra,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d19, #255 \n"
+ MEMACCESS(3)
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_bgra), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TOBGRAROW_NEON
+
+#ifdef HAS_I422TOABGRROW_NEON
+void I422ToABGRRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_abgr,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(3)
+ "vst4.8 {d20, d21, d22, d23}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_abgr), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TOABGRROW_NEON
+
+#ifdef HAS_I422TORGBAROW_NEON
+void I422ToRGBARow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgba,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d19, #255 \n"
+ MEMACCESS(3)
+ "vst4.8 {d19, d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgba), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TORGBAROW_NEON
+
+#ifdef HAS_I422TORGB24ROW_NEON
+void I422ToRGB24Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb24,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ MEMACCESS(3)
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb24), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TORGB24ROW_NEON
+
+#ifdef HAS_I422TORAWROW_NEON
+void I422ToRAWRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_raw,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vswp.u8 d20, d22 \n"
+ MEMACCESS(3)
+ "vst3.8 {d20, d21, d22}, [%3]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_raw), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TORAWROW_NEON
+
+#define ARGBTORGB565 \
+ "vshr.u8 d20, d20, #3 \n" /* B */ \
+ "vshr.u8 d21, d21, #2 \n" /* G */ \
+ "vshr.u8 d22, d22, #3 \n" /* R */ \
+ "vmovl.u8 q8, d20 \n" /* B */ \
+ "vmovl.u8 q9, d21 \n" /* G */ \
+ "vmovl.u8 q10, d22 \n" /* R */ \
+ "vshl.u16 q9, q9, #5 \n" /* G */ \
+ "vshl.u16 q10, q10, #11 \n" /* R */ \
+ "vorr q0, q8, q9 \n" /* BG */ \
+ "vorr q0, q0, q10 \n" /* BGR */
+
+#ifdef HAS_I422TORGB565ROW_NEON
+void I422ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ ARGBTORGB565
+ MEMACCESS(3)
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_rgb565), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TORGB565ROW_NEON
+
+#define ARGBTOARGB1555 \
+ "vshr.u8 q10, q10, #3 \n" /* B */ \
+ "vshr.u8 d22, d22, #3 \n" /* R */ \
+ "vshr.u8 d23, d23, #7 \n" /* A */ \
+ "vmovl.u8 q8, d20 \n" /* B */ \
+ "vmovl.u8 q9, d21 \n" /* G */ \
+ "vmovl.u8 q10, d22 \n" /* R */ \
+ "vmovl.u8 q11, d23 \n" /* A */ \
+ "vshl.u16 q9, q9, #5 \n" /* G */ \
+ "vshl.u16 q10, q10, #10 \n" /* R */ \
+ "vshl.u16 q11, q11, #15 \n" /* A */ \
+ "vorr q0, q8, q9 \n" /* BG */ \
+ "vorr q1, q10, q11 \n" /* RA */ \
+ "vorr q0, q0, q1 \n" /* BGRA */
+
+#ifdef HAS_I422TOARGB1555ROW_NEON
+void I422ToARGB1555Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb1555,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ ARGBTOARGB1555
+ MEMACCESS(3)
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb1555), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TOARGB1555ROW_NEON
+
+#define ARGBTOARGB4444 \
+ "vshr.u8 d20, d20, #4 \n" /* B */ \
+ "vbic.32 d21, d21, d4 \n" /* G */ \
+ "vshr.u8 d22, d22, #4 \n" /* R */ \
+ "vbic.32 d23, d23, d4 \n" /* A */ \
+ "vorr d0, d20, d21 \n" /* BG */ \
+ "vorr d1, d22, d23 \n" /* RA */ \
+ "vzip.u8 d0, d1 \n" /* BGRA */
+
+#ifdef HAS_I422TOARGB4444ROW_NEON
+void I422ToARGB4444Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb4444,
+ int width) {
+ asm volatile (
+ MEMACCESS(5)
+ "vld1.8 {d24}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {d25}, [%6] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV422
+ YUV422TORGB
+ "subs %4, %4, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ ARGBTOARGB4444
+ MEMACCESS(3)
+ "vst1.8 {q0}, [%3]! \n" // store 8 pixels ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_argb4444), // %3
+ "+r"(width) // %4
+ : "r"(&kUVToRB), // %5
+ "r"(&kUVToG) // %6
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_I422TOARGB4444ROW_NEON
+
+#ifdef HAS_YTOARGBROW_NEON
+void YToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ MEMACCESS(3)
+ "vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
+ "vld1.8 {d25}, [%4] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUV400
+ YUV422TORGB
+ "subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kUVToRB), // %3
+ "r"(&kUVToG) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_YTOARGBROW_NEON
+
+#ifdef HAS_I400TOARGBROW_NEON
+void I400ToARGBRow_NEON(const uint8* src_y,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "vmov.u8 d23, #255 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d20}, [%0]! \n"
+ "vmov d21, d20 \n"
+ "vmov d22, d20 \n"
+ "subs %2, %2, #8 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "d20", "d21", "d22", "d23"
+ );
+}
+#endif // HAS_I400TOARGBROW_NEON
+
+#ifdef HAS_NV12TOARGBROW_NEON
+void NV12ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ MEMACCESS(4)
+ "vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV12
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_NV12TOARGBROW_NEON
+
+#ifdef HAS_NV21TOARGBROW_NEON
+void NV21ToARGBRow_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ MEMACCESS(4)
+ "vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV21
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(2)
+ "vst4.8 {d20, d21, d22, d23}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_NV21TOARGBROW_NEON
+
+#ifdef HAS_NV12TORGB565ROW_NEON
+void NV12ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ MEMACCESS(4)
+ "vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV12
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ ARGBTORGB565
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_NV12TORGB565ROW_NEON
+
+#ifdef HAS_NV21TORGB565ROW_NEON
+void NV21ToRGB565Row_NEON(const uint8* src_y,
+ const uint8* src_uv,
+ uint8* dst_rgb565,
+ int width) {
+ asm volatile (
+ MEMACCESS(4)
+ "vld1.8 {d24}, [%4] \n"
+ MEMACCESS(5)
+ "vld1.8 {d25}, [%5] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READNV21
+ YUV422TORGB
+ "subs %3, %3, #8 \n"
+ ARGBTORGB565
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_uv), // %1
+ "+r"(dst_rgb565), // %2
+ "+r"(width) // %3
+ : "r"(&kUVToRB), // %4
+ "r"(&kUVToG) // %5
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_NV21TORGB565ROW_NEON
+
+#ifdef HAS_YUY2TOARGBROW_NEON
+void YUY2ToARGBRow_NEON(const uint8* src_yuy2,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ MEMACCESS(3)
+ "vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
+ "vld1.8 {d25}, [%4] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READYUY2
+ YUV422TORGB
+ "subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kUVToRB), // %3
+ "r"(&kUVToG) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_YUY2TOARGBROW_NEON
+
+#ifdef HAS_UYVYTOARGBROW_NEON
+void UYVYToARGBRow_NEON(const uint8* src_uyvy,
+ uint8* dst_argb,
+ int width) {
+ asm volatile (
+ MEMACCESS(3)
+ "vld1.8 {d24}, [%3] \n"
+ MEMACCESS(4)
+ "vld1.8 {d25}, [%4] \n"
+ "vmov.u8 d26, #128 \n"
+ "vmov.u16 q14, #74 \n"
+ "vmov.u16 q15, #16 \n"
+ ".p2align 2 \n"
+ "1: \n"
+ READUYVY
+ YUV422TORGB
+ "subs %2, %2, #8 \n"
+ "vmov.u8 d23, #255 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d21, d22, d23}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(&kUVToRB), // %3
+ "r"(&kUVToG) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_UYVYTOARGBROW_NEON
+
+// Reads 16 pairs of UV and write even values to dst_u and odd to dst_v.
+#ifdef HAS_SPLITUVROW_NEON
+void SplitUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pairs of UV
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store U
+ MEMACCESS(2)
+ "vst1.8 {q1}, [%2]! \n" // store V
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_SPLITUVROW_NEON
+
+// Reads 16 U's and V's and writes out 16 pairs of UV.
+#ifdef HAS_MERGEUVROW_NEON
+void MergeUVRow_NEON(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+ int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load U
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n" // load V
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(2)
+ "vst2.u8 {q0, q1}, [%2]! \n" // store 16 pairs of UV
+ "bgt 1b \n"
+ :
+ "+r"(src_u), // %0
+ "+r"(src_v), // %1
+ "+r"(dst_uv), // %2
+ "+r"(width) // %3 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_MERGEUVROW_NEON
+
+// Copy multiple of 32. vld4.8 allow unaligned and is fastest on a15.
+#ifdef HAS_COPYROW_NEON
+void CopyRow_NEON(const uint8* src, uint8* dst, int count) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0, d1, d2, d3}, [%0]! \n" // load 32
+ "subs %2, %2, #32 \n" // 32 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {d0, d1, d2, d3}, [%1]! \n" // store 32
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(count) // %2 // Output registers
+ : // Input registers
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_COPYROW_NEON
+
+// SetRow8 writes 'count' bytes using a 32 bit value repeated.
+#ifdef HAS_SETROW_NEON
+void SetRow_NEON(uint8* dst, uint32 v32, int count) {
+ asm volatile (
+ "vdup.u32 q0, %2 \n" // duplicate 4 ints
+ "1: \n"
+ "subs %1, %1, #16 \n" // 16 bytes per loop
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n" // store
+ "bgt 1b \n"
+ : "+r"(dst), // %0
+ "+r"(count) // %1
+ : "r"(v32) // %2
+ : "cc", "memory", "q0"
+ );
+}
+#endif // HAS_SETROW_NEON
+
+// TODO(fbarchard): Make fully assembler
+// SetRow32 writes 'count' words using a 32 bit value repeated.
+#ifdef HAS_ARGBSETROWS_NEON
+void ARGBSetRows_NEON(uint8* dst, uint32 v32, int width,
+ int dst_stride, int height) {
+ for (int y = 0; y < height; ++y) {
+ SetRow_NEON(dst, v32, width << 2);
+ dst += dst_stride;
+ }
+}
+#endif // HAS_ARGBSETROWS_NEON
+
+#ifdef HAS_MIRRORROW_NEON
+void MirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2 \n"
+ "sub %0, #16 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #16 \n" // 16 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0"
+ );
+}
+#endif // HAS_MIRRORROW_NEON
+
+#ifdef HAS_MIRRORUVROW_NEON
+void MirrorUVRow_NEON(const uint8* src_uv, uint8* dst_u, uint8* dst_v,
+ int width) {
+ asm volatile (
+ // Start at end of source row.
+ "mov r12, #-16 \n"
+ "add %0, %0, %3, lsl #1 \n"
+ "sub %0, #16 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld2.8 {d0, d1}, [%0], r12 \n" // src -= 16
+ "subs %3, #8 \n" // 8 pixels per loop.
+ "vrev64.8 q0, q0 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // dst += 8
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "r12", "q0"
+ );
+}
+#endif // HAS_MIRRORUVROW_NEON
+
+#ifdef HAS_ARGBMIRRORROW_NEON
+void ARGBMirrorRow_NEON(const uint8* src, uint8* dst, int width) {
+ asm volatile (
+ // Start at end of source row.
+ "mov r3, #-16 \n"
+ "add %0, %0, %2, lsl #2 \n"
+ "sub %0, #16 \n"
+
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0], r3 \n" // src -= 16
+ "subs %2, #4 \n" // 4 pixels per loop.
+ "vrev64.32 q0, q0 \n"
+ MEMACCESS(1)
+ "vst1.8 {d1}, [%1]! \n" // dst += 16
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n"
+ "bgt 1b \n"
+ : "+r"(src), // %0
+ "+r"(dst), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "r3", "q0"
+ );
+}
+#endif // HAS_ARGBMIRRORROW_NEON
+
+#ifdef HAS_RGB24TOARGBROW_NEON
+void RGB24ToARGBRow_NEON(const uint8* src_rgb24, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_RGB24TOARGBROW_NEON
+
+#ifdef HAS_RAWTOARGBROW_NEON
+void RAWToARGBRow_NEON(const uint8* src_raw, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d1, d2, d3}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
+ "vst4.8 {d1, d2, d3, d4}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_RAWTOARGBROW_NEON
+
+#define RGB565TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxGGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB RRRRRxxx */ \
+ "vshl.u8 d6, d6, #2 \n" /* G GGGGGG00 upper 6 */ \
+ "vshr.u8 d1, d1, #3 \n" /* R 000RRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #6 \n" /* G 000000GG lower 2 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+#ifdef HAS_RGB565TOARGBROW_NEON
+void RGB565ToARGBRow_NEON(const uint8* src_rgb565, uint8* dst_argb, int pix) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+#endif // HAS_RGB565TOARGBROW_NEON
+
+#define ARGB1555TOARGB \
+ "vshrn.u16 d7, q0, #8 \n" /* A Arrrrrxx */ \
+ "vshr.u8 d6, d7, #2 \n" /* R xxxRRRRR */ \
+ "vshrn.u16 d5, q0, #5 \n" /* G xxxGGGGG */ \
+ "vmovn.u16 d4, q0 \n" /* B xxxBBBBB */ \
+ "vshr.u8 d7, d7, #7 \n" /* A 0000000A */ \
+ "vneg.s8 d7, d7 \n" /* A AAAAAAAA upper 8 */ \
+ "vshl.u8 d6, d6, #3 \n" /* R RRRRR000 upper 5 */ \
+ "vshr.u8 q1, q3, #5 \n" /* R,A 00000RRR lower 3 */ \
+ "vshl.u8 q0, q2, #3 \n" /* B,G BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,G 00000BBB lower 3 */ \
+ "vorr.u8 q1, q1, q3 \n" /* R,A */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,G */ \
+
+// RGB555TOARGB is same as ARGB1555TOARGB but ignores alpha.
+#define RGB555TOARGB \
+ "vshrn.u16 d6, q0, #5 \n" /* G xxxGGGGG */ \
+ "vuzp.u8 d0, d1 \n" /* d0 xxxBBBBB xRRRRRxx */ \
+ "vshl.u8 d6, d6, #3 \n" /* G GGGGG000 upper 5 */ \
+ "vshr.u8 d1, d1, #2 \n" /* R 00xRRRRR lower 5 */ \
+ "vshl.u8 q0, q0, #3 \n" /* B,R BBBBB000 upper 5 */ \
+ "vshr.u8 q2, q0, #5 \n" /* B,R 00000BBB lower 3 */ \
+ "vorr.u8 d0, d0, d4 \n" /* B */ \
+ "vshr.u8 d4, d6, #5 \n" /* G 00000GGG lower 3 */ \
+ "vorr.u8 d2, d1, d5 \n" /* R */ \
+ "vorr.u8 d1, d4, d6 \n" /* G */
+
+#ifdef HAS_ARGB1555TOARGBROW_NEON
+void ARGB1555ToARGBRow_NEON(const uint8* src_argb1555, uint8* dst_argb,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+#endif // HAS_ARGB1555TOARGBROW_NEON
+
+#define ARGB4444TOARGB \
+ "vuzp.u8 d0, d1 \n" /* d0 BG, d1 RA */ \
+ "vshl.u8 q2, q0, #4 \n" /* B,R BBBB0000 */ \
+ "vshr.u8 q1, q0, #4 \n" /* G,A 0000GGGG */ \
+ "vshr.u8 q0, q2, #4 \n" /* B,R 0000BBBB */ \
+ "vorr.u8 q0, q0, q2 \n" /* B,R BBBBBBBB */ \
+ "vshl.u8 q2, q1, #4 \n" /* G,A GGGG0000 */ \
+ "vorr.u8 q1, q1, q2 \n" /* G,A GGGGGGGG */ \
+ "vswp.u8 d1, d2 \n" /* B,R,G,A -> B,G,R,A */
+
+#ifdef HAS_ARGB4444TOARGBROW_NEON
+void ARGB4444ToARGBRow_NEON(const uint8* src_argb4444, uint8* dst_argb,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // Alpha
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+#endif // HAS_ARGB4444TOARGBROW_NEON
+
+#ifdef HAS_ARGBTORGB24ROW_NEON
+void ARGBToRGB24Row_NEON(const uint8* src_argb, uint8* dst_rgb24, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ MEMACCESS(1)
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RGB24.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb24), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_ARGBTORGB24ROW_NEON
+
+#ifdef HAS_ARGBTORAWROW_NEON
+void ARGBToRAWRow_NEON(const uint8* src_argb, uint8* dst_raw, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d1, d2, d3, d4}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vswp.u8 d1, d3 \n" // swap R, B
+ MEMACCESS(1)
+ "vst3.8 {d1, d2, d3}, [%1]! \n" // store 8 pixels of RAW.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_raw), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d1", "d2", "d3", "d4" // Clobber List
+ );
+}
+#endif // HAS_ARGBTORAWROW_NEON
+
+#ifdef HAS_YUY2TOYROW_NEON
+void YUY2ToYRow_NEON(const uint8* src_yuy2, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_YUY2TOYROW_NEON
+
+#ifdef HAS_UYVYTOYROW_NEON
+void UYVYToYRow_NEON(const uint8* src_uyvy, uint8* dst_y, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld2.8 {q0, q1}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %2, %2, #16 \n" // 16 processed per loop.
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store 16 pixels of Y.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_UYVYTOYROW_NEON
+
+#ifdef HAS_YUY2TOUV422ROW_NEON
+void YUY2ToUV422Row_NEON(const uint8* src_yuy2, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
+ "vst1.8 {d1}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
+ "vst1.8 {d3}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+#endif // HAS_YUY2TOUV422ROW_NEON
+
+#ifdef HAS_UYVYTOUV422ROW_NEON
+void UYVYToUV422Row_NEON(const uint8* src_uyvy, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %3, %3, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 U.
+ MEMACCESS(2)
+ "vst1.8 {d2}, [%2]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3" // Clobber List
+ );
+}
+#endif // HAS_UYVYTOUV422ROW_NEON
+
+#ifdef HAS_YUY2TOUVROW_NEON
+void YUY2ToUVRow_NEON(const uint8* src_yuy2, int stride_yuy2,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // stride + src_yuy2
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of YUY2.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row YUY2.
+ "vrhadd.u8 d1, d1, d5 \n" // average rows of U
+ "vrhadd.u8 d3, d3, d7 \n" // average rows of V
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
+ "vst1.8 {d3}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_yuy2), // %0
+ "+r"(stride_yuy2), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ );
+}
+#endif // HAS_YUY2TOUVROW_NEON
+
+#ifdef HAS_UYVYTOUVROW_NEON
+void UYVYToUVRow_NEON(const uint8* src_uyvy, int stride_uyvy,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // stride + src_uyvy
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 16 pixels of UYVY.
+ "subs %4, %4, #16 \n" // 16 pixels = 8 UVs.
+ MEMACCESS(1)
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load next row UYVY.
+ "vrhadd.u8 d0, d0, d4 \n" // average rows of U
+ "vrhadd.u8 d2, d2, d6 \n" // average rows of V
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 U.
+ MEMACCESS(3)
+ "vst1.8 {d2}, [%3]! \n" // store 8 V.
+ "bgt 1b \n"
+ : "+r"(src_uyvy), // %0
+ "+r"(stride_uyvy), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" // Clobber List
+ );
+}
+#endif // HAS_UYVYTOUVROW_NEON
+
+#ifdef HAS_HALFROW_NEON
+void HalfRow_NEON(const uint8* src_uv, int src_uv_stride,
+ uint8* dst_uv, int pix) {
+ asm volatile (
+ // change the stride to row 2 pointer
+ "add %1, %0 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load row 1 16 pixels.
+ "subs %3, %3, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n" // load row 2 16 pixels.
+ "vrhadd.u8 q0, q1 \n" // average row 1 and 2
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n"
+ "bgt 1b \n"
+ : "+r"(src_uv), // %0
+ "+r"(src_uv_stride), // %1
+ "+r"(dst_uv), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_HALFROW_NEON
+
+// Select 2 channels from ARGB on alternating pixels. e.g. BGBGBGBG
+#ifdef HAS_ARGBTOBAYERROW_NEON
+void ARGBToBayerRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 selector, int pix) {
+ asm volatile (
+ "vmov.u32 d6[0], %3 \n" // selector
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0, q1}, [%0]! \n" // load row 8 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ "vtbl.8 d4, {d0, d1}, d6 \n" // look up 4 pixels
+ "vtbl.8 d5, {d2, d3}, d6 \n" // look up 4 pixels
+ "vtrn.u32 d4, d5 \n" // combine 8 pixels
+ MEMACCESS(1)
+ "vst1.8 {d4}, [%1]! \n" // store 8.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ : "r"(selector) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3" // Clobber List
+ );
+}
+#endif // HAS_ARGBTOBAYERROW_NEON
+
+// Select G channels from ARGB. e.g. GGGGGGGG
+#ifdef HAS_ARGBTOBAYERGGROW_NEON
+void ARGBToBayerGGRow_NEON(const uint8* src_argb, uint8* dst_bayer,
+ uint32 /*selector*/, int pix) {
+ asm volatile (
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load row 8 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
+ "vst1.8 {d1}, [%1]! \n" // store 8 G's.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_bayer), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_ARGBTOBAYERGGROW_NEON
+
+// For BGRAToARGB, ABGRToARGB, RGBAToARGB, and ARGBToRGBA.
+#ifdef HAS_ARGBSHUFFLEROW_NEON
+void ARGBShuffleRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const uint8* shuffler, int pix) {
+ asm volatile (
+ MEMACCESS(3)
+ "vld1.8 {q2}, [%3] \n" // shuffler
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 4 pixels.
+ "subs %2, %2, #4 \n" // 4 processed per loop
+ "vtbl.8 d2, {d0, d1}, d4 \n" // look up 2 first pixels
+ "vtbl.8 d3, {d0, d1}, d5 \n" // look up 2 next pixels
+ MEMACCESS(1)
+ "vst1.8 {q1}, [%1]! \n" // store 4.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(pix) // %2
+ : "r"(shuffler) // %3
+ : "cc", "memory", "q0", "q1", "q2" // Clobber List
+ );
+}
+#endif // HAS_ARGBSHUFFLEROW_NEON
+
+#ifdef HAS_I422TOYUY2ROW_NEON
+void I422ToYUY2Row_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_yuy2, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld2.8 {d0, d2}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
+ "vld1.8 {d1}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
+ "vld1.8 {d3}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 YUY2/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_yuy2), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3"
+ );
+}
+#endif // HAS_I422TOYUY2ROW_NEON
+
+#ifdef HAS_I422TOUYVYROW_NEON
+void I422ToUYVYRow_NEON(const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_uyvy, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld2.8 {d1, d3}, [%0]! \n" // load 16 Ys
+ MEMACCESS(1)
+ "vld1.8 {d0}, [%1]! \n" // load 8 Us
+ MEMACCESS(2)
+ "vld1.8 {d2}, [%2]! \n" // load 8 Vs
+ "subs %4, %4, #16 \n" // 16 pixels
+ MEMACCESS(3)
+ "vst4.8 {d0, d1, d2, d3}, [%3]! \n" // Store 8 UYVY/16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_y), // %0
+ "+r"(src_u), // %1
+ "+r"(src_v), // %2
+ "+r"(dst_uyvy), // %3
+ "+r"(width) // %4
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3"
+ );
+}
+#endif // HAS_I422TOUYVYROW_NEON
+
+#ifdef HAS_ARGBTORGB565ROW_NEON
+void ARGBToRGB565Row_NEON(const uint8* src_argb, uint8* dst_rgb565, int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTORGB565
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels RGB565.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_rgb565), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+#endif // HAS_ARGBTORGB565ROW_NEON
+
+#ifdef HAS_ARGBTOARGB1555ROW_NEON
+void ARGBToARGB1555Row_NEON(const uint8* src_argb, uint8* dst_argb1555,
+ int pix) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB1555
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB1555.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb1555), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+#endif // HAS_ARGBTOARGB1555ROW_NEON
+
+#ifdef HAS_ARGBTOARGB4444ROW_NEON
+void ARGBToARGB4444Row_NEON(const uint8* src_argb, uint8* dst_argb4444,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d4, #0x0f \n" // bits to clear with vbic.
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d20, d21, d22, d23}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGBTOARGB4444
+ MEMACCESS(1)
+ "vst1.8 {q0}, [%1]! \n" // store 8 pixels ARGB4444.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb4444), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q8", "q9", "q10", "q11"
+ );
+}
+#endif // HAS_ARGBTOARGB4444ROW_NEON
+
+#ifdef HAS_ARGBTOYROW_NEON
+void ARGBToYRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+#endif // HAS_ARGBTOYROW_NEON
+
+#ifdef HAS_ARGBTOYJROW_NEON
+void ARGBToYJRow_NEON(const uint8* src_argb, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit Y
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+#endif // HAS_ARGBTOYJROW_NEON
+
+// 8x1 pixels.
+#ifdef HAS_ARGBTOUV444ROW_NEON
+void ARGBToUV444Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.u8 d24, #112 \n" // UB / VR 0.875 coefficient
+ "vmov.u8 d25, #74 \n" // UG -0.5781 coefficient
+ "vmov.u8 d26, #38 \n" // UR -0.2969 coefficient
+ "vmov.u8 d27, #18 \n" // VB -0.1406 coefficient
+ "vmov.u8 d28, #94 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlsl.u8 q2, d1, d25 \n" // G
+ "vmlsl.u8 q2, d2, d26 \n" // R
+ "vadd.u16 q2, q2, q15 \n" // +128 -> unsigned
+
+ "vmull.u8 q3, d2, d24 \n" // R
+ "vmlsl.u8 q3, d1, d28 \n" // G
+ "vmlsl.u8 q3, d0, d27 \n" // B
+ "vadd.u16 q3, q3, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q2, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q3, #8 \n" // 16 bit to 8 bit V
+
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ARGBTOUV444ROW_NEON
+
+// 16x1 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGBTOUV422ROW_NEON
+void ARGBToUV422Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q0, q10 \n" // B
+ "vmls.s16 q8, q1, q11 \n" // G
+ "vmls.s16 q8, q2, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+
+ "vmul.s16 q9, q2, q10 \n" // R
+ "vmls.s16 q9, q1, q14 \n" // G
+ "vmls.s16 q9, q0, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ARGBTOUV422ROW_NEON
+
+// 32x1 pixels -> 8x1. pix is number of argb pixels. e.g. 32.
+#ifdef HAS_ARGBTOUV411ROW_NEON
+void ARGBToUV411Row_NEON(const uint8* src_argb, uint8* dst_u, uint8* dst_v,
+ int pix) {
+ asm volatile (
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(0)
+ "vld4.8 {d8, d10, d12, d14}, [%0]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d9, d11, d13, d15}, [%0]! \n" // load last 8 ARGB pixels.
+ "vpaddl.u8 q4, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q5, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q6, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vpadd.u16 d0, d0, d1 \n" // B 16 shorts -> 8 shorts.
+ "vpadd.u16 d1, d8, d9 \n" // B
+ "vpadd.u16 d2, d2, d3 \n" // G 16 shorts -> 8 shorts.
+ "vpadd.u16 d3, d10, d11 \n" // G
+ "vpadd.u16 d4, d4, d5 \n" // R 16 shorts -> 8 shorts.
+ "vpadd.u16 d5, d12, d13 \n" // R
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %3, %3, #32 \n" // 32 processed per loop.
+ "vmul.s16 q8, q0, q10 \n" // B
+ "vmls.s16 q8, q1, q11 \n" // G
+ "vmls.s16 q8, q2, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q2, q10 \n" // R
+ "vmls.s16 q9, q1, q14 \n" // G
+ "vmls.s16 q9, q0, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels U.
+ MEMACCESS(2)
+ "vst1.8 {d1}, [%2]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_u), // %1
+ "+r"(dst_v), // %2
+ "+r"(pix) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ARGBTOUV411ROW_NEON
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#define RGBTOUV(QB, QG, QR) \
+ "vmul.s16 q8, " #QB ", q10 \n" /* B */ \
+ "vmls.s16 q8, " #QG ", q11 \n" /* G */ \
+ "vmls.s16 q8, " #QR ", q12 \n" /* R */ \
+ "vadd.u16 q8, q8, q15 \n" /* +128 -> unsigned */ \
+ "vmul.s16 q9, " #QR ", q10 \n" /* R */ \
+ "vmls.s16 q9, " #QG ", q14 \n" /* G */ \
+ "vmls.s16 q9, " #QB ", q13 \n" /* B */ \
+ "vadd.u16 q9, q9, q15 \n" /* +128 -> unsigned */ \
+ "vqshrn.u16 d0, q8, #8 \n" /* 16 bit to 8 bit U */ \
+ "vqshrn.u16 d1, q9, #8 \n" /* 16 bit to 8 bit V */
+
+// TODO(fbarchard): Consider vhadd vertical, then vpaddl horizontal, avoid shr.
+#ifdef HAS_ARGBTOUVROW_NEON
+void ARGBToUVRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ARGBTOUVROW_NEON
+
+// TODO(fbarchard): Subsample match C code.
+#ifdef HAS_ARGBTOUVJROW_NEON
+void ARGBToUVJRow_NEON(const uint8* src_argb, int src_stride_argb,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #127 / 2 \n" // UB / VR 0.500 coefficient
+ "vmov.s16 q11, #84 / 2 \n" // UG -0.33126 coefficient
+ "vmov.s16 q12, #43 / 2 \n" // UR -0.16874 coefficient
+ "vmov.s16 q13, #20 / 2 \n" // VB -0.08131 coefficient
+ "vmov.s16 q14, #107 / 2 \n" // VG -0.41869 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ARGB pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(src_stride_argb), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ARGBTOUVJROW_NEON
+
+#ifdef HAS_BGRATOUVROW_NEON
+void BGRAToUVRow_NEON(const uint8* src_bgra, int src_stride_bgra,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_bgra
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 BGRA pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 BGRA pixels.
+ "vpaddl.u8 q3, q3 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more BGRA pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 BGRA pixels.
+ "vpadal.u8 q3, q7 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q1, q1, #1 \n" // 2x average
+ "vrshr.u16 q2, q2, #1 \n"
+ "vrshr.u16 q3, q3, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q3, q2, q1)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(src_stride_bgra), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_BGRATOUVROW_NEON
+
+#ifdef HAS_ABGRTOUVROW_NEON
+void ABGRToUVRow_NEON(const uint8* src_abgr, int src_stride_abgr,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_abgr
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ABGR pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ABGR pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more ABGR pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 ABGR pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(src_stride_abgr), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ABGRTOUVROW_NEON
+
+#ifdef HAS_RGBATOUVROW_NEON
+void RGBAToUVRow_NEON(const uint8* src_rgba, int src_stride_rgba,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgba
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 RGBA pixels.
+ MEMACCESS(0)
+ "vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 RGBA pixels.
+ "vpaddl.u8 q0, q1 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q2 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q3 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld4.8 {d8, d10, d12, d14}, [%1]! \n" // load 8 more RGBA pixels.
+ MEMACCESS(1)
+ "vld4.8 {d9, d11, d13, d15}, [%1]! \n" // load last 8 RGBA pixels.
+ "vpadal.u8 q0, q5 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q6 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q7 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(src_stride_rgba), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_RGBATOUVROW_NEON
+
+#ifdef HAS_RGB24TOUVROW_NEON
+void RGB24ToUVRow_NEON(const uint8* src_rgb24, int src_stride_rgb24,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_rgb24
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RGB24 pixels.
+ MEMACCESS(0)
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RGB24 pixels.
+ "vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RGB24 pixels.
+ MEMACCESS(1)
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RGB24 pixels.
+ "vpadal.u8 q0, q4 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q2, q6 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q0, q1, q2)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(src_stride_rgb24), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_RGB24TOUVROW_NEON
+
+#ifdef HAS_RAWTOUVROW_NEON
+void RAWToUVRow_NEON(const uint8* src_raw, int src_stride_raw,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_raw
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d0, d2, d4}, [%0]! \n" // load 8 RAW pixels.
+ MEMACCESS(0)
+ "vld3.8 {d1, d3, d5}, [%0]! \n" // load next 8 RAW pixels.
+ "vpaddl.u8 q2, q2 \n" // B 16 bytes -> 8 shorts.
+ "vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
+ "vpaddl.u8 q0, q0 \n" // R 16 bytes -> 8 shorts.
+ MEMACCESS(1)
+ "vld3.8 {d8, d10, d12}, [%1]! \n" // load 8 more RAW pixels.
+ MEMACCESS(1)
+ "vld3.8 {d9, d11, d13}, [%1]! \n" // load last 8 RAW pixels.
+ "vpadal.u8 q2, q6 \n" // B 16 bytes -> 8 shorts.
+ "vpadal.u8 q1, q5 \n" // G 16 bytes -> 8 shorts.
+ "vpadal.u8 q0, q4 \n" // R 16 bytes -> 8 shorts.
+
+ "vrshr.u16 q0, q0, #1 \n" // 2x average
+ "vrshr.u16 q1, q1, #1 \n"
+ "vrshr.u16 q2, q2, #1 \n"
+
+ "subs %4, %4, #16 \n" // 32 processed per loop.
+ RGBTOUV(q2, q1, q0)
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(src_stride_raw), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_RAWTOUVROW_NEON
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_RGB565TOUVROW_NEON
+void RGB565ToUVRow_NEON(const uint8* src_rgb565, int src_stride_rgb565,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // load 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // next 8 RGB565 pixels.
+ RGB565TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(src_stride_rgb565), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_RGB565TOUVROW_NEON
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB1555TOUVROW_NEON
+void ARGB1555ToUVRow_NEON(const uint8* src_argb1555, int src_stride_argb1555,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB1555 pixels.
+ RGB555TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(src_stride_argb1555), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ARGB1555TOUVROW_NEON
+
+// 16x2 pixels -> 8x1. pix is number of argb pixels. e.g. 16.
+#ifdef HAS_ARGB4444TOUVROW_NEON
+void ARGB4444ToUVRow_NEON(const uint8* src_argb4444, int src_stride_argb4444,
+ uint8* dst_u, uint8* dst_v, int pix) {
+ asm volatile (
+ "add %1, %0, %1 \n" // src_stride + src_argb
+ "vmov.s16 q10, #112 / 2 \n" // UB / VR 0.875 coefficient
+ "vmov.s16 q11, #74 / 2 \n" // UG -0.5781 coefficient
+ "vmov.s16 q12, #38 / 2 \n" // UR -0.2969 coefficient
+ "vmov.s16 q13, #18 / 2 \n" // VB -0.1406 coefficient
+ "vmov.s16 q14, #94 / 2 \n" // VG -0.7344 coefficient
+ "vmov.u16 q15, #0x8080 \n" // 128.5
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpaddl.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpaddl.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpaddl.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // load 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d8, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d10, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d12, d2 \n" // R 8 bytes -> 4 shorts.
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n" // next 8 ARGB4444 pixels.
+ ARGB4444TOARGB
+ "vpadal.u8 d9, d0 \n" // B 8 bytes -> 4 shorts.
+ "vpadal.u8 d11, d1 \n" // G 8 bytes -> 4 shorts.
+ "vpadal.u8 d13, d2 \n" // R 8 bytes -> 4 shorts.
+
+ "vrshr.u16 q4, q4, #1 \n" // 2x average
+ "vrshr.u16 q5, q5, #1 \n"
+ "vrshr.u16 q6, q6, #1 \n"
+
+ "subs %4, %4, #16 \n" // 16 processed per loop.
+ "vmul.s16 q8, q4, q10 \n" // B
+ "vmls.s16 q8, q5, q11 \n" // G
+ "vmls.s16 q8, q6, q12 \n" // R
+ "vadd.u16 q8, q8, q15 \n" // +128 -> unsigned
+ "vmul.s16 q9, q6, q10 \n" // R
+ "vmls.s16 q9, q5, q14 \n" // G
+ "vmls.s16 q9, q4, q13 \n" // B
+ "vadd.u16 q9, q9, q15 \n" // +128 -> unsigned
+ "vqshrn.u16 d0, q8, #8 \n" // 16 bit to 8 bit U
+ "vqshrn.u16 d1, q9, #8 \n" // 16 bit to 8 bit V
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 pixels U.
+ MEMACCESS(3)
+ "vst1.8 {d1}, [%3]! \n" // store 8 pixels V.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(src_stride_argb4444), // %1
+ "+r"(dst_u), // %2
+ "+r"(dst_v), // %3
+ "+r"(pix) // %4
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+ "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ARGB4444TOUVROW_NEON
+
+#ifdef HAS_RGB565TOYROW_NEON
+void RGB565ToYRow_NEON(const uint8* src_rgb565, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 RGB565 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ RGB565TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb565), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+#endif // HAS_RGB565TOYROW_NEON
+
+#ifdef HAS_ARGB1555TOYROW_NEON
+void ARGB1555ToYRow_NEON(const uint8* src_argb1555, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB1555 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB1555TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb1555), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+#endif // HAS_ARGB1555TOYROW_NEON
+
+#ifdef HAS_ARGB4444TOYROW_NEON
+void ARGB4444ToYRow_NEON(const uint8* src_argb4444, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d24, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d25, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d26, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d27, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 8 ARGB4444 pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ ARGB4444TOARGB
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d27 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_argb4444), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q12", "q13"
+ );
+}
+#endif // HAS_ARGB4444TOYROW_NEON
+
+#ifdef HAS_BGRATOYROW_NEON
+void BGRAToYRow_NEON(const uint8* src_bgra, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of BGRA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // R
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_bgra), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+#endif // HAS_BGRATOYROW_NEON
+
+#ifdef HAS_ABGRTOYROW_NEON
+void ABGRToYRow_NEON(const uint8* src_abgr, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ABGR.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // R
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // B
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_abgr), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+#endif // HAS_ABGRTOYROW_NEON
+
+#ifdef HAS_RGBATOYROW_NEON
+void RGBAToYRow_NEON(const uint8* src_rgba, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of RGBA.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d1, d4 \n" // B
+ "vmlal.u8 q8, d2, d5 \n" // G
+ "vmlal.u8 q8, d3, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgba), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+#endif // HAS_RGBATOYROW_NEON
+
+#ifdef HAS_RGB24TOYROW_NEON
+void RGB24ToYRow_NEON(const uint8* src_rgb24, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RGB24.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_rgb24), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+#endif // HAS_RGB24TOYROW_NEON
+
+#ifdef HAS_RAWTOYROW_NEON
+void RAWToYRow_NEON(const uint8* src_raw, uint8* dst_y, int pix) {
+ asm volatile (
+ "vmov.u8 d4, #33 \n" // R * 0.2578 coefficient
+ "vmov.u8 d5, #65 \n" // G * 0.5078 coefficient
+ "vmov.u8 d6, #13 \n" // B * 0.1016 coefficient
+ "vmov.u8 d7, #16 \n" // Add 16 constant
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld3.8 {d0, d1, d2}, [%0]! \n" // load 8 pixels of RAW.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q8, d0, d4 \n" // B
+ "vmlal.u8 q8, d1, d5 \n" // G
+ "vmlal.u8 q8, d2, d6 \n" // R
+ "vqrshrun.s16 d0, q8, #7 \n" // 16 bit to 8 bit Y
+ "vqadd.u8 d0, d7 \n"
+ MEMACCESS(1)
+ "vst1.8 {d0}, [%1]! \n" // store 8 pixels Y.
+ "bgt 1b \n"
+ : "+r"(src_raw), // %0
+ "+r"(dst_y), // %1
+ "+r"(pix) // %2
+ :
+ : "cc", "memory", "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", "q8"
+ );
+}
+#endif // HAS_RAWTOYROW_NEON
+
+// Bilinear filter 16x2 -> 16x1
+#ifdef HAS_INTERPOLATEROW_NEON
+void InterpolateRow_NEON(uint8* dst_ptr,
+ const uint8* src_ptr, ptrdiff_t src_stride,
+ int dst_width, int source_y_fraction) {
+ asm volatile (
+ "cmp %4, #0 \n"
+ "beq 100f \n"
+ "add %2, %1 \n"
+ "cmp %4, #64 \n"
+ "beq 75f \n"
+ "cmp %4, #128 \n"
+ "beq 50f \n"
+ "cmp %4, #192 \n"
+ "beq 25f \n"
+
+ "vdup.8 d5, %4 \n"
+ "rsb %4, #256 \n"
+ "vdup.8 d4, %4 \n"
+ // General purpose row blend.
+ "1: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vmull.u8 q13, d0, d4 \n"
+ "vmull.u8 q14, d1, d4 \n"
+ "vmlal.u8 q13, d2, d5 \n"
+ "vmlal.u8 q14, d3, d5 \n"
+ "vrshrn.u16 d0, q13, #8 \n"
+ "vrshrn.u16 d1, q14, #8 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 1b \n"
+ "b 99f \n"
+
+ // Blend 25 / 75.
+ "25: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 25b \n"
+ "b 99f \n"
+
+ // Blend 50 / 50.
+ "50: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q1}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 50b \n"
+ "b 99f \n"
+
+ // Blend 75 / 25.
+ "75: \n"
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
+ "vld1.8 {q0}, [%2]! \n"
+ "subs %3, %3, #16 \n"
+ "vrhadd.u8 q0, q1 \n"
+ "vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 75b \n"
+ "b 99f \n"
+
+ // Blend 100 / 0 - Copy row unchanged.
+ "100: \n"
+ MEMACCESS(1)
+ "vld1.8 {q0}, [%1]! \n"
+ "subs %3, %3, #16 \n"
+ MEMACCESS(0)
+ "vst1.8 {q0}, [%0]! \n"
+ "bgt 100b \n"
+
+ "99: \n"
+ : "+r"(dst_ptr), // %0
+ "+r"(src_ptr), // %1
+ "+r"(src_stride), // %2
+ "+r"(dst_width), // %3
+ "+r"(source_y_fraction) // %4
+ :
+ : "cc", "memory", "q0", "q1", "d4", "d5", "q13", "q14"
+ );
+}
+#endif // HAS_INTERPOLATEROW_NEON
+
+// dr * (256 - sa) / 256 + sr = dr - dr * sa / 256 + sr
+#ifdef HAS_ARGBBLENDROW_NEON
+void ARGBBlendRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "subs %3, #8 \n"
+ "blt 89f \n"
+ // Blend 8 pixels.
+ "8: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB0.
+ MEMACCESS(1)
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 pixels of ARGB1.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 pixels of ARGB.
+ "bge 8b \n"
+
+ "89: \n"
+ "adds %3, #8-1 \n"
+ "blt 99f \n"
+
+ // Blend 1 pixels.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0[0],d1[0],d2[0],d3[0]}, [%0]! \n" // load 1 pixel ARGB0.
+ MEMACCESS(1)
+ "vld4.8 {d4[0],d5[0],d6[0],d7[0]}, [%1]! \n" // load 1 pixel ARGB1.
+ "subs %3, %3, #1 \n" // 1 processed per loop.
+ "vmull.u8 q10, d4, d3 \n" // db * a
+ "vmull.u8 q11, d5, d3 \n" // dg * a
+ "vmull.u8 q12, d6, d3 \n" // dr * a
+ "vqrshrn.u16 d20, q10, #8 \n" // db >>= 8
+ "vqrshrn.u16 d21, q11, #8 \n" // dg >>= 8
+ "vqrshrn.u16 d22, q12, #8 \n" // dr >>= 8
+ "vqsub.u8 q2, q2, q10 \n" // dbg - dbg * a / 256
+ "vqsub.u8 d6, d6, d22 \n" // dr - dr * a / 256
+ "vqadd.u8 q0, q0, q2 \n" // + sbg
+ "vqadd.u8 d2, d2, d6 \n" // + sr
+ "vmov.u8 d3, #255 \n" // a = 255
+ MEMACCESS(2)
+ "vst4.8 {d0[0],d1[0],d2[0],d3[0]}, [%2]! \n" // store 1 pixel.
+ "bge 1b \n"
+
+ "99: \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q10", "q11", "q12"
+ );
+}
+#endif // HAS_ARGBBLENDROW_NEON
+
+// Attenuate 8 pixels at a time.
+#ifdef HAS_ARGBATTENUATEROW_NEON
+void ARGBAttenuateRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ // Attenuate 8 pixels.
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q10, d0, d3 \n" // b * a
+ "vmull.u8 q11, d1, d3 \n" // g * a
+ "vmull.u8 q12, d2, d3 \n" // r * a
+ "vqrshrn.u16 d0, q10, #8 \n" // b >>= 8
+ "vqrshrn.u16 d1, q11, #8 \n" // g >>= 8
+ "vqrshrn.u16 d2, q12, #8 \n" // r >>= 8
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q10", "q11", "q12"
+ );
+}
+#endif // HAS_ARGBATTENUATEROW_NEON
+
+// Quantize 8 ARGB pixels (32 bytes).
+// dst = (dst * scale >> 16) * interval_size + interval_offset;
+#ifdef HAS_ARGBQUANTIZEROW_NEON
+void ARGBQuantizeRow_NEON(uint8* dst_argb, int scale, int interval_size,
+ int interval_offset, int width) {
+ asm volatile (
+ "vdup.u16 q8, %2 \n"
+ "vshr.u16 q8, q8, #1 \n" // scale >>= 1
+ "vdup.u16 q9, %3 \n" // interval multiply.
+ "vdup.u16 q10, %4 \n" // interval add
+
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0] \n" // load 8 pixels of ARGB.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q0, d0 \n" // b (0 .. 255)
+ "vmovl.u8 q1, d2 \n"
+ "vmovl.u8 q2, d4 \n"
+ "vqdmulh.s16 q0, q0, q8 \n" // b * scale
+ "vqdmulh.s16 q1, q1, q8 \n" // g
+ "vqdmulh.s16 q2, q2, q8 \n" // r
+ "vmul.u16 q0, q0, q9 \n" // b * interval_size
+ "vmul.u16 q1, q1, q9 \n" // g
+ "vmul.u16 q2, q2, q9 \n" // r
+ "vadd.u16 q0, q0, q10 \n" // b + interval_offset
+ "vadd.u16 q1, q1, q10 \n" // g
+ "vadd.u16 q2, q2, q10 \n" // r
+ "vqmovn.u16 d0, q0 \n"
+ "vqmovn.u16 d2, q1 \n"
+ "vqmovn.u16 d4, q2 \n"
+ MEMACCESS(0)
+ "vst4.8 {d0, d2, d4, d6}, [%0]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ : "r"(scale), // %2
+ "r"(interval_size), // %3
+ "r"(interval_offset) // %4
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q8", "q9", "q10"
+ );
+}
+#endif // HAS_ARGBQUANTIZEROW_NEON
+
+// Shade 8 pixels at a time by specified value.
+// NOTE vqrdmulh.s16 q10, q10, d0[0] must use a scaler register from 0 to 8.
+// Rounding in vqrdmulh does +1 to high if high bit of low s16 is set.
+#ifdef HAS_ARGBSHADEROW_NEON
+void ARGBShadeRow_NEON(const uint8* src_argb, uint8* dst_argb, int width,
+ uint32 value) {
+ asm volatile (
+ "vdup.u32 q0, %3 \n" // duplicate scale value.
+ "vzip.u8 d0, d1 \n" // d0 aarrggbb.
+ "vshr.u16 q0, q0, #1 \n" // scale / 2.
+
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d20, d22, d24, d26}, [%0]! \n" // load 8 pixels of ARGB.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q10, d20 \n" // b (0 .. 255)
+ "vmovl.u8 q11, d22 \n"
+ "vmovl.u8 q12, d24 \n"
+ "vmovl.u8 q13, d26 \n"
+ "vqrdmulh.s16 q10, q10, d0[0] \n" // b * scale * 2
+ "vqrdmulh.s16 q11, q11, d0[1] \n" // g
+ "vqrdmulh.s16 q12, q12, d0[2] \n" // r
+ "vqrdmulh.s16 q13, q13, d0[3] \n" // a
+ "vqmovn.u16 d20, q10 \n"
+ "vqmovn.u16 d22, q11 \n"
+ "vqmovn.u16 d24, q12 \n"
+ "vqmovn.u16 d26, q13 \n"
+ MEMACCESS(1)
+ "vst4.8 {d20, d22, d24, d26}, [%1]! \n" // store 8 pixels of ARGB.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(value) // %3
+ : "cc", "memory", "q0", "q10", "q11", "q12", "q13"
+ );
+}
+#endif // HAS_ARGBSHADEROW_NEON
+
+// Convert 8 ARGB pixels (64 bytes) to 8 Gray ARGB pixels
+// Similar to ARGBToYJ but stores ARGB.
+// C code is (15 * b + 75 * g + 38 * r + 64) >> 7;
+#ifdef HAS_ARGBGRAYROW_NEON
+void ARGBGrayRow_NEON(const uint8* src_argb, uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d24, #15 \n" // B * 0.11400 coefficient
+ "vmov.u8 d25, #75 \n" // G * 0.58700 coefficient
+ "vmov.u8 d26, #38 \n" // R * 0.29900 coefficient
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d24 \n" // B
+ "vmlal.u8 q2, d1, d25 \n" // G
+ "vmlal.u8 q2, d2, d26 \n" // R
+ "vqrshrun.s16 d0, q2, #7 \n" // 15 bit to 8 bit B
+ "vmov d1, d0 \n" // G
+ "vmov d2, d0 \n" // R
+ MEMACCESS(1)
+ "vst4.8 {d0, d1, d2, d3}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q12", "q13"
+ );
+}
+#endif // HAS_ARGBGRAYROW_NEON
+
+// Convert 8 ARGB pixels (32 bytes) to 8 Sepia ARGB pixels.
+// b = (r * 35 + g * 68 + b * 17) >> 7
+// g = (r * 45 + g * 88 + b * 22) >> 7
+// r = (r * 50 + g * 98 + b * 24) >> 7
+
+#ifdef HAS_ARGBSEPIAROW_NEON
+void ARGBSepiaRow_NEON(uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d20, #17 \n" // BB coefficient
+ "vmov.u8 d21, #68 \n" // BG coefficient
+ "vmov.u8 d22, #35 \n" // BR coefficient
+ "vmov.u8 d24, #22 \n" // GB coefficient
+ "vmov.u8 d25, #88 \n" // GG coefficient
+ "vmov.u8 d26, #45 \n" // GR coefficient
+ "vmov.u8 d28, #24 \n" // BB coefficient
+ "vmov.u8 d29, #98 \n" // BG coefficient
+ "vmov.u8 d30, #50 \n" // BR coefficient
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0] \n" // load 8 ARGB pixels.
+ "subs %1, %1, #8 \n" // 8 processed per loop.
+ "vmull.u8 q2, d0, d20 \n" // B to Sepia B
+ "vmlal.u8 q2, d1, d21 \n" // G
+ "vmlal.u8 q2, d2, d22 \n" // R
+ "vmull.u8 q3, d0, d24 \n" // B to Sepia G
+ "vmlal.u8 q3, d1, d25 \n" // G
+ "vmlal.u8 q3, d2, d26 \n" // R
+ "vmull.u8 q8, d0, d28 \n" // B to Sepia R
+ "vmlal.u8 q8, d1, d29 \n" // G
+ "vmlal.u8 q8, d2, d30 \n" // R
+ "vqshrn.u16 d0, q2, #7 \n" // 16 bit to 8 bit B
+ "vqshrn.u16 d1, q3, #7 \n" // 16 bit to 8 bit G
+ "vqshrn.u16 d2, q8, #7 \n" // 16 bit to 8 bit R
+ MEMACCESS(0)
+ "vst4.8 {d0, d1, d2, d3}, [%0]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(dst_argb), // %0
+ "+r"(width) // %1
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ARGBSEPIAROW_NEON
+
+// Tranform 8 ARGB pixels (32 bytes) with color matrix.
+// TODO(fbarchard): Was same as Sepia except matrix is provided. This function
+// needs to saturate. Consider doing a non-saturating version.
+#ifdef HAS_ARGBCOLORMATRIXROW_NEON
+void ARGBColorMatrixRow_NEON(const uint8* src_argb, uint8* dst_argb,
+ const int8* matrix_argb, int width) {
+ asm volatile (
+ MEMACCESS(3)
+ "vld1.8 {q2}, [%3] \n" // load 3 ARGB vectors.
+ "vmovl.s8 q0, d4 \n" // B,G coefficients s16.
+ "vmovl.s8 q1, d5 \n" // R,A coefficients s16.
+
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d16, d18, d20, d22}, [%0]! \n" // load 8 ARGB pixels.
+ "subs %2, %2, #8 \n" // 8 processed per loop.
+ "vmovl.u8 q8, d16 \n" // b (0 .. 255) 16 bit
+ "vmovl.u8 q9, d18 \n" // g
+ "vmovl.u8 q10, d20 \n" // r
+ "vmovl.u8 q15, d22 \n" // a
+ "vmul.s16 q12, q8, d0[0] \n" // B = B * Matrix B
+ "vmul.s16 q13, q8, d1[0] \n" // G = B * Matrix G
+ "vmul.s16 q14, q8, d2[0] \n" // R = B * Matrix R
+ "vmul.s16 q15, q8, d3[0] \n" // A = B * Matrix A
+ "vmul.s16 q4, q9, d0[1] \n" // B += G * Matrix B
+ "vmul.s16 q5, q9, d1[1] \n" // G += G * Matrix G
+ "vmul.s16 q6, q9, d2[1] \n" // R += G * Matrix R
+ "vmul.s16 q7, q9, d3[1] \n" // A += G * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q10, d0[2] \n" // B += R * Matrix B
+ "vmul.s16 q5, q10, d1[2] \n" // G += R * Matrix G
+ "vmul.s16 q6, q10, d2[2] \n" // R += R * Matrix R
+ "vmul.s16 q7, q10, d3[2] \n" // A += R * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vmul.s16 q4, q15, d0[3] \n" // B += A * Matrix B
+ "vmul.s16 q5, q15, d1[3] \n" // G += A * Matrix G
+ "vmul.s16 q6, q15, d2[3] \n" // R += A * Matrix R
+ "vmul.s16 q7, q15, d3[3] \n" // A += A * Matrix A
+ "vqadd.s16 q12, q12, q4 \n" // Accumulate B
+ "vqadd.s16 q13, q13, q5 \n" // Accumulate G
+ "vqadd.s16 q14, q14, q6 \n" // Accumulate R
+ "vqadd.s16 q15, q15, q7 \n" // Accumulate A
+ "vqshrun.s16 d16, q12, #6 \n" // 16 bit to 8 bit B
+ "vqshrun.s16 d18, q13, #6 \n" // 16 bit to 8 bit G
+ "vqshrun.s16 d20, q14, #6 \n" // 16 bit to 8 bit R
+ "vqshrun.s16 d22, q15, #6 \n" // 16 bit to 8 bit A
+ MEMACCESS(1)
+ "vst4.8 {d16, d18, d20, d22}, [%1]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_argb), // %0
+ "+r"(dst_argb), // %1
+ "+r"(width) // %2
+ : "r"(matrix_argb) // %3
+ : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+ "q10", "q11", "q12", "q13", "q14", "q15"
+ );
+}
+#endif // HAS_ARGBCOLORMATRIXROW_NEON
+
+// TODO(fbarchard): fix vqshrun in ARGBMultiplyRow_NEON and reenable.
+// Multiply 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBMULTIPLYROW_NEON
+void ARGBMultiplyRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d1, d3, d5, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vmull.u8 q0, d0, d1 \n" // multiply B
+ "vmull.u8 q1, d2, d3 \n" // multiply G
+ "vmull.u8 q2, d4, d5 \n" // multiply R
+ "vmull.u8 q3, d6, d7 \n" // multiply A
+ "vrshrn.u16 d0, q0, #8 \n" // 16 bit to 8 bit B
+ "vrshrn.u16 d1, q1, #8 \n" // 16 bit to 8 bit G
+ "vrshrn.u16 d2, q2, #8 \n" // 16 bit to 8 bit R
+ "vrshrn.u16 d3, q3, #8 \n" // 16 bit to 8 bit A
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+#endif // HAS_ARGBMULTIPLYROW_NEON
+
+// Add 2 rows of ARGB pixels together, 8 pixels at a time.
+#ifdef HAS_ARGBADDROW_NEON
+void ARGBAddRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 q0, q0, q2 \n" // add B, G
+ "vqadd.u8 q1, q1, q3 \n" // add R, A
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+#endif // HAS_ARGBADDROW_NEON
+
+// Subtract 2 rows of ARGB pixels, 8 pixels at a time.
+#ifdef HAS_ARGBSUBTRACTROW_NEON
+void ARGBSubtractRow_NEON(const uint8* src_argb0, const uint8* src_argb1,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld4.8 {d0, d1, d2, d3}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(1)
+ "vld4.8 {d4, d5, d6, d7}, [%1]! \n" // load 8 more ARGB pixels.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqsub.u8 q0, q0, q2 \n" // subtract B, G
+ "vqsub.u8 q1, q1, q3 \n" // subtract R, A
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+
+ : "+r"(src_argb0), // %0
+ "+r"(src_argb1), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1", "q2", "q3"
+ );
+}
+#endif // HAS_ARGBSUBTRACTROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into ARGB.
+// A = 255
+// R = Sobel
+// G = Sobel
+// B = Sobel
+#ifdef HAS_SOBELROW_NEON
+void SobelRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
+ "vld1.8 {d1}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d0, d0, d1 \n" // add
+ "vmov.u8 d1, d0 \n"
+ "vmov.u8 d2, d0 \n"
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+#endif // HAS_SOBELROW_NEON
+
+// Adds Sobel X and Sobel Y and stores Sobel into plane.
+#ifdef HAS_SOBELTOPLANEROW_NEON
+void SobelToPlaneRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_y, int width) {
+ asm volatile (
+ // 16 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {q0}, [%0]! \n" // load 16 sobelx.
+ MEMACCESS(1)
+ "vld1.8 {q1}, [%1]! \n" // load 16 sobely.
+ "subs %3, %3, #16 \n" // 16 processed per loop.
+ "vqadd.u8 q0, q0, q1 \n" // add
+ MEMACCESS(2)
+ "vst1.8 {q0}, [%2]! \n" // store 16 pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_y), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+#endif // HAS_SOBELTOPLANEROW_NEON
+
+// Mixes Sobel X, Sobel Y and Sobel into ARGB.
+// A = 255
+// R = Sobel X
+// G = Sobel
+// B = Sobel Y
+#ifdef HAS_SOBELXYROW_NEON
+void SobelXYRow_NEON(const uint8* src_sobelx, const uint8* src_sobely,
+ uint8* dst_argb, int width) {
+ asm volatile (
+ "vmov.u8 d3, #255 \n" // alpha
+ // 8 pixel loop.
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0]! \n" // load 8 sobelx.
+ MEMACCESS(1)
+ "vld1.8 {d0}, [%1]! \n" // load 8 sobely.
+ "subs %3, %3, #8 \n" // 8 processed per loop.
+ "vqadd.u8 d1, d0, d2 \n" // add
+ MEMACCESS(2)
+ "vst4.8 {d0, d1, d2, d3}, [%2]! \n" // store 8 ARGB pixels.
+ "bgt 1b \n"
+ : "+r"(src_sobelx), // %0
+ "+r"(src_sobely), // %1
+ "+r"(dst_argb), // %2
+ "+r"(width) // %3
+ :
+ : "cc", "memory", "q0", "q1"
+ );
+}
+#endif // HAS_SOBELXYROW_NEON
+
+// SobelX as a matrix is
+// -1 0 1
+// -2 0 2
+// -1 0 1
+#ifdef HAS_SOBELXROW_NEON
+void SobelXRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ const uint8* src_y2, uint8* dst_sobelx, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0],%5 \n" // top
+ MEMACCESS(0)
+ "vld1.8 {d1}, [%0],%6 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(1)
+ "vld1.8 {d2}, [%1],%5 \n" // center * 2
+ MEMACCESS(1)
+ "vld1.8 {d3}, [%1],%6 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(2)
+ "vld1.8 {d2}, [%2],%5 \n" // bottom
+ MEMACCESS(2)
+ "vld1.8 {d3}, [%2],%6 \n"
+ "subs %4, %4, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ MEMACCESS(3)
+ "vst1.8 {d0}, [%3]! \n" // store 8 sobelx
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(src_y2), // %2
+ "+r"(dst_sobelx), // %3
+ "+r"(width) // %4
+ : "r"(2), // %5
+ "r"(6) // %6
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_SOBELXROW_NEON
+
+// SobelY as a matrix is
+// -1 -2 -1
+// 0 0 0
+// 1 2 1
+#ifdef HAS_SOBELYROW_NEON
+void SobelYRow_NEON(const uint8* src_y0, const uint8* src_y1,
+ uint8* dst_sobely, int width) {
+ asm volatile (
+ ".p2align 2 \n"
+ "1: \n"
+ MEMACCESS(0)
+ "vld1.8 {d0}, [%0],%4 \n" // left
+ MEMACCESS(1)
+ "vld1.8 {d1}, [%1],%4 \n"
+ "vsubl.u8 q0, d0, d1 \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0],%4 \n" // center * 2
+ MEMACCESS(1)
+ "vld1.8 {d3}, [%1],%4 \n"
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ MEMACCESS(0)
+ "vld1.8 {d2}, [%0],%5 \n" // right
+ MEMACCESS(1)
+ "vld1.8 {d3}, [%1],%5 \n"
+ "subs %3, %3, #8 \n" // 8 pixels
+ "vsubl.u8 q1, d2, d3 \n"
+ "vadd.s16 q0, q0, q1 \n"
+ "vabs.s16 q0, q0 \n"
+ "vqmovn.u16 d0, q0 \n"
+ MEMACCESS(2)
+ "vst1.8 {d0}, [%2]! \n" // store 8 sobely
+ "bgt 1b \n"
+ : "+r"(src_y0), // %0
+ "+r"(src_y1), // %1
+ "+r"(dst_sobely), // %2
+ "+r"(width) // %3
+ : "r"(1), // %4
+ "r"(6) // %5
+ : "cc", "memory", "q0", "q1" // Clobber List
+ );
+}
+#endif // HAS_SOBELYROW_NEON
+#endif // !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/row_posix.cc b/third_party/libyuv/source/row_posix.cc
index e47708802..106fda568 100644
--- a/third_party/libyuv/source/row_posix.cc
+++ b/third_party/libyuv/source/row_posix.cc
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/source/row_win.cc b/third_party/libyuv/source/row_win.cc
index 2cfacad1b..8eb888926 100644
--- a/third_party/libyuv/source/row_win.cc
+++ b/third_party/libyuv/source/row_win.cc
@@ -8,15 +8,179 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
+
+#if defined (_M_X64)
+#include <emmintrin.h>
+#include <tmmintrin.h> // For _mm_maddubs_epi16
+#endif
#ifdef __cplusplus
namespace libyuv {
extern "C" {
#endif
-// This module is for Visual C x86.
-#if !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+// This module is for Visual C.
+#if !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
+
+#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
+
+#define UB 127 /* min(127,(int8)(2.018 * 64)) */
+#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
+#define UR 0
+
+#define VB 0
+#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
+#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
+
+// Bias
+#define BB UB * 128 + VB * 128
+#define BG UG * 128 + VG * 128
+#define BR UR * 128 + VR * 128
+
+static const vec8 kUVToB = {
+ UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
+};
+
+static const vec8 kUVToR = {
+ UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
+};
+
+static const vec8 kUVToG = {
+ UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
+};
+
+static const vec8 kVUToB = {
+ VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
+};
+
+static const vec8 kVUToR = {
+ VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
+};
+
+static const vec8 kVUToG = {
+ VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
+};
+
+static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
+static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
+static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
+static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
+static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
+
+// 64 bit
+#if defined(_M_X64)
+
+// Aligned destination version.
+__declspec(align(16))
+void I422ToARGBRow_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ const __m128i xmm5 = _mm_set1_epi8(-1);
+ const __m128i xmm4 = _mm_setzero_si128();
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+ while (width > 0) {
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+ xmm1 = _mm_load_si128(&xmm0);
+ xmm2 = _mm_load_si128(&xmm0);
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
+ xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
+ xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
+ xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+ xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+ xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
+ xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+ xmm0 = _mm_adds_epi16(xmm0, xmm3);
+ xmm1 = _mm_adds_epi16(xmm1, xmm3);
+ xmm2 = _mm_adds_epi16(xmm2, xmm3);
+ xmm0 = _mm_srai_epi16(xmm0, 6);
+ xmm1 = _mm_srai_epi16(xmm1, 6);
+ xmm2 = _mm_srai_epi16(xmm2, 6);
+ xmm0 = _mm_packus_epi16(xmm0, xmm0);
+ xmm1 = _mm_packus_epi16(xmm1, xmm1);
+ xmm2 = _mm_packus_epi16(xmm2, xmm2);
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+ xmm1 = _mm_load_si128(&xmm0);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+ _mm_store_si128((__m128i *)dst_argb, xmm0);
+ _mm_store_si128((__m128i *)(dst_argb + 16), xmm1);
+
+ y_buf += 8;
+ u_buf += 4;
+ dst_argb += 32;
+ width -= 8;
+ }
+}
+
+// Unaligned destination version.
+void I422ToARGBRow_Unaligned_SSSE3(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
+
+ __m128i xmm0, xmm1, xmm2, xmm3;
+ const __m128i xmm5 = _mm_set1_epi8(-1);
+ const __m128i xmm4 = _mm_setzero_si128();
+ const ptrdiff_t offset = (uint8*)v_buf - (uint8*)u_buf;
+
+ while (width > 0) {
+ xmm0 = _mm_cvtsi32_si128(*(uint32*)u_buf);
+ xmm1 = _mm_cvtsi32_si128(*(uint32*)(u_buf + offset));
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm0);
+ xmm1 = _mm_load_si128(&xmm0);
+ xmm2 = _mm_load_si128(&xmm0);
+ xmm0 = _mm_maddubs_epi16(xmm0, *(__m128i*)kUVToB);
+ xmm1 = _mm_maddubs_epi16(xmm1, *(__m128i*)kUVToG);
+ xmm2 = _mm_maddubs_epi16(xmm2, *(__m128i*)kUVToR);
+ xmm0 = _mm_sub_epi16(xmm0, *(__m128i*)kUVBiasB);
+ xmm1 = _mm_sub_epi16(xmm1, *(__m128i*)kUVBiasG);
+ xmm2 = _mm_sub_epi16(xmm2, *(__m128i*)kUVBiasR);
+ xmm3 = _mm_loadl_epi64((__m128i*)y_buf);
+ xmm3 = _mm_unpacklo_epi8(xmm3, xmm4);
+ xmm3 = _mm_subs_epi16(xmm3, *(__m128i*)kYSub16);
+ xmm3 = _mm_mullo_epi16(xmm3, *(__m128i*)kYToRgb);
+ xmm0 = _mm_adds_epi16(xmm0, xmm3);
+ xmm1 = _mm_adds_epi16(xmm1, xmm3);
+ xmm2 = _mm_adds_epi16(xmm2, xmm3);
+ xmm0 = _mm_srai_epi16(xmm0, 6);
+ xmm1 = _mm_srai_epi16(xmm1, 6);
+ xmm2 = _mm_srai_epi16(xmm2, 6);
+ xmm0 = _mm_packus_epi16(xmm0, xmm0);
+ xmm1 = _mm_packus_epi16(xmm1, xmm1);
+ xmm2 = _mm_packus_epi16(xmm2, xmm2);
+ xmm0 = _mm_unpacklo_epi8(xmm0, xmm1);
+ xmm2 = _mm_unpacklo_epi8(xmm2, xmm5);
+ xmm1 = _mm_load_si128(&xmm0);
+ xmm0 = _mm_unpacklo_epi16(xmm0, xmm2);
+ xmm1 = _mm_unpackhi_epi16(xmm1, xmm2);
+
+ _mm_storeu_si128((__m128i *)dst_argb, xmm0);
+ _mm_storeu_si128((__m128i *)(dst_argb + 16), xmm1);
+
+ y_buf += 8;
+ u_buf += 4;
+ dst_argb += 32;
+ width -= 8;
+ }
+}
+// 32 bit
+#else // defined(_M_X64)
#ifdef HAS_ARGBTOYROW_SSSE3
@@ -2030,21 +2194,6 @@ void RGBAToUVRow_Unaligned_SSSE3(const uint8* src_argb0, int src_stride_argb,
}
#endif // HAS_ARGBTOYROW_SSSE3
-#define YG 74 /* (int8)(1.164 * 64 + 0.5) */
-
-#define UB 127 /* min(63,(int8)(2.018 * 64)) */
-#define UG -25 /* (int8)(-0.391 * 64 - 0.5) */
-#define UR 0
-
-#define VB 0
-#define VG -52 /* (int8)(-0.813 * 64 - 0.5) */
-#define VR 102 /* (int8)(1.596 * 64 + 0.5) */
-
-// Bias
-#define BB UB * 128 + VB * 128
-#define BG UG * 128 + VG * 128
-#define BR UR * 128 + VR * 128
-
#ifdef HAS_I422TOARGBROW_AVX2
static const lvec8 kUVToB_AVX = {
@@ -2079,10 +2228,10 @@ static const lvec16 kUVBiasR_AVX = {
// 8 UV values upsampled to 16 UV, mixed with 16 Y producing 16 ARGB (64 bytes).
__declspec(naked) __declspec(align(16))
void I422ToARGBRow_AVX2(const uint8* y_buf,
- const uint8* u_buf,
- const uint8* v_buf,
- uint8* dst_argb,
- int width) {
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* dst_argb,
+ int width) {
__asm {
push esi
push edi
@@ -2150,36 +2299,6 @@ void I422ToARGBRow_AVX2(const uint8* y_buf,
#ifdef HAS_I422TOARGBROW_SSSE3
-static const vec8 kUVToB = {
- UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB
-};
-
-static const vec8 kUVToR = {
- UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR
-};
-
-static const vec8 kUVToG = {
- UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG
-};
-
-static const vec8 kVUToB = {
- VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB, VB, UB,
-};
-
-static const vec8 kVUToR = {
- VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR, VR, UR,
-};
-
-static const vec8 kVUToG = {
- VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG, VG, UG,
-};
-
-static const vec16 kYToRgb = { YG, YG, YG, YG, YG, YG, YG, YG };
-static const vec16 kYSub16 = { 16, 16, 16, 16, 16, 16, 16, 16 };
-static const vec16 kUVBiasB = { BB, BB, BB, BB, BB, BB, BB, BB };
-static const vec16 kUVBiasG = { BG, BG, BG, BG, BG, BG, BG, BG };
-static const vec16 kUVBiasR = { BR, BR, BR, BR, BR, BR, BR, BR };
-
// TODO(fbarchard): Read that does half size on Y and treats 420 as 444.
// Read 8 UV from 444.
@@ -7276,7 +7395,8 @@ void ARGBLumaColorTableRow_SSSE3(const uint8* src_argb, uint8* dst_argb,
}
#endif // HAS_ARGBLUMACOLORTABLEROW_SSSE3
-#endif // !defined(LIBYUV_DISABLE_X86) && defined(_M_IX86) && defined(_MSC_VER)
+#endif // defined(_M_X64)
+#endif // !defined(LIBYUV_DISABLE_X86) && defined(_MSC_VER)
#ifdef __cplusplus
} // extern "C"
diff --git a/third_party/libyuv/source/row_x86.asm b/third_party/libyuv/source/row_x86.asm
new file mode 100644
index 000000000..0cb326f8e
--- /dev/null
+++ b/third_party/libyuv/source/row_x86.asm
@@ -0,0 +1,146 @@
+;
+; Copyright 2012 The LibYuv Project Authors. All rights reserved.
+;
+; Use of this source code is governed by a BSD-style license
+; that can be found in the LICENSE file in the root of the source
+; tree. An additional intellectual property rights grant can be found
+; in the file PATENTS. All contributing project authors may
+; be found in the AUTHORS file in the root of the source tree.
+;
+
+%ifdef __YASM_VERSION_ID__
+%if __YASM_VERSION_ID__ < 01020000h
+%error AVX2 is supported only by yasm 1.2.0 or later.
+%endif
+%endif
+%include "x86inc.asm"
+
+SECTION .text
+
+; cglobal numeric constants are parameters, gpr regs, mm regs
+
+; void YUY2ToYRow_SSE2(const uint8* src_yuy2, uint8* dst_y, int pix)
+
+%macro YUY2TOYROW 2-3
+cglobal %1ToYRow%3, 3, 3, 3, src_yuy2, dst_y, pix
+%ifidn %1,YUY2
+ pcmpeqb m2, m2, m2 ; generate mask 0x00ff00ff
+ psrlw m2, m2, 8
+%endif
+
+ ALIGN 4
+.convertloop:
+ mov%2 m0, [src_yuy2q]
+ mov%2 m1, [src_yuy2q + mmsize]
+ lea src_yuy2q, [src_yuy2q + mmsize * 2]
+%ifidn %1,YUY2
+ pand m0, m0, m2 ; YUY2 even bytes are Y
+ pand m1, m1, m2
+%else
+ psrlw m0, m0, 8 ; UYVY odd bytes are Y
+ psrlw m1, m1, 8
+%endif
+ packuswb m0, m0, m1
+%if cpuflag(AVX2)
+ vpermq m0, m0, 0xd8
+%endif
+ sub pixd, mmsize
+ mov%2 [dst_yq], m0
+ lea dst_yq, [dst_yq + mmsize]
+ jg .convertloop
+ REP_RET
+%endmacro
+
+; TODO(fbarchard): Remove MMX. Add SSSE3 pshufb version.
+INIT_MMX MMX
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_XMM SSE2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW YUY2,u,_Unaligned
+YUY2TOYROW UYVY,a,
+YUY2TOYROW UYVY,u,_Unaligned
+INIT_YMM AVX2
+YUY2TOYROW YUY2,a,
+YUY2TOYROW UYVY,a,
+
+; void SplitUVRow_SSE2(const uint8* src_uv, uint8* dst_u, uint8* dst_v, int pix)
+
+%macro SplitUVRow 1-2
+cglobal SplitUVRow%2, 4, 4, 5, src_uv, dst_u, dst_v, pix
+ pcmpeqb m4, m4, m4 ; generate mask 0x00ff00ff
+ psrlw m4, m4, 8
+ sub dst_vq, dst_uq
+
+ ALIGN 4
+.convertloop:
+ mov%1 m0, [src_uvq]
+ mov%1 m1, [src_uvq + mmsize]
+ lea src_uvq, [src_uvq + mmsize * 2]
+ psrlw m2, m0, 8 ; odd bytes
+ psrlw m3, m1, 8
+ pand m0, m0, m4 ; even bytes
+ pand m1, m1, m4
+ packuswb m0, m0, m1
+ packuswb m2, m2, m3
+%if cpuflag(AVX2)
+ vpermq m0, m0, 0xd8
+ vpermq m2, m2, 0xd8
+%endif
+ mov%1 [dst_uq], m0
+ mov%1 [dst_uq + dst_vq], m2
+ lea dst_uq, [dst_uq + mmsize]
+ sub pixd, mmsize
+ jg .convertloop
+ REP_RET
+%endmacro
+
+INIT_MMX MMX
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_XMM SSE2
+SplitUVRow a,
+SplitUVRow u,_Unaligned
+INIT_YMM AVX2
+SplitUVRow a,
+
+; void MergeUVRow_SSE2(const uint8* src_u, const uint8* src_v, uint8* dst_uv,
+; int width);
+
+%macro MergeUVRow_ 1-2
+cglobal MergeUVRow_%2, 4, 4, 3, src_u, src_v, dst_uv, pix
+ sub src_vq, src_uq
+
+ ALIGN 4
+.convertloop:
+ mov%1 m0, [src_uq]
+ mov%1 m1, [src_vq]
+ lea src_uq, [src_uq + mmsize]
+ punpcklbw m2, m0, m1 // first 8 UV pairs
+ punpckhbw m0, m0, m1 // next 8 UV pairs
+%if cpuflag(AVX2)
+ vperm2i128 m1, m2, m0, 0x20 // low 128 of ymm2 and low 128 of ymm0
+ vperm2i128 m2, m2, m0, 0x31 // high 128 of ymm2 and high 128 of ymm0
+ mov%1 [dst_uvq], m1
+ mov%1 [dst_uvq + mmsize], m2
+%else
+ mov%1 [dst_uvq], m2
+ mov%1 [dst_uvq + mmsize], m0
+%endif
+ lea dst_uvq, [dst_uvq + mmsize * 2]
+ sub pixd, mmsize
+ jg .convertloop
+ REP_RET
+%endmacro
+
+INIT_MMX MMX
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_XMM SSE2
+MergeUVRow_ a,
+MergeUVRow_ u,_Unaligned
+INIT_YMM AVX2
+MergeUVRow_ a,
+
diff --git a/third_party/libyuv/source/scale.cc b/third_party/libyuv/source/scale.cc
index 31cedf11d..5b33b5f04 100644
--- a/third_party/libyuv/source/scale.cc
+++ b/third_party/libyuv/source/scale.cc
@@ -8,15 +8,15 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/scale.h"
+#include "libyuv/scale.h"
#include <assert.h>
#include <string.h>
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
-#include "third_party/libyuv/include/libyuv/planar_functions.h" // CopyPlane
-#include "third_party/libyuv/include/libyuv/row.h"
-#include "third_party/libyuv/include/libyuv/scale_row.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyPlane
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/source/scale_argb.cc b/third_party/libyuv/source/scale_argb.cc
new file mode 100644
index 000000000..e339cd7c7
--- /dev/null
+++ b/third_party/libyuv/source/scale_argb.cc
@@ -0,0 +1,809 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "libyuv/scale.h"
+
+#include <assert.h>
+#include <string.h>
+
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+static __inline int Abs(int v) {
+ return v >= 0 ? v : -v;
+}
+
+// ScaleARGB ARGB, 1/2
+// This is an optimized version for scaling down a ARGB to 1/2 of
+// its original size.
+static void ScaleARGBDown2(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) =
+ filtering == kFilterNone ? ScaleARGBRowDown2_C :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_C :
+ ScaleARGBRowDown2Box_C);
+ assert(dx == 65536 * 2); // Test scale factor of 2.
+ assert((dy & 0x1ffff) == 0); // Test vertical scale is multiple of 2.
+ // Advance to odd row, even column.
+ if (filtering == kFilterBilinear) {
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ } else {
+ src_argb += (y >> 16) * src_stride + ((x >> 16) - 1) * 4;
+ }
+
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDown2 = filtering == kFilterNone ? ScaleARGBRowDown2_SSE2 :
+ (filtering == kFilterLinear ? ScaleARGBRowDown2Linear_SSE2 :
+ ScaleARGBRowDown2Box_SSE2);
+ }
+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+ ScaleARGBRowDown2 = filtering ? ScaleARGBRowDown2Box_NEON :
+ ScaleARGBRowDown2_NEON;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDown2(src_argb, src_stride, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+}
+
+// ScaleARGB ARGB, 1/4
+// This is an optimized version for scaling down a ARGB to 1/4 of
+// its original size.
+static void ScaleARGBDown4Box(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy) {
+ int j;
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 2 * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+ int row_stride = src_stride * (dy >> 16);
+ void (*ScaleARGBRowDown2)(const uint8* src_argb, ptrdiff_t src_stride,
+ uint8* dst_argb, int dst_width) = ScaleARGBRowDown2Box_C;
+ // Advance to odd row, even column.
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+ assert(dx == 65536 * 4); // Test scale factor of 4.
+ assert((dy & 0x3ffff) == 0); // Test vertical scale is multiple of 4.
+#if defined(HAS_SCALEARGBROWDOWN2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(row_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_SSE2;
+ }
+#elif defined(HAS_SCALEARGBROWDOWN2_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(row_stride, 4)) {
+ ScaleARGBRowDown2 = ScaleARGBRowDown2Box_NEON;
+ }
+#endif
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDown2(src_argb, src_stride, row, dst_width * 2);
+ ScaleARGBRowDown2(src_argb + src_stride * 2, src_stride,
+ row + kRowSize, dst_width * 2);
+ ScaleARGBRowDown2(row, kRowSize, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+ free_aligned_buffer_64(row);
+}
+
+// ScaleARGB ARGB Even
+// This is an optimized version for scaling down a ARGB to even
+// multiple of its original size.
+static void ScaleARGBDownEven(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ int col_step = dx >> 16;
+ int row_stride = (dy >> 16) * src_stride;
+ void (*ScaleARGBRowDownEven)(const uint8* src_argb, ptrdiff_t src_stride,
+ int src_step, uint8* dst_argb, int dst_width) =
+ filtering ? ScaleARGBRowDownEvenBox_C : ScaleARGBRowDownEven_C;
+ assert(IS_ALIGNED(src_width, 2));
+ assert(IS_ALIGNED(src_height, 2));
+ src_argb += (y >> 16) * src_stride + (x >> 16) * 4;
+#if defined(HAS_SCALEARGBROWDOWNEVEN_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_SSE2 :
+ ScaleARGBRowDownEven_SSE2;
+ }
+#elif defined(HAS_SCALEARGBROWDOWNEVEN_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && IS_ALIGNED(dst_width, 4) &&
+ IS_ALIGNED(src_argb, 4)) {
+ ScaleARGBRowDownEven = filtering ? ScaleARGBRowDownEvenBox_NEON :
+ ScaleARGBRowDownEven_NEON;
+ }
+#endif
+
+ if (filtering == kFilterLinear) {
+ src_stride = 0;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBRowDownEven(src_argb, src_stride, col_step, dst_argb, dst_width);
+ src_argb += row_stride;
+ dst_argb += dst_stride;
+ }
+}
+
+// Scale ARGB down with bilinear interpolation.
+static void ScaleARGBBilinearDown(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBFilterCols64_C : ScaleARGBFilterCols_C;
+ int64 xlast = x + (int64)(dst_width - 1) * dx;
+ int64 xl = (dx >= 0) ? x : xlast;
+ int64 xr = (dx >= 0) ? xlast : x;
+ int clip_src_width;
+ xl = (xl >> 16) & ~3; // Left edge aligned.
+ xr = (xr >> 16) + 1; // Right most pixel used. Bilinear uses 2 pixels.
+ xr = (xr + 1 + 3) & ~3; // 1 beyond 4 pixel aligned right most pixel.
+ if (xr > src_width) {
+ xr = src_width;
+ }
+ clip_src_width = (int)(xr - xl) * 4; // Width aligned to 4.
+ src_argb += xl * 4;
+ x -= (int)(xl << 16);
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && clip_src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && clip_src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && clip_src_width >= 32) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(clip_src_width, 32)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && clip_src_width >= 16) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(clip_src_width, 16)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && clip_src_width >= 4 &&
+ IS_ALIGNED(src_argb, 4) && IS_ALIGNED(src_stride, 4)) {
+ InterpolateRow = InterpolateRow_Any_MIPS_DSPR2;
+ if (IS_ALIGNED(clip_src_width, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+ }
+#endif
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+ // TODO(fbarchard): Consider not allocating row buffer for kFilterLinear.
+ // Allocate a row of ARGB.
+ {
+ align_buffer_64(row, clip_src_width * 4);
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ for (j = 0; j < dst_height; ++j) {
+ int yi = y >> 16;
+ const uint8* src = src_argb + yi * src_stride;
+ if (filtering == kFilterLinear) {
+ ScaleARGBFilterCols(dst_argb, src, dst_width, x, dx);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(row, src, src_stride, clip_src_width, yf);
+ ScaleARGBFilterCols(dst_argb, row, dst_width, x, dx);
+ }
+ dst_argb += dst_stride;
+ y += dy;
+ if (y > max_y) {
+ y = max_y;
+ }
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+// Scale ARGB up with bilinear interpolation.
+static void ScaleARGBBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ const int max_y = (src_height - 1) << 16;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+#endif
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols = filtering ?
+ ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ if (y > max_y) {
+ y = max_y;
+ }
+
+ {
+ int yi = y >> 16;
+ const uint8* src = src_argb + yi * src_stride;
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ if (src_height > 1) {
+ src += src_stride;
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src, dst_width, x, dx);
+ src += src_stride;
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ src = src_argb + yi * src_stride;
+ }
+ if (yi != lasty) {
+ ScaleARGBFilterCols(rowptr, src, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src += src_stride;
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ }
+}
+
+#ifdef YUVSCALEUP
+// Scale YUV to ARGB up with bilinear interpolation.
+static void ScaleYUVToARGBBilinearUp(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride_y,
+ int src_stride_u,
+ int src_stride_v,
+ int dst_stride_argb,
+ const uint8* src_y,
+ const uint8* src_u,
+ const uint8* src_v,
+ uint8* dst_argb,
+ int x, int dx, int y, int dy,
+ enum FilterMode filtering) {
+ int j;
+ void (*I422ToARGBRow)(const uint8* y_buf,
+ const uint8* u_buf,
+ const uint8* v_buf,
+ uint8* rgb_buf,
+ int width) = I422ToARGBRow_C;
+#if defined(HAS_I422TOARGBROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && src_width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_SSSE3;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ I422ToARGBRow = I422ToARGBRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && src_width >= 16) {
+ I422ToARGBRow = I422ToARGBRow_Any_AVX2;
+ if (IS_ALIGNED(src_width, 16)) {
+ I422ToARGBRow = I422ToARGBRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && src_width >= 8) {
+ I422ToARGBRow = I422ToARGBRow_Any_NEON;
+ if (IS_ALIGNED(src_width, 8)) {
+ I422ToARGBRow = I422ToARGBRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_I422TOARGBROW_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && IS_ALIGNED(src_width, 4) &&
+ IS_ALIGNED(src_y, 4) && IS_ALIGNED(src_stride_y, 4) &&
+ IS_ALIGNED(src_u, 2) && IS_ALIGNED(src_stride_u, 2) &&
+ IS_ALIGNED(src_v, 2) && IS_ALIGNED(src_stride_v, 2) &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ I422ToARGBRow = I422ToARGBRow_MIPS_DSPR2;
+ }
+#endif
+
+ void (*InterpolateRow)(uint8* dst_argb, const uint8* src_argb,
+ ptrdiff_t src_stride, int dst_width, int source_y_fraction) =
+ InterpolateRow_C;
+#if defined(HAS_INTERPOLATEROW_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSE2;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSE2;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ InterpolateRow = InterpolateRow_SSE2;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_SSSE3)
+ if (TestCpuFlag(kCpuHasSSSE3) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_SSSE3;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_Unaligned_SSSE3;
+ if (IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride_argb, 16)) {
+ InterpolateRow = InterpolateRow_SSSE3;
+ }
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_AVX2)
+ if (TestCpuFlag(kCpuHasAVX2) && dst_width >= 8) {
+ InterpolateRow = InterpolateRow_Any_AVX2;
+ if (IS_ALIGNED(dst_width, 8)) {
+ InterpolateRow = InterpolateRow_AVX2;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROW_NEON)
+ if (TestCpuFlag(kCpuHasNEON) && dst_width >= 4) {
+ InterpolateRow = InterpolateRow_Any_NEON;
+ if (IS_ALIGNED(dst_width, 4)) {
+ InterpolateRow = InterpolateRow_NEON;
+ }
+ }
+#endif
+#if defined(HAS_INTERPOLATEROWS_MIPS_DSPR2)
+ if (TestCpuFlag(kCpuHasMIPS_DSPR2) && dst_width >= 1 &&
+ IS_ALIGNED(dst_argb, 4) && IS_ALIGNED(dst_stride_argb, 4)) {
+ InterpolateRow = InterpolateRow_MIPS_DSPR2;
+ }
+#endif
+
+ void (*ScaleARGBFilterCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ filtering ? ScaleARGBFilterCols_C : ScaleARGBCols_C;
+ if (src_width >= 32768) {
+ ScaleARGBFilterCols = filtering ?
+ ScaleARGBFilterCols64_C : ScaleARGBCols64_C;
+ }
+#if defined(HAS_SCALEARGBFILTERCOLS_SSSE3)
+ if (filtering && TestCpuFlag(kCpuHasSSSE3) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBFilterCols_SSSE3;
+ }
+#endif
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (!filtering && TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBFilterCols = ScaleARGBCols_SSE2;
+ }
+#endif
+ if (!filtering && src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBFilterCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ const int max_y = (src_height - 1) << 16;
+ if (y > max_y) {
+ y = max_y;
+ }
+ const int kYShift = 1; // Shift Y by 1 to convert Y plane to UV coordinate.
+ int yi = y >> 16;
+ int uv_yi = yi >> kYShift;
+ const uint8* src_row_y = src_y + yi * src_stride_y;
+ const uint8* src_row_u = src_u + uv_yi * src_stride_u;
+ const uint8* src_row_v = src_v + uv_yi * src_stride_v;
+
+ // Allocate 2 rows of ARGB.
+ const int kRowSize = (dst_width * 4 + 15) & ~15;
+ align_buffer_64(row, kRowSize * 2);
+
+ // Allocate 1 row of ARGB for source conversion.
+ align_buffer_64(argb_row, src_width * 4);
+
+ uint8* rowptr = row;
+ int rowstride = kRowSize;
+ int lasty = yi;
+
+ // TODO(fbarchard): Convert first 2 rows of YUV to ARGB.
+ ScaleARGBFilterCols(rowptr, src_row_y, dst_width, x, dx);
+ if (src_height > 1) {
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ ScaleARGBFilterCols(rowptr + rowstride, src_row_y, dst_width, x, dx);
+ if (src_height > 2) {
+ src_row_y += src_stride_y;
+ if (!(yi & 1)) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ yi = y >> 16;
+ if (yi != lasty) {
+ if (y > max_y) {
+ y = max_y;
+ yi = y >> 16;
+ uv_yi = yi >> kYShift;
+ src_row_y = src_y + yi * src_stride_y;
+ src_row_u = src_u + uv_yi * src_stride_u;
+ src_row_v = src_v + uv_yi * src_stride_v;
+ }
+ if (yi != lasty) {
+ // TODO(fbarchard): Convert the clipped region of row.
+ I422ToARGBRow(src_row_y, src_row_u, src_row_v, argb_row, src_width);
+ ScaleARGBFilterCols(rowptr, argb_row, dst_width, x, dx);
+ rowptr += rowstride;
+ rowstride = -rowstride;
+ lasty = yi;
+ src_row_y += src_stride_y;
+ if (yi & 1) {
+ src_row_u += src_stride_u;
+ src_row_v += src_stride_v;
+ }
+ }
+ }
+ if (filtering == kFilterLinear) {
+ InterpolateRow(dst_argb, rowptr, 0, dst_width * 4, 0);
+ } else {
+ int yf = (y >> 8) & 255;
+ InterpolateRow(dst_argb, rowptr, rowstride, dst_width * 4, yf);
+ }
+ dst_argb += dst_stride_argb;
+ y += dy;
+ }
+ free_aligned_buffer_64(row);
+ free_aligned_buffer_64(row_argb);
+}
+#endif
+
+// Scale ARGB to/from any dimensions, without interpolation.
+// Fixed point math is used for performance: The upper 16 bits
+// of x and dx is the integer part of the source position and
+// the lower 16 bits are the fixed decimal part.
+
+static void ScaleARGBSimple(int src_width, int src_height,
+ int dst_width, int dst_height,
+ int src_stride, int dst_stride,
+ const uint8* src_argb, uint8* dst_argb,
+ int x, int dx, int y, int dy) {
+ int j;
+ void (*ScaleARGBCols)(uint8* dst_argb, const uint8* src_argb,
+ int dst_width, int x, int dx) =
+ (src_width >= 32768) ? ScaleARGBCols64_C : ScaleARGBCols_C;
+#if defined(HAS_SCALEARGBCOLS_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && src_width < 32768) {
+ ScaleARGBCols = ScaleARGBCols_SSE2;
+ }
+#endif
+ if (src_width * 2 == dst_width && x < 0x8000) {
+ ScaleARGBCols = ScaleARGBColsUp2_C;
+#if defined(HAS_SCALEARGBCOLSUP2_SSE2)
+ if (TestCpuFlag(kCpuHasSSE2) && IS_ALIGNED(dst_width, 8) &&
+ IS_ALIGNED(src_argb, 16) && IS_ALIGNED(src_stride, 16) &&
+ IS_ALIGNED(dst_argb, 16) && IS_ALIGNED(dst_stride, 16)) {
+ ScaleARGBCols = ScaleARGBColsUp2_SSE2;
+ }
+#endif
+ }
+
+ for (j = 0; j < dst_height; ++j) {
+ ScaleARGBCols(dst_argb, src_argb + (y >> 16) * src_stride,
+ dst_width, x, dx);
+ dst_argb += dst_stride;
+ y += dy;
+ }
+}
+
+// ScaleARGB a ARGB.
+// This function in turn calls a scaling function
+// suitable for handling the desired resolutions.
+static void ScaleARGB(const uint8* src, int src_stride,
+ int src_width, int src_height,
+ uint8* dst, int dst_stride,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering) {
+ // Initial source x/y coordinate and step values as 16.16 fixed point.
+ int x = 0;
+ int y = 0;
+ int dx = 0;
+ int dy = 0;
+ // ARGB does not support box filter yet, but allow the user to pass it.
+ // Simplify filtering when possible.
+ filtering = ScaleFilterReduce(src_width, src_height,
+ dst_width, dst_height,
+ filtering);
+
+ // Negative src_height means invert the image.
+ if (src_height < 0) {
+ src_height = -src_height;
+ src = src + (src_height - 1) * src_stride;
+ src_stride = -src_stride;
+ }
+ ScaleSlope(src_width, src_height, dst_width, dst_height, filtering,
+ &x, &y, &dx, &dy);
+ src_width = Abs(src_width);
+ if (clip_x) {
+ int64 clipf = (int64)(clip_x) * dx;
+ x += (clipf & 0xffff);
+ src += (clipf >> 16) * 4;
+ dst += clip_x * 4;
+ }
+ if (clip_y) {
+ int64 clipf = (int64)(clip_y) * dy;
+ y += (clipf & 0xffff);
+ src += (clipf >> 16) * src_stride;
+ dst += clip_y * dst_stride;
+ }
+
+ // Special case for integer step values.
+ if (((dx | dy) & 0xffff) == 0) {
+ if (!dx || !dy) { // 1 pixel wide and/or tall.
+ filtering = kFilterNone;
+ } else {
+ // Optimized even scale down. ie 2, 4, 6, 8, 10x.
+ if (!(dx & 0x10000) && !(dy & 0x10000)) {
+ if (dx == 0x20000) {
+ // Optimized 1/2 downsample.
+ ScaleARGBDown2(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ if (dx == 0x40000 && filtering == kFilterBox) {
+ // Optimized 1/4 box downsample.
+ ScaleARGBDown4Box(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy);
+ return;
+ }
+ ScaleARGBDownEven(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ // Optimized odd scale down. ie 3, 5, 7, 9x.
+ if ((dx & 0x10000) && (dy & 0x10000)) {
+ filtering = kFilterNone;
+ if (dx == 0x10000 && dy == 0x10000) {
+ // Straight copy.
+ ARGBCopy(src + (y >> 16) * src_stride + (x >> 16) * 4, src_stride,
+ dst, dst_stride, clip_width, clip_height);
+ return;
+ }
+ }
+ }
+ }
+ if (dx == 0x10000 && (x & 0xffff) == 0) {
+ // Arbitrary scale vertically, but unscaled vertically.
+ ScalePlaneVertical(src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, y, dy, 4, filtering);
+ return;
+ }
+ if (filtering && dy < 65536) {
+ ScaleARGBBilinearUp(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ if (filtering) {
+ ScaleARGBBilinearDown(src_width, src_height,
+ clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy, filtering);
+ return;
+ }
+ ScaleARGBSimple(src_width, src_height, clip_width, clip_height,
+ src_stride, dst_stride, src, dst,
+ x, dx, y, dy);
+}
+
+LIBYUV_API
+int ARGBScaleClip(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ int clip_x, int clip_y, int clip_width, int clip_height,
+ enum FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 ||
+ !dst_argb || dst_width <= 0 || dst_height <= 0 ||
+ clip_x < 0 || clip_y < 0 ||
+ (clip_x + clip_width) > dst_width ||
+ (clip_y + clip_height) > dst_height) {
+ return -1;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+ dst_argb, dst_stride_argb, dst_width, dst_height,
+ clip_x, clip_y, clip_width, clip_height, filtering);
+ return 0;
+}
+
+// Scale an ARGB image.
+LIBYUV_API
+int ARGBScale(const uint8* src_argb, int src_stride_argb,
+ int src_width, int src_height,
+ uint8* dst_argb, int dst_stride_argb,
+ int dst_width, int dst_height,
+ enum FilterMode filtering) {
+ if (!src_argb || src_width == 0 || src_height == 0 ||
+ !dst_argb || dst_width <= 0 || dst_height <= 0) {
+ return -1;
+ }
+ ScaleARGB(src_argb, src_stride_argb, src_width, src_height,
+ dst_argb, dst_stride_argb, dst_width, dst_height,
+ 0, 0, dst_width, dst_height, filtering);
+ return 0;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
diff --git a/third_party/libyuv/source/scale_common.cc b/third_party/libyuv/source/scale_common.cc
index 595ad66ba..e4b2acc41 100644
--- a/third_party/libyuv/source/scale_common.cc
+++ b/third_party/libyuv/source/scale_common.cc
@@ -8,15 +8,15 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/scale.h"
+#include "libyuv/scale.h"
#include <assert.h>
#include <string.h>
-#include "third_party/libyuv/include/libyuv/cpu_id.h"
-#include "third_party/libyuv/include/libyuv/planar_functions.h" // CopyARGB
-#include "third_party/libyuv/include/libyuv/row.h"
-#include "third_party/libyuv/include/libyuv/scale_row.h"
+#include "libyuv/cpu_id.h"
+#include "libyuv/planar_functions.h" // For CopyARGB
+#include "libyuv/row.h"
+#include "libyuv/scale_row.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/source/scale_mips.cc b/third_party/libyuv/source/scale_mips.cc
index 5722dea80..3eb4f27c4 100644
--- a/third_party/libyuv/source/scale_mips.cc
+++ b/third_party/libyuv/source/scale_mips.cc
@@ -8,8 +8,8 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/basic_types.h"
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/basic_types.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -18,7 +18,8 @@ extern "C" {
// This module is for GCC MIPS DSPR2
#if !defined(LIBYUV_DISABLE_MIPS) && \
- defined(__mips_dsp) && (__mips_dsp_rev >= 2)
+ defined(__mips_dsp) && (__mips_dsp_rev >= 2) && \
+ (_MIPS_SIM == _MIPS_SIM_ABI32)
void ScaleRowDown2_MIPS_DSPR2(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst, int dst_width) {
diff --git a/third_party/libyuv/source/scale_neon.cc b/third_party/libyuv/source/scale_neon.cc
index 704cfd251..1b8a5ba58 100644
--- a/third_party/libyuv/source/scale_neon.cc
+++ b/third_party/libyuv/source/scale_neon.cc
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
@@ -28,8 +28,10 @@ void ScaleRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
+ MEMACCESS(0)
"vld2.8 {q0, q1}, [%0]! \n"
"subs %2, %2, #16 \n" // 16 processed per loop
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -48,7 +50,9 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0, q1}, [%0]! \n" // load row 1 and post inc
+ MEMACCESS(1)
"vld1.8 {q2, q3}, [%1]! \n" // load row 2 and post inc
"subs %3, %3, #16 \n" // 16 processed per loop
"vpaddl.u8 q0, q0 \n" // row 1 add adjacent
@@ -57,6 +61,7 @@ void ScaleRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vpadal.u8 q1, q3 \n"
"vrshrn.u16 d0, q0, #2 \n" // downshift, round and pack
"vrshrn.u16 d1, q1, #2 \n"
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -73,8 +78,10 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
"vst1.8 {d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -87,16 +94,20 @@ void ScaleRowDown4_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
- asm volatile (
- "add r4, %0, %3 \n"
- "add r5, r4, %3 \n"
- "add %3, r5, %3 \n"
+ const uint8* src_ptr1 = src_ptr + src_stride;
+ const uint8* src_ptr2 = src_ptr + src_stride * 2;
+ const uint8* src_ptr3 = src_ptr + src_stride * 3;
+asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {q0}, [%0]! \n" // load up 16x4
- "vld1.8 {q1}, [r4]! \n"
- "vld1.8 {q2}, [r5]! \n"
- "vld1.8 {q3}, [%3]! \n"
+ MEMACCESS(3)
+ "vld1.8 {q1}, [%3]! \n"
+ MEMACCESS(4)
+ "vld1.8 {q2}, [%4]! \n"
+ MEMACCESS(5)
+ "vld1.8 {q3}, [%5]! \n"
"subs %2, %2, #4 \n"
"vpaddl.u8 q0, q0 \n"
"vpadal.u8 q0, q1 \n"
@@ -105,13 +116,17 @@ void ScaleRowDown4Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vpaddl.u16 q0, q0 \n"
"vrshrn.u32 d0, q0, #4 \n" // divide by 16 w/rounding
"vmovn.u16 d0, q0 \n"
+ MEMACCESS(1)
"vst1.32 {d0[0]}, [%1]! \n"
"bgt 1b \n"
- : "+r"(src_ptr), // %0
- "+r"(dst_ptr), // %1
- "+r"(dst_width) // %2
- : "r"(src_stride) // %3
- : "r4", "r5", "q0", "q1", "q2", "q3", "memory", "cc"
+ : "+r"(src_ptr), // %0
+ "+r"(dst_ptr), // %1
+ "+r"(dst_width), // %2
+ "+r"(src_ptr1), // %3
+ "+r"(src_ptr2), // %4
+ "+r"(src_ptr3) // %5
+ :
+ : "q0", "q1", "q2", "q3", "memory", "cc"
);
}
@@ -124,9 +139,11 @@ void ScaleRowDown34_NEON(const uint8* src_ptr,
asm volatile (
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
"subs %2, %2, #24 \n"
"vmov d2, d3 \n" // order d0, d1, d2
+ MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -145,7 +162,9 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
@@ -182,6 +201,7 @@ void ScaleRowDown34_0_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q8, d3, d24 \n"
"vqrshrn.u16 d2, q8, #2 \n"
+ MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
@@ -202,7 +222,9 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n" // src line 0
+ MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n" // src line 1
"subs %2, %2, #24 \n"
// average src line 0 with src line 1
@@ -222,6 +244,7 @@ void ScaleRowDown34_1_Box_NEON(const uint8* src_ptr,
"vmlal.u8 q3, d3, d24 \n"
"vqrshrn.u16 d2, q3, #2 \n"
+ MEMACCESS(1)
"vst3.8 {d0, d1, d2}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -250,14 +273,18 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
+ MEMACCESS(3)
"vld1.8 {q3}, [%3] \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0, d1, d2, d3}, [%0]! \n"
"subs %2, %2, #12 \n"
"vtbl.u8 d4, {d0, d1, d2, d3}, d6 \n"
"vtbl.u8 d5, {d0, d1, d2, d3}, d7 \n"
+ MEMACCESS(1)
"vst1.8 {d4}, [%1]! \n"
+ MEMACCESS(1)
"vst1.32 {d5[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -272,11 +299,15 @@ void ScaleRowDown38_NEON(const uint8* src_ptr,
void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
+ const uint8* src_ptr1 = src_ptr + src_stride * 2;
+
asm volatile (
- "vld1.16 {q13}, [%4] \n"
- "vld1.8 {q14}, [%5] \n"
- "vld1.8 {q15}, [%6] \n"
- "add r4, %0, %3, lsl #1 \n"
+ MEMACCESS(5)
+ "vld1.16 {q13}, [%5] \n"
+ MEMACCESS(6)
+ "vld1.8 {q14}, [%6] \n"
+ MEMACCESS(7)
+ "vld1.8 {q15}, [%7] \n"
"add %3, %0 \n"
".p2align 2 \n"
"1: \n"
@@ -285,9 +316,12 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
- "vld4.8 {d16, d17, d18, d19}, [r4]! \n"
+ MEMACCESS(4)
+ "vld4.8 {d16, d17, d18, d19}, [%4]! \n"
"subs %2, %2, #12 \n"
// Shuffle the input data around to get align the data
@@ -364,18 +398,20 @@ void OMITFP ScaleRowDown38_3_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
"+r"(dst_ptr), // %1
"+r"(dst_width), // %2
- "+r"(src_stride) // %3
- : "r"(&kMult38_Div6), // %4
- "r"(&kShuf38_2), // %5
- "r"(&kMult38_Div9) // %6
- : "r4", "q0", "q1", "q2", "q3", "q8", "q9",
- "q13", "q14", "q15", "memory", "cc"
+ "+r"(src_stride), // %3
+ "+r"(src_ptr1) // %4
+ : "r"(&kMult38_Div6), // %5
+ "r"(&kShuf38_2), // %6
+ "r"(&kMult38_Div9) // %7
+ : "q0", "q1", "q2", "q3", "q8", "q9", "q13", "q14", "q15", "memory", "cc"
);
}
@@ -384,7 +420,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
ptrdiff_t src_stride,
uint8* dst_ptr, int dst_width) {
asm volatile (
+ MEMACCESS(4)
"vld1.16 {q13}, [%4] \n"
+ MEMACCESS(5)
"vld1.8 {q14}, [%5] \n"
"add %3, %0 \n"
".p2align 2 \n"
@@ -394,7 +432,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
// d1 = 10 50 11 51 12 52 13 53
// d2 = 20 60 21 61 22 62 23 63
// d3 = 30 70 31 71 32 72 33 73
+ MEMACCESS(0)
"vld4.8 {d0, d1, d2, d3}, [%0]! \n"
+ MEMACCESS(3)
"vld4.8 {d4, d5, d6, d7}, [%3]! \n"
"subs %2, %2, #12 \n"
@@ -461,7 +501,9 @@ void ScaleRowDown38_2_Box_NEON(const uint8* src_ptr,
"vtbl.u8 d3, {d0, d1, d2}, d28 \n"
"vtbl.u8 d4, {d0, d1, d2}, d29 \n"
+ MEMACCESS(1)
"vst1.8 {d3}, [%1]! \n"
+ MEMACCESS(1)
"vst1.32 {d4[0]}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -494,7 +536,9 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vdup.8 d4, %4 \n"
// General purpose row blend.
"1: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vmull.u8 q13, d0, d4 \n"
@@ -503,50 +547,63 @@ void ScaleFilterRows_NEON(uint8* dst_ptr,
"vmlal.u8 q14, d3, d5 \n"
"vrshrn.u16 d0, q13, #8 \n"
"vrshrn.u16 d1, q14, #8 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 1b \n"
"b 99f \n"
// Blend 25 / 75.
"25: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 25b \n"
"b 99f \n"
// Blend 50 / 50.
"50: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q1}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 50b \n"
"b 99f \n"
// Blend 75 / 25.
"75: \n"
+ MEMACCESS(1)
"vld1.8 {q1}, [%1]! \n"
+ MEMACCESS(2)
"vld1.8 {q0}, [%2]! \n"
"subs %3, %3, #16 \n"
"vrhadd.u8 q0, q1 \n"
"vrhadd.u8 q0, q1 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 75b \n"
"b 99f \n"
// Blend 100 / 0 - Copy row unchanged.
"100: \n"
+ MEMACCESS(1)
"vld1.8 {q0}, [%1]! \n"
"subs %3, %3, #16 \n"
+ MEMACCESS(0)
"vst1.8 {q0}, [%0]! \n"
"bgt 100b \n"
"99: \n"
+ MEMACCESS(0)
"vst1.8 {d1[7]}, [%0] \n"
: "+r"(dst_ptr), // %0
"+r"(src_ptr), // %1
@@ -564,10 +621,14 @@ void ScaleARGBRowDown2_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
".p2align 2 \n"
"1: \n"
// load even pixels into q0, odd into q1
+ MEMACCESS(0)
"vld2.32 {q0, q1}, [%0]! \n"
+ MEMACCESS(0)
"vld2.32 {q2, q3}, [%0]! \n"
"subs %2, %2, #8 \n" // 8 processed per loop
+ MEMACCESS(1)
"vst1.8 {q1}, [%1]! \n" // store odd pixels
+ MEMACCESS(1)
"vst1.8 {q3}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -585,14 +646,18 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld4.8 {d0, d2, d4, d6}, [%0]! \n" // load 8 ARGB pixels.
+ MEMACCESS(0)
"vld4.8 {d1, d3, d5, d7}, [%0]! \n" // load next 8 ARGB pixels.
"subs %3, %3, #8 \n" // 8 processed per loop.
"vpaddl.u8 q0, q0 \n" // B 16 bytes -> 8 shorts.
"vpaddl.u8 q1, q1 \n" // G 16 bytes -> 8 shorts.
"vpaddl.u8 q2, q2 \n" // R 16 bytes -> 8 shorts.
"vpaddl.u8 q3, q3 \n" // A 16 bytes -> 8 shorts.
+ MEMACCESS(1)
"vld4.8 {d16, d18, d20, d22}, [%1]! \n" // load 8 more ARGB pixels.
+ MEMACCESS(1)
"vld4.8 {d17, d19, d21, d23}, [%1]! \n" // load last 8 ARGB pixels.
"vpadal.u8 q0, q8 \n" // B 16 bytes -> 8 shorts.
"vpadal.u8 q1, q9 \n" // G 16 bytes -> 8 shorts.
@@ -602,6 +667,7 @@ void ScaleARGBRowDown2Box_NEON(const uint8* src_ptr, ptrdiff_t src_stride,
"vrshrn.u16 d1, q1, #2 \n"
"vrshrn.u16 d2, q2, #2 \n"
"vrshrn.u16 d3, q3, #2 \n"
+ MEMACCESS(2)
"vst4.8 {d0, d1, d2, d3}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_ptr), // %0
@@ -621,11 +687,16 @@ void ScaleARGBRowDownEven_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"mov r12, %3, lsl #2 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.32 {d0[0]}, [%0], r12 \n"
+ MEMACCESS(0)
"vld1.32 {d0[1]}, [%0], r12 \n"
+ MEMACCESS(0)
"vld1.32 {d1[0]}, [%0], r12 \n"
+ MEMACCESS(0)
"vld1.32 {d1[1]}, [%0], r12 \n"
"subs %2, %2, #4 \n" // 4 pixels per loop.
+ MEMACCESS(1)
"vst1.8 {q0}, [%1]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
@@ -646,13 +717,21 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"add %1, %1, %0 \n"
".p2align 2 \n"
"1: \n"
+ MEMACCESS(0)
"vld1.8 {d0}, [%0], r12 \n" // Read 4 2x2 blocks -> 2x1
+ MEMACCESS(1)
"vld1.8 {d1}, [%1], r12 \n"
+ MEMACCESS(0)
"vld1.8 {d2}, [%0], r12 \n"
+ MEMACCESS(1)
"vld1.8 {d3}, [%1], r12 \n"
+ MEMACCESS(0)
"vld1.8 {d4}, [%0], r12 \n"
+ MEMACCESS(1)
"vld1.8 {d5}, [%1], r12 \n"
+ MEMACCESS(0)
"vld1.8 {d6}, [%0], r12 \n"
+ MEMACCESS(1)
"vld1.8 {d7}, [%1], r12 \n"
"vaddl.u8 q0, d0, d1 \n"
"vaddl.u8 q1, d2, d3 \n"
@@ -665,6 +744,7 @@ void ScaleARGBRowDownEvenBox_NEON(const uint8* src_argb, ptrdiff_t src_stride,
"vrshrn.u16 d0, q0, #2 \n" // first 2 pixels.
"vrshrn.u16 d1, q2, #2 \n" // next 2 pixels.
"subs %3, %3, #4 \n" // 4 pixels per loop.
+ MEMACCESS(2)
"vst1.8 {q0}, [%2]! \n"
"bgt 1b \n"
: "+r"(src_argb), // %0
diff --git a/third_party/libyuv/source/scale_posix.cc b/third_party/libyuv/source/scale_posix.cc
index 18b081026..352e66782 100644
--- a/third_party/libyuv/source/scale_posix.cc
+++ b/third_party/libyuv/source/scale_posix.cc
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/source/scale_win.cc b/third_party/libyuv/source/scale_win.cc
index bd5cca8af..840b9738d 100644
--- a/third_party/libyuv/source/scale_win.cc
+++ b/third_party/libyuv/source/scale_win.cc
@@ -8,7 +8,7 @@
* be found in the AUTHORS file in the root of the source tree.
*/
-#include "third_party/libyuv/include/libyuv/row.h"
+#include "libyuv/row.h"
#ifdef __cplusplus
namespace libyuv {
diff --git a/third_party/libyuv/source/video_common.cc b/third_party/libyuv/source/video_common.cc
new file mode 100644
index 000000000..efbedf46e
--- /dev/null
+++ b/third_party/libyuv/source/video_common.cc
@@ -0,0 +1,64 @@
+/*
+ * Copyright 2011 The LibYuv Project Authors. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#include "libyuv/video_common.h"
+
+#ifdef __cplusplus
+namespace libyuv {
+extern "C" {
+#endif
+
+#define ARRAY_SIZE(x) (int)(sizeof(x) / sizeof(x[0]))
+
+struct FourCCAliasEntry {
+ uint32 alias;
+ uint32 canonical;
+};
+
+static const struct FourCCAliasEntry kFourCCAliases[] = {
+ {FOURCC_IYUV, FOURCC_I420},
+ {FOURCC_YU16, FOURCC_I422},
+ {FOURCC_YU24, FOURCC_I444},
+ {FOURCC_YUYV, FOURCC_YUY2},
+ {FOURCC_YUVS, FOURCC_YUY2}, // kCMPixelFormat_422YpCbCr8_yuvs
+ {FOURCC_HDYC, FOURCC_UYVY},
+ {FOURCC_2VUY, FOURCC_UYVY}, // kCMPixelFormat_422YpCbCr8
+ {FOURCC_JPEG, FOURCC_MJPG}, // Note: JPEG has DHT while MJPG does not.
+ {FOURCC_DMB1, FOURCC_MJPG},
+ {FOURCC_BA81, FOURCC_BGGR},
+ {FOURCC_RGB3, FOURCC_RAW },
+ {FOURCC_BGR3, FOURCC_24BG},
+ {FOURCC_CM32, FOURCC_BGRA}, // kCMPixelFormat_32ARGB
+ {FOURCC_CM24, FOURCC_RAW }, // kCMPixelFormat_24RGB
+ {FOURCC_L555, FOURCC_RGBO}, // kCMPixelFormat_16LE555
+ {FOURCC_L565, FOURCC_RGBP}, // kCMPixelFormat_16LE565
+ {FOURCC_5551, FOURCC_RGBO}, // kCMPixelFormat_16LE5551
+};
+// TODO(fbarchard): Consider mapping kCMPixelFormat_32BGRA to FOURCC_ARGB.
+// {FOURCC_BGRA, FOURCC_ARGB}, // kCMPixelFormat_32BGRA
+
+LIBYUV_API
+uint32 CanonicalFourCC(uint32 fourcc) {
+ int i;
+ for (i = 0; i < ARRAY_SIZE(kFourCCAliases); ++i) {
+ if (kFourCCAliases[i].alias == fourcc) {
+ return kFourCCAliases[i].canonical;
+ }
+ }
+ // Not an alias, so return it as-is.
+ return fourcc;
+}
+
+#ifdef __cplusplus
+} // extern "C"
+} // namespace libyuv
+#endif
+
diff --git a/third_party/libyuv/source/x86inc.asm b/third_party/libyuv/source/x86inc.asm
new file mode 100644
index 000000000..cb5c32df3
--- /dev/null
+++ b/third_party/libyuv/source/x86inc.asm
@@ -0,0 +1,1136 @@
+;*****************************************************************************
+;* x86inc.asm: x264asm abstraction layer
+;*****************************************************************************
+;* Copyright (C) 2005-2012 x264 project
+;*
+;* Authors: Loren Merritt <lorenm@u.washington.edu>
+;* Anton Mitrofanov <BugMaster@narod.ru>
+;* Jason Garrett-Glaser <darkshikari@gmail.com>
+;* Henrik Gramner <hengar-6@student.ltu.se>
+;*
+;* Permission to use, copy, modify, and/or distribute this software for any
+;* purpose with or without fee is hereby granted, provided that the above
+;* copyright notice and this permission notice appear in all copies.
+;*
+;* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+;* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+;* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+;* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+;* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+;* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+;* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+;*****************************************************************************
+
+; This is a header file for the x264ASM assembly language, which uses
+; NASM/YASM syntax combined with a large number of macros to provide easy
+; abstraction between different calling conventions (x86_32, win64, linux64).
+; It also has various other useful features to simplify writing the kind of
+; DSP functions that are most often used in x264.
+
+; Unlike the rest of x264, this file is available under an ISC license, as it
+; has significant usefulness outside of x264 and we want it to be available
+; to the largest audience possible. Of course, if you modify it for your own
+; purposes to add a new feature, we strongly encourage contributing a patch
+; as this feature might be useful for others as well. Send patches or ideas
+; to x264-devel@videolan.org .
+
+; Local changes for libyuv:
+; remove %define program_name and references in labels
+; rename cpus to uppercase
+
+%define WIN64 0
+%define UNIX64 0
+%if ARCH_X86_64
+ %ifidn __OUTPUT_FORMAT__,win32
+ %define WIN64 1
+ %elifidn __OUTPUT_FORMAT__,win64
+ %define WIN64 1
+ %else
+ %define UNIX64 1
+ %endif
+%endif
+
+%ifdef PREFIX
+ %define mangle(x) _ %+ x
+%else
+ %define mangle(x) x
+%endif
+
+; Name of the .rodata section.
+; Kludge: Something on OS X fails to align .rodata even given an align attribute,
+; so use a different read-only section.
+%macro SECTION_RODATA 0-1 16
+ %ifidn __OUTPUT_FORMAT__,macho64
+ SECTION .text align=%1
+ %elifidn __OUTPUT_FORMAT__,macho
+ SECTION .text align=%1
+ fakegot:
+ %elifidn __OUTPUT_FORMAT__,aout
+ section .text
+ %else
+ SECTION .rodata align=%1
+ %endif
+%endmacro
+
+; aout does not support align=
+%macro SECTION_TEXT 0-1 16
+ %ifidn __OUTPUT_FORMAT__,aout
+ SECTION .text
+ %else
+ SECTION .text align=%1
+ %endif
+%endmacro
+
+%if WIN64
+ %define PIC
+%elif ARCH_X86_64 == 0
+; x86_32 doesn't require PIC.
+; Some distros prefer shared objects to be PIC, but nothing breaks if
+; the code contains a few textrels, so we'll skip that complexity.
+ %undef PIC
+%endif
+%ifdef PIC
+ default rel
+%endif
+
+; Always use long nops (reduces 0x90 spam in disassembly on x86_32)
+CPU amdnop
+
+; Macros to eliminate most code duplication between x86_32 and x86_64:
+; Currently this works only for leaf functions which load all their arguments
+; into registers at the start, and make no other use of the stack. Luckily that
+; covers most of x264's asm.
+
+; PROLOGUE:
+; %1 = number of arguments. loads them from stack if needed.
+; %2 = number of registers used. pushes callee-saved regs if needed.
+; %3 = number of xmm registers used. pushes callee-saved xmm regs if needed.
+; %4 = list of names to define to registers
+; PROLOGUE can also be invoked by adding the same options to cglobal
+
+; e.g.
+; cglobal foo, 2,3,0, dst, src, tmp
+; declares a function (foo), taking two args (dst and src) and one local variable (tmp)
+
+; TODO Some functions can use some args directly from the stack. If they're the
+; last args then you can just not declare them, but if they're in the middle
+; we need more flexible macro.
+
+; RET:
+; Pops anything that was pushed by PROLOGUE, and returns.
+
+; REP_RET:
+; Same, but if it doesn't pop anything it becomes a 2-byte ret, for athlons
+; which are slow when a normal ret follows a branch.
+
+; registers:
+; rN and rNq are the native-size register holding function argument N
+; rNd, rNw, rNb are dword, word, and byte size
+; rNh is the high 8 bits of the word size
+; rNm is the original location of arg N (a register or on the stack), dword
+; rNmp is native size
+
+%macro DECLARE_REG 2-3
+ %define r%1q %2
+ %define r%1d %2d
+ %define r%1w %2w
+ %define r%1b %2b
+ %define r%1h %2h
+ %if %0 == 2
+ %define r%1m %2d
+ %define r%1mp %2
+ %elif ARCH_X86_64 ; memory
+ %define r%1m [rsp + stack_offset + %3]
+ %define r%1mp qword r %+ %1m
+ %else
+ %define r%1m [esp + stack_offset + %3]
+ %define r%1mp dword r %+ %1m
+ %endif
+ %define r%1 %2
+%endmacro
+
+%macro DECLARE_REG_SIZE 3
+ %define r%1q r%1
+ %define e%1q r%1
+ %define r%1d e%1
+ %define e%1d e%1
+ %define r%1w %1
+ %define e%1w %1
+ %define r%1h %3
+ %define e%1h %3
+ %define r%1b %2
+ %define e%1b %2
+%if ARCH_X86_64 == 0
+ %define r%1 e%1
+%endif
+%endmacro
+
+DECLARE_REG_SIZE ax, al, ah
+DECLARE_REG_SIZE bx, bl, bh
+DECLARE_REG_SIZE cx, cl, ch
+DECLARE_REG_SIZE dx, dl, dh
+DECLARE_REG_SIZE si, sil, null
+DECLARE_REG_SIZE di, dil, null
+DECLARE_REG_SIZE bp, bpl, null
+
+; t# defines for when per-arch register allocation is more complex than just function arguments
+
+%macro DECLARE_REG_TMP 1-*
+ %assign %%i 0
+ %rep %0
+ CAT_XDEFINE t, %%i, r%1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro DECLARE_REG_TMP_SIZE 0-*
+ %rep %0
+ %define t%1q t%1 %+ q
+ %define t%1d t%1 %+ d
+ %define t%1w t%1 %+ w
+ %define t%1h t%1 %+ h
+ %define t%1b t%1 %+ b
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_REG_TMP_SIZE 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
+
+%if ARCH_X86_64
+ %define gprsize 8
+%else
+ %define gprsize 4
+%endif
+
+%macro PUSH 1
+ push %1
+ %assign stack_offset stack_offset+gprsize
+%endmacro
+
+%macro POP 1
+ pop %1
+ %assign stack_offset stack_offset-gprsize
+%endmacro
+
+%macro PUSH_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ PUSH r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro POP_IF_USED 1-*
+ %rep %0
+ %if %1 < regs_used
+ pop r%1
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro LOAD_IF_USED 1-*
+ %rep %0
+ %if %1 < num_args
+ mov r%1, r %+ %1 %+ mp
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+%macro SUB 2
+ sub %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset+(%2)
+ %endif
+%endmacro
+
+%macro ADD 2
+ add %1, %2
+ %ifidn %1, rsp
+ %assign stack_offset stack_offset-(%2)
+ %endif
+%endmacro
+
+%macro movifnidn 2
+ %ifnidn %1, %2
+ mov %1, %2
+ %endif
+%endmacro
+
+%macro movsxdifnidn 2
+ %ifnidn %1, %2
+ movsxd %1, %2
+ %endif
+%endmacro
+
+%macro ASSERT 1
+ %if (%1) == 0
+ %error assert failed
+ %endif
+%endmacro
+
+%macro DEFINE_ARGS 0-*
+ %ifdef n_arg_names
+ %assign %%i 0
+ %rep n_arg_names
+ CAT_UNDEF arg_name %+ %%i, q
+ CAT_UNDEF arg_name %+ %%i, d
+ CAT_UNDEF arg_name %+ %%i, w
+ CAT_UNDEF arg_name %+ %%i, h
+ CAT_UNDEF arg_name %+ %%i, b
+ CAT_UNDEF arg_name %+ %%i, m
+ CAT_UNDEF arg_name %+ %%i, mp
+ CAT_UNDEF arg_name, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+
+ %xdefine %%stack_offset stack_offset
+ %undef stack_offset ; so that the current value of stack_offset doesn't get baked in by xdefine
+ %assign %%i 0
+ %rep %0
+ %xdefine %1q r %+ %%i %+ q
+ %xdefine %1d r %+ %%i %+ d
+ %xdefine %1w r %+ %%i %+ w
+ %xdefine %1h r %+ %%i %+ h
+ %xdefine %1b r %+ %%i %+ b
+ %xdefine %1m r %+ %%i %+ m
+ %xdefine %1mp r %+ %%i %+ mp
+ CAT_XDEFINE arg_name, %%i, %1
+ %assign %%i %%i+1
+ %rotate 1
+ %endrep
+ %xdefine stack_offset %%stack_offset
+ %assign n_arg_names %0
+%endmacro
+
+%if WIN64 ; Windows x64 ;=================================================
+
+DECLARE_REG 0, rcx
+DECLARE_REG 1, rdx
+DECLARE_REG 2, R8
+DECLARE_REG 3, R9
+DECLARE_REG 4, R10, 40
+DECLARE_REG 5, R11, 48
+DECLARE_REG 6, rax, 56
+DECLARE_REG 7, rdi, 64
+DECLARE_REG 8, rsi, 72
+DECLARE_REG 9, rbx, 80
+DECLARE_REG 10, rbp, 88
+DECLARE_REG 11, R12, 96
+DECLARE_REG 12, R13, 104
+DECLARE_REG 13, R14, 112
+DECLARE_REG 14, R15, 120
+
+%macro PROLOGUE 2-4+ 0 ; #args, #regs, #xmm_regs, arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 7, 8, 9, 10, 11, 12, 13, 14
+ %if mmsize == 8
+ %assign xmm_regs_used 0
+ %else
+ WIN64_SPILL_XMM %3
+ %endif
+ LOAD_IF_USED 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS %4
+%endmacro
+
+%macro WIN64_SPILL_XMM 1
+ %assign xmm_regs_used %1
+ ASSERT xmm_regs_used <= 16
+ %if xmm_regs_used > 6
+ SUB rsp, (xmm_regs_used-6)*16+16
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa [rsp + (%%i-6)*16+(~stack_offset&8)], xmm %+ %%i
+ %endrep
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM_INTERNAL 1
+ %if xmm_regs_used > 6
+ %assign %%i xmm_regs_used
+ %rep (xmm_regs_used-6)
+ %assign %%i %%i-1
+ movdqa xmm %+ %%i, [%1 + (%%i-6)*16+(~stack_offset&8)]
+ %endrep
+ add %1, (xmm_regs_used-6)*16+16
+ %endif
+%endmacro
+
+%macro WIN64_RESTORE_XMM 1
+ WIN64_RESTORE_XMM_INTERNAL %1
+ %assign stack_offset stack_offset-(xmm_regs_used-6)*16+16
+ %assign xmm_regs_used 0
+%endmacro
+
+%define has_epilogue regs_used > 7 || xmm_regs_used > 6 || mmsize == 32
+
+%macro RET 0
+ WIN64_RESTORE_XMM_INTERNAL rsp
+ POP_IF_USED 14, 13, 12, 11, 10, 9, 8, 7
+%if mmsize == 32
+ vzeroupper
+%endif
+ ret
+%endmacro
+
+%elif ARCH_X86_64 ; *nix x64 ;=============================================
+
+DECLARE_REG 0, rdi
+DECLARE_REG 1, rsi
+DECLARE_REG 2, rdx
+DECLARE_REG 3, rcx
+DECLARE_REG 4, R8
+DECLARE_REG 5, R9
+DECLARE_REG 6, rax, 8
+DECLARE_REG 7, R10, 16
+DECLARE_REG 8, R11, 24
+DECLARE_REG 9, rbx, 32
+DECLARE_REG 10, rbp, 40
+DECLARE_REG 11, R12, 48
+DECLARE_REG 12, R13, 56
+DECLARE_REG 13, R14, 64
+DECLARE_REG 14, R15, 72
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ ASSERT regs_used >= num_args
+ ASSERT regs_used <= 15
+ PUSH_IF_USED 9, 10, 11, 12, 13, 14
+ LOAD_IF_USED 6, 7, 8, 9, 10, 11, 12, 13, 14
+ DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 9 || mmsize == 32
+
+%macro RET 0
+ POP_IF_USED 14, 13, 12, 11, 10, 9
+%if mmsize == 32
+ vzeroupper
+%endif
+ ret
+%endmacro
+
+%else ; X86_32 ;==============================================================
+
+DECLARE_REG 0, eax, 4
+DECLARE_REG 1, ecx, 8
+DECLARE_REG 2, edx, 12
+DECLARE_REG 3, ebx, 16
+DECLARE_REG 4, esi, 20
+DECLARE_REG 5, edi, 24
+DECLARE_REG 6, ebp, 28
+%define rsp esp
+
+%macro DECLARE_ARG 1-*
+ %rep %0
+ %define r%1m [esp + stack_offset + 4*%1 + 4]
+ %define r%1mp dword r%1m
+ %rotate 1
+ %endrep
+%endmacro
+
+DECLARE_ARG 7, 8, 9, 10, 11, 12, 13, 14
+
+%macro PROLOGUE 2-4+ ; #args, #regs, #xmm_regs, arg_names...
+ %assign num_args %1
+ %assign regs_used %2
+ %if regs_used > 7
+ %assign regs_used 7
+ %endif
+ ASSERT regs_used >= num_args
+ PUSH_IF_USED 3, 4, 5, 6
+ LOAD_IF_USED 0, 1, 2, 3, 4, 5, 6
+ DEFINE_ARGS %4
+%endmacro
+
+%define has_epilogue regs_used > 3 || mmsize == 32
+
+%macro RET 0
+ POP_IF_USED 6, 5, 4, 3
+%if mmsize == 32
+ vzeroupper
+%endif
+ ret
+%endmacro
+
+%endif ;======================================================================
+
+%if WIN64 == 0
+%macro WIN64_SPILL_XMM 1
+%endmacro
+%macro WIN64_RESTORE_XMM 1
+%endmacro
+%endif
+
+%macro REP_RET 0
+ %if has_epilogue
+ RET
+ %else
+ rep ret
+ %endif
+%endmacro
+
+%macro TAIL_CALL 2 ; callee, is_nonadjacent
+ %if has_epilogue
+ call %1
+ RET
+ %elif %2
+ jmp %1
+ %endif
+%endmacro
+
+;=============================================================================
+; arch-independent part
+;=============================================================================
+
+%assign function_align 16
+
+; Begin a function.
+; Applies any symbol mangling needed for C linkage, and sets up a define such that
+; subsequent uses of the function name automatically refer to the mangled version.
+; Appends cpuflags to the function name if cpuflags has been specified.
+%macro cglobal 1-2+ ; name, [PROLOGUE args]
+%if %0 == 1
+ cglobal_internal %1 %+ SUFFIX
+%else
+ cglobal_internal %1 %+ SUFFIX, %2
+%endif
+%endmacro
+%macro cglobal_internal 1-2+
+ %ifndef cglobaled_%1
+ %xdefine %1 mangle(%1)
+ %xdefine %1.skip_prologue %1 %+ .skip_prologue
+ CAT_XDEFINE cglobaled_, %1, 1
+ %endif
+ %xdefine current_function %1
+ %ifidn __OUTPUT_FORMAT__,elf
+ global %1:function hidden
+ %else
+ global %1
+ %endif
+ align function_align
+ %1:
+ RESET_MM_PERMUTATION ; not really needed, but makes disassembly somewhat nicer
+ %assign stack_offset 0
+ %if %0 > 1
+ PROLOGUE %2
+ %endif
+%endmacro
+
+%macro cextern 1
+ %xdefine %1 mangle(%1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+; like cextern, but without the prefix
+%macro cextern_naked 1
+ %xdefine %1 mangle(%1)
+ CAT_XDEFINE cglobaled_, %1, 1
+ extern %1
+%endmacro
+
+%macro const 2+
+ %xdefine %1 mangle(%1)
+ global %1
+ %1: %2
+%endmacro
+
+; This is needed for ELF, otherwise the GNU linker assumes the stack is
+; executable by default.
+%ifidn __OUTPUT_FORMAT__,elf
+SECTION .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf32
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+%ifidn __OUTPUT_FORMAT__,elf64
+section .note.GNU-stack noalloc noexec nowrite progbits
+%endif
+
+; cpuflags
+
+%assign cpuflags_MMX (1<<0)
+%assign cpuflags_MMX2 (1<<1) | cpuflags_MMX
+%assign cpuflags_3dnow (1<<2) | cpuflags_MMX
+%assign cpuflags_3dnow2 (1<<3) | cpuflags_3dnow
+%assign cpuflags_SSE (1<<4) | cpuflags_MMX2
+%assign cpuflags_SSE2 (1<<5) | cpuflags_SSE
+%assign cpuflags_SSE2slow (1<<6) | cpuflags_SSE2
+%assign cpuflags_SSE3 (1<<7) | cpuflags_SSE2
+%assign cpuflags_SSSE3 (1<<8) | cpuflags_SSE3
+%assign cpuflags_SSE4 (1<<9) | cpuflags_SSSE3
+%assign cpuflags_SSE42 (1<<10)| cpuflags_SSE4
+%assign cpuflags_AVX (1<<11)| cpuflags_SSE42
+%assign cpuflags_xop (1<<12)| cpuflags_AVX
+%assign cpuflags_fma4 (1<<13)| cpuflags_AVX
+%assign cpuflags_AVX2 (1<<14)| cpuflags_AVX
+%assign cpuflags_fma3 (1<<15)| cpuflags_AVX
+
+%assign cpuflags_cache32 (1<<16)
+%assign cpuflags_cache64 (1<<17)
+%assign cpuflags_slowctz (1<<18)
+%assign cpuflags_lzcnt (1<<19)
+%assign cpuflags_misalign (1<<20)
+%assign cpuflags_aligned (1<<21) ; not a cpu feature, but a function variant
+%assign cpuflags_atom (1<<22)
+%assign cpuflags_bmi1 (1<<23)
+%assign cpuflags_bmi2 (1<<24)|cpuflags_bmi1
+%assign cpuflags_tbm (1<<25)|cpuflags_bmi1
+
+%define cpuflag(x) ((cpuflags & (cpuflags_ %+ x)) == (cpuflags_ %+ x))
+%define notcpuflag(x) ((cpuflags & (cpuflags_ %+ x)) != (cpuflags_ %+ x))
+
+; Takes up to 2 cpuflags from the above list.
+; All subsequent functions (up to the next INIT_CPUFLAGS) is built for the specified cpu.
+; You shouldn't need to invoke this macro directly, it's a subroutine for INIT_MMX &co.
+%macro INIT_CPUFLAGS 0-2
+ %if %0 >= 1
+ %xdefine cpuname %1
+ %assign cpuflags cpuflags_%1
+ %if %0 >= 2
+ %xdefine cpuname %1_%2
+ %assign cpuflags cpuflags | cpuflags_%2
+ %endif
+ %xdefine SUFFIX _ %+ cpuname
+ %if cpuflag(AVX)
+ %assign AVX_enabled 1
+ %endif
+ %if mmsize == 16 && notcpuflag(SSE2)
+ %define mova movaps
+ %define movu movups
+ %define movnta movntps
+ %endif
+ %if cpuflag(aligned)
+ %define movu mova
+ %elifidn %1, SSE3
+ %define movu lddqu
+ %endif
+ %else
+ %xdefine SUFFIX
+ %undef cpuname
+ %undef cpuflags
+ %endif
+%endmacro
+
+; merge MMX and SSE*
+
+%macro CAT_XDEFINE 3
+ %xdefine %1%2 %3
+%endmacro
+
+%macro CAT_UNDEF 2
+ %undef %1%2
+%endmacro
+
+%macro INIT_MMX 0-1+
+ %assign AVX_enabled 0
+ %define RESET_MM_PERMUTATION INIT_MMX %1
+ %define mmsize 8
+ %define num_mmregs 8
+ %define mova movq
+ %define movu movq
+ %define movh movd
+ %define movnta movntq
+ %assign %%i 0
+ %rep 8
+ CAT_XDEFINE m, %%i, mm %+ %%i
+ CAT_XDEFINE nmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %rep 8
+ CAT_UNDEF m, %%i
+ CAT_UNDEF nmm, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_XMM 0-1+
+ %assign AVX_enabled 0
+ %define RESET_MM_PERMUTATION INIT_XMM %1
+ %define mmsize 16
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 16
+ %endif
+ %define mova movdqa
+ %define movu movdqu
+ %define movh movq
+ %define movnta movntdq
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, xmm %+ %%i
+ CAT_XDEFINE nxmm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+%macro INIT_YMM 0-1+
+ %assign AVX_enabled 1
+ %define RESET_MM_PERMUTATION INIT_YMM %1
+ %define mmsize 32
+ %define num_mmregs 8
+ %if ARCH_X86_64
+ %define num_mmregs 16
+ %endif
+ %define mova vmovaps
+ %define movu vmovups
+ %undef movh
+ %define movnta vmovntps
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, ymm %+ %%i
+ CAT_XDEFINE nymm, %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ INIT_CPUFLAGS %1
+%endmacro
+
+INIT_XMM
+
+; I often want to use macros that permute their arguments. e.g. there's no
+; efficient way to implement butterfly or transpose or dct without swapping some
+; arguments.
+;
+; I would like to not have to manually keep track of the permutations:
+; If I insert a permutation in the middle of a function, it should automatically
+; change everything that follows. For more complex macros I may also have multiple
+; implementations, e.g. the SSE2 and SSSE3 versions may have different permutations.
+;
+; Hence these macros. Insert a PERMUTE or some SWAPs at the end of a macro that
+; permutes its arguments. It's equivalent to exchanging the contents of the
+; registers, except that this way you exchange the register names instead, so it
+; doesn't cost any cycles.
+
+%macro PERMUTE 2-* ; takes a list of pairs to swap
+%rep %0/2
+ %xdefine tmp%2 m%2
+ %xdefine ntmp%2 nm%2
+ %rotate 2
+%endrep
+%rep %0/2
+ %xdefine m%1 tmp%2
+ %xdefine nm%1 ntmp%2
+ %undef tmp%2
+ %undef ntmp%2
+ %rotate 2
+%endrep
+%endmacro
+
+%macro SWAP 2-* ; swaps a single chain (sometimes more concise than pairs)
+%rep %0-1
+%ifdef m%1
+ %xdefine tmp m%1
+ %xdefine m%1 m%2
+ %xdefine m%2 tmp
+ CAT_XDEFINE n, m%1, %1
+ CAT_XDEFINE n, m%2, %2
+%else
+ ; If we were called as "SWAP m0,m1" rather than "SWAP 0,1" infer the original numbers here.
+ ; Be careful using this mode in nested macros though, as in some cases there may be
+ ; other copies of m# that have already been dereferenced and don't get updated correctly.
+ %xdefine %%n1 n %+ %1
+ %xdefine %%n2 n %+ %2
+ %xdefine tmp m %+ %%n1
+ CAT_XDEFINE m, %%n1, m %+ %%n2
+ CAT_XDEFINE m, %%n2, tmp
+ CAT_XDEFINE n, m %+ %%n1, %%n1
+ CAT_XDEFINE n, m %+ %%n2, %%n2
+%endif
+ %undef tmp
+ %rotate 1
+%endrep
+%endmacro
+
+; If SAVE_MM_PERMUTATION is placed at the end of a function, then any later
+; calls to that function will automatically load the permutation, so values can
+; be returned in mmregs.
+%macro SAVE_MM_PERMUTATION 0-1
+ %if %0
+ %xdefine %%f %1_m
+ %else
+ %xdefine %%f current_function %+ _m
+ %endif
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE %%f, %%i, m %+ %%i
+ %assign %%i %%i+1
+ %endrep
+%endmacro
+
+%macro LOAD_MM_PERMUTATION 1 ; name to load from
+ %ifdef %1_m0
+ %assign %%i 0
+ %rep num_mmregs
+ CAT_XDEFINE m, %%i, %1_m %+ %%i
+ CAT_XDEFINE n, m %+ %%i, %%i
+ %assign %%i %%i+1
+ %endrep
+ %endif
+%endmacro
+
+; Append cpuflags to the callee's name iff the appended name is known and the plain name isn't
+%macro call 1
+ call_internal %1, %1 %+ SUFFIX
+%endmacro
+%macro call_internal 2
+ %xdefine %%i %1
+ %ifndef cglobaled_%1
+ %ifdef cglobaled_%2
+ %xdefine %%i %2
+ %endif
+ %endif
+ call %%i
+ LOAD_MM_PERMUTATION %%i
+%endmacro
+
+; Substitutions that reduce instruction size but are functionally equivalent
+%macro add 2
+ %ifnum %2
+ %if %2==128
+ sub %1, -128
+ %else
+ add %1, %2
+ %endif
+ %else
+ add %1, %2
+ %endif
+%endmacro
+
+%macro sub 2
+ %ifnum %2
+ %if %2==128
+ add %1, -128
+ %else
+ sub %1, %2
+ %endif
+ %else
+ sub %1, %2
+ %endif
+%endmacro
+
+;=============================================================================
+; AVX abstraction layer
+;=============================================================================
+
+%assign i 0
+%rep 16
+ %if i < 8
+ CAT_XDEFINE sizeofmm, i, 8
+ %endif
+ CAT_XDEFINE sizeofxmm, i, 16
+ CAT_XDEFINE sizeofymm, i, 32
+%assign i i+1
+%endrep
+%undef i
+
+%macro CHECK_AVX_INSTR_EMU 3-*
+ %xdefine %%opcode %1
+ %xdefine %%dst %2
+ %rep %0-2
+ %ifidn %%dst, %3
+ %error non-AVX emulation of ``%%opcode'' is not supported
+ %endif
+ %rotate 1
+ %endrep
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == number of operands given
+;%5+: operands
+%macro RUN_AVX_INSTR 6-7+
+ %ifid %6
+ %define %%sizeofreg sizeof%6
+ %elifid %5
+ %define %%sizeofreg sizeof%5
+ %else
+ %define %%sizeofreg mmsize
+ %endif
+ %if %%sizeofreg==32
+ %if %4>=3
+ v%1 %5, %6, %7
+ %else
+ v%1 %5, %6
+ %endif
+ %else
+ %if %%sizeofreg==8
+ %define %%regmov movq
+ %elif %2
+ %define %%regmov movaps
+ %else
+ %define %%regmov movdqa
+ %endif
+
+ %if %4>=3+%3
+ %ifnidn %5, %6
+ %if AVX_enabled && %%sizeofreg==16
+ v%1 %5, %6, %7
+ %else
+ CHECK_AVX_INSTR_EMU {%1 %5, %6, %7}, %5, %7
+ %%regmov %5, %6
+ %1 %5, %7
+ %endif
+ %else
+ %1 %5, %7
+ %endif
+ %elif %4>=3
+ %1 %5, %6, %7
+ %else
+ %1 %5, %6
+ %endif
+ %endif
+%endmacro
+
+; 3arg AVX ops with a memory arg can only have it in src2,
+; whereas SSE emulation of 3arg prefers to have it in src1 (i.e. the mov).
+; So, if the op is symmetric and the wrong one is memory, swap them.
+%macro RUN_AVX_INSTR1 8
+ %assign %%swap 0
+ %if AVX_enabled
+ %ifnid %6
+ %assign %%swap 1
+ %endif
+ %elifnidn %5, %6
+ %ifnid %7
+ %assign %%swap 1
+ %endif
+ %endif
+ %if %%swap && %3 == 0 && %8 == 1
+ RUN_AVX_INSTR %1, %2, %3, %4, %5, %7, %6
+ %else
+ RUN_AVX_INSTR %1, %2, %3, %4, %5, %6, %7
+ %endif
+%endmacro
+
+;%1 == instruction
+;%2 == 1 if float, 0 if int
+;%3 == 1 if 4-operand (xmm, xmm, xmm, imm), 0 if 2- or 3-operand (xmm, xmm, xmm)
+;%4 == 1 if symmetric (i.e. doesn't matter which src arg is which), 0 if not
+%macro AVX_INSTR 4
+ %macro %1 2-9 fnord, fnord, fnord, %1, %2, %3, %4
+ %ifidn %3, fnord
+ RUN_AVX_INSTR %6, %7, %8, 2, %1, %2
+ %elifidn %4, fnord
+ RUN_AVX_INSTR1 %6, %7, %8, 3, %1, %2, %3, %9
+ %elifidn %5, fnord
+ RUN_AVX_INSTR %6, %7, %8, 4, %1, %2, %3, %4
+ %else
+ RUN_AVX_INSTR %6, %7, %8, 5, %1, %2, %3, %4, %5
+ %endif
+ %endmacro
+%endmacro
+
+AVX_INSTR addpd, 1, 0, 1
+AVX_INSTR addps, 1, 0, 1
+AVX_INSTR addsd, 1, 0, 1
+AVX_INSTR addss, 1, 0, 1
+AVX_INSTR addsubpd, 1, 0, 0
+AVX_INSTR addsubps, 1, 0, 0
+AVX_INSTR andpd, 1, 0, 1
+AVX_INSTR andps, 1, 0, 1
+AVX_INSTR andnpd, 1, 0, 0
+AVX_INSTR andnps, 1, 0, 0
+AVX_INSTR blendpd, 1, 0, 0
+AVX_INSTR blendps, 1, 0, 0
+AVX_INSTR blendvpd, 1, 0, 0
+AVX_INSTR blendvps, 1, 0, 0
+AVX_INSTR cmppd, 1, 0, 0
+AVX_INSTR cmpps, 1, 0, 0
+AVX_INSTR cmpsd, 1, 0, 0
+AVX_INSTR cmpss, 1, 0, 0
+AVX_INSTR cvtdq2ps, 1, 0, 0
+AVX_INSTR cvtps2dq, 1, 0, 0
+AVX_INSTR divpd, 1, 0, 0
+AVX_INSTR divps, 1, 0, 0
+AVX_INSTR divsd, 1, 0, 0
+AVX_INSTR divss, 1, 0, 0
+AVX_INSTR dppd, 1, 1, 0
+AVX_INSTR dpps, 1, 1, 0
+AVX_INSTR haddpd, 1, 0, 0
+AVX_INSTR haddps, 1, 0, 0
+AVX_INSTR hsubpd, 1, 0, 0
+AVX_INSTR hsubps, 1, 0, 0
+AVX_INSTR maxpd, 1, 0, 1
+AVX_INSTR maxps, 1, 0, 1
+AVX_INSTR maxsd, 1, 0, 1
+AVX_INSTR maxss, 1, 0, 1
+AVX_INSTR minpd, 1, 0, 1
+AVX_INSTR minps, 1, 0, 1
+AVX_INSTR minsd, 1, 0, 1
+AVX_INSTR minss, 1, 0, 1
+AVX_INSTR movhlps, 1, 0, 0
+AVX_INSTR movlhps, 1, 0, 0
+AVX_INSTR movsd, 1, 0, 0
+AVX_INSTR movss, 1, 0, 0
+AVX_INSTR mpsadbw, 0, 1, 0
+AVX_INSTR mulpd, 1, 0, 1
+AVX_INSTR mulps, 1, 0, 1
+AVX_INSTR mulsd, 1, 0, 1
+AVX_INSTR mulss, 1, 0, 1
+AVX_INSTR orpd, 1, 0, 1
+AVX_INSTR orps, 1, 0, 1
+AVX_INSTR pabsb, 0, 0, 0
+AVX_INSTR pabsw, 0, 0, 0
+AVX_INSTR pabsd, 0, 0, 0
+AVX_INSTR packsswb, 0, 0, 0
+AVX_INSTR packssdw, 0, 0, 0
+AVX_INSTR packuswb, 0, 0, 0
+AVX_INSTR packusdw, 0, 0, 0
+AVX_INSTR paddb, 0, 0, 1
+AVX_INSTR paddw, 0, 0, 1
+AVX_INSTR paddd, 0, 0, 1
+AVX_INSTR paddq, 0, 0, 1
+AVX_INSTR paddsb, 0, 0, 1
+AVX_INSTR paddsw, 0, 0, 1
+AVX_INSTR paddusb, 0, 0, 1
+AVX_INSTR paddusw, 0, 0, 1
+AVX_INSTR palignr, 0, 1, 0
+AVX_INSTR pand, 0, 0, 1
+AVX_INSTR pandn, 0, 0, 0
+AVX_INSTR pavgb, 0, 0, 1
+AVX_INSTR pavgw, 0, 0, 1
+AVX_INSTR pblendvb, 0, 0, 0
+AVX_INSTR pblendw, 0, 1, 0
+AVX_INSTR pcmpestri, 0, 0, 0
+AVX_INSTR pcmpestrm, 0, 0, 0
+AVX_INSTR pcmpistri, 0, 0, 0
+AVX_INSTR pcmpistrm, 0, 0, 0
+AVX_INSTR pcmpeqb, 0, 0, 1
+AVX_INSTR pcmpeqw, 0, 0, 1
+AVX_INSTR pcmpeqd, 0, 0, 1
+AVX_INSTR pcmpeqq, 0, 0, 1
+AVX_INSTR pcmpgtb, 0, 0, 0
+AVX_INSTR pcmpgtw, 0, 0, 0
+AVX_INSTR pcmpgtd, 0, 0, 0
+AVX_INSTR pcmpgtq, 0, 0, 0
+AVX_INSTR phaddw, 0, 0, 0
+AVX_INSTR phaddd, 0, 0, 0
+AVX_INSTR phaddsw, 0, 0, 0
+AVX_INSTR phsubw, 0, 0, 0
+AVX_INSTR phsubd, 0, 0, 0
+AVX_INSTR phsubsw, 0, 0, 0
+AVX_INSTR pmaddwd, 0, 0, 1
+AVX_INSTR pmaddubsw, 0, 0, 0
+AVX_INSTR pmaxsb, 0, 0, 1
+AVX_INSTR pmaxsw, 0, 0, 1
+AVX_INSTR pmaxsd, 0, 0, 1
+AVX_INSTR pmaxub, 0, 0, 1
+AVX_INSTR pmaxuw, 0, 0, 1
+AVX_INSTR pmaxud, 0, 0, 1
+AVX_INSTR pminsb, 0, 0, 1
+AVX_INSTR pminsw, 0, 0, 1
+AVX_INSTR pminsd, 0, 0, 1
+AVX_INSTR pminub, 0, 0, 1
+AVX_INSTR pminuw, 0, 0, 1
+AVX_INSTR pminud, 0, 0, 1
+AVX_INSTR pmovmskb, 0, 0, 0
+AVX_INSTR pmulhuw, 0, 0, 1
+AVX_INSTR pmulhrsw, 0, 0, 1
+AVX_INSTR pmulhw, 0, 0, 1
+AVX_INSTR pmullw, 0, 0, 1
+AVX_INSTR pmulld, 0, 0, 1
+AVX_INSTR pmuludq, 0, 0, 1
+AVX_INSTR pmuldq, 0, 0, 1
+AVX_INSTR por, 0, 0, 1
+AVX_INSTR psadbw, 0, 0, 1
+AVX_INSTR pshufb, 0, 0, 0
+AVX_INSTR pshufd, 0, 1, 0
+AVX_INSTR pshufhw, 0, 1, 0
+AVX_INSTR pshuflw, 0, 1, 0
+AVX_INSTR psignb, 0, 0, 0
+AVX_INSTR psignw, 0, 0, 0
+AVX_INSTR psignd, 0, 0, 0
+AVX_INSTR psllw, 0, 0, 0
+AVX_INSTR pslld, 0, 0, 0
+AVX_INSTR psllq, 0, 0, 0
+AVX_INSTR pslldq, 0, 0, 0
+AVX_INSTR psraw, 0, 0, 0
+AVX_INSTR psrad, 0, 0, 0
+AVX_INSTR psrlw, 0, 0, 0
+AVX_INSTR psrld, 0, 0, 0
+AVX_INSTR psrlq, 0, 0, 0
+AVX_INSTR psrldq, 0, 0, 0
+AVX_INSTR psubb, 0, 0, 0
+AVX_INSTR psubw, 0, 0, 0
+AVX_INSTR psubd, 0, 0, 0
+AVX_INSTR psubq, 0, 0, 0
+AVX_INSTR psubsb, 0, 0, 0
+AVX_INSTR psubsw, 0, 0, 0
+AVX_INSTR psubusb, 0, 0, 0
+AVX_INSTR psubusw, 0, 0, 0
+AVX_INSTR ptest, 0, 0, 0
+AVX_INSTR punpckhbw, 0, 0, 0
+AVX_INSTR punpckhwd, 0, 0, 0
+AVX_INSTR punpckhdq, 0, 0, 0
+AVX_INSTR punpckhqdq, 0, 0, 0
+AVX_INSTR punpcklbw, 0, 0, 0
+AVX_INSTR punpcklwd, 0, 0, 0
+AVX_INSTR punpckldq, 0, 0, 0
+AVX_INSTR punpcklqdq, 0, 0, 0
+AVX_INSTR pxor, 0, 0, 1
+AVX_INSTR shufps, 1, 1, 0
+AVX_INSTR subpd, 1, 0, 0
+AVX_INSTR subps, 1, 0, 0
+AVX_INSTR subsd, 1, 0, 0
+AVX_INSTR subss, 1, 0, 0
+AVX_INSTR unpckhpd, 1, 0, 0
+AVX_INSTR unpckhps, 1, 0, 0
+AVX_INSTR unpcklpd, 1, 0, 0
+AVX_INSTR unpcklps, 1, 0, 0
+AVX_INSTR xorpd, 1, 0, 1
+AVX_INSTR xorps, 1, 0, 1
+
+; 3DNow instructions, for sharing code between AVX, SSE and 3DN
+AVX_INSTR pfadd, 1, 0, 1
+AVX_INSTR pfsub, 1, 0, 0
+AVX_INSTR pfmul, 1, 0, 1
+
+; base-4 constants for shuffles
+%assign i 0
+%rep 256
+ %assign j ((i>>6)&3)*1000 + ((i>>4)&3)*100 + ((i>>2)&3)*10 + (i&3)
+ %if j < 10
+ CAT_XDEFINE q000, j, i
+ %elif j < 100
+ CAT_XDEFINE q00, j, i
+ %elif j < 1000
+ CAT_XDEFINE q0, j, i
+ %else
+ CAT_XDEFINE q, j, i
+ %endif
+%assign i i+1
+%endrep
+%undef i
+%undef j
+
+%macro FMA_INSTR 3
+ %macro %1 4-7 %1, %2, %3
+ %if cpuflag(xop)
+ v%5 %1, %2, %3, %4
+ %else
+ %6 %1, %2, %3
+ %7 %1, %4
+ %endif
+ %endmacro
+%endmacro
+
+FMA_INSTR pmacsdd, pmulld, paddd
+FMA_INSTR pmacsww, pmullw, paddw
+FMA_INSTR pmadcswd, pmaddwd, paddd
+
+; tzcnt is equivalent to "rep bsf" and is backwards-compatible with bsf.
+; This lets us use tzcnt without bumping the yasm version requirement yet.
+%define tzcnt rep bsf