Add 2D-specific Neon horizontal convolution functions

2D 8-tap convolution filtering is performed in two passes - horizontal and vertical. The horizontal pass must produce enough input data for the subsequent vertical pass - 3 rows above and 4 rows below, in addition to the actual block height. At present, all Neon horizontal convolution algorithms process 4 rows at a time, but this means we end up doing at least 1 row too much work in the 2D first pass case where we need h + 7, not h + 8 rows of output. This patch adds additional dot-product (SDOT and USDOT) Neon paths that process h + 7 rows of data exactly, saving the work of the unnecessary extra row. It is impractical to take a similar approach for the Armv8.0 MLA paths since we have to transpose the data block both before and after calling the convolution helper functions. vpx_convolve_neon performance impact: we observe a speedup of ~9% for smaller (and wider) blocks, and a speedup of 0-3% for larger blocks. This is to be expected since the proportion of redundant work decreases as the block height increases. Change-Id: Ie77ad1848707d2d48bb8851345a469aae9d097e1
author: Jonathan Wright <jonathan.wright@arm.com> 2023-05-04 16:33:38 +0100
committer: Jonathan Wright <jonathan.wright@arm.com> 2023-05-13 20:43:20 +0100
commit: 3e1e38d1176c34f71a87f8402c07cdcc2e20083e (patch)
tree: f11c4c4b8acb0fad783e0cc37b47a6da0df482c4 /vpx_dsp/arm/vpx_convolve_neon.c
parent: 8ecf58432118b672fe3f4a54725bc63caac262aa (diff)
download: libvpx-3e1e38d1176c34f71a87f8402c07cdcc2e20083e.tar
libvpx-3e1e38d1176c34f71a87f8402c07cdcc2e20083e.tar.gz
libvpx-3e1e38d1176c34f71a87f8402c07cdcc2e20083e.tar.bz2
libvpx-3e1e38d1176c34f71a87f8402c07cdcc2e20083e.zip
1 files changed, 55 insertions, 0 deletions
diff --git a/vpx_dsp/arm/vpx_convolve_neon.c b/vpx_dsp/arm/vpx_convolve_neon.c
index 830f3176d..f7db3e6a9 100644
--- a/vpx_dsp/arm/vpx_convolve_neon.c
+++ b/vpx_dsp/arm/vpx_convolve_neon.c
@@ -14,6 +14,57 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_ports/mem.h"
 
+#if VPX_ARCH_AARCH64 && \
+    (defined(__ARM_FEATURE_DOTPROD) || defined(__ARM_FEATURE_MATMUL_INT8))
+#include "vpx_dsp/arm/vpx_convolve8_neon.h"
+
+void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                        ptrdiff_t dst_stride, const InterpKernel *filter,
+                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
+                        int w, int h) {
+  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
+   * maximum buffer size to 64 * (64 + 7). */
+  uint8_t temp[64 * 71];
+
+  /* Account for the vertical phase needing 3 lines prior and 4 lines post. */
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  /* Filter starting 3 lines back. */
+  vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                              intermediate_height);
+
+  /* Step into the temp buffer 3 lines to get the actual frame data */
+  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                          x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  uint8_t temp[64 * 71];
+  const int intermediate_height = h + 7;
+
+  assert(y_step_q4 == 16);
+  assert(x_step_q4 == 16);
+
+  vpx_convolve8_2d_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter,
+                              x0_q4, x_step_q4, y0_q4, y_step_q4, w,
+                              intermediate_height);
+
+  vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+}
+
+#else  // !(VPX_ARCH_AARCH64 &&
+       //   (defined(__ARM_FEATURE_DOTPROD) ||
+       //    defined(__ARM_FEATURE_MATMUL_INT8)))
+
 void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                         ptrdiff_t dst_stride, const InterpKernel *filter,
                         int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
@@ -63,3 +114,7 @@ void vpx_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
   vpx_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter, x0_q4,
                               x_step_q4, y0_q4, y_step_q4, w, h);
 }
+
+#endif  // #if VPX_ARCH_AARCH64 &&
+        //     (defined(__ARM_FEATURE_DOTPROD) ||
+        //      defined(__ARM_FEATURE_MATMUL_INT8))
author	Jonathan Wright <jonathan.wright@arm.com>	2023-05-04 16:33:38 +0100
committer	Jonathan Wright <jonathan.wright@arm.com>	2023-05-13 20:43:20 +0100
commit	3e1e38d1176c34f71a87f8402c07cdcc2e20083e (patch)
tree	f11c4c4b8acb0fad783e0cc37b47a6da0df482c4 /vpx_dsp/arm/vpx_convolve_neon.c
parent	8ecf58432118b672fe3f4a54725bc63caac262aa (diff)
download	libvpx-3e1e38d1176c34f71a87f8402c07cdcc2e20083e.tar libvpx-3e1e38d1176c34f71a87f8402c07cdcc2e20083e.tar.gz libvpx-3e1e38d1176c34f71a87f8402c07cdcc2e20083e.tar.bz2 libvpx-3e1e38d1176c34f71a87f8402c07cdcc2e20083e.zip