Update vpx_comp_avg_pred_neon()

Separate width 4 and 8 cases to reduce jumps in loop in clang. Change-Id: I6ffc6f1555f2ad08b72a8dba35a78b9fd5f95a73
author: Linfeng Zhang <linfengz@google.com> 2018-05-08 17:37:18 -0700
committer: Linfeng Zhang <linfengz@google.com> 2018-05-08 17:37:18 -0700
commit: 7edb5e8a16caf79c84281c117a9b2168326f8d87 (patch)
tree: e938f1345d42f28af58a35d92fe3bc5490ac1edc /vpx_dsp
parent: 2d3e33388211d2f0539900671a87a874e25e5240 (diff)
download: libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.tar
libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.tar.gz
libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.tar.bz2
libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.zip
1 files changed, 26 insertions, 16 deletions
diff --git a/vpx_dsp/arm/avg_pred_neon.c b/vpx_dsp/arm/avg_pred_neon.c
index 1370ec2d2..5afdece0a 100644
--- a/vpx_dsp/arm/avg_pred_neon.c
+++ b/vpx_dsp/arm/avg_pred_neon.c
@@ -17,8 +17,8 @@
 void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
                             int height, const uint8_t *ref, int ref_stride) {
   if (width > 8) {
-    int x, y;
-    for (y = 0; y < height; ++y) {
+    int x, y = height;
+    do {
       for (x = 0; x < width; x += 16) {
         const uint8x16_t p = vld1q_u8(pred + x);
         const uint8x16_t r = vld1q_u8(ref + x);
@@ -28,28 +28,38 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width,
       comp += width;
       pred += width;
       ref += ref_stride;
-    }
+    } while (--y);
+  } else if (width == 8) {
+    int i = width * height;
+    do {
+      const uint8x16_t p = vld1q_u8(pred);
+      uint8x16_t r;
+      const uint8x8_t r_0 = vld1_u8(ref);
+      const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
+      r = vcombine_u8(r_0, r_1);
+      ref += 2 * ref_stride;
+      r = vrhaddq_u8(r, p);
+      vst1q_u8(comp, r);
+
+      pred += 16;
+      comp += 16;
+      i -= 16;
+    } while (i);
   } else {
-    int i;
-    for (i = 0; i < width * height; i += 16) {
+    int i = width * height;
+    assert(width == 4);
+    do {
       const uint8x16_t p = vld1q_u8(pred);
       uint8x16_t r;
 
-      if (width == 4) {
-        r = load_unaligned_u8q(ref, ref_stride);
-        ref += 4 * ref_stride;
-      } else {
-        const uint8x8_t r_0 = vld1_u8(ref);
-        const uint8x8_t r_1 = vld1_u8(ref + ref_stride);
-        assert(width == 8);
-        r = vcombine_u8(r_0, r_1);
-        ref += 2 * ref_stride;
-      }
+      r = load_unaligned_u8q(ref, ref_stride);
+      ref += 4 * ref_stride;
       r = vrhaddq_u8(r, p);
       vst1q_u8(comp, r);
 
       pred += 16;
       comp += 16;
-    }
+      i -= 16;
+    } while (i);
   }
 }
author	Linfeng Zhang <linfengz@google.com>	2018-05-08 17:37:18 -0700
committer	Linfeng Zhang <linfengz@google.com>	2018-05-08 17:37:18 -0700
commit	7edb5e8a16caf79c84281c117a9b2168326f8d87 (patch)
tree	e938f1345d42f28af58a35d92fe3bc5490ac1edc /vpx_dsp
parent	2d3e33388211d2f0539900671a87a874e25e5240 (diff)
download	libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.tar libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.tar.gz libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.tar.bz2 libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.zip