diff options
author | Linfeng Zhang <linfengz@google.com> | 2018-05-08 17:37:18 -0700 |
---|---|---|
committer | Linfeng Zhang <linfengz@google.com> | 2018-05-08 17:37:18 -0700 |
commit | 7edb5e8a16caf79c84281c117a9b2168326f8d87 (patch) | |
tree | e938f1345d42f28af58a35d92fe3bc5490ac1edc /vpx_dsp | |
parent | 2d3e33388211d2f0539900671a87a874e25e5240 (diff) | |
download | libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.tar libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.tar.gz libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.tar.bz2 libvpx-7edb5e8a16caf79c84281c117a9b2168326f8d87.zip |
Update vpx_comp_avg_pred_neon()
Separate width 4 and 8 cases to reduce jumps in loop in clang.
Change-Id: I6ffc6f1555f2ad08b72a8dba35a78b9fd5f95a73
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/avg_pred_neon.c | 42 |
1 files changed, 26 insertions, 16 deletions
diff --git a/vpx_dsp/arm/avg_pred_neon.c b/vpx_dsp/arm/avg_pred_neon.c index 1370ec2d2..5afdece0a 100644 --- a/vpx_dsp/arm/avg_pred_neon.c +++ b/vpx_dsp/arm/avg_pred_neon.c @@ -17,8 +17,8 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { if (width > 8) { - int x, y; - for (y = 0; y < height; ++y) { + int x, y = height; + do { for (x = 0; x < width; x += 16) { const uint8x16_t p = vld1q_u8(pred + x); const uint8x16_t r = vld1q_u8(ref + x); @@ -28,28 +28,38 @@ void vpx_comp_avg_pred_neon(uint8_t *comp, const uint8_t *pred, int width, comp += width; pred += width; ref += ref_stride; - } + } while (--y); + } else if (width == 8) { + int i = width * height; + do { + const uint8x16_t p = vld1q_u8(pred); + uint8x16_t r; + const uint8x8_t r_0 = vld1_u8(ref); + const uint8x8_t r_1 = vld1_u8(ref + ref_stride); + r = vcombine_u8(r_0, r_1); + ref += 2 * ref_stride; + r = vrhaddq_u8(r, p); + vst1q_u8(comp, r); + + pred += 16; + comp += 16; + i -= 16; + } while (i); } else { - int i; - for (i = 0; i < width * height; i += 16) { + int i = width * height; + assert(width == 4); + do { const uint8x16_t p = vld1q_u8(pred); uint8x16_t r; - if (width == 4) { - r = load_unaligned_u8q(ref, ref_stride); - ref += 4 * ref_stride; - } else { - const uint8x8_t r_0 = vld1_u8(ref); - const uint8x8_t r_1 = vld1_u8(ref + ref_stride); - assert(width == 8); - r = vcombine_u8(r_0, r_1); - ref += 2 * ref_stride; - } + r = load_unaligned_u8q(ref, ref_stride); + ref += 4 * ref_stride; r = vrhaddq_u8(r, p); vst1q_u8(comp, r); pred += 16; comp += 16; - } + i -= 16; + } while (i); } } |