diff options
author | Luca Barbato <lu_zero@gentoo.org> | 2018-06-06 21:10:18 +0000 |
---|---|---|
committer | Luca Barbato <lu_zero@gentoo.org> | 2018-06-08 05:26:05 +0200 |
commit | d468fd90e05ba7f5173d849c63f6a50115c9769b (patch) | |
tree | b60507eb7ba4aa901f0c116ca4785f8e4184b08b /vpx_dsp/ppc/subtract_vsx.c | |
parent | 034f94c127b4d608200d94889c52e7e42b2f62e4 (diff) | |
download | libvpx-d468fd90e05ba7f5173d849c63f6a50115c9769b.tar libvpx-d468fd90e05ba7f5173d849c63f6a50115c9769b.tar.gz libvpx-d468fd90e05ba7f5173d849c63f6a50115c9769b.tar.bz2 libvpx-d468fd90e05ba7f5173d849c63f6a50115c9769b.zip |
Implement subtract_block for VSX
~2x speedup or better.
[ RUN ] C/VP9SubtractBlockTest.Speed/0
[ BENCH ] 4x4 365.1 ms ( ±2.2 ms )
[ BENCH ] 8x4 258.5 ms ( ±0.3 ms )
[ BENCH ] 4x8 202.7 ms ( ±0.2 ms )
[ BENCH ] 8x8 162.2 ms ( ±0.5 ms )
[ BENCH ] 16x8 138.8 ms ( ±0.3 ms )
[ BENCH ] 8x16 121.5 ms ( ±0.4 ms )
[ BENCH ] 16x16 110.2 ms ( ±0.5 ms )
[ BENCH ] 32x16 104.8 ms ( ±0.1 ms )
[ BENCH ] 16x32 32.7 ms ( ±0.1 ms )
[ BENCH ] 32x32 30.0 ms ( ±0.0 ms )
[ BENCH ] 64x32 28.7 ms ( ±0.0 ms )
[ BENCH ] 32x64 20.1 ms ( ±0.0 ms )
[ BENCH ] 64x64 19.3 ms ( ±0.0 ms )
[ RUN ] VSX/VP9SubtractBlockTest.Speed/0
[ BENCH ] 4x4 155.3 ms ( ±0.9 ms )
[ BENCH ] 8x4 99.3 ms ( ±0.4 ms )
[ BENCH ] 4x8 77.2 ms ( ±0.1 ms )
[ BENCH ] 8x8 45.7 ms ( ±0.0 ms )
[ BENCH ] 16x8 34.1 ms ( ±0.0 ms )
[ BENCH ] 8x16 29.5 ms ( ±0.0 ms )
[ BENCH ] 16x16 19.9 ms ( ±0.0 ms )
[ BENCH ] 32x16 15.1 ms ( ±0.0 ms )
[ BENCH ] 16x32 16.7 ms ( ±0.0 ms )
[ BENCH ] 32x32 14.1 ms ( ±0.0 ms )
[ BENCH ] 64x32 12.6 ms ( ±0.0 ms )
[ BENCH ] 32x64 12.0 ms ( ±0.0 ms )
[ BENCH ] 64x64 11.2 ms ( ±0.0 ms )
Change-Id: I89ce12b6475871dc9e8fde84d0b6fe5c420c28c7
Diffstat (limited to 'vpx_dsp/ppc/subtract_vsx.c')
-rw-r--r-- | vpx_dsp/ppc/subtract_vsx.c | 117 |
1 files changed, 117 insertions, 0 deletions
diff --git a/vpx_dsp/ppc/subtract_vsx.c b/vpx_dsp/ppc/subtract_vsx.c new file mode 100644 index 000000000..3fd4a6a2d --- /dev/null +++ b/vpx_dsp/ppc/subtract_vsx.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2018 The WebM project authors. All Rights Reserved. + * + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file in the root of the source + * tree. An additional intellectual property rights grant can be found + * in the file PATENTS. All contributing project authors may + * be found in the AUTHORS file in the root of the source tree. + */ + +#include <assert.h> + +#include "./vpx_config.h" +#include "vpx/vpx_integer.h" +#include "vpx_dsp/ppc/types_vsx.h" + +static VPX_FORCE_INLINE void subtract_block4x4( + int16_t *diff, ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, ptrdiff_t pred_stride) { + int16_t *diff1 = diff + 2 * diff_stride; + const uint8_t *src1 = src + 2 * src_stride; + const uint8_t *pred1 = pred + 2 * pred_stride; + + const int16x8_t d0 = vec_vsx_ld(0, diff); + const int16x8_t d1 = vec_vsx_ld(0, diff + diff_stride); + const int16x8_t d2 = vec_vsx_ld(0, diff1); + const int16x8_t d3 = vec_vsx_ld(0, diff1 + diff_stride); + + const uint8x16_t s0 = read4x2(src, (int)src_stride); + const uint8x16_t p0 = read4x2(pred, (int)pred_stride); + const uint8x16_t s1 = read4x2(src1, (int)src_stride); + const uint8x16_t p1 = read4x2(pred1, (int)pred_stride); + + const int16x8_t da = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t db = vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + + vec_vsx_st(xxpermdi(da, d0, 1), 0, diff); + vec_vsx_st(xxpermdi(da, d1, 3), 0, diff + diff_stride); + vec_vsx_st(xxpermdi(db, d2, 1), 0, diff1); + vec_vsx_st(xxpermdi(db, d3, 3), 0, diff1 + diff_stride); +} + +void vpx_subtract_block_vsx(int rows, int cols, int16_t *diff, + ptrdiff_t diff_stride, const uint8_t *src, + ptrdiff_t src_stride, const uint8_t *pred, + ptrdiff_t pred_stride) { + int r = rows, c; + + switch (cols) { + case 64: + case 32: + do { + for (c = 0; c < cols; c += 32) { + const uint8x16_t s0 = vec_vsx_ld(0, src + c); + const uint8x16_t s1 = vec_vsx_ld(16, src + c); + const uint8x16_t p0 = vec_vsx_ld(0, pred + c); + const uint8x16_t p1 = vec_vsx_ld(16, pred + c); + const int16x8_t d0l = + vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = + vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + const int16x8_t d1l = + vec_sub(unpack_to_s16_l(s1), unpack_to_s16_l(p1)); + const int16x8_t d1h = + vec_sub(unpack_to_s16_h(s1), unpack_to_s16_h(p1)); + vec_vsx_st(d0h, 0, diff + c); + vec_vsx_st(d0l, 16, diff + c); + vec_vsx_st(d1h, 0, diff + c + 16); + vec_vsx_st(d1l, 16, diff + c + 16); + } + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 16: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0l = vec_sub(unpack_to_s16_l(s0), unpack_to_s16_l(p0)); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + vec_vsx_st(d0l, 16, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 8: + do { + const uint8x16_t s0 = vec_vsx_ld(0, src); + const uint8x16_t p0 = vec_vsx_ld(0, pred); + const int16x8_t d0h = vec_sub(unpack_to_s16_h(s0), unpack_to_s16_h(p0)); + vec_vsx_st(d0h, 0, diff); + diff += diff_stride; + pred += pred_stride; + src += src_stride; + } while (--r); + break; + case 4: + subtract_block4x4(diff, diff_stride, src, src_stride, pred, pred_stride); + if (r > 4) { + diff += 4 * diff_stride; + pred += 4 * pred_stride; + src += 4 * src_stride; + + subtract_block4x4(diff, diff_stride, + + src, src_stride, + + pred, pred_stride); + } + break; + default: + assert(0); // unreachable + } +} |