summaryrefslogtreecommitdiff
path: root/vpx_dsp/ppc
diff options
context:
space:
mode:
authorLuca Barbato <lu_zero@gentoo.org>2018-05-15 00:44:06 +0000
committerLuca Barbato <lu_zero@gentoo.org>2018-05-15 18:04:10 +0200
commitd8c36c94807270631d375aa3eee72a87ff593958 (patch)
tree702a38c9bdcbb87310dbc7c8712c880a333d90f3 /vpx_dsp/ppc
parente51c9e39bcfcf26b9031845ff7767050bcb3b059 (diff)
downloadlibvpx-d8c36c94807270631d375aa3eee72a87ff593958.tar
libvpx-d8c36c94807270631d375aa3eee72a87ff593958.tar.gz
libvpx-d8c36c94807270631d375aa3eee72a87ff593958.tar.bz2
libvpx-d8c36c94807270631d375aa3eee72a87ff593958.zip
Add vpx_varianceNxM_vsx and vpx_mseNxM_vsx
Speedups: 64x64 5.9 64x32 6.2 32x64 5.8 32x32 6.2 32x16 5.1 16x32 3.3 16x16 2.6 16x8 2.6 8x16 2.4 8x8 2.3 8x4 2.1 4x8 1.6 4x4 1.6 Change-Id: Idfaab96c03d3d1f487301cf398da0dd47a34e887
Diffstat (limited to 'vpx_dsp/ppc')
-rw-r--r--vpx_dsp/ppc/variance_vsx.c174
1 files changed, 173 insertions, 1 deletions
diff --git a/vpx_dsp/ppc/variance_vsx.c b/vpx_dsp/ppc/variance_vsx.c
index 1efe2f005..d3f257b63 100644
--- a/vpx_dsp/ppc/variance_vsx.c
+++ b/vpx_dsp/ppc/variance_vsx.c
@@ -10,10 +10,11 @@
#include <assert.h>
+#include "./vpx_config.h"
#include "./vpx_dsp_rtcd.h"
#include "vpx_dsp/ppc/types_vsx.h"
-static inline uint8x16_t read4x2(const uint8_t *a, int stride) {
+static INLINE uint8x16_t read4x2(const uint8_t *a, int stride) {
const uint32x4_t a0 = (uint32x4_t)vec_vsx_ld(0, a);
const uint32x4_t a1 = (uint32x4_t)vec_vsx_ld(0, a + stride);
@@ -101,3 +102,174 @@ void vpx_comp_avg_pred_vsx(uint8_t *comp_pred, const uint8_t *pred, int width,
}
}
}
+
+static INLINE void variance_inner_32(const uint8_t *a, const uint8_t *b,
+ int32x4_t *sum_squared, int32x4_t *sum) {
+ int32x4_t s = *sum;
+ int32x4_t ss = *sum_squared;
+
+ const uint8x16_t va0 = vec_vsx_ld(0, a);
+ const uint8x16_t vb0 = vec_vsx_ld(0, b);
+ const uint8x16_t va1 = vec_vsx_ld(16, a);
+ const uint8x16_t vb1 = vec_vsx_ld(16, b);
+
+ const int16x8_t a0 = unpack_to_s16_h(va0);
+ const int16x8_t b0 = unpack_to_s16_h(vb0);
+ const int16x8_t a1 = unpack_to_s16_l(va0);
+ const int16x8_t b1 = unpack_to_s16_l(vb0);
+ const int16x8_t a2 = unpack_to_s16_h(va1);
+ const int16x8_t b2 = unpack_to_s16_h(vb1);
+ const int16x8_t a3 = unpack_to_s16_l(va1);
+ const int16x8_t b3 = unpack_to_s16_l(vb1);
+ const int16x8_t d0 = vec_sub(a0, b0);
+ const int16x8_t d1 = vec_sub(a1, b1);
+ const int16x8_t d2 = vec_sub(a2, b2);
+ const int16x8_t d3 = vec_sub(a3, b3);
+
+ s = vec_sum4s(d0, s);
+ ss = vec_msum(d0, d0, ss);
+ s = vec_sum4s(d1, s);
+ ss = vec_msum(d1, d1, ss);
+ s = vec_sum4s(d2, s);
+ ss = vec_msum(d2, d2, ss);
+ s = vec_sum4s(d3, s);
+ ss = vec_msum(d3, d3, ss);
+ *sum = s;
+ *sum_squared = ss;
+}
+
+static INLINE void variance(const uint8_t *a, int a_stride, const uint8_t *b,
+ int b_stride, int w, int h, uint32_t *sse,
+ int *sum) {
+ int i;
+
+ int32x4_t s = vec_splat_s32(0);
+ int32x4_t ss = vec_splat_s32(0);
+
+ switch (w) {
+ case 4:
+ for (i = 0; i < h / 2; ++i) {
+ const int16x8_t a0 = unpack_to_s16_h(read4x2(a, a_stride));
+ const int16x8_t b0 = unpack_to_s16_h(read4x2(b, b_stride));
+ const int16x8_t d = vec_sub(a0, b0);
+ s = vec_sum4s(d, s);
+ ss = vec_msum(d, d, ss);
+ a += a_stride * 2;
+ b += b_stride * 2;
+ }
+ break;
+ case 8:
+ for (i = 0; i < h; ++i) {
+ const int16x8_t a0 = unpack_to_s16_h(vec_vsx_ld(0, a));
+ const int16x8_t b0 = unpack_to_s16_h(vec_vsx_ld(0, b));
+ const int16x8_t d = vec_sub(a0, b0);
+
+ s = vec_sum4s(d, s);
+ ss = vec_msum(d, d, ss);
+ a += a_stride;
+ b += b_stride;
+ }
+ break;
+ case 16:
+ for (i = 0; i < h; ++i) {
+ const uint8x16_t va = vec_vsx_ld(0, a);
+ const uint8x16_t vb = vec_vsx_ld(0, b);
+ const int16x8_t a0 = unpack_to_s16_h(va);
+ const int16x8_t b0 = unpack_to_s16_h(vb);
+ const int16x8_t a1 = unpack_to_s16_l(va);
+ const int16x8_t b1 = unpack_to_s16_l(vb);
+ const int16x8_t d0 = vec_sub(a0, b0);
+ const int16x8_t d1 = vec_sub(a1, b1);
+
+ s = vec_sum4s(d0, s);
+ ss = vec_msum(d0, d0, ss);
+ s = vec_sum4s(d1, s);
+ ss = vec_msum(d1, d1, ss);
+
+ a += a_stride;
+ b += b_stride;
+ }
+ break;
+ case 32:
+ for (i = 0; i < h; ++i) {
+ variance_inner_32(a, b, &ss, &s);
+ a += a_stride;
+ b += b_stride;
+ }
+ break;
+ case 64:
+ for (i = 0; i < h; ++i) {
+ variance_inner_32(a, b, &ss, &s);
+ variance_inner_32(a + 32, b + 32, &ss, &s);
+
+ a += a_stride;
+ b += b_stride;
+ }
+ break;
+ }
+
+ s = vec_splat(vec_sums(s, vec_splat_s32(0)), 3);
+
+ vec_ste(s, 0, sum);
+
+ ss = vec_splat(vec_sums(ss, vec_splat_s32(0)), 3);
+
+ vec_ste((uint32x4_t)ss, 0, sse);
+}
+
+/* Identical to the variance call except it takes an additional parameter, sum,
+ * and returns that value using pass-by-reference instead of returning
+ * sse - sum^2 / w*h
+ */
+#define GET_VAR(W, H) \
+ void vpx_get##W##x##H##var_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse, int *sum) { \
+ variance(a, a_stride, b, b_stride, W, H, sse, sum); \
+ }
+
+/* Identical to the variance call except it does not calculate the
+ * sse - sum^2 / w*h and returns sse in addtion to modifying the passed in
+ * variable.
+ */
+#define MSE(W, H) \
+ uint32_t vpx_mse##W##x##H##_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse; \
+ }
+
+#define VAR(W, H) \
+ uint32_t vpx_variance##W##x##H##_vsx(const uint8_t *a, int a_stride, \
+ const uint8_t *b, int b_stride, \
+ uint32_t *sse) { \
+ int sum; \
+ variance(a, a_stride, b, b_stride, W, H, sse, &sum); \
+ return *sse - (uint32_t)(((int64_t)sum * sum) / (W * H)); \
+ }
+
+#define VARIANCES(W, H) VAR(W, H)
+
+VARIANCES(64, 64)
+VARIANCES(64, 32)
+VARIANCES(32, 64)
+VARIANCES(32, 32)
+VARIANCES(32, 16)
+VARIANCES(16, 32)
+VARIANCES(16, 16)
+VARIANCES(16, 8)
+VARIANCES(8, 16)
+VARIANCES(8, 8)
+VARIANCES(8, 4)
+VARIANCES(4, 8)
+VARIANCES(4, 4)
+
+GET_VAR(16, 16)
+GET_VAR(8, 8)
+
+MSE(16, 16)
+MSE(16, 8)
+MSE(8, 16)
+MSE(8, 8)