summaryrefslogtreecommitdiff
path: root/vpx_dsp
diff options
context:
space:
mode:
Diffstat (limited to 'vpx_dsp')
-rw-r--r--vpx_dsp/arm/sad4d_neon.c122
-rw-r--r--vpx_dsp/mips/sad_msa.c32
-rw-r--r--vpx_dsp/sad.c48
-rw-r--r--vpx_dsp/vpx_dsp_rtcd_defs.pl74
-rw-r--r--vpx_dsp/x86/sad4d_avx2.c13
-rw-r--r--vpx_dsp/x86/sad4d_avx512.c6
6 files changed, 155 insertions, 140 deletions
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c
index 5c7a0fcaf..03f716c3d 100644
--- a/vpx_dsp/arm/sad4d_neon.c
+++ b/vpx_dsp/arm/sad4d_neon.c
@@ -31,7 +31,7 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0,
static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
const uint8_t *const ref_array[4],
const int ref_stride, const int height,
- uint32_t *const res) {
+ uint32_t sad_array[4]) {
int i;
uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
#if !defined(__aarch64__)
@@ -61,26 +61,26 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
#endif
- vst1q_u32(res, r);
+ vst1q_u32(sad_array, r);
}
void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
+ uint32_t sad_array[4]) {
+ sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array);
}
void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
+ uint32_t sad_array[4]) {
+ sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array);
}
////////////////////////////////////////////////////////////////////////////////
// Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
-static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
- uint32_t *const res) {
+static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4],
+ uint32_t sad_array[4]) {
#if defined(__aarch64__)
const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
@@ -95,21 +95,21 @@ static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
const uint16x4_t b1 = vpadd_u16(a2, a3);
const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
#endif
- vst1q_u32(res, r);
+ vst1q_u32(sad_array, r);
}
#if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
// Can handle 1024 pixels' sad sum (such as 32x32)
-static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
- uint32_t *const res) {
+static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4],
+ uint32_t sad_array[4]) {
#if defined(__aarch64__)
const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
const uint32x4_t b0 = vpaddlq_u16(a0);
const uint32x4_t b1 = vpaddlq_u16(a1);
const uint32x4_t r = vpaddq_u32(b0, b1);
- vst1q_u32(res, r);
+ vst1q_u32(sad_array, r);
#else
const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
@@ -119,13 +119,13 @@ static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
- vst1q_u32(res, vcombine_u32(c0, c1));
+ vst1q_u32(sad_array, vcombine_u32(c0, c1));
#endif
}
// Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
-static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
- uint32_t *const res) {
+static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4],
+ uint32_t sad_array[4]) {
#if defined(__aarch64__)
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -134,7 +134,7 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
const uint32x4_t b0 = vpaddq_u32(a0, a1);
const uint32x4_t b1 = vpaddq_u32(a2, a3);
const uint32x4_t r = vpaddq_u32(b0, b1);
- vst1q_u32(res, r);
+ vst1q_u32(sad_array, r);
#else
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -146,13 +146,13 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
const uint32x2_t c0 = vpadd_u32(b0, b1);
const uint32x2_t c1 = vpadd_u32(b2, b3);
- vst1q_u32(res, vcombine_u32(c0, c1));
+ vst1q_u32(sad_array, vcombine_u32(c0, c1));
#endif
}
// Can handle 4096 pixels' sad sum (such as 64x64)
-static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
- uint32_t *const res) {
+static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8],
+ uint32_t sad_array[4]) {
#if defined(__aarch64__)
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -169,7 +169,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
const uint32x4_t c0 = vpaddq_u32(b0, b1);
const uint32x4_t c1 = vpaddq_u32(b2, b3);
const uint32x4_t r = vpaddq_u32(c0, c1);
- vst1q_u32(res, r);
+ vst1q_u32(sad_array, r);
#else
const uint32x4_t a0 = vpaddlq_u16(sum[0]);
const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -189,7 +189,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
const uint32x2_t d0 = vpadd_u32(c0, c1);
const uint32x2_t d1 = vpadd_u32(c2, c3);
- vst1q_u32(res, vcombine_u32(d0, d1));
+ vst1q_u32(sad_array, vcombine_u32(d0, d1));
#endif
}
@@ -197,7 +197,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res, const int height) {
+ uint32_t sad_array[4], const int height) {
int i, j;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
@@ -214,25 +214,25 @@ static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
}
}
- sad_512_pel_final_neon(sum, res);
+ sad_512_pel_final_neon(sum, sad_array);
}
void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
+ uint32_t sad_array[4]) {
+ sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4);
}
void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
+ uint32_t sad_array[4]) {
+ sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
}
void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+ uint32_t sad_array[4]) {
+ sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
}
////////////////////////////////////////////////////////////////////////////////
@@ -249,7 +249,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res, const int height) {
+ uint32_t sad_array[4], const int height) {
int i;
uint32x4_t r0, r1;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -267,7 +267,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
r0 = vpaddq_u32(sum[0], sum[1]);
r1 = vpaddq_u32(sum[2], sum[3]);
- vst1q_u32(res, vpaddq_u32(r0, r1));
+ vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
#else
@@ -281,7 +281,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr,
static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res, const int height) {
+ uint32_t sad_array[4], const int height) {
int i;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
@@ -302,27 +302,27 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
ref_loop[3] += ref_stride;
}
- sad_512_pel_final_neon(sum, res);
+ sad_512_pel_final_neon(sum, sad_array);
}
#endif
void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
+ uint32_t sad_array[4]) {
+ sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
}
void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+ uint32_t sad_array[4]) {
+ sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
}
void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
+ uint32_t sad_array[4]) {
+ sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
}
////////////////////////////////////////////////////////////////////////////////
@@ -332,7 +332,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res, const int height) {
+ uint32_t sad_array[4], const int height) {
int i;
uint32x4_t r0, r1;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -365,25 +365,25 @@ static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
r0 = vpaddq_u32(sum[0], sum[1]);
r1 = vpaddq_u32(sum[2], sum[3]);
- vst1q_u32(res, vpaddq_u32(r0, r1));
+ vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+ uint32_t sad_array[4]) {
+ sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
}
void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
+ uint32_t sad_array[4]) {
+ sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
}
void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
- sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 64);
+ uint32_t sad_array[4]) {
+ sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
}
#else
@@ -422,26 +422,26 @@ static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
+ uint32_t sad_array[4]) {
uint16x8_t sum[4];
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
- sad_512_pel_final_neon(sum, res);
+ sad_512_pel_final_neon(sum, sad_array);
}
void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
+ uint32_t sad_array[4]) {
uint16x8_t sum[4];
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
- sad_1024_pel_final_neon(sum, res);
+ sad_1024_pel_final_neon(sum, sad_array);
}
void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
+ uint32_t sad_array[4]) {
uint16x8_t sum[4];
sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
- sad_2048_pel_final_neon(sum, res);
+ sad_2048_pel_final_neon(sum, sad_array);
}
#endif
@@ -453,7 +453,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
+ uint32_t sad_array[4]) {
int i;
uint32x4_t r0, r1;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -497,12 +497,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
r0 = vpaddq_u32(sum[0], sum[1]);
r1 = vpaddq_u32(sum[2], sum[3]);
- vst1q_u32(res, vpaddq_u32(r0, r1));
+ vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
+ uint32_t sad_array[4]) {
int i;
uint32x4_t r0, r1, r2, r3;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -551,14 +551,14 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
r3 = vpaddq_u32(sum[6], sum[7]);
r0 = vpaddq_u32(r0, r1);
r1 = vpaddq_u32(r2, r3);
- vst1q_u32(res, vpaddq_u32(r0, r1));
+ vst1q_u32(sad_array, vpaddq_u32(r0, r1));
}
#else
void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
+ uint32_t sad_array[4]) {
int i;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
@@ -599,12 +599,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
ref_loop[3] += ref_stride;
}
- sad_2048_pel_final_neon(sum, res);
+ sad_2048_pel_final_neon(sum, sad_array);
}
void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
const uint8_t *const ref_array[4], int ref_stride,
- uint32_t *res) {
+ uint32_t sad_array[4]) {
int i;
const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
ref_array[3] };
@@ -646,7 +646,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
ref_loop[3] += ref_stride;
}
- sad_4096_pel_final_neon(sum, res);
+ sad_4096_pel_final_neon(sum, sad_array);
}
#endif
diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c
index ab681ae9f..e3e91c433 100644
--- a/vpx_dsp/mips/sad_msa.c
+++ b/vpx_dsp/mips/sad_msa.c
@@ -1040,77 +1040,77 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride,
#define VPX_SAD_4xHEIGHTx3_MSA(height) \
void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
+ uint32_t sads[3]) { \
sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
}
#define VPX_SAD_8xHEIGHTx3_MSA(height) \
void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
+ uint32_t sads[3]) { \
sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
}
#define VPX_SAD_16xHEIGHTx3_MSA(height) \
void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
+ uint32_t sads[3]) { \
sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \
}
#define VPX_SAD_4xHEIGHTx8_MSA(height) \
void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
+ uint32_t sads[8]) { \
sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
}
#define VPX_SAD_8xHEIGHTx8_MSA(height) \
void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
+ uint32_t sads[8]) { \
sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
}
#define VPX_SAD_16xHEIGHTx8_MSA(height) \
void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
const uint8_t *ref, int32_t ref_stride, \
- uint32_t *sads) { \
+ uint32_t sads[8]) { \
sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \
}
#define VPX_SAD_4xHEIGHTx4D_MSA(height) \
void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
}
#define VPX_SAD_8xHEIGHTx4D_MSA(height) \
void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
}
#define VPX_SAD_16xHEIGHTx4D_MSA(height) \
void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
}
#define VPX_SAD_32xHEIGHTx4D_MSA(height) \
void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
}
#define VPX_SAD_64xHEIGHTx4D_MSA(height) \
void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
- const uint8_t *const refs[], \
- int32_t ref_stride, uint32_t *sads) { \
+ const uint8_t *const refs[4], \
+ int32_t ref_stride, uint32_t sads[4]) { \
sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \
}
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index 769322019..46d513b68 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -45,23 +45,39 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride,
return sad(src_ptr, src_stride, comp_pred, m, m, n); \
}
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
+// Compare |src_ptr| to |k| adjacent blocks starting at |ref_ptr|.
+// |k| == {3,8}. Used in vp8 for an exhaustive search.
+// src: ref:
+// 0 1 2 3 0 1 2 3 x x
+// 4 5 6 7 6 7 8 9 x x
+// 8 9 10 11 12 13 14 15 x x
+// 12 13 14 15 18 19 20 21 x x
+//
+// x 1 2 3 4 x
+// x 7 8 9 10 x
+// x 13 14 15 16 x
+// x 19 20 21 22 x
+//
+// x x 2 3 4 5
+// x x 8 9 10 11
+// x x 14 15 16 17
+// x x 20 21 22 23
+//
#define sadMxNxK(m, n, k) \
void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride, \
const uint8_t *ref_ptr, int ref_stride, \
- uint32_t *sad_array) { \
+ uint32_t sad_array[k]) { \
int i; \
for (i = 0; i < k; ++i) \
sad_array[i] = \
- vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \
+ vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_ptr + i, ref_stride); \
}
-// This appears to be equivalent to the above when k == 4 and refs is const
+// Compare |src_ptr| to 4 distinct references in |ref_array[]|
#define sadMxNx4D(m, n) \
void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
int i; \
for (i = 0; i < 4; ++i) \
sad_array[i] = \
@@ -181,15 +197,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride,
return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \
}
-#define highbd_sadMxNx4D(m, n) \
- void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
- const uint8_t *const ref_array[], \
- int ref_stride, uint32_t *sad_array) { \
- int i; \
- for (i = 0; i < 4; ++i) { \
- sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \
- ref_array[i], ref_stride); \
- } \
+#define highbd_sadMxNx4D(m, n) \
+ void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
+ const uint8_t *const ref_array[4], \
+ int ref_stride, uint32_t sad_array[4]) { \
+ int i; \
+ for (i = 0; i < 4; ++i) { \
+ sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \
+ ref_array[i], ref_stride); \
+ } \
}
/* clang-format off */
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index 8b6672248..06a8febb2 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -877,80 +877,80 @@ specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
#
# Blocks of 3
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]";
specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
# Blocks of 8
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
specialize qw/vpx_sad32x32x8 avx2/;
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]";
specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
#
# Multi-block SAD, comparing a reference to N independent blocks
#
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
@@ -1064,43 +1064,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
#
# Multi-block SAD, comparing a reference to N independent blocks
#
- add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad64x64x4d sse2/;
- add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad64x32x4d sse2/;
- add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad32x64x4d sse2/;
- add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad32x32x4d sse2/;
- add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad32x16x4d sse2/;
- add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad16x32x4d sse2/;
- add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad16x16x4d sse2/;
- add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad16x8x4d sse2/;
- add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad8x16x4d sse2/;
- add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad8x8x4d sse2/;
- add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad8x4x4d sse2/;
- add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad4x8x4d sse2/;
- add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+ add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
specialize qw/vpx_highbd_sad4x4x4d sse2/;
#
diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c
index 9dd066691..5f1f757e2 100644
--- a/vpx_dsp/x86/sad4d_avx2.c
+++ b/vpx_dsp/x86/sad4d_avx2.c
@@ -11,8 +11,7 @@
#include "./vpx_dsp_rtcd.h"
#include "vpx/vpx_integer.h"
-static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
- uint32_t *sad_array) {
+static INLINE void calc_final_4(const __m256i sums[4], uint32_t sad_array[4]) {
const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
const __m256i t2 = _mm256_hadd_epi32(t0, t1);
@@ -22,8 +21,8 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
}
void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_array[/*4*/], int ref_stride,
- uint32_t *sad_array /*[4]*/) {
+ const uint8_t *const ref_array[4], int ref_stride,
+ uint32_t sad_array[4]) {
int i;
const uint8_t *refs[4];
__m256i sums[4];
@@ -71,7 +70,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride,
void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
const uint8_t *ref_ptr, int ref_stride,
- uint32_t *sad_array) {
+ uint32_t sad_array[8]) {
int i;
__m256i sums[8];
@@ -127,8 +126,8 @@ void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
}
void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_array[/*4*/], int ref_stride,
- uint32_t *sad_array /*[4]*/) {
+ const uint8_t *const ref_array[4], int ref_stride,
+ uint32_t sad_array[4]) {
__m256i sums[4];
int i;
const uint8_t *refs[4];
diff --git a/vpx_dsp/x86/sad4d_avx512.c b/vpx_dsp/x86/sad4d_avx512.c
index 2fa910871..cfd23fedd 100644
--- a/vpx_dsp/x86/sad4d_avx512.c
+++ b/vpx_dsp/x86/sad4d_avx512.c
@@ -12,8 +12,8 @@
#include "vpx/vpx_integer.h"
void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
- const uint8_t *const ref_array[/*4*/],
- int ref_stride, uint32_t *res /*[4]*/) {
+ const uint8_t *const ref_array[4], int ref_stride,
+ uint32_t sad_array[4]) {
__m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
__m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
__m512i sum_mlow, sum_mhigh;
@@ -78,6 +78,6 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
_mm256_extractf128_si256(sum256, 1));
- _mm_storeu_si128((__m128i *)(res), sum128);
+ _mm_storeu_si128((__m128i *)(sad_array), sum128);
}
}