diff options
Diffstat (limited to 'vpx_dsp')
-rw-r--r-- | vpx_dsp/arm/sad4d_neon.c | 122 | ||||
-rw-r--r-- | vpx_dsp/mips/sad_msa.c | 32 | ||||
-rw-r--r-- | vpx_dsp/sad.c | 48 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 74 | ||||
-rw-r--r-- | vpx_dsp/x86/sad4d_avx2.c | 13 | ||||
-rw-r--r-- | vpx_dsp/x86/sad4d_avx512.c | 6 |
6 files changed, 155 insertions, 140 deletions
diff --git a/vpx_dsp/arm/sad4d_neon.c b/vpx_dsp/arm/sad4d_neon.c index 5c7a0fcaf..03f716c3d 100644 --- a/vpx_dsp/arm/sad4d_neon.c +++ b/vpx_dsp/arm/sad4d_neon.c @@ -31,7 +31,7 @@ static INLINE uint8x8_t load_unaligned_2_buffers(const void *const buf0, static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, const uint8_t *const ref_array[4], const int ref_stride, const int height, - uint32_t *const res) { + uint32_t sad_array[4]) { int i; uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) }; #if !defined(__aarch64__) @@ -61,26 +61,26 @@ static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride, a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1])); r = vpaddlq_u16(vcombine_u16(a[0], a[1])); #endif - vst1q_u32(res, r); + vst1q_u32(sad_array, r); } void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res); + uint32_t sad_array[4]) { + sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array); } void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res); + uint32_t sad_array[4]) { + sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array); } //////////////////////////////////////////////////////////////////////////////// // Can handle 512 pixels' sad sum (such as 16x32 or 32x16) -static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, - uint32_t *const res) { +static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4], + uint32_t sad_array[4]) { #if defined(__aarch64__) const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); @@ -95,21 +95,21 @@ static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint16x4_t b1 = vpadd_u16(a2, a3); const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1)); #endif - vst1q_u32(res, r); + vst1q_u32(sad_array, r); } #if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD) // Can handle 1024 pixels' sad sum (such as 32x32) -static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, - uint32_t *const res) { +static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4], + uint32_t sad_array[4]) { #if defined(__aarch64__) const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]); const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]); const uint32x4_t b0 = vpaddlq_u16(a0); const uint32x4_t b1 = vpaddlq_u16(a1); const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(res, r); + vst1q_u32(sad_array, r); #else const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0])); const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1])); @@ -119,13 +119,13 @@ static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3)); const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0)); const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1)); - vst1q_u32(res, vcombine_u32(c0, c1)); + vst1q_u32(sad_array, vcombine_u32(c0, c1)); #endif } // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32) -static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, - uint32_t *const res) { +static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4], + uint32_t sad_array[4]) { #if defined(__aarch64__) const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); @@ -134,7 +134,7 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x4_t b0 = vpaddq_u32(a0, a1); const uint32x4_t b1 = vpaddq_u32(a2, a3); const uint32x4_t r = vpaddq_u32(b0, b1); - vst1q_u32(res, r); + vst1q_u32(sad_array, r); #else const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); @@ -146,13 +146,13 @@ static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/, const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3)); const uint32x2_t c0 = vpadd_u32(b0, b1); const uint32x2_t c1 = vpadd_u32(b2, b3); - vst1q_u32(res, vcombine_u32(c0, c1)); + vst1q_u32(sad_array, vcombine_u32(c0, c1)); #endif } // Can handle 4096 pixels' sad sum (such as 64x64) -static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, - uint32_t *const res) { +static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8], + uint32_t sad_array[4]) { #if defined(__aarch64__) const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); @@ -169,7 +169,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, const uint32x4_t c0 = vpaddq_u32(b0, b1); const uint32x4_t c1 = vpaddq_u32(b2, b3); const uint32x4_t r = vpaddq_u32(c0, c1); - vst1q_u32(res, r); + vst1q_u32(sad_array, r); #else const uint32x4_t a0 = vpaddlq_u16(sum[0]); const uint32x4_t a1 = vpaddlq_u16(sum[1]); @@ -189,7 +189,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3)); const uint32x2_t d0 = vpadd_u32(c0, c1); const uint32x2_t d1 = vpadd_u32(c2, c3); - vst1q_u32(res, vcombine_u32(d0, d1)); + vst1q_u32(sad_array, vcombine_u32(d0, d1)); #endif } @@ -197,7 +197,7 @@ static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/, static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res, const int height) { + uint32_t sad_array[4], const int height) { int i, j; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; @@ -214,25 +214,25 @@ static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride, } } - sad_512_pel_final_neon(sum, res); + sad_512_pel_final_neon(sum, sad_array); } void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4); + uint32_t sad_array[4]) { + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4); } void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8); + uint32_t sad_array[4]) { + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); } void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); + uint32_t sad_array[4]) { + sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); } //////////////////////////////////////////////////////////////////////////////// @@ -249,7 +249,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res, const int height) { + uint32_t sad_array[4], const int height) { int i; uint32x4_t r0, r1; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], @@ -267,7 +267,7 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, r0 = vpaddq_u32(sum[0], sum[1]); r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(res, vpaddq_u32(r0, r1)); + vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } #else @@ -281,7 +281,7 @@ static INLINE void sad16_neon(const uint8_t *ref_ptr, const uint8x16_t src_ptr, static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res, const int height) { + uint32_t sad_array[4], const int height) { int i; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; @@ -302,27 +302,27 @@ static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride, ref_loop[3] += ref_stride; } - sad_512_pel_final_neon(sum, res); + sad_512_pel_final_neon(sum, sad_array); } #endif void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8); + uint32_t sad_array[4]) { + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8); } void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); + uint32_t sad_array[4]) { + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); } void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32); + uint32_t sad_array[4]) { + sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); } //////////////////////////////////////////////////////////////////////////////// @@ -332,7 +332,7 @@ void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res, const int height) { + uint32_t sad_array[4], const int height) { int i; uint32x4_t r0, r1; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], @@ -365,25 +365,25 @@ static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, r0 = vpaddq_u32(sum[0], sum[1]); r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(res, vpaddq_u32(r0, r1)); + vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16); + uint32_t sad_array[4]) { + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16); } void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32); + uint32_t sad_array[4]) { + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32); } void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { - sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 64); + uint32_t sad_array[4]) { + sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64); } #else @@ -422,26 +422,26 @@ static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride, void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { uint16x8_t sum[4]; sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum); - sad_512_pel_final_neon(sum, res); + sad_512_pel_final_neon(sum, sad_array); } void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { uint16x8_t sum[4]; sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum); - sad_1024_pel_final_neon(sum, res); + sad_1024_pel_final_neon(sum, sad_array); } void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { uint16x8_t sum[4]; sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum); - sad_2048_pel_final_neon(sum, res); + sad_2048_pel_final_neon(sum, sad_array); } #endif @@ -453,7 +453,7 @@ void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { int i; uint32x4_t r0, r1; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], @@ -497,12 +497,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, r0 = vpaddq_u32(sum[0], sum[1]); r1 = vpaddq_u32(sum[2], sum[3]); - vst1q_u32(res, vpaddq_u32(r0, r1)); + vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { int i; uint32x4_t r0, r1, r2, r3; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], @@ -551,14 +551,14 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, r3 = vpaddq_u32(sum[6], sum[7]); r0 = vpaddq_u32(r0, r1); r1 = vpaddq_u32(r2, r3); - vst1q_u32(res, vpaddq_u32(r0, r1)); + vst1q_u32(sad_array, vpaddq_u32(r0, r1)); } #else void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { int i; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; @@ -599,12 +599,12 @@ void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, ref_loop[3] += ref_stride; } - sad_2048_pel_final_neon(sum, res); + sad_2048_pel_final_neon(sum, sad_array); } void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *const ref_array[4], int ref_stride, - uint32_t *res) { + uint32_t sad_array[4]) { int i; const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2], ref_array[3] }; @@ -646,7 +646,7 @@ void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, ref_loop[3] += ref_stride; } - sad_4096_pel_final_neon(sum, res); + sad_4096_pel_final_neon(sum, sad_array); } #endif diff --git a/vpx_dsp/mips/sad_msa.c b/vpx_dsp/mips/sad_msa.c index ab681ae9f..e3e91c433 100644 --- a/vpx_dsp/mips/sad_msa.c +++ b/vpx_dsp/mips/sad_msa.c @@ -1040,77 +1040,77 @@ static uint32_t avgsad_64width_msa(const uint8_t *src, int32_t src_stride, #define VPX_SAD_4xHEIGHTx3_MSA(height) \ void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[3]) { \ sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_8xHEIGHTx3_MSA(height) \ void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[3]) { \ sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_16xHEIGHTx3_MSA(height) \ void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[3]) { \ sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_4xHEIGHTx8_MSA(height) \ void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[8]) { \ sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_8xHEIGHTx8_MSA(height) \ void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[8]) { \ sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_16xHEIGHTx8_MSA(height) \ void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \ const uint8_t *ref, int32_t ref_stride, \ - uint32_t *sads) { \ + uint32_t sads[8]) { \ sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads); \ } #define VPX_SAD_4xHEIGHTx4D_MSA(height) \ void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_8xHEIGHTx4D_MSA(height) \ void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_16xHEIGHTx4D_MSA(height) \ void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_32xHEIGHTx4D_MSA(height) \ void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } #define VPX_SAD_64xHEIGHTx4D_MSA(height) \ void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \ - const uint8_t *const refs[], \ - int32_t ref_stride, uint32_t *sads) { \ + const uint8_t *const refs[4], \ + int32_t ref_stride, uint32_t sads[4]) { \ sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads); \ } diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c index 769322019..46d513b68 100644 --- a/vpx_dsp/sad.c +++ b/vpx_dsp/sad.c @@ -45,23 +45,39 @@ static INLINE unsigned int sad(const uint8_t *src_ptr, int src_stride, return sad(src_ptr, src_stride, comp_pred, m, m, n); \ } -// depending on call sites, pass **ref_array to avoid & in subsequent call and -// de-dup with 4D below. +// Compare |src_ptr| to |k| adjacent blocks starting at |ref_ptr|. +// |k| == {3,8}. Used in vp8 for an exhaustive search. +// src: ref: +// 0 1 2 3 0 1 2 3 x x +// 4 5 6 7 6 7 8 9 x x +// 8 9 10 11 12 13 14 15 x x +// 12 13 14 15 18 19 20 21 x x +// +// x 1 2 3 4 x +// x 7 8 9 10 x +// x 13 14 15 16 x +// x 19 20 21 22 x +// +// x x 2 3 4 5 +// x x 8 9 10 11 +// x x 14 15 16 17 +// x x 20 21 22 23 +// #define sadMxNxK(m, n, k) \ void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride, \ const uint8_t *ref_ptr, int ref_stride, \ - uint32_t *sad_array) { \ + uint32_t sad_array[k]) { \ int i; \ for (i = 0; i < k; ++i) \ sad_array[i] = \ - vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \ + vpx_sad##m##x##n##_c(src_ptr, src_stride, ref_ptr + i, ref_stride); \ } -// This appears to be equivalent to the above when k == 4 and refs is const +// Compare |src_ptr| to 4 distinct references in |ref_array[]| #define sadMxNx4D(m, n) \ void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ int i; \ for (i = 0; i < 4; ++i) \ sad_array[i] = \ @@ -181,15 +197,15 @@ static INLINE unsigned int highbd_sadb(const uint8_t *src8_ptr, int src_stride, return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n); \ } -#define highbd_sadMxNx4D(m, n) \ - void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ - const uint8_t *const ref_array[], \ - int ref_stride, uint32_t *sad_array) { \ - int i; \ - for (i = 0; i < 4; ++i) { \ - sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ - ref_array[i], ref_stride); \ - } \ +#define highbd_sadMxNx4D(m, n) \ + void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \ + const uint8_t *const ref_array[4], \ + int ref_stride, uint32_t sad_array[4]) { \ + int i; \ + for (i = 0; i < 4; ++i) { \ + sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride, \ + ref_array[i], ref_stride); \ + } \ } /* clang-format off */ diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index 8b6672248..06a8febb2 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -877,80 +877,80 @@ specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/; # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally # # Blocks of 3 -add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/; -add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/; -add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad8x16x3 sse3 msa mmi/; -add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad8x8x3 sse3 msa mmi/; -add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[3]"; specialize qw/vpx_sad4x4x3 sse3 msa mmi/; # Blocks of 8 -add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad32x32x8 avx2/; -add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/; -add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/; -add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/; -add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/; -add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t sad_array[8]"; specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/; # # Multi-block SAD, comparing a reference to N independent blocks # -add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/; -add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/; -add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array"; +add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/; add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size"; @@ -1064,43 +1064,43 @@ if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") { # # Multi-block SAD, comparing a reference to N independent blocks # - add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad64x64x4d sse2/; - add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad64x32x4d sse2/; - add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x64x4d sse2/; - add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x32x4d sse2/; - add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad32x16x4d sse2/; - add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x32x4d sse2/; - add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x16x4d sse2/; - add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad16x8x4d sse2/; - add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x16x4d sse2/; - add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x8x4d sse2/; - add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad8x4x4d sse2/; - add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x8x4d sse2/; - add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array"; + add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]"; specialize qw/vpx_highbd_sad4x4x4d sse2/; # diff --git a/vpx_dsp/x86/sad4d_avx2.c b/vpx_dsp/x86/sad4d_avx2.c index 9dd066691..5f1f757e2 100644 --- a/vpx_dsp/x86/sad4d_avx2.c +++ b/vpx_dsp/x86/sad4d_avx2.c @@ -11,8 +11,7 @@ #include "./vpx_dsp_rtcd.h" #include "vpx/vpx_integer.h" -static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, - uint32_t *sad_array) { +static INLINE void calc_final_4(const __m256i sums[4], uint32_t sad_array[4]) { const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]); const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]); const __m256i t2 = _mm256_hadd_epi32(t0, t1); @@ -22,8 +21,8 @@ static INLINE void calc_final_4(const __m256i *const sums /*[4]*/, } void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[/*4*/], int ref_stride, - uint32_t *sad_array /*[4]*/) { + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { int i; const uint8_t *refs[4]; __m256i sums[4]; @@ -71,7 +70,7 @@ void vpx_sad32x32x4d_avx2(const uint8_t *src_ptr, int src_stride, void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, - uint32_t *sad_array) { + uint32_t sad_array[8]) { int i; __m256i sums[8]; @@ -127,8 +126,8 @@ void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride, } void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[/*4*/], int ref_stride, - uint32_t *sad_array /*[4]*/) { + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { __m256i sums[4]; int i; const uint8_t *refs[4]; diff --git a/vpx_dsp/x86/sad4d_avx512.c b/vpx_dsp/x86/sad4d_avx512.c index 2fa910871..cfd23fedd 100644 --- a/vpx_dsp/x86/sad4d_avx512.c +++ b/vpx_dsp/x86/sad4d_avx512.c @@ -12,8 +12,8 @@ #include "vpx/vpx_integer.h" void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, - const uint8_t *const ref_array[/*4*/], - int ref_stride, uint32_t *res /*[4]*/) { + const uint8_t *const ref_array[4], int ref_stride, + uint32_t sad_array[4]) { __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg; __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3; __m512i sum_mlow, sum_mhigh; @@ -78,6 +78,6 @@ void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride, sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256), _mm256_extractf128_si256(sum256, 1)); - _mm_storeu_si128((__m128i *)(res), sum128); + _mm_storeu_si128((__m128i *)(sad_array), sum128); } } |