diff options
-rw-r--r-- | test/sad_test.cc | 3 | ||||
-rw-r--r-- | vpx_dsp/ppc/sad_vsx.c | 87 | ||||
-rw-r--r-- | vpx_dsp/ppc/types_vsx.h | 1 | ||||
-rw-r--r-- | vpx_dsp/vpx_dsp_rtcd_defs.pl | 6 |
4 files changed, 54 insertions, 43 deletions
diff --git a/test/sad_test.cc b/test/sad_test.cc index 0d6870d50..cef23e19f 100644 --- a/test/sad_test.cc +++ b/test/sad_test.cc @@ -1021,6 +1021,9 @@ const SadMxNParam vsx_tests[] = { SadMxNParam(16, 32, &vpx_sad16x32_vsx), SadMxNParam(16, 16, &vpx_sad16x16_vsx), SadMxNParam(16, 8, &vpx_sad16x8_vsx), + SadMxNParam(8, 16, &vpx_sad8x16_vsx), + SadMxNParam(8, 8, &vpx_sad8x8_vsx), + SadMxNParam(8, 4, &vpx_sad8x4_vsx), }; INSTANTIATE_TEST_CASE_P(VSX, SADTest, ::testing::ValuesIn(vsx_tests)); diff --git a/vpx_dsp/ppc/sad_vsx.c b/vpx_dsp/ppc/sad_vsx.c index bb49addae..18673f682 100644 --- a/vpx_dsp/ppc/sad_vsx.c +++ b/vpx_dsp/ppc/sad_vsx.c @@ -17,71 +17,75 @@ #include "vpx/vpx_integer.h" #include "vpx_ports/mem.h" -#define PROCESS16(offset) \ - v_a = vec_vsx_ld(offset, a); \ - v_b = vec_vsx_ld(offset, b); \ - v_ah = unpack_to_s16_h(v_a); \ - v_al = unpack_to_s16_l(v_a); \ - v_bh = unpack_to_s16_h(v_b); \ - v_bl = unpack_to_s16_l(v_b); \ - v_subh = vec_sub(v_ah, v_bh); \ - v_subl = vec_sub(v_al, v_bl); \ - v_absh = vec_abs(v_subh); \ - v_absl = vec_abs(v_subl); \ - v_sad = vec_sum4s(v_absh, v_sad); \ - v_sad = vec_sum4s(v_absl, v_sad); +#define PROCESS16(offset) \ + v_a = vec_vsx_ld(offset, a); \ + v_b = vec_vsx_ld(offset, b); \ + v_abs = vec_sub(vec_max(v_a, v_b), vec_min(v_a, v_b)); \ + v_sad = vec_sum4s(v_abs, v_sad); + +#define SAD8(height) \ + unsigned int vpx_sad8x##height##_vsx(const uint8_t *a, int a_stride, \ + const uint8_t *b, int b_stride) { \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ + \ + do { \ + PROCESS16(0) \ + \ + a += a_stride; \ + b += b_stride; \ + y++; \ + } while (y < height); \ + \ + return v_sad[1] + v_sad[0]; \ + } #define SAD16(height) \ unsigned int vpx_sad16x##height##_vsx(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride) { \ - int y; \ - unsigned int sad[4]; \ - uint8x16_t v_a, v_b; \ - int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ - int32x4_t v_sad = vec_splat_s32(0); \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ \ - for (y = 0; y < height; y++) { \ + do { \ PROCESS16(0); \ \ a += a_stride; \ b += b_stride; \ - } \ - vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + y++; \ + } while (y < height); \ \ - return sad[3] + sad[2] + sad[1] + sad[0]; \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ } #define SAD32(height) \ unsigned int vpx_sad32x##height##_vsx(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride) { \ - int y; \ - unsigned int sad[4]; \ - uint8x16_t v_a, v_b; \ - int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ - int32x4_t v_sad = vec_splat_s32(0); \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ \ - for (y = 0; y < height; y++) { \ + do { \ PROCESS16(0); \ PROCESS16(16); \ \ a += a_stride; \ b += b_stride; \ - } \ - vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + y++; \ + } while (y < height); \ \ - return sad[3] + sad[2] + sad[1] + sad[0]; \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ } #define SAD64(height) \ unsigned int vpx_sad64x##height##_vsx(const uint8_t *a, int a_stride, \ const uint8_t *b, int b_stride) { \ - int y; \ - unsigned int sad[4]; \ - uint8x16_t v_a, v_b; \ - int16x8_t v_ah, v_al, v_bh, v_bl, v_absh, v_absl, v_subh, v_subl; \ - int32x4_t v_sad = vec_splat_s32(0); \ + int y = 0; \ + uint8x16_t v_a, v_b, v_abs; \ + uint32x4_t v_sad = vec_zeros_u32; \ \ - for (y = 0; y < height; y++) { \ + do { \ PROCESS16(0); \ PROCESS16(16); \ PROCESS16(32); \ @@ -89,12 +93,15 @@ \ a += a_stride; \ b += b_stride; \ - } \ - vec_vsx_st((uint32x4_t)v_sad, 0, sad); \ + y++; \ + } while (y < height); \ \ - return sad[3] + sad[2] + sad[1] + sad[0]; \ + return v_sad[3] + v_sad[2] + v_sad[1] + v_sad[0]; \ } +SAD8(4); +SAD8(8); +SAD8(16); SAD16(8); SAD16(16); SAD16(32); diff --git a/vpx_dsp/ppc/types_vsx.h b/vpx_dsp/ppc/types_vsx.h index 803d0377a..81c7b970a 100644 --- a/vpx_dsp/ppc/types_vsx.h +++ b/vpx_dsp/ppc/types_vsx.h @@ -82,6 +82,7 @@ static const int16x8_t vec_ones_s16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; static const uint16x8_t vec_ones_u16 = { 1, 1, 1, 1, 1, 1, 1, 1 }; static const uint32x4_t vec_ones_u32 = { 1, 1, 1, 1 }; static const int32x4_t vec_zeros_s32 = { 0, 0, 0, 0 }; +static const uint32x4_t vec_zeros_u32 = { 0, 0, 0, 0 }; static const uint16x8_t vec_shift_sign_s16 = { 15, 15, 15, 15, 15, 15, 15, 15 }; static const uint32x4_t vec_shift_sign_s32 = { 31, 31, 31, 31 }; static const uint8x16_t vec_perm64 = { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl index f237e5503..9f3e268cc 100644 --- a/vpx_dsp/vpx_dsp_rtcd_defs.pl +++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl @@ -748,13 +748,13 @@ add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x16 neon msa sse2 mmi/; +specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x8 neon msa sse2 mmi/; +specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; -specialize qw/vpx_sad8x4 neon msa sse2 mmi/; +specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/; add_proto qw/unsigned int vpx_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride"; specialize qw/vpx_sad4x8 neon msa sse2 mmi/; |