From 8fb6c58191251792765c2910af3f9d6da22d6c11 Mon Sep 17 00:00:00 2001 From: "Ronald S. Bultje" Date: Thu, 20 Jun 2013 09:34:25 -0700 Subject: Implement sse2 and ssse3 versions for all sub_pixel_variance sizes. Overall speedup around 5% (bus @ 1500kbps first 50 frames 4min10 -> 3min58). Specific changes to timings for each function compared to original assembly-optimized versions (or just new version timings if no previous assembly-optimized version was available): sse2 4x4: 99 -> 82 cycles sse2 4x8: 128 cycles sse2 8x4: 121 cycles sse2 8x8: 149 -> 129 cycles sse2 8x16: 235 -> 245 cycles (?) sse2 16x8: 269 -> 203 cycles sse2 16x16: 441 -> 349 cycles sse2 16x32: 641 cycles sse2 32x16: 643 cycles sse2 32x32: 1733 -> 1154 cycles sse2 32x64: 2247 cycles sse2 64x32: 2323 cycles sse2 64x64: 6984 -> 4442 cycles ssse3 4x4: 100 cycles (?) ssse3 4x8: 103 cycles ssse3 8x4: 71 cycles ssse3 8x8: 147 cycles ssse3 8x16: 158 cycles ssse3 16x8: 188 -> 162 cycles ssse3 16x16: 316 -> 273 cycles ssse3 16x32: 535 cycles ssse3 32x16: 564 cycles ssse3 32x32: 973 cycles ssse3 32x64: 1930 cycles ssse3 64x32: 1922 cycles ssse3 64x64: 3760 cycles Change-Id: I81ff6fe51daf35a40d19785167004664d7e0c59d --- vp9/common/vp9_rtcd_defs.sh | 42 +++++++++++++++++++----------------------- 1 file changed, 19 insertions(+), 23 deletions(-) (limited to 'vp9/common') diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index a405aab8d..575b619a1 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -266,85 +266,81 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid specialize vp9_variance4x4 mmx sse2 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x64 sse2 +specialize vp9_sub_pixel_variance64x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance64x64 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x64 +specialize vp9_sub_pixel_variance32x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x64 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x32 +specialize vp9_sub_pixel_variance64x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance64x32 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x16 +specialize vp9_sub_pixel_variance32x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x16 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x32 +specialize vp9_sub_pixel_variance16x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance16x32 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x32 sse2 +specialize vp9_sub_pixel_variance32x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x32 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3 +specialize vp9_sub_pixel_variance16x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance16x16 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x16 sse2 mmx -vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt +specialize vp9_sub_pixel_variance8x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x16 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3 -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3; -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt +specialize vp9_sub_pixel_variance16x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance16x8 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x8 sse2 mmx -vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt +specialize vp9_sub_pixel_variance8x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x8 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x4 +specialize vp9_sub_pixel_variance8x4 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x4 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x8 +specialize vp9_sub_pixel_variance4x8 sse ssse3 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance4x8 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x4 sse2 mmx -vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt +specialize vp9_sub_pixel_variance4x4 sse ssse3 +#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance4x4 @@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co specialize vp9_sad4x4 mmx sse prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_h mmx sse2 +specialize vp9_variance_halfpixvar16x16_h sse2 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_v mmx sse2 +specialize vp9_variance_halfpixvar16x16_v sse2 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_hv mmx sse2 +specialize vp9_variance_halfpixvar16x16_hv sse2 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" @@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x4d sse -prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" -specialize vp9_sub_pixel_mse16x16 sse2 mmx +#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" +#specialize vp9_sub_pixel_mse16x16 sse2 mmx prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" specialize vp9_mse16x16 mmx sse2 -- cgit v1.2.3