diff options
author | Ronald S. Bultje <rbultje@google.com> | 2013-06-20 09:34:25 -0700 |
---|---|---|
committer | Ronald S. Bultje <rbultje@google.com> | 2013-06-20 09:34:25 -0700 |
commit | 8fb6c58191251792765c2910af3f9d6da22d6c11 (patch) | |
tree | 658ce312142ee7b7d3dd092beaab84e2cd301476 /vp9/common | |
parent | 3656835771ad338ed22cebb19311274e90efc768 (diff) | |
download | libvpx-8fb6c58191251792765c2910af3f9d6da22d6c11.tar libvpx-8fb6c58191251792765c2910af3f9d6da22d6c11.tar.gz libvpx-8fb6c58191251792765c2910af3f9d6da22d6c11.tar.bz2 libvpx-8fb6c58191251792765c2910af3f9d6da22d6c11.zip |
Implement sse2 and ssse3 versions for all sub_pixel_variance sizes.
Overall speedup around 5% (bus @ 1500kbps first 50 frames 4min10 ->
3min58). Specific changes to timings for each function compared to
original assembly-optimized versions (or just new version timings if
no previous assembly-optimized version was available):
sse2 4x4: 99 -> 82 cycles
sse2 4x8: 128 cycles
sse2 8x4: 121 cycles
sse2 8x8: 149 -> 129 cycles
sse2 8x16: 235 -> 245 cycles (?)
sse2 16x8: 269 -> 203 cycles
sse2 16x16: 441 -> 349 cycles
sse2 16x32: 641 cycles
sse2 32x16: 643 cycles
sse2 32x32: 1733 -> 1154 cycles
sse2 32x64: 2247 cycles
sse2 64x32: 2323 cycles
sse2 64x64: 6984 -> 4442 cycles
ssse3 4x4: 100 cycles (?)
ssse3 4x8: 103 cycles
ssse3 8x4: 71 cycles
ssse3 8x8: 147 cycles
ssse3 8x16: 158 cycles
ssse3 16x8: 188 -> 162 cycles
ssse3 16x16: 316 -> 273 cycles
ssse3 16x32: 535 cycles
ssse3 32x16: 564 cycles
ssse3 32x32: 973 cycles
ssse3 32x64: 1930 cycles
ssse3 64x32: 1922 cycles
ssse3 64x64: 3760 cycles
Change-Id: I81ff6fe51daf35a40d19785167004664d7e0c59d
Diffstat (limited to 'vp9/common')
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 42 |
1 files changed, 19 insertions, 23 deletions
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index a405aab8d..575b619a1 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -266,85 +266,81 @@ prototype unsigned int vp9_variance4x4 "const uint8_t *src_ptr, int source_strid specialize vp9_variance4x4 mmx sse2 prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x64 sse2 +specialize vp9_sub_pixel_variance64x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance64x64 prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x64 +specialize vp9_sub_pixel_variance32x64 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x64 prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance64x32 +specialize vp9_sub_pixel_variance64x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance64x32 prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x16 +specialize vp9_sub_pixel_variance32x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x16 prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x32 +specialize vp9_sub_pixel_variance16x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance16x32 prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance32x32 sse2 +specialize vp9_sub_pixel_variance32x32 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance32x32 prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3 +specialize vp9_sub_pixel_variance16x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance16x16 prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x16 sse2 mmx -vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt +specialize vp9_sub_pixel_variance8x16 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x16 prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3 -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3; -vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt +specialize vp9_sub_pixel_variance16x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance16x8 prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x8 sse2 mmx -vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt +specialize vp9_sub_pixel_variance8x8 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x8 # TODO(jingning): need to convert 8x4/4x8 functions into mmx/sse form prototype unsigned int vp9_sub_pixel_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance8x4 +specialize vp9_sub_pixel_variance8x4 sse2 ssse3 prototype unsigned int vp9_sub_pixel_avg_variance8x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance8x4 prototype unsigned int vp9_sub_pixel_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x8 +specialize vp9_sub_pixel_variance4x8 sse ssse3 prototype unsigned int vp9_sub_pixel_avg_variance4x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance4x8 prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_sub_pixel_variance4x4 sse2 mmx -vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt +specialize vp9_sub_pixel_variance4x4 sse ssse3 +#vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" specialize vp9_sub_pixel_avg_variance4x4 @@ -390,15 +386,15 @@ prototype unsigned int vp9_sad4x4 "const uint8_t *src_ptr, int source_stride, co specialize vp9_sad4x4 mmx sse prototype unsigned int vp9_variance_halfpixvar16x16_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_h mmx sse2 +specialize vp9_variance_halfpixvar16x16_h sse2 vp9_variance_halfpixvar16x16_h_sse2=vp9_variance_halfpixvar16x16_h_wmt prototype unsigned int vp9_variance_halfpixvar16x16_v "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_v mmx sse2 +specialize vp9_variance_halfpixvar16x16_v sse2 vp9_variance_halfpixvar16x16_v_sse2=vp9_variance_halfpixvar16x16_v_wmt prototype unsigned int vp9_variance_halfpixvar16x16_hv "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" -specialize vp9_variance_halfpixvar16x16_hv mmx sse2 +specialize vp9_variance_halfpixvar16x16_hv sse2 vp9_variance_halfpixvar16x16_hv_sse2=vp9_variance_halfpixvar16x16_hv_wmt prototype unsigned int vp9_variance_halfpixvar64x64_h "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" @@ -507,8 +503,8 @@ specialize vp9_sad4x8x4d sse prototype void vp9_sad4x4x4d "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, unsigned int *sad_array" specialize vp9_sad4x4x4d sse -prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" -specialize vp9_sub_pixel_mse16x16 sse2 mmx +#prototype unsigned int vp9_sub_pixel_mse16x16 "const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, const uint8_t *dst_ptr, int dst_pixels_per_line, unsigned int *sse" +#specialize vp9_sub_pixel_mse16x16 sse2 mmx prototype unsigned int vp9_mse16x16 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int recon_stride, unsigned int *sse" specialize vp9_mse16x16 mmx sse2 |