diff options
author | Ronald S. Bultje <rbultje@google.com> | 2013-06-20 09:34:25 -0700 |
---|---|---|
committer | Ronald S. Bultje <rbultje@google.com> | 2013-06-20 09:34:25 -0700 |
commit | 8fb6c58191251792765c2910af3f9d6da22d6c11 (patch) | |
tree | 658ce312142ee7b7d3dd092beaab84e2cd301476 /vp9/encoder/x86/vp9_variance_mmx.c | |
parent | 3656835771ad338ed22cebb19311274e90efc768 (diff) | |
download | libvpx-8fb6c58191251792765c2910af3f9d6da22d6c11.tar libvpx-8fb6c58191251792765c2910af3f9d6da22d6c11.tar.gz libvpx-8fb6c58191251792765c2910af3f9d6da22d6c11.tar.bz2 libvpx-8fb6c58191251792765c2910af3f9d6da22d6c11.zip |
Implement sse2 and ssse3 versions for all sub_pixel_variance sizes.
Overall speedup around 5% (bus @ 1500kbps first 50 frames 4min10 ->
3min58). Specific changes to timings for each function compared to
original assembly-optimized versions (or just new version timings if
no previous assembly-optimized version was available):
sse2 4x4: 99 -> 82 cycles
sse2 4x8: 128 cycles
sse2 8x4: 121 cycles
sse2 8x8: 149 -> 129 cycles
sse2 8x16: 235 -> 245 cycles (?)
sse2 16x8: 269 -> 203 cycles
sse2 16x16: 441 -> 349 cycles
sse2 16x32: 641 cycles
sse2 32x16: 643 cycles
sse2 32x32: 1733 -> 1154 cycles
sse2 32x64: 2247 cycles
sse2 64x32: 2323 cycles
sse2 64x64: 6984 -> 4442 cycles
ssse3 4x4: 100 cycles (?)
ssse3 4x8: 103 cycles
ssse3 8x4: 71 cycles
ssse3 8x8: 147 cycles
ssse3 8x16: 158 cycles
ssse3 16x8: 188 -> 162 cycles
ssse3 16x16: 316 -> 273 cycles
ssse3 16x32: 535 cycles
ssse3 32x16: 564 cycles
ssse3 32x32: 973 cycles
ssse3 32x64: 1930 cycles
ssse3 64x32: 1922 cycles
ssse3 64x64: 3760 cycles
Change-Id: I81ff6fe51daf35a40d19785167004664d7e0c59d
Diffstat (limited to 'vp9/encoder/x86/vp9_variance_mmx.c')
-rw-r--r-- | vp9/encoder/x86/vp9_variance_mmx.c | 235 |
1 files changed, 0 insertions, 235 deletions
diff --git a/vp9/encoder/x86/vp9_variance_mmx.c b/vp9/encoder/x86/vp9_variance_mmx.c index bad1cfa74..d1415606e 100644 --- a/vp9/encoder/x86/vp9_variance_mmx.c +++ b/vp9/encoder/x86/vp9_variance_mmx.c @@ -13,27 +13,6 @@ #include "vp9/common/vp9_pragmas.h" #include "vpx_ports/mem.h" -extern void filter_block1d_h6_mmx -( - const unsigned char *src_ptr, - unsigned short *output_ptr, - unsigned int src_pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *vp7_filter -); -extern void filter_block1d_v6_mmx -( - const short *src_ptr, - unsigned char *output_ptr, - unsigned int pixels_per_line, - unsigned int pixel_step, - unsigned int output_height, - unsigned int output_width, - short *vp7_filter -); - extern unsigned int vp9_get_mb_ss_mmx(const short *src_ptr); extern unsigned int vp9_get8x8var_mmx ( @@ -53,30 +32,6 @@ extern unsigned int vp9_get4x4var_mmx unsigned int *SSE, int *Sum ); -extern void vp9_filter_block2d_bil4x4_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); -extern void vp9_filter_block2d_bil_var_mmx -( - const unsigned char *ref_ptr, - int ref_pixels_per_line, - const unsigned char *src_ptr, - int src_pixels_per_line, - unsigned int Height, - const short *HFilter, - const short *VFilter, - int *sum, - unsigned int *sumsquared -); - unsigned int vp9_variance4x4_mmx( const unsigned char *src_ptr, @@ -190,193 +145,3 @@ unsigned int vp9_variance8x16_mmx( return (var - (((unsigned int)avg * avg) >> 7)); } - -DECLARE_ALIGNED(16, extern const short, vp9_bilinear_filters_mmx[16][8]); - -unsigned int vp9_sub_pixel_variance4x4_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse) - -{ - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil4x4_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 4)); -} - - -unsigned int vp9_sub_pixel_variance8x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 6)); -} - -unsigned int vp9_sub_pixel_variance16x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - vp9_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 8)); - - -} - -unsigned int vp9_sub_pixel_mse16x16_mmx( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - vp9_sub_pixel_variance16x16_mmx(src_ptr, src_pixels_per_line, xoffset, yoffset, dst_ptr, dst_pixels_per_line, sse); - return *sse; -} - -unsigned int vp9_sub_pixel_variance16x8_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum0, xsum1; - unsigned int xxsum0, xxsum1; - - - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum0, &xxsum0 - ); - - - vp9_filter_block2d_bil_var_mmx( - src_ptr + 8, src_pixels_per_line, - dst_ptr + 8, dst_pixels_per_line, 8, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum1, &xxsum1 - ); - - xsum0 += xsum1; - xxsum0 += xxsum1; - - *sse = xxsum0; - return (xxsum0 - (((unsigned int)xsum0 * xsum0) >> 7)); -} - -unsigned int vp9_sub_pixel_variance8x16_mmx -( - const unsigned char *src_ptr, - int src_pixels_per_line, - int xoffset, - int yoffset, - const unsigned char *dst_ptr, - int dst_pixels_per_line, - unsigned int *sse -) { - int xsum; - unsigned int xxsum; - vp9_filter_block2d_bil_var_mmx( - src_ptr, src_pixels_per_line, - dst_ptr, dst_pixels_per_line, 16, - vp9_bilinear_filters_mmx[xoffset], vp9_bilinear_filters_mmx[yoffset], - &xsum, &xxsum - ); - *sse = xxsum; - return (xxsum - (((unsigned int)xsum * xsum) >> 7)); -} - - -unsigned int vp9_variance_halfpixvar16x16_h_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 0, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp9_variance_halfpixvar16x16_v_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 0, 8, - ref_ptr, recon_stride, sse); -} - - -unsigned int vp9_variance_halfpixvar16x16_hv_mmx( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - return vp9_sub_pixel_variance16x16_mmx(src_ptr, source_stride, 8, 8, - ref_ptr, recon_stride, sse); -} |