From 5c937db029820fe7a44e55d5dc7fc3c884ddb15b Mon Sep 17 00:00:00 2001 From: Dmitry Kovalev Date: Wed, 28 May 2014 11:44:45 -0700 Subject: Cleaning up vp9_variance_avx2.c. Change-Id: I75eb47dd21f87015efd673dbd2aa71f4386afdf5 --- vp9/encoder/x86/vp9_variance_avx2.c | 295 +++++++++++++----------------------- 1 file changed, 109 insertions(+), 186 deletions(-) (limited to 'vp9/encoder') diff --git a/vp9/encoder/x86/vp9_variance_avx2.c b/vp9/encoder/x86/vp9_variance_avx2.c index 7f81f46b8..ea09b959e 100644 --- a/vp9/encoder/x86/vp9_variance_avx2.c +++ b/vp9/encoder/x86/vp9_variance_avx2.c @@ -12,67 +12,39 @@ #include "vp9/encoder/vp9_variance.h" #include "vpx_ports/mem.h" -typedef void (*get_var_avx2) ( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -void vp9_get16x16var_avx2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -void vp9_get32x32var_avx2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *SSE, - int *Sum -); - -unsigned int vp9_sub_pixel_variance32xh_avx2 -( - const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - int height, - unsigned int *sse -); - -unsigned int vp9_sub_pixel_avg_variance32xh_avx2 -( - const uint8_t *src, - int src_stride, - int x_offset, - int y_offset, - const uint8_t *dst, - int dst_stride, - const uint8_t *sec, - int sec_stride, - int height, - unsigned int *sseptr -); - -static void variance_avx2(const unsigned char *src_ptr, int source_stride, - const unsigned char *ref_ptr, int recon_stride, - int w, int h, unsigned int *sse, int *sum, - get_var_avx2 var_fn, int block_size) { - unsigned int sse0; - int sum0; +typedef void (*get_var_avx2)(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vp9_get16x16var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +void vp9_get32x32var_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse, int *sum); + +unsigned int vp9_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride, + int x_offset, int y_offset, + const uint8_t *dst, int dst_stride, + int height, + unsigned int *sse); + +unsigned int vp9_sub_pixel_avg_variance32xh_avx2(const uint8_t *src, + int src_stride, + int x_offset, + int y_offset, + const uint8_t *dst, + int dst_stride, + const uint8_t *sec, + int sec_stride, + int height, + unsigned int *sseptr); + +static void variance_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + int w, int h, unsigned int *sse, int *sum, + get_var_avx2 var_fn, int block_size) { int i, j; *sse = 0; @@ -80,105 +52,68 @@ static void variance_avx2(const unsigned char *src_ptr, int source_stride, for (i = 0; i < h; i += 16) { for (j = 0; j < w; j += block_size) { - // processing 16 rows horizontally each call - var_fn(src_ptr + source_stride * i + j, source_stride, - ref_ptr + recon_stride * i + j, recon_stride, &sse0, &sum0); + unsigned int sse0; + int sum0; + var_fn(&src[src_stride * i + j], src_stride, + &ref[ref_stride * i + j], ref_stride, &sse0, &sum0); *sse += sse0; *sum += sum0; } } } -unsigned int vp9_variance16x16_avx2 -( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int var; - int avg; - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 16, 16, - &var, &avg, vp9_get16x16var_avx2, 16); - *sse = var; - return (var - (((unsigned int)avg * avg) >> 8)); +unsigned int vp9_variance16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 16, 16, + sse, &sum, vp9_get16x16var_avx2, 16); + return *sse - (((unsigned int)sum * sum) >> 8); } -unsigned int vp9_mse16x16_avx2( - const unsigned char *src_ptr, - int source_stride, - const unsigned char *ref_ptr, - int recon_stride, - unsigned int *sse) { - unsigned int sse0; - int sum0; - vp9_get16x16var_avx2(src_ptr, source_stride, ref_ptr, recon_stride, &sse0, - &sum0); - *sse = sse0; - return sse0; +unsigned int vp9_mse16x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, + unsigned int *sse) { + int sum; + vp9_get16x16var_avx2(src, src_stride, ref, ref_stride, sse, &sum); + return *sse; } -unsigned int vp9_variance32x32_avx2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance32x16_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - // processing 32 elements vertically in parallel - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 32, - &var, &avg, vp9_get32x32var_avx2, 32); - *sse = var; - return (var - (((int64_t)avg * avg) >> 10)); + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 16, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 9); } -unsigned int vp9_variance32x16_avx2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance32x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - // processing 32 elements vertically in parallel - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 32, 16, - &var, &avg, vp9_get32x32var_avx2, 32); - *sse = var; - return (var - (((int64_t)avg * avg) >> 9)); + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 32, 32, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 10); } - -unsigned int vp9_variance64x64_avx2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance64x64_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - // processing 32 elements vertically in parallel - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 64, - &var, &avg, vp9_get32x32var_avx2, 32); - *sse = var; - return (var - (((int64_t)avg * avg) >> 12)); + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 64, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 12); } -unsigned int vp9_variance64x32_avx2(const uint8_t *src_ptr, - int source_stride, - const uint8_t *ref_ptr, - int recon_stride, +unsigned int vp9_variance64x32_avx2(const uint8_t *src, int src_stride, + const uint8_t *ref, int ref_stride, unsigned int *sse) { - unsigned int var; - int avg; - - // processing 32 elements vertically in parallel - variance_avx2(src_ptr, source_stride, ref_ptr, recon_stride, 64, 32, - &var, &avg, vp9_get32x32var_avx2, 32); - - *sse = var; - return (var - (((int64_t)avg * avg) >> 11)); + int sum; + variance_avx2(src, src_stride, ref, ref_stride, 64, 32, + sse, &sum, vp9_get32x32var_avx2, 32); + return *sse - (((int64_t)sum * sum) >> 11); } unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, @@ -187,22 +122,19 @@ unsigned int vp9_sub_pixel_variance64x64_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, - unsigned int *sse_ptr) { - // processing 32 elements in parallel - unsigned int sse; - int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - 64, &sse); - // processing the next 32 elements in parallel + unsigned int *sse) { + unsigned int sse1; + const int se1 = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 64, &sse1); unsigned int sse2; - int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, - x_offset, y_offset, - dst + 32, dst_stride, - 64, &sse2); - se += se2; - sse += sse2; - *sse_ptr = sse; - return sse - (((int64_t)se * se) >> 12); + const int se2 = vp9_sub_pixel_variance32xh_avx2(src + 32, src_stride, + x_offset, y_offset, + dst + 32, dst_stride, + 64, &sse2); + const int se = se1 + se2; + *sse = sse1 + sse2; + return *sse - (((int64_t)se * se) >> 12); } unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, @@ -211,14 +143,11 @@ unsigned int vp9_sub_pixel_variance32x32_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, - unsigned int *sse_ptr) { - // processing 32 element in parallel - unsigned int sse; - int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - 32, &sse); - *sse_ptr = sse; - return sse - (((int64_t)se * se) >> 10); + unsigned int *sse) { + const int se = vp9_sub_pixel_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + 32, sse); + return *sse - (((int64_t)se * se) >> 10); } unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, @@ -227,24 +156,22 @@ unsigned int vp9_sub_pixel_avg_variance64x64_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, - unsigned int *sseptr, + unsigned int *sse, const uint8_t *sec) { - // processing 32 elements in parallel - unsigned int sse; - - int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - sec, 64, 64, &sse); + unsigned int sse1; + const int se1 = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + sec, 64, 64, &sse1); unsigned int sse2; - // processing the next 32 elements in parallel - int se2 = vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, - y_offset, dst + 32, dst_stride, - sec + 32, 64, 64, &sse2); - se += se2; - sse += sse2; - *sseptr = sse; + const int se2 = + vp9_sub_pixel_avg_variance32xh_avx2(src + 32, src_stride, x_offset, + y_offset, dst + 32, dst_stride, + sec + 32, 64, 64, &sse2); + const int se = se1 + se2; - return sse - (((int64_t)se * se) >> 12); + *sse = sse1 + sse2; + + return *sse - (((int64_t)se * se) >> 12); } unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, @@ -253,15 +180,11 @@ unsigned int vp9_sub_pixel_avg_variance32x32_avx2(const uint8_t *src, int y_offset, const uint8_t *dst, int dst_stride, - unsigned int *sseptr, + unsigned int *sse, const uint8_t *sec) { // processing 32 element in parallel - unsigned int sse; - int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, - y_offset, dst, dst_stride, - sec, 32, 32, &sse); - *sseptr = sse; - return sse - (((int64_t)se * se) >> 10); + const int se = vp9_sub_pixel_avg_variance32xh_avx2(src, src_stride, x_offset, + y_offset, dst, dst_stride, + sec, 32, 32, sse); + return *sse - (((int64_t)se * se) >> 10); } - - -- cgit v1.2.3