diff options
Diffstat (limited to 'vp8')
-rw-r--r-- | vp8/encoder/generic/csystemdependent.c | 4 | ||||
-rw-r--r-- | vp8/encoder/onyx_if.c | 12 | ||||
-rw-r--r-- | vp8/encoder/ssim.c | 468 | ||||
-rw-r--r-- | vp8/encoder/variance.h | 32 | ||||
-rw-r--r-- | vp8/encoder/x86/ssim_opt.asm | 215 | ||||
-rw-r--r-- | vp8/encoder/x86/x86_csystemdependent.c | 30 | ||||
-rw-r--r-- | vp8/vp8cx.mk | 1 |
7 files changed, 468 insertions, 294 deletions
diff --git a/vp8/encoder/generic/csystemdependent.c b/vp8/encoder/generic/csystemdependent.c index fc0580d55..81108fe96 100644 --- a/vp8/encoder/generic/csystemdependent.c +++ b/vp8/encoder/generic/csystemdependent.c @@ -103,6 +103,10 @@ void vp8_cmachine_specific_config(VP8_COMP *cpi) // Pure C: vp8_yv12_copy_partial_frame_ptr = vp8_yv12_copy_partial_frame; +#if CONFIG_PSNR + cpi->rtcd.variance.ssimpf_8x8 = ssim_parms_8x8_c; + cpi->rtcd.variance.ssimpf = ssim_parms_c; +#endif #if ARCH_X86 || ARCH_X86_64 vp8_arch_x86_encoder_init(cpi); diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c index f7d3fb469..25e914936 100644 --- a/vp8/encoder/onyx_if.c +++ b/vp8/encoder/onyx_if.c @@ -86,9 +86,11 @@ extern double vp8_calc_ssim YV12_BUFFER_CONFIG *source, YV12_BUFFER_CONFIG *dest, int lumamask, - double *weight + double *weight, + const vp8_variance_rtcd_vtable_t *rtcd ); + extern double vp8_calc_ssimg ( YV12_BUFFER_CONFIG *source, @@ -5155,8 +5157,12 @@ int vp8_get_compressed_data(VP8_PTR ptr, unsigned int *frame_flags, unsigned lon vp8_deblock(cm->frame_to_show, &cm->post_proc_buffer, cm->filter_level * 10 / 6, 1, 0, IF_RTCD(&cm->rtcd.postproc)); vp8_clear_system_state(); - frame_psnr2 = vp8_calc_psnr(cpi->Source, &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error); - frame_ssim2 = vp8_calc_ssim(cpi->Source, &cm->post_proc_buffer, 1, &weight); + frame_psnr2 = vp8_calc_psnr(cpi->Source, + &cm->post_proc_buffer, &y2, &u2, &v2, &sq_error); + + frame_ssim2 = vp8_calc_ssim(cpi->Source, + &cm->post_proc_buffer, 1, &weight, + IF_RTCD(&cpi->rtcd.variance)); cpi->summed_quality += frame_ssim2 * weight; cpi->summed_weights += weight; diff --git a/vp8/encoder/ssim.c b/vp8/encoder/ssim.c index 4ebcba1a1..d6aa9566b 100644 --- a/vp8/encoder/ssim.c +++ b/vp8/encoder/ssim.c @@ -11,298 +11,13 @@ #include "vpx_scale/yv12config.h" #include "math.h" +#include "onyx_int.h" -#define C1 (float)(64 * 64 * 0.01*255*0.01*255) -#define C2 (float)(64 * 64 * 0.03*255*0.03*255) - -static int width_y; -static int height_y; -static int height_uv; -static int width_uv; -static int stride_uv; -static int stride; -static int lumimask; -static int luminance; -static double plane_summed_weights = 0; - -static short img12_sum_block[8*4096*4096*2] ; - -static short img1_sum[8*4096*2]; -static short img2_sum[8*4096*2]; -static int img1_sq_sum[8*4096*2]; -static int img2_sq_sum[8*4096*2]; -static int img12_mul_sum[8*4096*2]; - - -double vp8_similarity -( - int mu_x, - int mu_y, - int pre_mu_x2, - int pre_mu_y2, - int pre_mu_xy2 -) -{ - int mu_x2, mu_y2, mu_xy, theta_x2, theta_y2, theta_xy; - - mu_x2 = mu_x * mu_x; - mu_y2 = mu_y * mu_y; - mu_xy = mu_x * mu_y; - - theta_x2 = 64 * pre_mu_x2 - mu_x2; - theta_y2 = 64 * pre_mu_y2 - mu_y2; - theta_xy = 64 * pre_mu_xy2 - mu_xy; - - return (2 * mu_xy + C1) * (2 * theta_xy + C2) / ((mu_x2 + mu_y2 + C1) * (theta_x2 + theta_y2 + C2)); -} - -double vp8_ssim -( - const unsigned char *img1, - const unsigned char *img2, - int stride_img1, - int stride_img2, - int width, - int height -) -{ - int x, y, x2, y2, img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block, temp; - - double plane_quality, weight, mean; - - short *img1_sum_ptr1, *img1_sum_ptr2; - short *img2_sum_ptr1, *img2_sum_ptr2; - int *img1_sq_sum_ptr1, *img1_sq_sum_ptr2; - int *img2_sq_sum_ptr1, *img2_sq_sum_ptr2; - int *img12_mul_sum_ptr1, *img12_mul_sum_ptr2; - - plane_quality = 0; - - if (lumimask) - plane_summed_weights = 0.0f; - else - plane_summed_weights = (height - 7) * (width - 7); - - //some prologue for the main loop - temp = 8 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum[x] = img1[x]; - img2_sum[x] = img2[x]; - img1_sq_sum[x] = img1[x] * img1[x]; - img2_sq_sum[x] = img2[x] * img2[x]; - img12_mul_sum[x] = img1[x] * img2[x]; - - img1_sum_ptr1[x] = 0; - img2_sum_ptr1[x] = 0; - img1_sq_sum_ptr1[x] = 0; - img2_sq_sum_ptr1[x] = 0; - img12_mul_sum_ptr1[x] = 0; - } - - //the main loop - for (y = 1; y < height; y++) - { - img1 += stride_img1; - img2 += stride_img2; - - temp = (y - 1) % 9 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - temp = y % 9 * width; - - img1_sum_ptr2 = img1_sum + temp; - img2_sum_ptr2 = img2_sum + temp; - img1_sq_sum_ptr2 = img1_sq_sum + temp; - img2_sq_sum_ptr2 = img2_sq_sum + temp; - img12_mul_sum_ptr2 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum_ptr2[x] = img1_sum_ptr1[x] + img1[x]; - img2_sum_ptr2[x] = img2_sum_ptr1[x] + img2[x]; - img1_sq_sum_ptr2[x] = img1_sq_sum_ptr1[x] + img1[x] * img1[x]; - img2_sq_sum_ptr2[x] = img2_sq_sum_ptr1[x] + img2[x] * img2[x]; - img12_mul_sum_ptr2[x] = img12_mul_sum_ptr1[x] + img1[x] * img2[x]; - } - - if (y > 6) - { - //calculate the sum of the last 8 lines by subtracting the total sum of 8 lines back from the present sum - temp = (y + 1) % 9 * width; - - img1_sum_ptr1 = img1_sum + temp; - img2_sum_ptr1 = img2_sum + temp; - img1_sq_sum_ptr1 = img1_sq_sum + temp; - img2_sq_sum_ptr1 = img2_sq_sum + temp; - img12_mul_sum_ptr1 = img12_mul_sum + temp; - - for (x = 0; x < width; x++) - { - img1_sum_ptr1[x] = img1_sum_ptr2[x] - img1_sum_ptr1[x]; - img2_sum_ptr1[x] = img2_sum_ptr2[x] - img2_sum_ptr1[x]; - img1_sq_sum_ptr1[x] = img1_sq_sum_ptr2[x] - img1_sq_sum_ptr1[x]; - img2_sq_sum_ptr1[x] = img2_sq_sum_ptr2[x] - img2_sq_sum_ptr1[x]; - img12_mul_sum_ptr1[x] = img12_mul_sum_ptr2[x] - img12_mul_sum_ptr1[x]; - } - - //here we calculate the sum over the 8x8 block of pixels - //this is done by sliding a window across the column sums for the last 8 lines - //each time adding the new column sum, and subtracting the one which fell out of the window - img1_block = 0; - img2_block = 0; - img1_sq_block = 0; - img2_sq_block = 0; - img12_mul_block = 0; - - //prologue, and calculation of simularity measure from the first 8 column sums - for (x = 0; x < 8; x++) - { - img1_block += img1_sum_ptr1[x]; - img2_block += img2_sum_ptr1[x]; - img1_sq_block += img1_sq_sum_ptr1[x]; - img2_sq_block += img2_sq_sum_ptr1[x]; - img12_mul_block += img12_mul_sum_ptr1[x]; - } - - if (lumimask) - { - y2 = y - 7; - x2 = 0; - - if (luminance) - { - mean = (img2_block + img1_block) / 128.0f; - - if (!(y2 % 2 || x2 % 2)) - *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; - } - else - { - mean = *(img12_sum_block + y2 * width_uv + x2); - mean += *(img12_sum_block + y2 * width_uv + x2 + 4); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); - - mean /= 512.0f; - } - - weight = mean < 40 ? 0.0f : - (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); - plane_summed_weights += weight; - - plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - else - plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - - //and for the rest - for (x = 8; x < width; x++) - { - img1_block = img1_block + img1_sum_ptr1[x] - img1_sum_ptr1[x - 8]; - img2_block = img2_block + img2_sum_ptr1[x] - img2_sum_ptr1[x - 8]; - img1_sq_block = img1_sq_block + img1_sq_sum_ptr1[x] - img1_sq_sum_ptr1[x - 8]; - img2_sq_block = img2_sq_block + img2_sq_sum_ptr1[x] - img2_sq_sum_ptr1[x - 8]; - img12_mul_block = img12_mul_block + img12_mul_sum_ptr1[x] - img12_mul_sum_ptr1[x - 8]; - - if (lumimask) - { - y2 = y - 7; - x2 = x - 7; - - if (luminance) - { - mean = (img2_block + img1_block) / 128.0f; - - if (!(y2 % 2 || x2 % 2)) - *(img12_sum_block + y2 / 2 * width_uv + x2 / 2) = img2_block + img1_block; - } - else - { - mean = *(img12_sum_block + y2 * width_uv + x2); - mean += *(img12_sum_block + y2 * width_uv + x2 + 4); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2); - mean += *(img12_sum_block + (y2 + 4) * width_uv + x2 + 4); - - mean /= 512.0f; - } - - weight = mean < 40 ? 0.0f : - (mean < 50 ? (mean - 40.0f) / 10.0f : 1.0f); - plane_summed_weights += weight; - - plane_quality += weight * vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - else - plane_quality += vp8_similarity(img1_block, img2_block, img1_sq_block, img2_sq_block, img12_mul_block); - } - } - } - - if (plane_summed_weights == 0) - return 1.0f; - else - return plane_quality / plane_summed_weights; -} - -double vp8_calc_ssim -( - YV12_BUFFER_CONFIG *source, - YV12_BUFFER_CONFIG *dest, - int lumamask, - double *weight -) -{ - double a, b, c; - double frame_weight; - double ssimv; - - width_y = source->y_width; - height_y = source->y_height; - height_uv = source->uv_height; - width_uv = source->uv_width; - stride_uv = dest->uv_stride; - stride = dest->y_stride; - - lumimask = lumamask; - - luminance = 1; - a = vp8_ssim(source->y_buffer, dest->y_buffer, - source->y_stride, dest->y_stride, source->y_width, source->y_height); - luminance = 0; - - frame_weight = plane_summed_weights / ((width_y - 7) * (height_y - 7)); - - if (frame_weight == 0) - a = b = c = 1.0f; - else - { - b = vp8_ssim(source->u_buffer, dest->u_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); - - c = vp8_ssim(source->v_buffer, dest->v_buffer, - source->uv_stride, dest->uv_stride, source->uv_width, source->uv_height); - } - - ssimv = a * .8 + .1 * (b + c); - - *weight = frame_weight; - - return ssimv; -} - +#if CONFIG_RUNTIME_CPU_DETECT +#define IF_RTCD(x) (x) +#else +#define IF_RTCD(x) NULL +#endif // Google version of SSIM // SSIM #define KERNEL 3 @@ -520,3 +235,174 @@ double vp8_calc_ssimg *ssim_v /= uvsize; return ssim_all; } + + +void ssim_parms_c +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +) +{ + int i,j; + for(i=0;i<16;i++,s+=sp,r+=rp) + { + for(j=0;j<16;j++) + { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} +void ssim_parms_8x8_c +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +) +{ + int i,j; + for(i=0;i<8;i++,s+=sp,r+=rp) + { + for(j=0;j<8;j++) + { + *sum_s += s[j]; + *sum_r += r[j]; + *sum_sq_s += s[j] * s[j]; + *sum_sq_r += r[j] * r[j]; + *sum_sxr += s[j] * r[j]; + } + } +} + +const static long long c1 = 426148; // (256^2*(.01*255)^2 +const static long long c2 = 3835331; //(256^2*(.03*255)^2 + +static double similarity +( + unsigned long sum_s, + unsigned long sum_r, + unsigned long sum_sq_s, + unsigned long sum_sq_r, + unsigned long sum_sxr, + int count +) +{ + long long ssim_n = (2*sum_s*sum_r+ c1)*(2*count*sum_sxr-2*sum_s*sum_r+c2); + + long long ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + (count*sum_sq_s-sum_s*sum_s + count*sum_sq_r-sum_r*sum_r +c2) ; + + return ssim_n * 1.0 / ssim_d; +} + +static double ssim_16x16(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 256); +} +static double ssim_8x8(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + rtcd->ssimpf_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64); +} + +// TODO: (jbb) tried to scale this function such that we may be able to use it +// for distortion metric in mode selection code ( provided we do a reconstruction) +long dssim(unsigned char *s,int sp, unsigned char *r,int rp, + const vp8_variance_rtcd_vtable_t *rtcd) +{ + unsigned long sum_s=0,sum_r=0,sum_sq_s=0,sum_sq_r=0,sum_sxr=0; + double ssim3; + long long ssim_n; + long long ssim_d; + + rtcd->ssimpf(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r, &sum_sxr); + ssim_n = (2*sum_s*sum_r+ c1)*(2*256*sum_sxr-2*sum_s*sum_r+c2); + + ssim_d = (sum_s*sum_s +sum_r*sum_r+c1)* + (256*sum_sq_s-sum_s*sum_s + 256*sum_sq_r-sum_r*sum_r +c2) ; + + ssim3 = 256 * (ssim_d-ssim_n) / ssim_d; + return (long)( 256*ssim3 * ssim3 ); +} +// TODO: (jbb) this 8x8 window might be too big + we may want to pick pixels +// such that the window regions overlap block boundaries to penalize blocking +// artifacts. + +double vp8_ssim2 +( + unsigned char *img1, + unsigned char *img2, + int stride_img1, + int stride_img2, + int width, + int height, + const vp8_variance_rtcd_vtable_t *rtcd +) +{ + int i,j; + + double ssim_total=0; + + // we can sample points as frequently as we like start with 1 per 8x8 + for(i=0; i < height; i+=8, img1 += stride_img1*8, img2 += stride_img2*8) + { + for(j=0; j < width; j+=8 ) + { + ssim_total += ssim_8x8(img1, stride_img1, img2, stride_img2, rtcd); + } + } + ssim_total /= (width/8 * height /8); + return ssim_total; + +} +double vp8_calc_ssim +( + YV12_BUFFER_CONFIG *source, + YV12_BUFFER_CONFIG *dest, + int lumamask, + double *weight, + const vp8_variance_rtcd_vtable_t *rtcd +) +{ + double a, b, c; + double ssimv; +//IF_RTCD(&cpi->rtcd.variance) + a = vp8_ssim2(source->y_buffer, dest->y_buffer, + source->y_stride, dest->y_stride, source->y_width, + source->y_height, rtcd); + + b = vp8_ssim2(source->u_buffer, dest->u_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + + c = vp8_ssim2(source->v_buffer, dest->v_buffer, + source->uv_stride, dest->uv_stride, source->uv_width, + source->uv_height, rtcd); + + ssimv = a * .8 + .1 * (b + c); + + *weight = 1; + + return ssimv; +} diff --git a/vp8/encoder/variance.h b/vp8/encoder/variance.h index 5befd3b86..bf17ea8b6 100644 --- a/vp8/encoder/variance.h +++ b/vp8/encoder/variance.h @@ -85,6 +85,19 @@ unsigned int *sse \ ); +#define prototype_ssimpf(sym) \ + void (sym) \ + ( \ + unsigned char *s, \ + int sp, \ + unsigned char *r, \ + int rp, \ + unsigned long *sum_s, \ + unsigned long *sum_r, \ + unsigned long *sum_sq_s, \ + unsigned long *sum_sq_r, \ + unsigned long *sum_sxr \ + ); #define prototype_getmbss(sym) unsigned int (sym)(const short *) @@ -306,6 +319,15 @@ extern prototype_variance2(vp8_variance_get16x16var); #endif extern prototype_sad(vp8_variance_get4x4sse_cs); +#ifndef vp8_ssimpf +#define vp8_ssimpf ssim_parms_c +#endif +extern prototype_ssimpf(vp8_ssimpf) + +#ifndef vp8_ssimpf_8x8 +#define vp8_ssimpf_8x8 ssim_parms_8x8_c +#endif +extern prototype_ssimpf(vp8_ssimpf_8x8) typedef prototype_sad(*vp8_sad_fn_t); typedef prototype_sad_multi_same_address(*vp8_sad_multi_fn_t); @@ -315,6 +337,10 @@ typedef prototype_variance(*vp8_variance_fn_t); typedef prototype_variance2(*vp8_variance2_fn_t); typedef prototype_subpixvariance(*vp8_subpixvariance_fn_t); typedef prototype_getmbss(*vp8_getmbss_fn_t); + +typedef prototype_ssimpf(*vp8_ssimpf_fn_t) + + typedef struct { vp8_sad_fn_t sad4x4; @@ -365,6 +391,11 @@ typedef struct vp8_sad_multi_d_fn_t sad8x8x4d; vp8_sad_multi_d_fn_t sad4x4x4d; +#if CONFIG_PSNR + vp8_ssimpf_fn_t ssimpf_8x8; + vp8_ssimpf_fn_t ssimpf; +#endif + } vp8_variance_rtcd_vtable_t; typedef struct @@ -378,6 +409,7 @@ typedef struct vp8_sad_multi_fn_t sdx3f; vp8_sad_multi1_fn_t sdx8f; vp8_sad_multi_d_fn_t sdx4df; + } vp8_variance_fn_ptr_t; #if CONFIG_RUNTIME_CPU_DETECT diff --git a/vp8/encoder/x86/ssim_opt.asm b/vp8/encoder/x86/ssim_opt.asm new file mode 100644 index 000000000..c267cdb54 --- /dev/null +++ b/vp8/encoder/x86/ssim_opt.asm @@ -0,0 +1,215 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + +%include "vpx_ports/x86_abi_support.asm" + +; tabulate_ssim - sums sum_s,sum_r,sum_sq_s,sum_sq_r, sum_sxr +%macro TABULATE_SSIM 0 + paddusw xmm15, xmm3 ; sum_s + paddusw xmm14, xmm4 ; sum_r + movdqa xmm1, xmm3 + pmaddwd xmm1, xmm1 + paddq xmm13, xmm1 ; sum_sq_s + movdqa xmm2, xmm4 + pmaddwd xmm2, xmm2 + paddq xmm12, xmm2 ; sum_sq_r + pmaddwd xmm3, xmm4 + paddq xmm11, xmm3 ; sum_sxr +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_Q 1 + movdqa xmm2,%1 + punpckldq %1,xmm0 + punpckhdq xmm2,xmm0 + paddq %1,xmm2 + movdqa xmm2,%1 + punpcklqdq %1,xmm0 + punpckhqdq xmm2,xmm0 + paddq %1,xmm2 +%endmacro + +; Sum across the register %1 starting with q words +%macro SUM_ACROSS_W 1 + movdqa xmm1, %1 + punpcklwd %1,xmm0 + punpckhwd xmm1,xmm0 + paddd %1, xmm1 + SUM_ACROSS_Q %1 +%endmacro +;void ssim_parms_sse3( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_16x16_sse3) +sym(vp8_ssim_parms_16x16_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 16 ;row counter +NextRow: + + ;grab source and reference pixels + movdqu xmm5, [rsi] + movdqu xmm6, [rdi] + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpckhbw xmm3, xmm0 ; high_s + punpckhbw xmm4, xmm0 ; high_r + + TABULATE_SSIM + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz NextRow + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movq [rdi], xmm15; + mov rdi,arg(5) + movq [rdi], xmm14; + mov rdi,arg(6) + movq [rdi], xmm13; + mov rdi,arg(7) + movq [rdi], xmm12; + mov rdi,arg(8) + movq [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;void ssim_parms_sse3( +; unsigned char *s, +; int sp, +; unsigned char *r, +; int rp +; unsigned long *sum_s, +; unsigned long *sum_r, +; unsigned long *sum_sq_s, +; unsigned long *sum_sq_r, +; unsigned long *sum_sxr); +; +; TODO: Use parm passing through structure, probably don't need the pxors +; ( calling app will initialize to 0 ) could easily fit everything in sse2 +; without too much hastle, and can probably do better estimates with psadw +; or pavgb At this point this is just meant to be first pass for calculating +; all the parms needed for 16x16 ssim so we can play with dssim as distortion +; in mode selection code. +global sym(vp8_ssim_parms_8x8_sse3) +sym(vp8_ssim_parms_8x8_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 9 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;s + mov rcx, arg(1) ;sp + mov rdi, arg(2) ;r + mov rax, arg(3) ;rp + + pxor xmm0, xmm0 + pxor xmm15,xmm15 ;sum_s + pxor xmm14,xmm14 ;sum_r + pxor xmm13,xmm13 ;sum_sq_s + pxor xmm12,xmm12 ;sum_sq_r + pxor xmm11,xmm11 ;sum_sxr + + mov rdx, 8 ;row counter +NextRow2: + + ;grab source and reference pixels + movq xmm5, [rsi] + movq xmm6, [rdi] + + movdqa xmm3, xmm5 + movdqa xmm4, xmm6 + punpcklbw xmm3, xmm0 ; low_s + punpcklbw xmm4, xmm0 ; low_r + + TABULATE_SSIM + + add rsi, rcx ; next s row + add rdi, rax ; next r row + + dec rdx ; counter + jnz NextRow2 + + SUM_ACROSS_W xmm15 + SUM_ACROSS_W xmm14 + SUM_ACROSS_Q xmm13 + SUM_ACROSS_Q xmm12 + SUM_ACROSS_Q xmm11 + + mov rdi,arg(4) + movq [rdi], xmm15; + mov rdi,arg(5) + movq [rdi], xmm14; + mov rdi,arg(6) + movq [rdi], xmm13; + mov rdi,arg(7) + movq [rdi], xmm12; + mov rdi,arg(8) + movq [rdi], xmm11; + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret diff --git a/vp8/encoder/x86/x86_csystemdependent.c b/vp8/encoder/x86/x86_csystemdependent.c index 3158ac12b..5ab364147 100644 --- a/vp8/encoder/x86/x86_csystemdependent.c +++ b/vp8/encoder/x86/x86_csystemdependent.c @@ -176,6 +176,25 @@ void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) d->dqcoeff ); } +#if CONFIG_PSNR +#if ARCH_X86_64 +typedef void ssimpf +( + unsigned char *s, + int sp, + unsigned char *r, + int rp, + unsigned long *sum_s, + unsigned long *sum_r, + unsigned long *sum_sq_s, + unsigned long *sum_sq_r, + unsigned long *sum_sxr +); + +extern ssimpf vp8_ssim_parms_16x16_sse3; +extern ssimpf vp8_ssim_parms_8x8_sse3; +#endif +#endif #endif @@ -280,6 +299,8 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.variance.get16x16prederror = vp8_get16x16pred_error_sse2; cpi->rtcd.variance.get8x8var = vp8_get8x8var_sse2; cpi->rtcd.variance.get16x16var = vp8_get16x16var_sse2; + + /* cpi->rtcd.variance.get4x4sse_cs not implemented for wmt */; cpi->rtcd.fdct.short4x4 = vp8_short_fdct4x4_sse2; @@ -339,9 +360,18 @@ void vp8_arch_x86_encoder_init(VP8_COMP *cpi) cpi->rtcd.quantize.fastquantb = vp8_fast_quantize_b_ssse3; +#if CONFIG_PSNR +#if ARCH_X86_64 + cpi->rtcd.variance.ssimpf_8x8 = vp8_ssim_parms_8x8_sse3; + cpi->rtcd.variance.ssimpf = vp8_ssim_parms_16x16_sse3; +#endif +#endif + } #endif + + #if HAVE_SSE4_1 if (SSE4_1Enabled) { diff --git a/vp8/vp8cx.mk b/vp8/vp8cx.mk index c0ae250f5..670c02280 100644 --- a/vp8/vp8cx.mk +++ b/vp8/vp8cx.mk @@ -116,6 +116,7 @@ VP8_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/quantize_ssse3.asm VP8_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/sad_sse4.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/quantize_mmx.asm VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/encodeopt.asm +VP8_CX_SRCS-$(ARCH_X86)$(ARCH_X86_64) += encoder/x86/ssim_opt.asm ifeq ($(CONFIG_REALTIME_ONLY),yes) VP8_CX_SRCS_REMOVE-$(HAVE_SSE2) += encoder/x86/temporal_filter_apply_sse2.asm |