diff options
Diffstat (limited to 'vp8/common')
-rw-r--r-- | vp8/common/alloccommon.c | 18 | ||||
-rw-r--r-- | vp8/common/blockd.h | 13 | ||||
-rw-r--r-- | vp8/common/entropy.c | 6 | ||||
-rw-r--r-- | vp8/common/entropy.h | 4 | ||||
-rw-r--r-- | vp8/common/generic/systemdependent.c | 10 | ||||
-rw-r--r-- | vp8/common/idctllm.c | 33 | ||||
-rw-r--r-- | vp8/common/maskingmv.c | 855 | ||||
-rw-r--r-- | vp8/common/onyxc_int.h | 1 | ||||
-rw-r--r-- | vp8/common/quant_common.c | 45 | ||||
-rw-r--r-- | vp8/common/reconintra4x4.c | 16 | ||||
-rw-r--r-- | vp8/common/x86/mask_sse3.asm | 484 |
11 files changed, 1449 insertions, 36 deletions
diff --git a/vp8/common/alloccommon.c b/vp8/common/alloccommon.c index 9dce8c8f6..5ab8e29ab 100644 --- a/vp8/common/alloccommon.c +++ b/vp8/common/alloccommon.c @@ -126,7 +126,16 @@ int vp8_alloc_frame_buffers(VP8_COMMON *oci, int width, int height) } void vp8_setup_version(VP8_COMMON *cm) { - switch (cm->version) + if (cm->version & 0x4) + { + if (!CONFIG_EXPERIMENTAL) + vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM, + "Bitstream was created by an experimental " + "encoder"); + cm->experimental = 1; + } + + switch (cm->version & 0x3) { case 0: cm->no_lpf = 0; @@ -152,13 +161,6 @@ void vp8_setup_version(VP8_COMMON *cm) cm->use_bilinear_mc_filter = 1; cm->full_pixel = 1; break; - default: - /*4,5,6,7 are reserved for future use*/ - cm->no_lpf = 0; - cm->simpler_lpf = 0; - cm->use_bilinear_mc_filter = 0; - cm->full_pixel = 0; - break; } } void vp8_create_common(VP8_COMMON *oci) diff --git a/vp8/common/blockd.h b/vp8/common/blockd.h index 5a8991e65..3b4986f49 100644 --- a/vp8/common/blockd.h +++ b/vp8/common/blockd.h @@ -32,8 +32,8 @@ void vpx_log(const char *format, ...); #define UCONTEXT 1 #define VCONTEXT 2 #define Y2CONTEXT 3 - #define MB_FEATURE_TREE_PROBS 3 + #define MAX_MB_SEGMENTS 4 #define MAX_REF_LF_DELTAS 4 @@ -168,6 +168,7 @@ typedef struct int as_int; MV as_mv; } mv; + unsigned char segment_flag; unsigned char partitioning; unsigned char mb_skip_coeff; /* does this mb has coefficients at all, 1=no coefficients, 0=need decode tokens */ @@ -251,11 +252,15 @@ typedef struct /* 0 (do not update) 1 (update) the macroblock segmentation feature data. */ unsigned char mb_segement_abs_delta; + unsigned char temporal_update; /* Per frame flags that define which MB level features (such as quantizer or loop filter level) */ /* are enabled and when enabled the proabilities used to decode the per MB flags in MB_MODE_INFO */ - vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; /* Probability Tree used to code Segment number */ - - signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; /* Segment parameters */ +#if CONFIG_SEGMENTATION + vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS + 3]; // Probability Tree used to code Segment number +#else + vp8_prob mb_segment_tree_probs[MB_FEATURE_TREE_PROBS]; +#endif + signed char segment_feature_data[MB_LVL_MAX][MAX_MB_SEGMENTS]; // Segment parameters /* mode_based Loop filter adjustment */ unsigned char mode_ref_lf_delta_enabled; diff --git a/vp8/common/entropy.c b/vp8/common/entropy.c index a1fe4f4ab..219483289 100644 --- a/vp8/common/entropy.c +++ b/vp8/common/entropy.c @@ -76,7 +76,7 @@ static const Prob Pcat3[] = { 173, 148, 140}; static const Prob Pcat4[] = { 176, 155, 140, 135}; static const Prob Pcat5[] = { 180, 157, 141, 134, 130}; static const Prob Pcat6[] = -{ 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129}; +{ 254, 254, 252, 249, 243, 230, 196, 177, 153, 140, 133, 130, 129}; static vp8_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[22]; @@ -111,7 +111,7 @@ static void init_bit_trees() init_bit_tree(cat3, 3); init_bit_tree(cat4, 4); init_bit_tree(cat5, 5); - init_bit_tree(cat6, 11); + init_bit_tree(cat6, 13); } vp8_extra_bit_struct vp8_extra_bits[12] = @@ -126,7 +126,7 @@ vp8_extra_bit_struct vp8_extra_bits[12] = { cat3, Pcat3, 3, 11}, { cat4, Pcat4, 4, 19}, { cat5, Pcat5, 5, 35}, - { cat6, Pcat6, 11, 67}, + { cat6, Pcat6, 13, 67}, { 0, 0, 0, 0} }; #include "defaultcoefcounts.h" diff --git a/vp8/common/entropy.h b/vp8/common/entropy.h index d174e45b9..77f2673aa 100644 --- a/vp8/common/entropy.h +++ b/vp8/common/entropy.h @@ -27,7 +27,7 @@ #define DCT_VAL_CATEGORY3 7 /* 11-18 Extra Bits 3+1 */ #define DCT_VAL_CATEGORY4 8 /* 19-34 Extra Bits 4+1 */ #define DCT_VAL_CATEGORY5 9 /* 35-66 Extra Bits 5+1 */ -#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 11+1 */ +#define DCT_VAL_CATEGORY6 10 /* 67+ Extra Bits 13+1 */ #define DCT_EOB_TOKEN 11 /* EOB Extra Bits 0+0 */ #define vp8_coef_tokens 12 @@ -51,7 +51,7 @@ extern vp8_extra_bit_struct vp8_extra_bits[12]; /* indexed by token value */ #define PROB_UPDATE_BASELINE_COST 7 #define MAX_PROB 255 -#define DCT_MAX_VALUE 2048 +#define DCT_MAX_VALUE 8192 /* Coefficients are predicted via a 3-dimensional probability table. */ diff --git a/vp8/common/generic/systemdependent.c b/vp8/common/generic/systemdependent.c index b3eadaf27..6ba0cfb87 100644 --- a/vp8/common/generic/systemdependent.c +++ b/vp8/common/generic/systemdependent.c @@ -83,8 +83,18 @@ void vp8_machine_specific_config(VP8_COMMON *ctx) vp8_arch_x86_common_init(ctx); #endif + #if ARCH_ARM vp8_arch_arm_common_init(ctx); #endif +#if CONFIG_EXTEND_QRANGE + rtcd->idct.idct1 = vp8_short_idct4x4llm_1_c; + rtcd->idct.idct16 = vp8_short_idct4x4llm_c; + rtcd->idct.idct1_scalar_add = vp8_dc_only_idct_add_c; + rtcd->idct.iwalsh1 = vp8_short_inv_walsh4x4_1_c; + rtcd->idct.iwalsh16 = vp8_short_inv_walsh4x4_c; + +#endif + } diff --git a/vp8/common/idctllm.c b/vp8/common/idctllm.c index 196062df6..c65d35adc 100644 --- a/vp8/common/idctllm.c +++ b/vp8/common/idctllm.c @@ -22,6 +22,8 @@ * so * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). **************************************************************************/ +#include "vpx_ports/config.h" + static const int cospi8sqrt2minus1 = 20091; static const int sinpi8sqrt2 = 35468; static const int rounding = 0; @@ -75,11 +77,19 @@ void vp8_short_idct4x4llm_c(short *input, short *output, int pitch) d1 = temp1 + temp2; +#if !CONFIG_EXTEND_QRANGE op[0] = (a1 + d1 + 4) >> 3; op[3] = (a1 - d1 + 4) >> 3; op[1] = (b1 + c1 + 4) >> 3; op[2] = (b1 - c1 + 4) >> 3; +#else + op[0] = (a1 + d1 + 16) >> 5; + op[3] = (a1 - d1 + 16) >> 5; + + op[1] = (b1 + c1 + 16) >> 5; + op[2] = (b1 - c1 + 16) >> 5; +#endif ip += shortpitch; op += shortpitch; @@ -92,8 +102,11 @@ void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch) int a1; short *op = output; int shortpitch = pitch >> 1; +#if !CONFIG_EXTEND_QRANGE a1 = ((input[0] + 4) >> 3); - +#else + a1 = ((input[0] + 16) >> 5); +#endif for (i = 0; i < 4; i++) { op[0] = a1; @@ -106,7 +119,11 @@ void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch) void vp8_dc_only_idct_add_c(short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride) { +#if !CONFIG_EXTEND_QRANGE int a1 = ((input_dc + 4) >> 3); +#else + int a1 = ((input_dc + 16) >> 5); +#endif int r, c; for (r = 0; r < 4; r++) @@ -168,11 +185,17 @@ void vp8_short_inv_walsh4x4_c(short *input, short *output) c2 = a1 - b1; d2 = d1 - c1; +#if !CONFIG_EXTEND_QRANGE op[0] = (a2 + 3) >> 3; op[1] = (b2 + 3) >> 3; op[2] = (c2 + 3) >> 3; op[3] = (d2 + 3) >> 3; - +#else + op[0] = (a2 + 1) >> 2; + op[1] = (b2 + 1) >> 2; + op[2] = (c2 + 1) >> 2; + op[3] = (d2 + 1) >> 2; +#endif ip += 4; op += 4; } @@ -184,7 +207,11 @@ void vp8_short_inv_walsh4x4_1_c(short *input, short *output) int a1; short *op = output; - a1 = ((input[0] + 3) >> 3); +#if !CONFIG_EXTEND_QRANGE + a1 = (input[0] + 3 )>> 3; +#else + a1 = (input[0] + 1 )>> 2; +#endif for (i = 0; i < 4; i++) { diff --git a/vp8/common/maskingmv.c b/vp8/common/maskingmv.c new file mode 100644 index 000000000..d01a18fc8 --- /dev/null +++ b/vp8/common/maskingmv.c @@ -0,0 +1,855 @@ +/* + ============================================================================ + Name : maskingmv.c + Author : jimbankoski + Version : + Copyright : Your copyright notice + Description : Hello World in C, Ansi-style + ============================================================================ + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +extern unsigned int vp8_sad16x16_sse3( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int max_err); + +extern void vp8_sad16x16x3_sse3( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + int *results); + +extern int vp8_growmaskmb_sse3( + unsigned char *om, + unsigned char *nm); + +extern void vp8_makemask_sse3( + unsigned char *y, + unsigned char *u, + unsigned char *v, + unsigned char *ym, + int yp, + int uvp, + int ys, + int us, + int vs, + int yt, + int ut, + int vt); + +unsigned int vp8_sad16x16_unmasked_wmt( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned char *mask); + +unsigned int vp8_sad16x16_masked_wmt( + unsigned char *src_ptr, + int src_stride, + unsigned char *ref_ptr, + int ref_stride, + unsigned char *mask); + +unsigned int vp8_masked_predictor_wmt( + unsigned char *masked, + unsigned char *unmasked, + int src_stride, + unsigned char *dst_ptr, + int dst_stride, + unsigned char *mask); +unsigned int vp8_masked_predictor_uv_wmt( + unsigned char *masked, + unsigned char *unmasked, + int src_stride, + unsigned char *dst_ptr, + int dst_stride, + unsigned char *mask); +unsigned int vp8_uv_from_y_mask( + unsigned char *ymask, + unsigned char *uvmask); +int yp=16; +unsigned char sxy[]= +{ +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,80,120,120,90,90,90,90,90,80,120,120,90,90,90,90,90 +}; + +unsigned char sts[]= +{ +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, +}; +unsigned char str[]= +{ +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, +1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 +}; + +unsigned char y[]= +{ +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40, +40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40, +40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40, +40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40, +60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40, +60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40, +60,60,60,60,40,40,40,40,60,60,60,60,40,40,40,40, +40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40, +40,60,60,60,60,40,40,40,40,60,60,60,60,40,40,40, +40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40, +40,40,60,60,60,60,40,40,40,40,60,60,60,60,40,40, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40, +40,40,40,60,60,60,60,40,40,40,40,60,60,60,60,40 +}; +int uvp=8; +unsigned char u[]= +{ +90,80,70,70,90,90,90,17, +90,80,70,70,90,90,90,17, +84,70,70,90,90,90,17,17, +84,70,70,90,90,90,17,17, +80,70,70,90,90,90,17,17, +90,80,70,70,90,90,90,17, +90,80,70,70,90,90,90,17, +90,80,70,70,90,90,90,17 +}; + +unsigned char v[]= +{ +80,80,80,80,80,80,80,80, +80,80,80,80,80,80,80,80, +80,80,80,80,80,80,80,80, +80,80,80,80,80,80,80,80, +80,80,80,80,80,80,80,80, +80,80,80,80,80,80,80,80, +80,80,80,80,80,80,80,80, +80,80,80,80,80,80,80,80 +}; + +unsigned char ym[256]; +unsigned char uvm[64]; +typedef struct +{ + unsigned char y; + unsigned char yt; + unsigned char u; + unsigned char ut; + unsigned char v; + unsigned char vt; + unsigned char use; +} COLOR_SEG_ELEMENT; + +/* +COLOR_SEG_ELEMENT segmentation[]= +{ + { 60,4,80,17,80,10, 1}, + { 40,4,15,10,80,10, 1}, +}; +*/ + +COLOR_SEG_ELEMENT segmentation[]= +{ + { 79,44,92,44, 237,60, 1}, +}; + +unsigned char pixel_mask(unsigned char y,unsigned char u,unsigned char v, + COLOR_SEG_ELEMENT sgm[], + int c) +{ + COLOR_SEG_ELEMENT *s=sgm; + unsigned char m =0; + int i; + for(i=0;i<c;i++,s++) + m |= ( abs(y-s->y)< s->yt && + abs(u-s->u)< s->ut && + abs(v-s->v)< s->vt ? 255 : 0 ); + + return m; +} +int neighbors[256][8]; +int makeneighbors(void) +{ + int i,j; + for(i=0;i<256;i++) + { + int r=(i>>4),c=(i&15); + int ni=0; + for(j=0;j<8;j++) + neighbors[i][j]=i; + for(j=0;j<256;j++) + { + int nr=(j>>4),nc=(j&15); + if(abs(nr-r)<2&&abs(nc-c)<2) + neighbors[i][ni++]=j; + } + } + return 0; +} +void grow_ymask(unsigned char *ym) +{ + unsigned char nym[256]; + int i,j; + + for(i=0;i<256;i++) + { + nym[i]=ym[i]; + for(j=0;j<8;j++) + { + nym[i]|=ym[neighbors[i][j]]; + } + } + for(i=0;i<256;i++) + ym[i]=nym[i]; +} +void make_mb_mask(unsigned char *y, unsigned char *u, unsigned char *v, + unsigned char *ym, unsigned char *uvm, + int yp, int uvp, + COLOR_SEG_ELEMENT sgm[], + int count) +{ + int r,c; + unsigned char *oym = ym; + + memset(ym,20,256); + for(r=0;r<8;r++,uvm+=8,u+=uvp,v+=uvp,y+=(yp+yp),ym+=32) + for(c=0;c<8;c++) + { + int y1=y[c<<1]; + int u1=u[c]; + int v1=v[c]; + int m = pixel_mask(y1,u1,v1,sgm,count); + uvm[c] = m; + ym[c<<1] = uvm[c];// = pixel_mask(y[c<<1],u[c],v[c],sgm,count); + ym[(c<<1)+1] = pixel_mask(y[1+(c<<1)],u[c],v[c],sgm,count); + ym[(c<<1)+16] = pixel_mask(y[yp+(c<<1)],u[c],v[c],sgm,count); + ym[(c<<1)+17] = pixel_mask(y[1+yp+(c<<1)],u[c],v[c],sgm,count); + } + grow_ymask(oym); +} + +int masked_sad(unsigned char *src, int p, unsigned char *dst, int dp, + unsigned char *ym ) +{ + int i,j; + unsigned sad = 0; + for(i=0;i<16;i++,src+=p,dst+=dp,ym+=16) + for(j=0;j<16;j++) + if(ym[j]) + sad+= abs(src[j]-dst[j]); + + return sad; +} + +int compare_masks(unsigned char *sym, unsigned char *ym) +{ + int i,j; + unsigned sad = 0; + for(i=0;i<16;i++,sym += 16,ym+=16) + for(j=0;j<16;j++) + sad+= (sym[j]!=ym[j]?1:0); + + return sad; +} +int unmasked_sad(unsigned char *src, int p, unsigned char *dst, int dp, + unsigned char *ym) +{ + int i,j; + unsigned sad = 0; + for(i=0;i<16;i++,src+=p,dst+=dp,ym+=16) + for(j=0;j<16;j++) + if(!ym[j]) + sad+= abs(src[j]-dst[j]); + + return sad; +} +int masked_motion_search( unsigned char *y, unsigned char *u, unsigned char *v, + int yp, int uvp, + unsigned char *dy, unsigned char *du, unsigned char *dv, + int dyp, int duvp, + COLOR_SEG_ELEMENT sgm[], + int count, + int *mi, + int *mj, + int *ui, + int *uj, + int *wm) +{ + int i,j; + + unsigned char ym[256]; + unsigned char uvm[64]; + unsigned char dym[256]; + unsigned char duvm[64]; + unsigned int e = 0 ; + int beste=256; + int bmi=-32,bmj=-32; + int bui=-32,buj=-32; + int beste1=256; + int bmi1=-32,bmj1=-32; + int bui1=-32,buj1=-32; + int obeste; + + // first try finding best mask and then unmasked + beste = 0xffffffff; + + // find best unmasked mv + for(i=-32;i<32;i++) + { + unsigned char *dyz = i*dyp + dy; + unsigned char *duz = i/2*duvp + du; + unsigned char *dvz = i/2*duvp + dv; + for(j=-32;j<32;j++) + { + // 0,0 masked destination + make_mb_mask(dyz+j,duz+j/2, dvz+j/2, dym, duvm, dyp, duvp,sgm,count); + + e = unmasked_sad(y, yp, dyz+j, dyp, dym ); + + if(e<beste) + { + bui=i; + buj=j; + beste=e; + } + } + } + //bui=0;buj=0; + // best mv masked destination + make_mb_mask(dy+bui*dyp+buj,du+bui/2*duvp+buj/2, dv+bui/2*duvp+buj/2, + dym, duvm, dyp, duvp,sgm,count); + + obeste = beste; + beste = 0xffffffff; + + // find best masked + for(i=-32;i<32;i++) + { + unsigned char *dyz = i*dyp + dy; + for(j=-32;j<32;j++) + { + e = masked_sad(y, yp, dyz+j, dyp, dym ); + + if(e<beste) + { + bmi=i; + bmj=j; + beste=e; + } + } + } + beste1=beste+obeste; + bmi1=bmi;bmj1=bmj; + bui1=bui;buj1=buj; + + beste = 0xffffffff; + // source mask + make_mb_mask(y,u, v, ym, uvm, yp, uvp,sgm,count); + + // find best mask + for(i=-32;i<32;i++) + { + unsigned char *dyz = i*dyp + dy; + unsigned char *duz = i/2*duvp + du; + unsigned char *dvz = i/2*duvp + dv; + for(j=-32;j<32;j++) + { + // 0,0 masked destination + make_mb_mask(dyz+j,duz+j/2, dvz+j/2, dym, duvm, dyp, duvp,sgm,count); + + e = compare_masks(ym, dym); + + if(e<beste) + { + bmi=i; + bmj=j; + beste=e; + } + } + } + + + // best mv masked destination + make_mb_mask(dy+bmi*dyp+bmj,du+bmi/2*duvp+bmj/2, dv+bmi/2*duvp+bmj/2, + dym, duvm, dyp, duvp,sgm,count); + + obeste = masked_sad(y, yp, dy+bmi*dyp+bmj, dyp, dym ); + + beste = 0xffffffff; + + // find best unmasked mv + for(i=-32;i<32;i++) + { + unsigned char *dyz = i*dyp + dy; + for(j=-32;j<32;j++) + { + e = unmasked_sad(y, yp, dyz+j, dyp, dym ); + + if(e<beste) + { + bui=i; + buj=j; + beste=e; + } + } + } + beste += obeste; + + + if(beste<beste1) + { + *mi = bmi; + *mj = bmj; + *ui = bui; + *uj = buj; + *wm = 1; + } + else + { + *mi = bmi1; + *mj = bmj1; + *ui = bui1; + *uj = buj1; + *wm = 0; + + } + return 0; +} + +int predict(unsigned char *src, int p, unsigned char *dst, int dp, + unsigned char *ym, unsigned char *prd ) +{ + int i,j; + for(i=0;i<16;i++,src+=p,dst+=dp,ym+=16, prd+=16) + for(j=0;j<16;j++) + prd[j]=(ym[j] ? src[j]:dst[j]); + return 0; +} + +int fast_masked_motion_search( unsigned char *y, unsigned char *u, unsigned char *v, + int yp, int uvp, + unsigned char *dy, unsigned char *du, unsigned char *dv, + int dyp, int duvp, + COLOR_SEG_ELEMENT sgm[], + int count, + int *mi, + int *mj, + int *ui, + int *uj, + int *wm) +{ + int i,j; + + unsigned char ym[256]; + unsigned char ym2[256]; + unsigned char uvm[64]; + unsigned char dym2[256]; + unsigned char dym[256]; + unsigned char duvm[64]; + unsigned int e = 0 ; + int beste=256; + int bmi=-32,bmj=-32; + int bui=-32,buj=-32; + int beste1=256; + int bmi1=-32,bmj1=-32; + int bui1=-32,buj1=-32; + int obeste; + + // first try finding best mask and then unmasked + beste = 0xffffffff; + +#if 0 + for(i=0;i<16;i++) + { + unsigned char *dy = i*yp + y; + for(j=0;j<16;j++) + printf("%2x",dy[j]); + printf("\n"); + } + printf("\n"); + + for(i=-32;i<48;i++) + { + unsigned char *dyz = i*dyp + dy; + for(j=-32;j<48;j++) + printf("%2x",dyz[j]); + printf("\n"); + } +#endif + + // find best unmasked mv + for(i=-32;i<32;i++) + { + unsigned char *dyz = i*dyp + dy; + unsigned char *duz = i/2*duvp + du; + unsigned char *dvz = i/2*duvp + dv; + for(j=-32;j<32;j++) + { + // 0,0 masked destination + vp8_makemask_sse3(dyz+j,duz+j/2, dvz+j/2, dym, dyp, duvp, + sgm[0].y,sgm[0].u,sgm[0].v, + sgm[0].yt,sgm[0].ut,sgm[0].vt); + + vp8_growmaskmb_sse3(dym,dym2); + + e = vp8_sad16x16_unmasked_wmt(y, yp, dyz+j, dyp, dym2 ); + + if(e<beste) + { + bui=i; + buj=j; + beste=e; + } + } + } + //bui=0;buj=0; + // best mv masked destination + + vp8_makemask_sse3(dy+bui*dyp+buj,du+bui/2*duvp+buj/2, dv+bui/2*duvp+buj/2, + dym, dyp, duvp, + sgm[0].y,sgm[0].u,sgm[0].v, + sgm[0].yt,sgm[0].ut,sgm[0].vt); + + vp8_growmaskmb_sse3(dym,dym2); + + obeste = beste; + beste = 0xffffffff; + + // find best masked + for(i=-32;i<32;i++) + { + unsigned char *dyz = i*dyp + dy; + for(j=-32;j<32;j++) + { + e = vp8_sad16x16_masked_wmt(y, yp, dyz+j, dyp, dym2 ); + if(e<beste) + { + bmi=i; + bmj=j; + beste=e; + } + } + } + beste1=beste+obeste; + bmi1=bmi;bmj1=bmj; + bui1=bui;buj1=buj; + + // source mask + vp8_makemask_sse3(y,u, v, + ym, yp, uvp, + sgm[0].y,sgm[0].u,sgm[0].v, + sgm[0].yt,sgm[0].ut,sgm[0].vt); + + vp8_growmaskmb_sse3(ym,ym2); + + // find best mask + for(i=-32;i<32;i++) + { + unsigned char *dyz = i*dyp + dy; + unsigned char *duz = i/2*duvp + du; + unsigned char *dvz = i/2*duvp + dv; + for(j=-32;j<32;j++) + { + // 0,0 masked destination + vp8_makemask_sse3(dyz+j,duz+j/2, dvz+j/2, dym, dyp, duvp, + sgm[0].y,sgm[0].u,sgm[0].v, + sgm[0].yt,sgm[0].ut,sgm[0].vt); + + vp8_growmaskmb_sse3(dym,dym2); + + e = compare_masks(ym2, dym2); + + if(e<beste) + { + bmi=i; + bmj=j; + beste=e; + } + } + } + + vp8_makemask_sse3(dy+bmi*dyp+bmj,du+bmi/2*duvp+bmj/2, dv+bmi/2*duvp+bmj/2, + dym, dyp, duvp, + sgm[0].y,sgm[0].u,sgm[0].v, + sgm[0].yt,sgm[0].ut,sgm[0].vt); + + vp8_growmaskmb_sse3(dym,dym2); + + obeste = vp8_sad16x16_masked_wmt(y, yp, dy+bmi*dyp+bmj, dyp, dym2 ); + + beste = 0xffffffff; + + // find best unmasked mv + for(i=-32;i<32;i++) + { + unsigned char *dyz = i*dyp + dy; + for(j=-32;j<32;j++) + { + e = vp8_sad16x16_unmasked_wmt(y, yp, dyz+j, dyp, dym2 ); + + if(e<beste) + { + bui=i; + buj=j; + beste=e; + } + } + } + beste += obeste; + + if(beste<beste1) + { + *mi = bmi; + *mj = bmj; + *ui = bui; + *uj = buj; + *wm = 1; + } + else + { + *mi = bmi1; + *mj = bmj1; + *ui = bui1; + *uj = buj1; + *wm = 0; + beste=beste1; + + } + return beste; +} + +int predict_all(unsigned char *ym, unsigned char *um, unsigned char *vm, + int ymp, int uvmp, + unsigned char *yp, unsigned char *up, unsigned char *vp, + int ypp, int uvpp, + COLOR_SEG_ELEMENT sgm[], + int count, + int mi, + int mj, + int ui, + int uj, + int wm) +{ + int i,j; + unsigned char dym[256]; + unsigned char dym2[256]; + unsigned char duvm[64]; + unsigned char *yu=ym,*uu=um, *vu=vm; + + unsigned char *dym3=dym2; + + ym+=mi*ymp+mj; + um+=mi/2*uvmp+mj/2; + vm+=mi/2*uvmp+mj/2; + + yu+=ui*ymp+uj; + uu+=ui/2*uvmp+uj/2; + vu+=ui/2*uvmp+uj/2; + + // best mv masked destination + if(wm) + vp8_makemask_sse3(ym,um, vm, dym, ymp, uvmp, + sgm[0].y,sgm[0].u,sgm[0].v, + sgm[0].yt,sgm[0].ut,sgm[0].vt); + else + vp8_makemask_sse3(yu,uu, vu, dym, ymp, uvmp, + sgm[0].y,sgm[0].u,sgm[0].v, + sgm[0].yt,sgm[0].ut,sgm[0].vt); + + vp8_growmaskmb_sse3(dym,dym2); + vp8_masked_predictor_wmt(ym,yu,ymp,yp,ypp,dym3); + vp8_uv_from_y_mask(dym3,duvm); + vp8_masked_predictor_uv_wmt(um,uu,uvmp,up,uvpp,duvm); + vp8_masked_predictor_uv_wmt(vm,vu,uvmp,vp,uvpp,duvm); + + return 0; +} + +unsigned char f0p[1280*720*3/2]; +unsigned char f1p[1280*720*3/2]; +unsigned char prd[1280*720*3/2]; +unsigned char msk[1280*720*3/2]; + + +int mainz(int argc, char *argv[]) { + + FILE *f=fopen(argv[1],"rb"); + FILE *g=fopen(argv[2],"wb"); + int w=atoi(argv[3]),h=atoi(argv[4]); + int y_stride=w,uv_stride=w/2; + int r,c; + unsigned char *f0=f0p,*f1=f1p,*t; + unsigned char ym[256],uvm[64]; + unsigned char ym2[256],uvm2[64]; + unsigned char ym3[256],uvm3[64]; + int a,b; + + COLOR_SEG_ELEMENT last={ 20,20,20,20, 230,20, 1},best; +#if 0 + makeneighbors(); + COLOR_SEG_ELEMENT segmentation[]= + { + { 60,4,80,17,80,10, 1}, + { 40,4,15,10,80,10, 1}, + }; + make_mb_mask(y, u, v,ym2,uvm2,16,8,segmentation,1); + + vp8_makemask_sse3(y,u,v,ym, (int) 16,(int) 8, + (int) segmentation[0].y,(int) segmentation[0].u,(int) segmentation[0].v, + segmentation[0].yt,segmentation[0].ut,segmentation[0].vt); + + vp8_growmaskmb_sse3(ym,ym3); + + a = vp8_sad16x16_masked_wmt(str,16,sts,16,ym3); + b = vp8_sad16x16_unmasked_wmt(str,16,sts,16,ym3); + + vp8_masked_predictor_wmt(str,sts,16,ym,16,ym3); + + vp8_uv_from_y_mask(ym3,uvm3); + + return 4; +#endif + makeneighbors(); + + + memset(prd,128,w*h*3/2); + + fread(f0,w*h*3/2,1,f); + + while(!feof(f)) + { + unsigned char *ys=f1,*yd=f0,*yp=prd; + unsigned char *us=f1+w*h,*ud=f0+w*h,*up=prd+w*h; + unsigned char *vs=f1+w*h*5/4,*vd=f0+w*h*5/4,*vp=prd+w*h*5/4; + fread(f1,w*h*3/2,1,f); + + ys+=32*y_stride;yd+=32*y_stride;yp+=32*y_stride; + us+=16*uv_stride;ud+=16*uv_stride;up+=16*uv_stride; + vs+=16*uv_stride;vd+=16*uv_stride;vp+=16*uv_stride; + for(r=32;r<h-32;r+=16, + ys+=16*w,yd+=16*w,yp+=16*w, + us+=8*uv_stride,ud+=8*uv_stride,up+=8*uv_stride, + vs+=8*uv_stride,vd+=8*uv_stride,vp+=8*uv_stride) + { + for(c=32;c<w-32;c+=16) + { + int mi,mj,ui,uj,wm; + int bmi,bmj,bui,buj,bwm; + unsigned char ym[256]; + + if(vp8_sad16x16_sse3( ys+c,y_stride, yd+c,y_stride,0xffff) == 0) + bmi=bmj=bui=buj=bwm=0; + else + { + COLOR_SEG_ELEMENT cs[5]; + int j; + unsigned int beste=0xfffffff; + unsigned int bestj=0; + + // try color from last mb segmentation + cs[0] = last; + + // try color segs from 4 pixels in mb recon as segmentation + cs[1].y = yd[c + y_stride + 1];cs[1].u = ud[c/2 + uv_stride]; + cs[1].v = vd[c/2 + uv_stride]; + cs[1].yt = cs[1].ut = cs[1].vt = 20; + cs[2].y = yd[c + w + 14]; + cs[2].u = ud[c/2 + uv_stride+7]; + cs[2].v = vd[c/2 + uv_stride+7]; + cs[2].yt = cs[2].ut = cs[2].vt = 20; + cs[3].y = yd[c + w*14 + 1]; + cs[3].u = ud[c/2 + uv_stride*7]; + cs[3].v = vd[c/2 + uv_stride*7]; + cs[3].yt = cs[3].ut = cs[3].vt = 20; + cs[4].y = yd[c + w*14 + 14]; + cs[4].u = ud[c/2 + uv_stride*7+7]; + cs[4].v = vd[c/2 + uv_stride*7+7]; + cs[4].yt = cs[4].ut = cs[4].vt = 20; + + for(j=0;j<5;j++) + { + int e; + + e = fast_masked_motion_search( + ys+c, us+c/2, vs+c/2, y_stride, uv_stride, + yd+c, ud+c/2, vd+c/2, y_stride, uv_stride, + &cs[j], 1, &mi,&mj,&ui,&uj,&wm); + + if(e<beste) + { + bmi=mi;bmj=mj;bui=ui;buj=uj,bwm=wm; + bestj=j; + beste=e; + } + } + best = cs[bestj]; + //best = segmentation[0]; + last = best; + } + predict_all(yd+c, ud+c/2, vd+c/2, w, uv_stride, + yp+c, up+c/2, vp+c/2, w, uv_stride, + &best, 1, bmi,bmj,bui,buj,bwm); + + } + } + fwrite(prd,w*h*3/2,1,g); + t=f0; + f0=f1; + f1=t; + + } + fclose(f); + fclose(g); + return; +} diff --git a/vp8/common/onyxc_int.h b/vp8/common/onyxc_int.h index e011ec99a..90d63e535 100644 --- a/vp8/common/onyxc_int.h +++ b/vp8/common/onyxc_int.h @@ -117,6 +117,7 @@ typedef struct VP8Common int mode_info_stride; /* profile settings */ + int experimental; int mb_no_coeff_skip; int no_lpf; int simpler_lpf; diff --git a/vp8/common/quant_common.c b/vp8/common/quant_common.c index e9833fe33..b8e6e2972 100644 --- a/vp8/common/quant_common.c +++ b/vp8/common/quant_common.c @@ -11,6 +11,8 @@ #include "quant_common.h" + +#if !CONFIG_EXTEND_QRANGE static const int dc_qlookup[QINDEX_RANGE] = { 4, 5, 6, 7, 8, 9, 10, 10, 11, 12, 13, 14, 15, 16, 17, 17, @@ -34,7 +36,32 @@ static const int ac_qlookup[QINDEX_RANGE] = 155, 158, 161, 164, 167, 170, 173, 177, 181, 185, 189, 193, 197, 201, 205, 209, 213, 217, 221, 225, 229, 234, 239, 245, 249, 254, 259, 264, 269, 274, 279, 284, }; +#else + +static const int dc_qlookup[QINDEX_RANGE] = +{ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 32, 34, 36, 38, 40, 42, + 44, 46, 49, 52, 55, 58, 61, 64, 67, 70, 73, 76, 79, 82, 85, 88, + 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, + 156, 160, 164, 168, 172, 176, 180, 184, 188, 192, 196, 200, 205, 210, 215, 220, + 225, 230, 235, 240, 245, 250, 255, 260, 265, 270, 275, 280, 285, 290, 295, 300, + 310, 320, 330, 340, 350, 360, 370, 380, 390, 400, 410, 420, 430, 440, 450, 460, + 472, 484, 496, 508, 520, 532, 544, 556, 572, 588, 608, 628, 648, 668, 692, 720, +}; +static const int ac_qlookup[QINDEX_RANGE] = +{ + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, + 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 51, + 54, 57, 60, 63, 66, 69, 72, 76, 80, 84, 88, 92, 96, 100, 105, 110, + 115, 120, 125, 130, 135, 140, 146, 152, 158, 164, 170, 176, 182, 188, 194, 200, + 206, 212, 218, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, + 330, 340, 350, 360, 370, 380, 392, 404, 416, 428, 440, 454, 468, 482, 496, 510, + 524, 540, 556, 572, 588, 604, 622, 640, 658, 676, 696, 716, 736, 756, 776, 796, + 820, 844, 868, 892, 916, 944, 972, 1000, 1032, 1064, 1096, 1128, 1168, 1208, 1252, 1300 +}; +#endif int vp8_dc_quant(int QIndex, int Delta) { @@ -62,7 +89,11 @@ int vp8_dc2quant(int QIndex, int Delta) else if (QIndex < 0) QIndex = 0; +#if !CONFIG_EXTEND_QRANGE retval = dc_qlookup[ QIndex ] * 2; +#else + retval = dc_qlookup[ QIndex ]; +#endif return retval; } @@ -72,16 +103,13 @@ int vp8_dc_uv_quant(int QIndex, int Delta) QIndex = QIndex + Delta; - if (QIndex > 127) - QIndex = 127; + if (QIndex > 117) + QIndex = 117; else if (QIndex < 0) QIndex = 0; retval = dc_qlookup[ QIndex ]; - if (retval > 132) - retval = 132; - return retval; } @@ -108,12 +136,13 @@ int vp8_ac2quant(int QIndex, int Delta) QIndex = 127; else if (QIndex < 0) QIndex = 0; - +#if !CONFIG_EXTEND_QRANGE retval = (ac_qlookup[ QIndex ] * 155) / 100; - if (retval < 8) retval = 8; - +#else + retval = ac_qlookup[ QIndex ]; +#endif return retval; } int vp8_ac_uv_quant(int QIndex, int Delta) diff --git a/vp8/common/reconintra4x4.c b/vp8/common/reconintra4x4.c index db44fa190..d3d133836 100644 --- a/vp8/common/reconintra4x4.c +++ b/vp8/common/reconintra4x4.c @@ -81,10 +81,10 @@ void vp8_predict_intra4x4(BLOCKD *x, { unsigned int ap[4]; - ap[0] = (top_left + 2 * Above[0] + Above[1] + 2) >> 2; - ap[1] = (Above[0] + 2 * Above[1] + Above[2] + 2) >> 2; - ap[2] = (Above[1] + 2 * Above[2] + Above[3] + 2) >> 2; - ap[3] = (Above[2] + 2 * Above[3] + Above[4] + 2) >> 2; + ap[0] = Above[0]; + ap[1] = Above[1]; + ap[2] = Above[2]; + ap[3] = Above[3]; for (r = 0; r < 4; r++) { @@ -105,10 +105,10 @@ void vp8_predict_intra4x4(BLOCKD *x, { unsigned int lp[4]; - lp[0] = (top_left + 2 * Left[0] + Left[1] + 2) >> 2; - lp[1] = (Left[0] + 2 * Left[1] + Left[2] + 2) >> 2; - lp[2] = (Left[1] + 2 * Left[2] + Left[3] + 2) >> 2; - lp[3] = (Left[2] + 2 * Left[3] + Left[3] + 2) >> 2; + lp[0] = Left[0]; + lp[1] = Left[1]; + lp[2] = Left[2]; + lp[3] = Left[3]; for (r = 0; r < 4; r++) { diff --git a/vp8/common/x86/mask_sse3.asm b/vp8/common/x86/mask_sse3.asm new file mode 100644 index 000000000..0d90cfa86 --- /dev/null +++ b/vp8/common/x86/mask_sse3.asm @@ -0,0 +1,484 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void int vp8_makemask_sse3( +; unsigned char *y, +; unsigned char *u, +; unsigned char *v, +; unsigned char *ym, +; unsigned char *uvm, +; int yp, +; int uvp, +; int ys, +; int us, +; int vs, +; int yt, +; int ut, +; int vt) +global sym(vp8_makemask_sse3) +sym(vp8_makemask_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 14 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;y + mov rdi, arg(1) ;u + mov rcx, arg(2) ;v + mov rax, arg(3) ;ym + movsxd rbx, dword arg(4) ;yp + movsxd rdx, dword arg(5) ;uvp + + pxor xmm0,xmm0 + + ;make 16 copies of the center y value + movd xmm1, arg(6) + pshufb xmm1, xmm0 + + ; make 16 copies of the center u value + movd xmm2, arg(7) + pshufb xmm2, xmm0 + + ; make 16 copies of the center v value + movd xmm3, arg(8) + pshufb xmm3, xmm0 + unpcklpd xmm2, xmm3 + + ;make 16 copies of the y tolerance + movd xmm3, arg(9) + pshufb xmm3, xmm0 + + ;make 16 copies of the u tolerance + movd xmm4, arg(10) + pshufb xmm4, xmm0 + + ;make 16 copies of the v tolerance + movd xmm5, arg(11) + pshufb xmm5, xmm0 + unpckhpd xmm4, xmm5 + + mov r8,8 + +NextPairOfRows: + + ;grab the y source values + movdqu xmm0, [rsi] + + ;compute abs difference between source and y target + movdqa xmm6, xmm1 + movdqa xmm7, xmm0 + psubusb xmm0, xmm1 + psubusb xmm6, xmm7 + por xmm0, xmm6 + + ;compute abs difference between + movdqa xmm6, xmm3 + pcmpgtb xmm6, xmm0 + + ;grab the y source values + add rsi, rbx + movdqu xmm0, [rsi] + + ;compute abs difference between source and y target + movdqa xmm11, xmm1 + movdqa xmm7, xmm0 + psubusb xmm0, xmm1 + psubusb xmm11, xmm7 + por xmm0, xmm11 + + ;compute abs difference between + movdqa xmm11, xmm3 + pcmpgtb xmm11, xmm0 + + + ;grab the u and v source values + movdqu xmm7, [rdi] + movdqu xmm8, [rcx] + unpcklpd xmm7, xmm8 + + ;compute abs difference between source and uv targets + movdqa xmm9, xmm2 + movdqa xmm10, xmm7 + psubusb xmm7, xmm2 + psubusb xmm9, xmm10 + por xmm7, xmm9 + + ;check whether the number is < tolerance + movdqa xmm0, xmm4 + pcmpgtb xmm0, xmm7 + + ;double u and v masks + movdqa xmm8, xmm0 + punpckhbw xmm0, xmm0 + punpcklbw xmm8, xmm8 + + ;mask row 0 and output + pand xmm6, xmm8 + pand xmm6, xmm0 + movdqa [rax],xmm6 + + ;mask row 1 and output + pand xmm11, xmm8 + pand xmm11, xmm0 + movdqa [rax+16],xmm11 + + + ; to the next row or set of rows + add rsi, rbx + add rdi, rdx + add rcx, rdx + add rax,32 + dec r8 + jnz NextPairOfRows + + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;GROW_HORIZ (register for result, source register or mem local) +; takes source and shifts left and ors with source +; then shifts right and ors with source +%macro GROW_HORIZ 2 + movdqa %1, %2 + movdqa xmm14, %1 + movdqa xmm15, %1 + pslldq xmm14, 1 + psrldq xmm15, 1 + por %1,xmm14 + por %1,xmm15 +%endmacro +;GROW_VERT (result, center row, above row, below row) +%macro GROW_VERT 4 + movdqa %1,%2 + por %1,%3 + por %1,%4 +%endmacro + +;GROW_NEXTLINE (new line to grow, new source, line to write) +%macro GROW_NEXTLINE 3 + GROW_HORIZ %1, %2 + GROW_VERT xmm3, xmm0, xmm1, xmm2 + movdqa %3,xmm3 +%endmacro + + +;void int vp8_growmaskmb_sse3( +; unsigned char *om, +; unsigned char *nm, +global sym(vp8_growmaskmb_sse3) +sym(vp8_growmaskmb_sse3): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 2 + push rsi + push rdi + ; end prolog + + mov rsi, arg(0) ;src + mov rdi, arg(1) ;rst + + GROW_HORIZ xmm0, [rsi] + GROW_HORIZ xmm1, [rsi+16] + GROW_HORIZ xmm2, [rsi+32] + + GROW_VERT xmm3, xmm0, xmm1, xmm2 + por xmm0,xmm1 + movdqa [rdi], xmm0 + movdqa [rdi+16],xmm3 + + GROW_NEXTLINE xmm0,[rsi+48],[rdi+32] + GROW_NEXTLINE xmm1,[rsi+64],[rdi+48] + GROW_NEXTLINE xmm2,[rsi+80],[rdi+64] + GROW_NEXTLINE xmm0,[rsi+96],[rdi+80] + GROW_NEXTLINE xmm1,[rsi+112],[rdi+96] + GROW_NEXTLINE xmm2,[rsi+128],[rdi+112] + GROW_NEXTLINE xmm0,[rsi+144],[rdi+128] + GROW_NEXTLINE xmm1,[rsi+160],[rdi+144] + GROW_NEXTLINE xmm2,[rsi+176],[rdi+160] + GROW_NEXTLINE xmm0,[rsi+192],[rdi+176] + GROW_NEXTLINE xmm1,[rsi+208],[rdi+192] + GROW_NEXTLINE xmm2,[rsi+224],[rdi+208] + GROW_NEXTLINE xmm0,[rsi+240],[rdi+224] + + por xmm0,xmm2 + movdqa [rdi+240], xmm0 + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + + +;unsigned int vp8_sad16x16_masked_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; unsigned char *mask) +global sym(vp8_sad16x16_masked_wmt) +sym(vp8_sad16x16_masked_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rbx, arg(4) ;mask + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +NextSadRow: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + pand xmm0, xmm2 + pand xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm3, xmm0 + + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz NextSadRow + + movdqa xmm4 , xmm3 + psrldq xmm4, 8 + paddw xmm3, xmm4 + movq rax, xmm3 + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_sad16x16_unmasked_wmt( +; unsigned char *src_ptr, +; int src_stride, +; unsigned char *ref_ptr, +; int ref_stride, +; unsigned char *mask) +global sym(vp8_sad16x16_unmasked_wmt) +sym(vp8_sad16x16_unmasked_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(2) ;ref_ptr + + mov rbx, arg(4) ;mask + movsxd rax, dword ptr arg(1) ;src_stride + movsxd rdx, dword ptr arg(3) ;ref_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +next_vp8_sad16x16_unmasked_wmt: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + por xmm0, xmm2 + por xmm1, xmm2 + + psadbw xmm0, xmm1 + paddw xmm3, xmm0 + + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz next_vp8_sad16x16_unmasked_wmt + + movdqa xmm4 , xmm3 + psrldq xmm4, 8 + paddw xmm3, xmm4 + movq rax, xmm3 + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_masked_predictor_wmt( +; unsigned char *masked, +; unsigned char *unmasked, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; unsigned char *mask) +global sym(vp8_masked_predictor_wmt) +sym(vp8_masked_predictor_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;ref_ptr + + mov rbx, arg(5) ;mask + movsxd rax, dword ptr arg(2) ;src_stride + mov r11, arg(3) ; destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + mov rcx, 16 + + pxor xmm3, xmm3 + +next_vp8_masked_predictor_wmt: + movdqu xmm0, [rsi] + movdqu xmm1, [rdi] + movdqu xmm2, [rbx] + + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + movdqu [r11], xmm0 + + add r11, rdx + add rsi, rax + add rdi, rdx + add rbx, 16 + + dec rcx + jnz next_vp8_masked_predictor_wmt + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +;unsigned int vp8_masked_predictor_uv_wmt( +; unsigned char *masked, +; unsigned char *unmasked, +; int src_stride, +; unsigned char *dst_ptr, +; int dst_stride, +; unsigned char *mask) +global sym(vp8_masked_predictor_uv_wmt) +sym(vp8_masked_predictor_uv_wmt): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;ref_ptr + + mov rbx, arg(5) ;mask + movsxd rax, dword ptr arg(2) ;src_stride + mov r11, arg(3) ; destination + movsxd rdx, dword ptr arg(4) ;dst_stride + + mov rcx, 8 + + pxor xmm3, xmm3 + +next_vp8_masked_predictor_uv_wmt: + movq xmm0, [rsi] + movq xmm1, [rdi] + movq xmm2, [rbx] + + pand xmm0, xmm2 + pandn xmm2, xmm1 + por xmm0, xmm2 + movq [r11], xmm0 + + add r11, rdx + add rsi, rax + add rdi, rax + add rbx, 8 + + dec rcx + jnz next_vp8_masked_predictor_uv_wmt + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + + +;unsigned int vp8_uv_from_y_mask( +; unsigned char *ymask, +; unsigned char *uvmask) +global sym(vp8_uv_from_y_mask) +sym(vp8_uv_from_y_mask): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 6 + push rsi + push rdi + ; end prolog + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + + mov rcx, 8 + + pxor xmm3, xmm3 + +next_p8_uv_from_y_mask: + movdqu xmm0, [rsi] + pshufb xmm0, [shuf1b] ;[GLOBAL(shuf1b)] + movq [rdi],xmm0 + add rdi, 8 + add rsi,32 + + dec rcx + jnz next_p8_uv_from_y_mask + + ; begin epilog + pop rdi + pop rsi + UNSHADOW_ARGS + pop rbp + ret + +SECTION_RODATA +align 16 +shuf1b: + db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0 + |