diff options
-rwxr-xr-x | configure | 1 | ||||
-rw-r--r-- | vp9/common/vp9_blockd.h | 22 | ||||
-rw-r--r-- | vp9/common/vp9_enums.h | 1 | ||||
-rw-r--r-- | vp9/common/vp9_loopfilter_filters.c | 34 | ||||
-rw-r--r-- | vp9/common/vp9_rtcd_defs.sh | 57 | ||||
-rw-r--r-- | vp9/common/x86/vp9_loopfilter_intrin_mmx.c | 16 | ||||
-rw-r--r-- | vp9/common/x86/vp9_loopfilter_intrin_sse2.c | 16 | ||||
-rw-r--r-- | vp9/common/x86/vp9_loopfilter_mmx.asm | 343 | ||||
-rw-r--r-- | vp9/common/x86/vp9_loopfilter_sse2.asm | 366 | ||||
-rw-r--r-- | vp9/common/x86/vp9_loopfilter_x86.h | 8 | ||||
-rw-r--r-- | vp9/decoder/vp9_decodframe.c | 4 | ||||
-rw-r--r-- | vp9/decoder/vp9_detokenize.c | 7 | ||||
-rw-r--r-- | vp9/encoder/vp9_block.h | 8 | ||||
-rw-r--r-- | vp9/encoder/vp9_encodeframe.c | 781 | ||||
-rw-r--r-- | vp9/encoder/vp9_firstpass.c | 2 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.c | 297 | ||||
-rw-r--r-- | vp9/encoder/vp9_mcomp.h | 16 | ||||
-rw-r--r-- | vp9/encoder/vp9_onyx_if.c | 40 | ||||
-rw-r--r-- | vp9/encoder/vp9_rdopt.c | 199 | ||||
-rw-r--r-- | vp9/encoder/vp9_variance.h | 46 | ||||
-rw-r--r-- | vp9/encoder/vp9_variance_c.c | 256 |
21 files changed, 1113 insertions, 1407 deletions
@@ -247,6 +247,7 @@ EXPERIMENT_LIST=" multiple_arf non420 ab4x4 + comp_inter_joint_search " CONFIG_LIST=" external_build diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h index ab9e28dcc..004054d10 100644 --- a/vp9/common/vp9_blockd.h +++ b/vp9/common/vp9_blockd.h @@ -70,17 +70,17 @@ typedef enum { } INTERPOLATIONFILTERTYPE; typedef enum { - DC_PRED, /* average of above and left pixels */ - V_PRED, /* vertical prediction */ - H_PRED, /* horizontal prediction */ - D45_PRED, /* Directional 45 deg prediction [anti-clockwise from 0 deg hor] */ - D135_PRED, /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */ - D117_PRED, /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */ - D153_PRED, /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */ - D27_PRED, /* Directional 22 deg prediction [anti-clockwise from 0 deg hor] */ - D63_PRED, /* Directional 67 deg prediction [anti-clockwise from 0 deg hor] */ - TM_PRED, /* Truemotion prediction */ - I4X4_PRED, /* 4x4 based prediction, each 4x4 has its own mode */ + DC_PRED, // Average of above and left pixels + V_PRED, // Vertical + H_PRED, // Horizontal + D45_PRED, // Directional 45 deg = round(arctan(1/1) * 180/pi) + D135_PRED, // Directional 135 deg = 180 - 45 + D117_PRED, // Directional 117 deg = 180 - 63 + D153_PRED, // Directional 153 deg = 180 - 27 + D27_PRED, // Directional 27 deg = round(arctan(1/2) * 180/pi) + D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi) + TM_PRED, // True-motion + I4X4_PRED, // Each 4x4 subblock has its own mode NEARESTMV, NEARMV, ZEROMV, diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h index 166319565..2f6707487 100644 --- a/vp9/common/vp9_enums.h +++ b/vp9/common/vp9_enums.h @@ -36,6 +36,7 @@ typedef enum BLOCK_SIZE_TYPE { BLOCK_SIZE_SB32X64, BLOCK_SIZE_SB64X32, BLOCK_SIZE_SB64X64, + BLOCK_SIZE_TYPES } BLOCK_SIZE_TYPE; typedef enum PARTITION_TYPE { diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c index 15785f581..bf97589a9 100644 --- a/vp9/common/vp9_loopfilter_filters.c +++ b/vp9/common/vp9_loopfilter_filters.c @@ -282,29 +282,6 @@ static INLINE void simple_filter(int8_t mask, *op0 = signed_char_clamp(p0 + filter2) ^ 0x80; } -void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s, int p, - const uint8_t *blimit) { - int i = 0; - - do { - const int8_t mask = simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p], - s[0 * p], s[1 * p]); - simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p); - ++s; - } while (++i < 16); -} - -void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s, int p, - const uint8_t *blimit) { - int i = 0; - - do { - const int8_t mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]); - simple_filter(mask, s - 2, s - 1, s, s + 1); - s += p; - } while (++i < 16); -} - /* Vertical MB Filtering */ void vp9_loop_filter_mbv_c(uint8_t *y_ptr, uint8_t *u_ptr, uint8_t *v_ptr, int y_stride, int uv_stride, @@ -392,11 +369,6 @@ void vp9_loop_filter_bh8x8_c(uint8_t *y, uint8_t *u, uint8_t *v, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp9_loop_filter_bhs_c(uint8_t *y, int y_stride, const uint8_t *blimit) { - vp9_loop_filter_simple_horizontal_edge_c(y + 4 * y_stride, y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_c(y + 8 * y_stride, y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_c(y + 12 * y_stride, y_stride, blimit); -} void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v, int y_stride, int uv_stride, @@ -413,12 +385,6 @@ void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp9_loop_filter_bvs_c(uint8_t *y, int y_stride, const uint8_t *blimit) { - vp9_loop_filter_simple_vertical_edge_c(y + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_c(y + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_c(y + 12, y_stride, blimit); -} - static INLINE void wide_mbfilter(int8_t mask, uint8_t hev, uint8_t flat, uint8_t flat2, uint8_t *op7, uint8_t *op6, uint8_t *op5, diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 75e36040c..02d32530a 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh @@ -128,30 +128,6 @@ specialize vp9_loop_filter_bh sse2 prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi" specialize vp9_loop_filter_bh8x8 sse2 -prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit" -specialize vp9_loop_filter_simple_mbv mmx sse2 -vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c -vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx -vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2 - -prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit" -specialize vp9_loop_filter_simple_mbh mmx sse2 -vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c -vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx -vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2 - -prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit" -specialize vp9_loop_filter_simple_bv mmx sse2 -vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c -vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx -vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2 - -prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit" -specialize vp9_loop_filter_simple_bh mmx sse2 -vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c -vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx -vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2 - prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi" specialize vp9_lpf_mbh_w sse2 @@ -337,41 +313,74 @@ vp9_variance4x4_mmx=vp9_variance4x4_mmx prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance64x64 sse2 +prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance64x64 + prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance32x64 +prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance32x64 + prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance64x32 +prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance64x32 + prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance32x16 +prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance32x16 + prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x32 +prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance16x32 + prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance32x32 sse2 +prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance32x32 + prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3 +prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance16x16 + prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance8x16 sse2 mmx vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt +prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance8x16 + prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3 vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3; vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt +prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance16x8 + prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance8x8 sse2 mmx vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt +prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance8x8 + prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse" specialize vp9_sub_pixel_variance4x4 sse2 mmx vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt +prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred" +specialize vp9_sub_pixel_avg_variance4x4 + prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad" specialize vp9_sad64x64 sse2 diff --git a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c index 2be9e3179..7e6c4be2c 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c @@ -35,16 +35,6 @@ void vp9_loop_filter_bh_mmx(unsigned char *y_ptr, } -void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride, - y_stride, blimit); -} - /* Vertical B Filtering */ void vp9_loop_filter_bv_mmx(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -66,9 +56,3 @@ void vp9_loop_filter_bv_mmx(unsigned char *y_ptr, lfi->blim, lfi->lim, lfi->hev_thr, 1); } -void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit); -} diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c index 08447a62d..7982ca6a2 100644 --- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c +++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c @@ -1115,16 +1115,6 @@ void vp9_loop_filter_bh_sse2(unsigned char *y_ptr, v_ptr + 4 * uv_stride); } -void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride, - y_stride, blimit); - vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride, - y_stride, blimit); -} - /* Vertical B Filtering */ void vp9_loop_filter_bv_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, @@ -1143,9 +1133,3 @@ void vp9_loop_filter_bv_sse2(unsigned char *y_ptr, v_ptr + 4); } -void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride, - const unsigned char *blimit) { - vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit); - vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit); -} diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm index ceffdf558..4ebb51b77 100644 --- a/vp9/common/x86/vp9_loopfilter_mmx.asm +++ b/vp9/common/x86/vp9_loopfilter_mmx.asm @@ -593,349 +593,6 @@ sym(vp9_loop_filter_vertical_edge_mmx): pop rbp ret - -;void vp9_loop_filter_simple_horizontal_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE -sym(vp9_loop_filter_simple_horizontal_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - mov rcx, 2 ; count -.nexts8_h: - mov rdx, arg(2) ;blimit ; get blimit - movq mm3, [rdx] ; - - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - neg rax - - ; calculate mask - movq mm1, [rsi+2*rax] ; p1 - movq mm0, [rdi] ; q1 - movq mm2, mm1 - movq mm7, mm0 - movq mm4, mm0 - psubusb mm0, mm1 ; q1-=p1 - psubusb mm1, mm4 ; p1-=q1 - por mm1, mm0 ; abs(p1-q1) - pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm1, 1 ; abs(p1-q1)/2 - - movq mm5, [rsi+rax] ; p0 - movq mm4, [rsi] ; q0 - movq mm0, mm4 ; q0 - movq mm6, mm5 ; p0 - psubusb mm5, mm4 ; p0-=q0 - psubusb mm4, mm6 ; q0-=p0 - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm3, mm3 - pcmpeqb mm5, mm3 - - ; start work on filters - pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb mm2, mm7 ; p1 - q1 - - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm0 ; q0 - psubsb mm0, mm6 ; q0 - p0 - paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0) - pand mm5, mm2 ; mask filter values we don't care about - - ; do + 4 side - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - movq mm1, mm5 ; get a copy of filters - psraw mm1, 11 ; arithmetic shift right 11 - psllw mm1, 8 ; shift left 8 to put it back - - por mm0, mm1 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0 add - pxor mm3, [GLOBAL(t80)] ; unoffset - movq [rsi], mm3 ; write back - - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - movq [rsi+rax], mm6 ; write back - - add rsi,8 - neg rax - dec rcx - jnz .nexts8_h - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_simple_vertical_edge_mmx -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit -;) -global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE -sym(vp9_loop_filter_simple_vertical_edge_mmx): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi + rax*4- 2]; ; - mov rcx, 2 ; count -.nexts8_v: - - lea rdi, [rsi + rax]; - movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70 - - movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60 - punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60 - - movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50 - movd mm4, [rsi] ; xx xx xx xx 43 42 41 40 - - punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40 - movq mm5, mm4 ; 53 43 52 42 51 41 50 40 - - punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40 - punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42 - - neg rax - - movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30 - movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20 - - punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20 - movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10 - - movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00 - punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00 - - movq mm2, mm0 ; 13 03 12 02 11 01 10 00 - punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00 - - punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02 - movq mm1, mm0 ; 13 03 12 02 11 01 10 00 - - punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1 - movq mm3, mm2 ; 33 23 13 03 32 22 12 02 - - punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0 - punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0 - - punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1 - - - ; calculate mask - movq mm6, mm0 ; p1 - movq mm7, mm3 ; q1 - psubusb mm7, mm6 ; q1-=p1 - psubusb mm6, mm3 ; p1-=q1 - por mm6, mm7 ; abs(p1-q1) - pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw mm6, 1 ; abs(p1-q1)/2 - - movq mm5, mm1 ; p0 - movq mm4, mm2 ; q0 - - psubusb mm5, mm2 ; p0-=q0 - psubusb mm4, mm1 ; q0-=p0 - - por mm5, mm4 ; abs(p0 - q0) - paddusb mm5, mm5 ; abs(p0-q0)*2 - paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit ; get blimit - movq mm7, [rdx] - - psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor mm7, mm7 - pcmpeqb mm5, mm7 ; mm5 = mask - - ; start work on filters - movq t0, mm0 - movq t1, mm3 - - pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb mm0, mm3 ; p1 - q1 - movq mm6, mm1 ; p0 - - movq mm7, mm2 ; q0 - pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values - - pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values - movq mm3, mm7 ; offseted ; q0 - - psubsb mm7, mm6 ; q0 - p0 - paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0) - - paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0) - paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0) - - pand mm5, mm0 ; mask filter values we don't care about - - paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - movq mm7, mm5 ; get a copy of filters - psraw mm7, 11 ; arithmetic shift right 11 - psllw mm7, 8 ; shift left 8 to put it back - - por mm0, mm7 ; put the two together to get result - - psubsb mm3, mm0 ; q0-= q0sz add - pxor mm3, [GLOBAL(t80)] ; unoffset - - ; now do +3 side - psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movq mm0, mm5 ; get a copy of filters - psllw mm0, 8 ; shift left 8 - psraw mm0, 3 ; arithmetic shift right 11 - psrlw mm0, 8 - - psraw mm5, 11 ; arithmetic shift right 11 - psllw mm5, 8 ; shift left 8 to put it back - por mm0, mm5 ; put the two together to get result - - paddsb mm6, mm0 ; p0+= p0 add - pxor mm6, [GLOBAL(t80)] ; unoffset - - - movq mm0, t0 - movq mm4, t1 - - ; mm0 = 70 60 50 40 30 20 10 00 - ; mm6 = 71 61 51 41 31 21 11 01 - ; mm3 = 72 62 52 42 32 22 12 02 - ; mm4 = 73 63 53 43 33 23 13 03 - ; transpose back to write out - - movq mm1, mm0 ; - punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00 - - punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40 - movq mm2, mm3 ; - - punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02 - movq mm5, mm1 ; 71 70 61 60 51 50 41 40 - - punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42 - movq mm6, mm0 ; 31 30 21 20 11 10 01 00 - - punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00 - punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20 - - movd [rsi+rax*4], mm0 ; write 03 02 01 00 - punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40 - - psrlq mm0, 32 ; xx xx xx xx 13 12 11 10 - punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60 - - movd [rdi+rax*4], mm0 ; write 13 12 11 10 - movd [rsi+rax*2], mm6 ; write 23 22 21 20 - - psrlq mm6, 32 ; 33 32 31 30 - movd [rsi], mm1 ; write 43 42 41 40 - - movd [rsi + rax], mm6 ; write 33 32 31 30 - neg rax - - movd [rsi + rax*2], mm5 ; write 63 62 61 60 - psrlq mm1, 32 ; 53 52 51 50 - - movd [rdi], mm1 ; write out 53 52 51 50 - psrlq mm5, 32 ; 73 72 71 70 - - movd [rdi + rax*2], mm5 ; write 73 72 71 70 - - lea rsi, [rsi+rax*8] ; next 8 - - dec rcx - jnz .nexts8_v - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - UNSHADOW_ARGS - pop rbp - ret - - - -;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr, -; int y_stride, -; loop_filter_info *lfi) -;{ -; -; -; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2); -;} - SECTION_RODATA align 16 tfe: diff --git a/vp9/common/x86/vp9_loopfilter_sse2.asm b/vp9/common/x86/vp9_loopfilter_sse2.asm index ae4c60f53..74236cfbb 100644 --- a/vp9/common/x86/vp9_loopfilter_sse2.asm +++ b/vp9/common/x86/vp9_loopfilter_sse2.asm @@ -845,372 +845,6 @@ sym(vp9_loop_filter_vertical_edge_uv_sse2): pop rbp ret -;void vp9_loop_filter_simple_horizontal_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -;) -global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE -sym(vp9_loop_filter_simple_horizontal_edge_sse2): - push rbp - mov rbp, rsp - SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 7 - GET_GOT rbx - push rsi - push rdi - ; end prolog - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - mov rdx, arg(2) ;blimit - movdqa xmm3, XMMWORD PTR [rdx] - - mov rdi, rsi ; rdi points to row +1 for indirect addressing - add rdi, rax - neg rax - - ; calculate mask - movdqa xmm1, [rsi+2*rax] ; p1 - movdqa xmm0, [rdi] ; q1 - movdqa xmm2, xmm1 - movdqa xmm7, xmm0 - movdqa xmm4, xmm0 - psubusb xmm0, xmm1 ; q1-=p1 - psubusb xmm1, xmm4 ; p1-=q1 - por xmm1, xmm0 ; abs(p1-q1) - pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw xmm1, 1 ; abs(p1-q1)/2 - - movdqa xmm5, [rsi+rax] ; p0 - movdqa xmm4, [rsi] ; q0 - movdqa xmm0, xmm4 ; q0 - movdqa xmm6, xmm5 ; p0 - psubusb xmm5, xmm4 ; p0-=q0 - psubusb xmm4, xmm6 ; q0-=p0 - por xmm5, xmm4 ; abs(p0 - q0) - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor xmm3, xmm3 - pcmpeqb xmm5, xmm3 - - ; start work on filters - pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values - psubsb xmm2, xmm7 ; p1 - q1 - - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values - movdqa xmm3, xmm0 ; q0 - psubsb xmm0, xmm6 ; q0 - p0 - paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0) - paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0) - paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0) - pand xmm5, xmm2 ; mask filter values we don't care about - - ; do + 4 side - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - movdqa xmm1, xmm5 ; get a copy of filters - psraw xmm1, 11 ; arithmetic shift right 11 - psllw xmm1, 8 ; shift left 8 to put it back - - por xmm0, xmm1 ; put the two together to get result - - psubsb xmm3, xmm0 ; q0-= q0 add - pxor xmm3, [GLOBAL(t80)] ; unoffset - movdqa [rsi], xmm3 ; write back - - ; now do +3 side - psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - psraw xmm5, 11 ; arithmetic shift right 11 - psllw xmm5, 8 ; shift left 8 to put it back - por xmm0, xmm5 ; put the two together to get result - - - paddsb xmm6, xmm0 ; p0+= p0 add - pxor xmm6, [GLOBAL(t80)] ; unoffset - movdqa [rsi+rax], xmm6 ; write back - - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - - -;void vp9_loop_filter_simple_vertical_edge_sse2 -;( -; unsigned char *src_ptr, -; int src_pixel_step, -; const char *blimit, -;) -global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE -sym(vp9_loop_filter_simple_vertical_edge_sse2): - push rbp ; save old base pointer value. - mov rbp, rsp ; set new base pointer value. - SHADOW_ARGS_TO_STACK 3 - SAVE_XMM 7 - GET_GOT rbx ; save callee-saved reg - push rsi - push rdi - ; end prolog - - ALIGN_STACK 16, rax - sub rsp, 32 ; reserve 32 bytes - %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16]; - %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16]; - - mov rsi, arg(0) ;src_ptr - movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch? - - lea rsi, [rsi - 2 ] - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00 - movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40 - movd xmm2, [rdi] ; 13 12 11 10 - movd xmm3, [rcx] ; 53 52 51 50 - punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00 - punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10 - - movd xmm4, [rsi + rax*2] ; 23 22 21 20 - movd xmm5, [rdx + rax*2] ; 63 62 61 60 - movd xmm6, [rdi + rax*2] ; 33 32 31 30 - movd xmm7, [rcx + rax*2] ; 73 72 71 70 - punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20 - punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30 - - punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 - punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 - - movdqa xmm1, xmm0 - punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 - punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 - - movdqa xmm2, xmm0 - punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - - movdqa t0, xmm0 ; save to t0 - movdqa t1, xmm2 ; save to t1 - - lea rsi, [rsi + rax*8] - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd xmm4, [rsi] ; 83 82 81 80 - movd xmm1, [rdx] ; c3 c2 c1 c0 - movd xmm6, [rdi] ; 93 92 91 90 - movd xmm3, [rcx] ; d3 d2 d1 d0 - punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80 - punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90 - - movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0 - movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0 - movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0 - movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0 - punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0 - punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0 - - punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80 - punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0 - - movdqa xmm1, xmm4 - punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80 - punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0 - - movdqa xmm6, xmm4 - punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 - punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 - - movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 - movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 - movdqa xmm1, xmm0 - movdqa xmm3, xmm2 - - punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - - ; calculate mask - movdqa xmm6, xmm0 ; p1 - movdqa xmm7, xmm3 ; q1 - psubusb xmm7, xmm0 ; q1-=p1 - psubusb xmm6, xmm3 ; p1-=q1 - por xmm6, xmm7 ; abs(p1-q1) - pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero - psrlw xmm6, 1 ; abs(p1-q1)/2 - - movdqa xmm5, xmm1 ; p0 - movdqa xmm4, xmm2 ; q0 - psubusb xmm5, xmm2 ; p0-=q0 - psubusb xmm4, xmm1 ; q0-=p0 - por xmm5, xmm4 ; abs(p0 - q0) - paddusb xmm5, xmm5 ; abs(p0-q0)*2 - paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2 - - mov rdx, arg(2) ;blimit - movdqa xmm7, XMMWORD PTR [rdx] - - psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit - pxor xmm7, xmm7 - pcmpeqb xmm5, xmm7 ; mm5 = mask - - ; start work on filters - movdqa t0, xmm0 - movdqa t1, xmm3 - - pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values - pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values - - psubsb xmm0, xmm3 ; p1 - q1 - movdqa xmm6, xmm1 ; p0 - - movdqa xmm7, xmm2 ; q0 - pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values - - pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values - movdqa xmm3, xmm7 ; offseted ; q0 - - psubsb xmm7, xmm6 ; q0 - p0 - paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0) - - paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0) - paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0) - - pand xmm5, xmm0 ; mask filter values we don't care about - - - paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4 - - movdqa xmm0, xmm5 ; get a copy of filters - psllw xmm0, 8 ; shift left 8 - - psraw xmm0, 3 ; arithmetic shift right 11 - psrlw xmm0, 8 - - movdqa xmm7, xmm5 ; get a copy of filters - psraw xmm7, 11 ; arithmetic shift right 11 - - psllw xmm7, 8 ; shift left 8 to put it back - por xmm0, xmm7 ; put the two together to get result - - psubsb xmm3, xmm0 ; q0-= q0sz add - pxor xmm3, [GLOBAL(t80)] ; unoffset q0 - - ; now do +3 side - psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4 - movdqa xmm0, xmm5 ; get a copy of filters - - psllw xmm0, 8 ; shift left 8 - psraw xmm0, 3 ; arithmetic shift right 11 - - psrlw xmm0, 8 - psraw xmm5, 11 ; arithmetic shift right 11 - - psllw xmm5, 8 ; shift left 8 to put it back - por xmm0, xmm5 ; put the two together to get result - - paddsb xmm6, xmm0 ; p0+= p0 add - pxor xmm6, [GLOBAL(t80)] ; unoffset p0 - - movdqa xmm0, t0 ; p1 - movdqa xmm4, t1 ; q1 - - ; transpose back to write out - ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 - ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 - ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 - ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 - movdqa xmm1, xmm0 - punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 - punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 - - movdqa xmm5, xmm3 - punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 - punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 - - movdqa xmm2, xmm0 - punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 - punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 - - movdqa xmm3, xmm1 - punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 - punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 - - ; write out order: xmm0 xmm2 xmm1 xmm3 - lea rdx, [rsi + rax*4] - - movd [rsi], xmm1 ; write the second 8-line result - psrldq xmm1, 4 - movd [rdi], xmm1 - psrldq xmm1, 4 - movd [rsi + rax*2], xmm1 - psrldq xmm1, 4 - movd [rdi + rax*2], xmm1 - - movd [rdx], xmm3 - psrldq xmm3, 4 - movd [rcx], xmm3 - psrldq xmm3, 4 - movd [rdx + rax*2], xmm3 - psrldq xmm3, 4 - movd [rcx + rax*2], xmm3 - - neg rax - lea rsi, [rsi + rax*8] - neg rax - lea rdi, [rsi + rax] - lea rdx, [rsi + rax*4] - lea rcx, [rdx + rax] - - movd [rsi], xmm0 ; write the first 8-line result - psrldq xmm0, 4 - movd [rdi], xmm0 - psrldq xmm0, 4 - movd [rsi + rax*2], xmm0 - psrldq xmm0, 4 - movd [rdi + rax*2], xmm0 - - movd [rdx], xmm2 - psrldq xmm2, 4 - movd [rcx], xmm2 - psrldq xmm2, 4 - movd [rdx + rax*2], xmm2 - psrldq xmm2, 4 - movd [rcx + rax*2], xmm2 - - add rsp, 32 - pop rsp - ; begin epilog - pop rdi - pop rsi - RESTORE_GOT - RESTORE_XMM - UNSHADOW_ARGS - pop rbp - ret - SECTION_RODATA align 16 tfe: diff --git a/vp9/common/x86/vp9_loopfilter_x86.h b/vp9/common/x86/vp9_loopfilter_x86.h index 46a6202d2..fb5af05f7 100644 --- a/vp9/common/x86/vp9_loopfilter_x86.h +++ b/vp9/common/x86/vp9_loopfilter_x86.h @@ -23,10 +23,6 @@ extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx); extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx); extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx); extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx); #endif #if HAVE_SSE2 @@ -34,10 +30,6 @@ extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2); extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2); extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2); extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2); -extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2); #endif #endif // LOOPFILTER_X86_H diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c index 4be36774a..70db06dc1 100644 --- a/vp9/decoder/vp9_decodframe.c +++ b/vp9/decoder/vp9_decodframe.c @@ -811,12 +811,12 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) { int mi_row, mi_col; for (mi_row = pc->cur_tile_mi_row_start; - mi_row < pc->cur_tile_mi_row_end; mi_row += 8) { + mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) { // For a SB there are 2 left contexts, each pertaining to a MB row within vpx_memset(&pc->left_context, 0, sizeof(pc->left_context)); vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context)); for (mi_col = pc->cur_tile_mi_col_start; - mi_col < pc->cur_tile_mi_col_end; mi_col += 8) + mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE) decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64); } } diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c index ce2a86b4f..1d7e093cf 100644 --- a/vp9/decoder/vp9_detokenize.c +++ b/vp9/decoder/vp9_detokenize.c @@ -62,7 +62,7 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]); #define INCREMENT_COUNT(token) \ do { \ - coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \ + coef_counts[type][ref][band] \ [pt][token]++; \ token_cache[scan[c]] = token; \ } while (0) @@ -96,6 +96,7 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd, ENTROPY_CONTEXT above_ec, left_ec; FRAME_CONTEXT *const fc = &dx->common.fc; int pt, c = 0, pad, default_eob; + int band; vp9_coeff_probs *coef_probs; vp9_prob *prob; vp9_coeff_count *coef_counts; @@ -162,7 +163,6 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd, while (1) { int val; - int band; const uint8_t *cat6 = cat6_prob; if (c >= seg_eob) break; @@ -249,8 +249,7 @@ SKIP_START: } if (c < seg_eob) - coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] - [pt][DCT_EOB_TOKEN]++; + coef_counts[type][ref][band][pt][DCT_EOB_TOKEN]++; for (pt = 0; pt < (1 << txfm_size); pt++) { A[pt] = L[pt] = c > 0; diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h index 6bc42c7ff..44261481c 100644 --- a/vp9/encoder/vp9_block.h +++ b/vp9/encoder/vp9_block.h @@ -138,8 +138,8 @@ struct macroblock { int optimize; - // Structure to hold context for each of the 4 MBs within a SB: - // when encoded as 4 independent MBs: + // TODO(jingning): Need to refactor the structure arrays that buffers the + // coding mode decisions of each partition type. PICK_MODE_CONTEXT sb8_context[4][4][4]; PICK_MODE_CONTEXT sb8x16_context[4][4][2]; PICK_MODE_CONTEXT sb16x8_context[4][4][2]; @@ -153,6 +153,10 @@ struct macroblock { PICK_MODE_CONTEXT sb64_context; int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES]; + BLOCK_SIZE_TYPE mb_partitioning[4][4]; + BLOCK_SIZE_TYPE sb_partitioning[4]; + BLOCK_SIZE_TYPE sb64_partitioning; + void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch); void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch); void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch); diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c index 418f60edc..3345f8965 100644 --- a/vp9/encoder/vp9_encodeframe.c +++ b/vp9/encoder/vp9_encodeframe.c @@ -731,6 +731,9 @@ static void set_block_index(MACROBLOCKD *xd, int idx, } } +// TODO(jingning): the variables used here are little complicated. need further +// refactoring on organizing the the temporary buffers, when recursive +// partition down to 4x4 block size is enabled. static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, BLOCK_SIZE_TYPE bsize) { MACROBLOCKD *const xd = &x->e_mbd; @@ -762,6 +765,72 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x, } } +static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) { + switch (subsize) { + case BLOCK_SIZE_SB64X32: + case BLOCK_SIZE_SB32X64: + case BLOCK_SIZE_SB32X32: + return &xd->sb_index; + case BLOCK_SIZE_SB32X16: + case BLOCK_SIZE_SB16X32: + case BLOCK_SIZE_MB16X16: + return &xd->mb_index; + case BLOCK_SIZE_SB16X8: + case BLOCK_SIZE_SB8X16: + case BLOCK_SIZE_SB8X8: + return &xd->b_index; + default: + assert(0); + return NULL; + } +} + +static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x, + BLOCK_SIZE_TYPE bsize) { + MACROBLOCKD *xd = &x->e_mbd; + switch (bsize) { + case BLOCK_SIZE_SB64X64: + return &x->sb64_partitioning; + case BLOCK_SIZE_SB32X32: + return &x->sb_partitioning[xd->sb_index]; + case BLOCK_SIZE_MB16X16: + return &x->mb_partitioning[xd->sb_index][xd->mb_index]; + default: + assert(0); + return NULL; + } +} + +static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col, + ENTROPY_CONTEXT a[16 * MAX_MB_PLANE], + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], + PARTITION_CONTEXT sa[8], + PARTITION_CONTEXT sl[8], + BLOCK_SIZE_TYPE bsize) { + VP9_COMMON *const cm = &cpi->common; + MACROBLOCK *const x = &cpi->mb; + MACROBLOCKD *const xd = &x->e_mbd; + int p; + int bwl = b_width_log2(bsize), bw = 1 << bwl; + int bhl = b_height_log2(bsize), bh = 1 << bhl; + int mwl = mi_width_log2(bsize), mw = 1 << mwl; + int mhl = mi_height_log2(bsize), mh = 1 << mhl; + for (p = 0; p < MAX_MB_PLANE; p++) { + vpx_memcpy(cm->above_context[p] + + ((mi_col * 2) >> xd->plane[p].subsampling_x), + a + bw * p, + sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x); + vpx_memcpy(cm->left_context[p] + + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), + l + bh * p, + sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y); + } + vpx_memcpy(cm->above_seg_context + mi_col, sa, + sizeof(PARTITION_CONTEXT) * mw); + vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl, + sizeof(PARTITION_CONTEXT) * mh); +} + static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, BLOCK_SIZE_TYPE bsize, int sub_index) { @@ -788,27 +857,28 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp, static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int mi_row, int mi_col, int output_enabled, - BLOCK_SIZE_TYPE level, - BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4], - BLOCK_SIZE_TYPE c3[4][4] - ) { + BLOCK_SIZE_TYPE bsize) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - const int bsl = mi_width_log2(level), bs = 1 << (bsl - 1); - const int bwl = mi_width_log2(c1), bhl = mi_height_log2(c1); + BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8; + const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1); + int bwl, bhl; int UNINITIALIZED_IS_SAFE(pl); if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) return; - if (level > BLOCK_SIZE_SB8X8) { + if (bsize > BLOCK_SIZE_SB8X8) { set_partition_seg_context(cpi, mi_row, mi_col); - pl = partition_plane_context(xd, level); + pl = partition_plane_context(xd, bsize); + c1 = *(get_sb_partitioning(x, bsize)); } + bwl = mi_width_log2(c1), bhl = mi_height_log2(c1); + if (bsl == bwl && bsl == bhl) { - if (output_enabled && level > BLOCK_SIZE_SB8X8) + if (output_enabled && bsize > BLOCK_SIZE_SB8X8) cpi->partition_count[pl][PARTITION_NONE]++; encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1); } else if (bsl == bhl && bsl > bwl) { @@ -826,12 +896,12 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, int i; assert(bwl < bsl && bhl < bsl); - if (level == BLOCK_SIZE_SB64X64) { + if (bsize == BLOCK_SIZE_SB64X64) { subsize = BLOCK_SIZE_SB32X32; - } else if (level == BLOCK_SIZE_SB32X32) { + } else if (bsize == BLOCK_SIZE_SB32X32) { subsize = BLOCK_SIZE_MB16X16; } else { - assert(level == BLOCK_SIZE_MB16X16); + assert(bsize == BLOCK_SIZE_MB16X16); subsize = BLOCK_SIZE_SB8X8; } @@ -843,554 +913,200 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp, set_block_index(xd, i, subsize); encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs, - output_enabled, subsize, - c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL); + output_enabled, subsize); } } - if (level > BLOCK_SIZE_SB8X8 && - (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) { + if (bsize > BLOCK_SIZE_SB8X8 && + (bsize == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) { set_partition_seg_context(cpi, mi_row, mi_col); - update_partition_context(xd, c1, level); + update_partition_context(xd, c1, bsize); } } -static void encode_sb_row(VP9_COMP *cpi, - int mi_row, - TOKENEXTRA **tp, - int *totalrate) { + +// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are +// unlikely to be selected depending on previously rate-distortion optimization +// results, for encoding speed-up. +static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp, + int mi_row, int mi_col, + BLOCK_SIZE_TYPE bsize, + int *rate, int *dist) { VP9_COMMON *const cm = &cpi->common; MACROBLOCK *const x = &cpi->mb; MACROBLOCKD *const xd = &x->e_mbd; - int mi_col, pl; - - // Initialize the left context for the new SB row - vpx_memset(&cm->left_context, 0, sizeof(cm->left_context)); - vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context)); - - // Code each SB in the row - for (mi_col = cm->cur_tile_mi_col_start; - mi_col < cm->cur_tile_mi_col_end; mi_col += 8) { - int i, p; - BLOCK_SIZE_TYPE mb_partitioning[4][4]; - BLOCK_SIZE_TYPE sb_partitioning[4]; - BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32; - int sb64_rate = 0, sb64_dist = 0; - int sb64_skip = 0; - ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; - PARTITION_CONTEXT seg_l[64 / MI_SIZE], seg_a[64 / MI_SIZE]; - TOKENEXTRA *tp_orig = *tp; - - for (p = 0; p < MAX_MB_PLANE; p++) { - memcpy(a + 16 * p, cm->above_context[p] + - (mi_col * 2 >> xd->plane[p].subsampling_x), - sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x); - memcpy(l + 16 * p, cm->left_context[p], - sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y); - } - vpx_memcpy(&seg_a, cm->above_seg_context + mi_col, sizeof(seg_a)); - vpx_memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l)); + int bsl = b_width_log2(bsize), bs = 1 << bsl; + int msl = mi_height_log2(bsize), ms = 1 << msl; + ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE]; + PARTITION_CONTEXT sl[8], sa[8]; + TOKENEXTRA *tp_orig = *tp; + int i, p, pl; + BLOCK_SIZE_TYPE subsize; + int srate = INT_MAX, sdist = INT_MAX; + + assert(mi_height_log2(bsize) == mi_width_log2(bsize)); + + // buffer the above/left context information of the block in search. + for (p = 0; p < MAX_MB_PLANE; ++p) { + vpx_memcpy(a + bs * p, cm->above_context[p] + + (mi_col * 2 >> xd->plane[p].subsampling_x), + sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_x); + vpx_memcpy(l + bs * p, cm->left_context[p] + + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y), + sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_y); + } + vpx_memcpy(sa, cm->above_seg_context + mi_col, + sizeof(PARTITION_CONTEXT) * ms); + vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK), + sizeof(PARTITION_CONTEXT) * ms); + + // PARTITION_SPLIT + if (bsize >= BLOCK_SIZE_MB16X16) { + int r4 = 0, d4 = 0; + subsize = get_subsize(bsize, PARTITION_SPLIT); + *(get_sb_partitioning(x, bsize)) = subsize; + + for (i = 0; i < 4; ++i) { + int x_idx = (i & 1) * (ms >> 1); + int y_idx = (i >> 1) * (ms >> 1); + int r, d; - // FIXME(rbultje): this function should probably be rewritten to be - // recursive at some point in the future. - for (i = 0; i < 4; i++) { - const int x_idx = (i & 1) << 2; - const int y_idx = (i & 2) << 1; - int sb32_rate = 0, sb32_dist = 0; - int splitmodes_used = 0; - int sb32_skip = 0; - int j; - ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE]; - PARTITION_CONTEXT sl32[32 / MI_SIZE], sa32[32 / MI_SIZE]; - - sb_partitioning[i] = BLOCK_SIZE_MB16X16; - if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols) + if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols)) continue; - xd->sb_index = i; - - /* Function should not modify L & A contexts; save and restore on exit */ - for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(l2 + 8 * p, - cm->left_context[p] + - (y_idx * 2 >> xd->plane[p].subsampling_y), - sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y); - vpx_memcpy(a2 + 8 * p, - cm->above_context[p] + - ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x), - sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x); - } - vpx_memcpy(&sa32, cm->above_seg_context + mi_col + x_idx, sizeof(sa32)); - vpx_memcpy(&sl32, cm->left_seg_context + y_idx, sizeof(sl32)); - - /* Encode MBs in raster order within the SB */ - for (j = 0; j < 4; j++) { - const int x_idx_m = x_idx + ((j & 1) << 1); - const int y_idx_m = y_idx + ((j >> 1) << 1); - int r, d; - int r2, d2, mb16_rate = 0, mb16_dist = 0, k; - ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE]; - PARTITION_CONTEXT sl16[16 / MI_SIZE], sa16[16 / MI_SIZE]; - - mb_partitioning[i][j] = BLOCK_SIZE_SB8X8; - - if (mi_row + y_idx_m >= cm->mi_rows || - mi_col + x_idx_m >= cm->mi_cols) { - // MB lies outside frame, move on - continue; - } - - // Index of the MB in the SB 0..3 - xd->mb_index = j; - - for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(l3 + 4 * p, - cm->left_context[p] + - (y_idx_m * 2 >> xd->plane[p].subsampling_y), - sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y); - vpx_memcpy(a3 + 4 * p, - cm->above_context[p] + - ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x), - sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x); - } - vpx_memcpy(&sa16, cm->above_seg_context + mi_col + x_idx_m, - sizeof(sa16)); - vpx_memcpy(&sl16, cm->left_seg_context + y_idx_m, sizeof(sl16)); - - for (k = 0; k < 4; k++) { - xd->b_index = k; - - // try 8x8 coding - pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1), - mi_col + x_idx_m + (k & 1), - tp, &r, &d, BLOCK_SIZE_SB8X8, - &x->sb8_context[xd->sb_index][xd->mb_index] - [xd->b_index]); - mb16_rate += r; - mb16_dist += d; - update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index] - [xd->b_index], - BLOCK_SIZE_SB8X8, 0); - encode_superblock(cpi, tp, - 0, mi_row + y_idx_m + (k >> 1), - mi_col + x_idx_m + (k & 1), - BLOCK_SIZE_SB8X8); - } - set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m); - pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16); - mb16_rate += x->partition_cost[pl][PARTITION_SPLIT]; - for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(cm->left_context[p] + - (y_idx_m * 2 >> xd->plane[p].subsampling_y), - l3 + 4 * p, - sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y); - vpx_memcpy(cm->above_context[p] + - ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x), - a3 + 4 * p, - sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x); - } - vpx_memcpy(cm->above_seg_context + mi_col + x_idx_m, - sa16, sizeof(sa16)); - vpx_memcpy(cm->left_seg_context + y_idx_m, sl16, sizeof(sl16)); - - // try 8x16 coding - r2 = 0; - d2 = 0; - xd->b_index = 0; - pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m, - tp, &r, &d, BLOCK_SIZE_SB8X16, - &x->sb8x16_context[xd->sb_index][xd->mb_index] - [xd->b_index]); - r2 += r; - d2 += d; - update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index] - [xd->b_index], - BLOCK_SIZE_SB8X16, 0); - encode_superblock(cpi, tp, - 0, mi_row + y_idx_m, mi_col + x_idx_m, - BLOCK_SIZE_SB8X16); - xd->b_index = 1; - pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1, - tp, &r, &d, BLOCK_SIZE_SB8X16, - &x->sb8x16_context[xd->sb_index][xd->mb_index] - [xd->b_index]); - r2 += r; - d2 += d; - set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m); - pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16); - r2 += x->partition_cost[pl][PARTITION_VERT]; - if (RDCOST(x->rdmult, x->rddiv, r2, d2) < - RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) { - mb16_rate = r2; - mb16_dist = d2; - mb_partitioning[i][j] = BLOCK_SIZE_SB8X16; - } - for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(cm->left_context[p] + - (y_idx_m * 2 >> xd->plane[p].subsampling_y), - l3 + 4 * p, - sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y); - vpx_memcpy(cm->above_context[p] + - ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x), - a3 + 4 * p, - sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x); - } - - // try 16x8 coding - r2 = 0; - d2 = 0; - xd->b_index = 0; - pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m, - tp, &r, &d, BLOCK_SIZE_SB16X8, - &x->sb16x8_context[xd->sb_index][xd->mb_index] - [xd->b_index]); - r2 += r; - d2 += d; - update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index] - [xd->b_index], - BLOCK_SIZE_SB16X8, 0); - encode_superblock(cpi, tp, - 0, mi_row + y_idx_m, mi_col + x_idx_m, - BLOCK_SIZE_SB16X8); - xd->b_index = 1; - pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m, - tp, &r, &d, BLOCK_SIZE_SB16X8, - &x->sb16x8_context[xd->sb_index][xd->mb_index] - [xd->b_index]); - r2 += r; - d2 += d; - set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m); - pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16); - r2 += x->partition_cost[pl][PARTITION_HORZ]; - if (RDCOST(x->rdmult, x->rddiv, r2, d2) < - RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) { - mb16_rate = r2; - mb16_dist = d2; - mb_partitioning[i][j] = BLOCK_SIZE_SB16X8; - } - for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(cm->left_context[p] + - (y_idx_m * 2 >> xd->plane[p].subsampling_y), - l3 + 4 * p, - sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y); - vpx_memcpy(cm->above_context[p] + - ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x), - a3 + 4 * p, - sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x); - } - - // try as 16x16 - pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m, - tp, &r, &d, BLOCK_SIZE_MB16X16, - &x->mb_context[xd->sb_index][xd->mb_index]); - set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m); - pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16); - r += x->partition_cost[pl][PARTITION_NONE]; - if (RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) { - mb16_rate = r; - mb16_dist = d; - mb_partitioning[i][j] = BLOCK_SIZE_MB16X16; - } - sb32_rate += mb16_rate; - sb32_dist += mb16_dist; - - // Dummy encode, do not do the tokenization - encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0, - BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL); - } - - /* Restore L & A coding context to those in place on entry */ - for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(cm->left_context[p] + - (y_idx * 2 >> xd->plane[p].subsampling_y), - l2 + 8 * p, - sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y); - vpx_memcpy(cm->above_context[p] + - ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x), - a2 + 8 * p, - sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x); - } - // restore partition information context - vpx_memcpy(cm->above_seg_context + mi_col + x_idx, sa32, sizeof(sa32)); - vpx_memcpy(cm->left_seg_context + y_idx, sl32, sizeof(sl32)); - - set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx); - pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32); - sb32_rate += x->partition_cost[pl][PARTITION_SPLIT]; - - if (cpi->sf.splitmode_breakout) { - sb32_skip = splitmodes_used; - sb64_skip += splitmodes_used; - } - - // check 32x16 - if (mi_col + x_idx + 4 <= cm->mi_cols) { - int r, d; - - xd->mb_index = 0; - pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, - tp, &r, &d, BLOCK_SIZE_SB32X16, - &x->sb32x16_context[xd->sb_index][xd->mb_index]); - if (mi_row + y_idx + 2 < cm->mi_rows) { - int r2, d2; - - update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index], - BLOCK_SIZE_SB32X16, 0); - encode_superblock(cpi, tp, - 0, mi_row + y_idx, mi_col + x_idx, - BLOCK_SIZE_SB32X16); - xd->mb_index = 1; - pick_sb_modes(cpi, mi_row + y_idx + 2, - mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16, - &x->sb32x16_context[xd->sb_index][xd->mb_index]); - r += r2; - d += d2; - } - - set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx); - pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32); - r += x->partition_cost[pl][PARTITION_HORZ]; - - /* is this better than MB coding? */ - if (RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) { - sb32_rate = r; - sb32_dist = d; - sb_partitioning[i] = BLOCK_SIZE_SB32X16; - } - - for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(cm->left_context[p] + - (y_idx * 2 >> xd->plane[p].subsampling_y), - l2 + 8 * p, - sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y); - vpx_memcpy(cm->above_context[p] + - ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x), - a2 + 8 * p, - sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x); - } - } - - // check 16x32 - if (mi_row + y_idx + 4 <= cm->mi_rows) { - int r, d; - - xd->mb_index = 0; - pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, - tp, &r, &d, BLOCK_SIZE_SB16X32, - &x->sb16x32_context[xd->sb_index][xd->mb_index]); - if (mi_col + x_idx + 2 < cm->mi_cols) { - int r2, d2; - - update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index], - BLOCK_SIZE_SB16X32, 0); - encode_superblock(cpi, tp, - 0, mi_row + y_idx, mi_col + x_idx, - BLOCK_SIZE_SB16X32); - xd->mb_index = 1; - pick_sb_modes(cpi, mi_row + y_idx, - mi_col + x_idx + 2, - tp, &r2, &d2, BLOCK_SIZE_SB16X32, - &x->sb16x32_context[xd->sb_index][xd->mb_index]); - r += r2; - d += d2; - } - - set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx); - pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32); - r += x->partition_cost[pl][PARTITION_VERT]; - - /* is this better than MB coding? */ - if (RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) { - sb32_rate = r; - sb32_dist = d; - sb_partitioning[i] = BLOCK_SIZE_SB16X32; - } - - for (p = 0; p < MAX_MB_PLANE; p++) { - vpx_memcpy(cm->left_context[p] + - (y_idx * 2 >> xd->plane[p].subsampling_y), - l2 + 8 * p, - sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y); - vpx_memcpy(cm->above_context[p] + - ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x), - a2 + 8 * p, - sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x); - } - } - - if (!sb32_skip && - mi_col + x_idx + 4 <= cm->mi_cols && - mi_row + y_idx + 4 <= cm->mi_rows) { - int r, d; - - /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */ - pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx, - tp, &r, &d, BLOCK_SIZE_SB32X32, - &x->sb32_context[xd->sb_index]); - - set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx); - pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32); - r += x->partition_cost[pl][PARTITION_NONE]; - - if (RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) { - sb32_rate = r; - sb32_dist = d; - sb_partitioning[i] = BLOCK_SIZE_SB32X32; - } - } - - // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled). - if (cpi->sf.mb16_breakout && sb_partitioning[i] != BLOCK_SIZE_SB32X32) { - ++sb64_skip; - } - - sb64_rate += sb32_rate; - sb64_dist += sb32_dist; - - /* Encode SB using best computed mode(s) */ - // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb - // for each level that we go up, we can just keep tokens and recon - // pixels of the lower level; also, inverting SB/MB order (big->small - // instead of small->big) means we can use as threshold for small, which - // may enable breakouts if RD is not good enough (i.e. faster) - encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0, - BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i], - NULL); + *(get_sb_index(xd, subsize)) = i; + rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize, + &r, &d); + r4 += r; + d4 += d; } - - for (p = 0; p < MAX_MB_PLANE; p++) { - memcpy(cm->above_context[p] + - (mi_col * 2 >> xd->plane[p].subsampling_x), - a + 16 * p, - sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x); - memcpy(cm->left_context[p], l + 16 * p, - sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y); - } - memcpy(cm->above_seg_context + mi_col, &seg_a, sizeof(seg_a)); - memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l)); - set_partition_seg_context(cpi, mi_row, mi_col); - pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64); - sb64_rate += x->partition_cost[pl][PARTITION_SPLIT]; - - // check 64x32 - if (mi_col + 8 <= cm->mi_cols && !(cm->mb_rows & 1)) { - int r, d; - - xd->sb_index = 0; - pick_sb_modes(cpi, mi_row, mi_col, - tp, &r, &d, BLOCK_SIZE_SB64X32, - &x->sb64x32_context[xd->sb_index]); - if (mi_row + 4 != cm->mi_rows) { - int r2, d2; - - update_state(cpi, &x->sb64x32_context[xd->sb_index], - BLOCK_SIZE_SB64X32, 0); - encode_superblock(cpi, tp, - 0, mi_row, mi_col, BLOCK_SIZE_SB64X32); - xd->sb_index = 1; - pick_sb_modes(cpi, mi_row + 4, mi_col, - tp, &r2, &d2, BLOCK_SIZE_SB64X32, - &x->sb64x32_context[xd->sb_index]); - r += r2; - d += d2; - } + pl = partition_plane_context(xd, bsize); + r4 += x->partition_cost[pl][PARTITION_SPLIT]; - set_partition_seg_context(cpi, mi_row, mi_col); - pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64); - r += x->partition_cost[pl][PARTITION_HORZ]; - - /* is this better than MB coding? */ - if (RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) { - sb64_rate = r; - sb64_dist = d; - sb64_partitioning = BLOCK_SIZE_SB64X32; - } + srate = r4; + sdist = d4; + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + } - for (p = 0; p < MAX_MB_PLANE; p++) { - memcpy(cm->above_context[p] + - (mi_col * 2 >> xd->plane[p].subsampling_x), - a + 16 * p, - sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x); - memcpy(cm->left_context[p], l + 16 * p, - sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y); - } + // PARTITION_HORZ + if ((mi_col + ms <= cm->mi_cols) && (mi_row + (ms >> 1) <= cm->mi_rows) && + (bsize >= BLOCK_SIZE_MB16X16)) { + int r2, d2; + int mb_skip = 0; + subsize = get_subsize(bsize, PARTITION_HORZ); + *(get_sb_index(xd, subsize)) = 0; + pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, + get_block_context(x, subsize)); + + if (mi_row + ms <= cm->mi_rows) { + int r, d; + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + *(get_sb_index(xd, subsize)) = 1; + pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize, + get_block_context(x, subsize)); + r2 += r; + d2 += d; + } else { + if (mi_row + (ms >> 1) != cm->mi_rows) + mb_skip = 1; } + set_partition_seg_context(cpi, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + r2 += x->partition_cost[pl][PARTITION_HORZ]; + + if ((RDCOST(x->rdmult, x->rddiv, r2, d2) < + RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) { + srate = r2; + sdist = d2; + *(get_sb_partitioning(x, bsize)) = subsize; + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + } - // check 32x64 - if (mi_row + 8 <= cm->mi_rows && !(cm->mb_cols & 1)) { + // PARTITION_VERT + if ((mi_row + ms <= cm->mi_rows) && (mi_col + (ms >> 1) <= cm->mi_cols) && + (bsize >= BLOCK_SIZE_MB16X16)) { + int r2, d2; + int mb_skip = 0; + subsize = get_subsize(bsize, PARTITION_VERT); + *(get_sb_index(xd, subsize)) = 0; + pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize, + get_block_context(x, subsize)); + if (mi_col + ms <= cm->mi_cols) { int r, d; + update_state(cpi, get_block_context(x, subsize), subsize, 0); + encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize); + *(get_sb_index(xd, subsize)) = 1; + pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize, + get_block_context(x, subsize)); + r2 += r; + d2 += d; + } else { + if (mi_col + (ms >> 1) != cm->mi_cols) + mb_skip = 1; + } + set_partition_seg_context(cpi, mi_row, mi_col); + pl = partition_plane_context(xd, bsize); + r2 += x->partition_cost[pl][PARTITION_VERT]; + + if ((RDCOST(x->rdmult, x->rddiv, r2, d2) < + RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) { + srate = r2; + sdist = d2; + *(get_sb_partitioning(x, bsize)) = subsize; + } + restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize); + } - xd->sb_index = 0; - pick_sb_modes(cpi, mi_row, mi_col, - tp, &r, &d, BLOCK_SIZE_SB32X64, - &x->sb32x64_context[xd->sb_index]); - if (mi_col + 4 != cm->mi_cols) { - int r2, d2; - - update_state(cpi, &x->sb32x64_context[xd->sb_index], - BLOCK_SIZE_SB32X64, 0); - encode_superblock(cpi, tp, - 0, mi_row, mi_col, BLOCK_SIZE_SB32X64); - xd->sb_index = 1; - pick_sb_modes(cpi, mi_row, mi_col + 4, - tp, &r2, &d2, BLOCK_SIZE_SB32X64, - &x->sb32x64_context[xd->sb_index]); - r += r2; - d += d2; - } - + // PARTITION_NONE + if (mi_row + ms <= cm->mi_rows && mi_col + ms <= cm->mi_cols) { + int r, d; + pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize, + get_block_context(x, bsize)); + if (bsize >= BLOCK_SIZE_MB16X16) { set_partition_seg_context(cpi, mi_row, mi_col); - pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64); - r += x->partition_cost[pl][PARTITION_VERT]; - - /* is this better than MB coding? */ - if (RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) { - sb64_rate = r; - sb64_dist = d; - sb64_partitioning = BLOCK_SIZE_SB32X64; - } + pl = partition_plane_context(xd, bsize); + r += x->partition_cost[pl][PARTITION_NONE]; + } - for (p = 0; p < MAX_MB_PLANE; p++) { - memcpy(cm->above_context[p] + - (mi_col * 2 >> xd->plane[p].subsampling_x), - a + 16 * p, - sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x); - memcpy(cm->left_context[p], l + 16 * p, - sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y); - } + if (RDCOST(x->rdmult, x->rddiv, r, d) < + RDCOST(x->rdmult, x->rddiv, srate, sdist)) { + srate = r; + sdist = d; + if (bsize >= BLOCK_SIZE_MB16X16) + *(get_sb_partitioning(x, bsize)) = bsize; } + } - if (!sb64_skip && - mi_col + 8 <= cm->mi_cols && - mi_row + 8 <= cm->mi_rows) { - int r, d; + assert(srate < INT_MAX && sdist < INT_MAX); + *rate = srate; + *dist = sdist; - pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, - BLOCK_SIZE_SB64X64, &x->sb64_context); + encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize); - set_partition_seg_context(cpi, mi_row, mi_col); - pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64); - r += x->partition_cost[pl][PARTITION_NONE]; + if (bsize == BLOCK_SIZE_SB64X64) + assert(tp_orig < *tp); + else + assert(tp_orig == *tp); +} - if (RDCOST(x->rdmult, x->rddiv, r, d) < - RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) { - sb64_rate = r; - sb64_dist = d; - sb64_partitioning = BLOCK_SIZE_SB64X64; - } - } +static void encode_sb_row(VP9_COMP *cpi, int mi_row, + TOKENEXTRA **tp, int *totalrate) { + VP9_COMMON *const cm = &cpi->common; + int mi_col; - assert(tp_orig == *tp); - encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64, - sb64_partitioning, sb_partitioning, mb_partitioning); - assert(tp_orig < *tp); + // Initialize the left context for the new SB row + vpx_memset(&cm->left_context, 0, sizeof(cm->left_context)); + vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context)); + + // Code each SB in the row + for (mi_col = cm->cur_tile_mi_col_start; + mi_col < cm->cur_tile_mi_col_end; mi_col += 8) { + int dummy_rate, dummy_dist; + rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64, + &dummy_rate, &dummy_dist); } } @@ -1559,9 +1275,8 @@ static void encode_frame_internal(VP9_COMP *cpi) { vp9_get_tile_col_offsets(cm, tile_col); for (mi_row = cm->cur_tile_mi_row_start; mi_row < cm->cur_tile_mi_row_end; - mi_row += 8) { + mi_row += 8) encode_sb_row(cpi, mi_row, &tp, &totalrate); - } cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old); assert(tp - cpi->tok <= get_token_alloc(cm->mb_rows, cm->mb_cols)); diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c index ff0725fd0..72238514a 100644 --- a/vp9/encoder/vp9_firstpass.c +++ b/vp9/encoder/vp9_firstpass.c @@ -632,7 +632,7 @@ void vp9_first_pass(VP9_COMP *cpi) { vp9_build_inter_predictors_sby(xd, mb_row << 1, mb_col << 1, BLOCK_SIZE_MB16X16); - vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16); + vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16); sum_mvr += mv.as_mv.row; sum_mvr_abs += abs(mv.as_mv.row); sum_mvc += mv.as_mv.col; diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c index 74caba5a0..aff5637e1 100644 --- a/vp9/encoder/vp9_mcomp.c +++ b/vp9/encoder/vp9_mcomp.c @@ -413,6 +413,201 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, return besterr; } + +#if CONFIG_COMP_INTER_JOINT_SEARCH +#undef DIST +/* returns subpixel variance error function */ +#define DIST(r, c) \ + vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \ + z, src_stride, &sse, second_pred) + +int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int *mvjcost, int *mvcost[2], + int *distortion, + unsigned int *sse1, + const uint8_t *second_pred, int w, int h) { + uint8_t *z = x->plane[0].src.buf; + int src_stride = x->plane[0].src.stride; + MACROBLOCKD *xd = &x->e_mbd; + + int rr, rc, br, bc, hstep; + int tr, tc; + unsigned int besterr = INT_MAX; + unsigned int left, right, up, down, diag; + unsigned int sse; + unsigned int whichdir; + unsigned int halfiters = 4; + unsigned int quarteriters = 4; + unsigned int eighthiters = 4; + int thismse; + int maxc, minc, maxr, minr; + int y_stride; + int offset; + int usehp = xd->allow_high_precision_mv; + + uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t)); + uint8_t *y = xd->plane[0].pre[0].buf + + (bestmv->as_mv.row) * xd->plane[0].pre[0].stride + + bestmv->as_mv.col; + + y_stride = xd->plane[0].pre[0].stride; + + rr = ref_mv->as_mv.row; + rc = ref_mv->as_mv.col; + br = bestmv->as_mv.row << 3; + bc = bestmv->as_mv.col << 3; + hstep = 4; + minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) - + ((1 << MV_MAX_BITS) - 1)); + maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) + + ((1 << MV_MAX_BITS) - 1)); + minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) - + ((1 << MV_MAX_BITS) - 1)); + maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) + + ((1 << MV_MAX_BITS) - 1)); + + tr = br; + tc = bc; + + + offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col; + + // central mv + bestmv->as_mv.row <<= 3; + bestmv->as_mv.col <<= 3; + + // calculate central point error + // TODO(yunqingwang): central pointer error was already calculated in full- + // pixel search, and can be passed in this function. + comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride); + besterr = vfp->vf(comp_pred, w, z, src_stride, sse1); + *distortion = besterr; + besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, + error_per_bit, xd->allow_high_precision_mv); + + // Each subsequent iteration checks at least one point in + // common with the last iteration could be 2 ( if diag selected) + while (--halfiters) { + // 1/2 pel + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(down, tr + hstep, tc); + + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + switch (whichdir) { + case 0: + CHECK_BETTER(diag, tr - hstep, tc - hstep); + break; + case 1: + CHECK_BETTER(diag, tr - hstep, tc + hstep); + break; + case 2: + CHECK_BETTER(diag, tr + hstep, tc - hstep); + break; + case 3: + CHECK_BETTER(diag, tr + hstep, tc + hstep); + break; + } + + // no reason to check the same one again. + if (tr == br && tc == bc) + break; + + tr = br; + tc = bc; + } + + // Each subsequent iteration checks at least one point in common with + // the last iteration could be 2 ( if diag selected) 1/4 pel + hstep >>= 1; + while (--quarteriters) { + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(down, tr + hstep, tc); + + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + switch (whichdir) { + case 0: + CHECK_BETTER(diag, tr - hstep, tc - hstep); + break; + case 1: + CHECK_BETTER(diag, tr - hstep, tc + hstep); + break; + case 2: + CHECK_BETTER(diag, tr + hstep, tc - hstep); + break; + case 3: + CHECK_BETTER(diag, tr + hstep, tc + hstep); + break; + } + + // no reason to check the same one again. + if (tr == br && tc == bc) + break; + + tr = br; + tc = bc; + } + + if (xd->allow_high_precision_mv) { + usehp = vp9_use_nmv_hp(&ref_mv->as_mv); + } else { + usehp = 0; + } + + if (usehp) { + hstep >>= 1; + while (--eighthiters) { + CHECK_BETTER(left, tr, tc - hstep); + CHECK_BETTER(right, tr, tc + hstep); + CHECK_BETTER(up, tr - hstep, tc); + CHECK_BETTER(down, tr + hstep, tc); + + whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2); + + switch (whichdir) { + case 0: + CHECK_BETTER(diag, tr - hstep, tc - hstep); + break; + case 1: + CHECK_BETTER(diag, tr - hstep, tc + hstep); + break; + case 2: + CHECK_BETTER(diag, tr + hstep, tc - hstep); + break; + case 3: + CHECK_BETTER(diag, tr + hstep, tc + hstep); + break; + } + + // no reason to check the same one again. + if (tr == br && tc == bc) + break; + + tr = br; + tc = bc; + } + } + bestmv->as_mv.row = br; + bestmv->as_mv.col = bc; + + vpx_free(comp_pred); + + if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) || + (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3))) + return INT_MAX; + + return besterr; +} +#endif // CONFIG_COMP_INTER_JOINT_SEARCH + #undef MVC #undef PRE #undef DIST @@ -2132,7 +2327,109 @@ int vp9_refining_search_sadx4(MACROBLOCK *x, return INT_MAX; } +#if CONFIG_COMP_INTER_JOINT_SEARCH +/* This function is called when we do joint motion search in comp_inter_inter + * mode. + */ +int vp9_refining_search_8p_c(MACROBLOCK *x, + int_mv *ref_mv, int error_per_bit, + int search_range, vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], int_mv *center_mv, + const uint8_t *second_pred, int w, int h) { + const MACROBLOCKD* const xd = &x->e_mbd; + MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0}, + {-1, -1}, {1, -1}, {-1, 1}, {1, 1}}; + int i, j; + int this_row_offset, this_col_offset; + int what_stride = x->plane[0].src.stride; + int in_what_stride = xd->plane[0].pre[0].stride; + uint8_t *what = x->plane[0].src.buf; + uint8_t *best_address = xd->plane[0].pre[0].buf + + (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) + + ref_mv->as_mv.col; + uint8_t *check_here; + unsigned int thissad; + int_mv this_mv; + unsigned int bestsad = INT_MAX; + int_mv fcenter_mv; + + int *mvjsadcost = x->nmvjointsadcost; + int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]}; + + /* Compound pred buffer */ + uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t)); + + fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3; + fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3; + + /* Get compound pred by averaging two pred blocks. */ + comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride); + + bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) + + mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit); + + for (i = 0; i < search_range; i++) { + int best_site = -1; + + for (j = 0; j < 8; j++) { + this_row_offset = ref_mv->as_mv.row + neighbors[j].row; + this_col_offset = ref_mv->as_mv.col + neighbors[j].col; + + if ((this_col_offset > x->mv_col_min) && + (this_col_offset < x->mv_col_max) && + (this_row_offset > x->mv_row_min) && + (this_row_offset < x->mv_row_max)) { + check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col + + best_address; + + /* Get compound block and use it to calculate SAD. */ + comp_avg_pred(comp_pred, second_pred, w, h, check_here, + in_what_stride); + thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad); + + if (thissad < bestsad) { + this_mv.as_mv.row = this_row_offset; + this_mv.as_mv.col = this_col_offset; + thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost, + mvsadcost, error_per_bit); + + if (thissad < bestsad) { + bestsad = thissad; + best_site = j; + } + } + } + } + + if (best_site == -1) { + break; + } else { + ref_mv->as_mv.row += neighbors[best_site].row; + ref_mv->as_mv.col += neighbors[best_site].col; + best_address += (neighbors[best_site].row) * in_what_stride + + neighbors[best_site].col; + } + } + + this_mv.as_mv.row = ref_mv->as_mv.row << 3; + this_mv.as_mv.col = ref_mv->as_mv.col << 3; + + if (bestsad < INT_MAX) { + int besterr; + comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride); + besterr = fn_ptr->vf(what, what_stride, comp_pred, w, + (unsigned int *)(&thissad)) + + mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit, + xd->allow_high_precision_mv); + vpx_free(comp_pred); + return besterr; + } else { + vpx_free(comp_pred); + return INT_MAX; + } +} +#endif // CONFIG_COMP_INTER_JOINT_SEARCH #ifdef ENTROPY_STATS void print_mode_context(VP9_COMMON *pc) { diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h index e1ba7fd9d..cdbd29aa5 100644 --- a/vp9/encoder/vp9_mcomp.h +++ b/vp9/encoder/vp9_mcomp.h @@ -79,5 +79,21 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x, int *mvjcost, int *mvcost[2], int_mv *center_mv); +#if CONFIG_COMP_INTER_JOINT_SEARCH +int vp9_find_best_sub_pixel_comp(MACROBLOCK *x, + int_mv *bestmv, int_mv *ref_mv, + int error_per_bit, + const vp9_variance_fn_ptr_t *vfp, + int *mvjcost, int *mvcost[2], + int *distortion, unsigned int *sse1, + const uint8_t *second_pred, + int w, int h); +int vp9_refining_search_8p_c(MACROBLOCK *x, + int_mv *ref_mv, int error_per_bit, + int search_range, vp9_variance_fn_ptr_t *fn_ptr, + int *mvjcost, int *mvcost[2], + int_mv *center_mv, const uint8_t *second_pred, + int w, int h); +#endif // CONFIG_COMP_INTER_JOINT_SEARCH #endif // VP9_ENCODER_VP9_MCOMP_H_ diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c index e55f5551f..610d7330b 100644 --- a/vp9/encoder/vp9_onyx_if.c +++ b/vp9/encoder/vp9_onyx_if.c @@ -1527,10 +1527,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { for (i = 0; i < MAX_MODES; i++) cpi->rd_thresh_mult[i] = 128; -#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \ +#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\ cpi->fn_ptr[BT].sdf = SDF; \ cpi->fn_ptr[BT].vf = VF; \ cpi->fn_ptr[BT].svf = SVF; \ + cpi->fn_ptr[BT].svaf = SVAF; \ cpi->fn_ptr[BT].svf_halfpix_h = SVFHH; \ cpi->fn_ptr[BT].svf_halfpix_v = SVFHV; \ cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \ @@ -1539,57 +1540,64 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) { cpi->fn_ptr[BT].sdx4df = SDX4DF; BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16, - NULL, NULL, + vp9_sub_pixel_avg_variance32x16, NULL, NULL, NULL, NULL, NULL, vp9_sad32x16x4d) BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32, - NULL, NULL, + vp9_sub_pixel_avg_variance16x32, NULL, NULL, NULL, NULL, NULL, vp9_sad16x32x4d) BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32, - NULL, NULL, + vp9_sub_pixel_avg_variance64x32, NULL, NULL, NULL, NULL, NULL, vp9_sad64x32x4d) BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64, - NULL, NULL, + vp9_sub_pixel_avg_variance32x64, NULL, NULL, NULL, NULL, NULL, vp9_sad32x64x4d) BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32, - vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v, + vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h, + vp9_variance_halfpixvar32x32_v, vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8, vp9_sad32x32x4d) BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64, - vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v, + vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h, + vp9_variance_halfpixvar64x64_v, vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8, vp9_sad64x64x4d) BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16, - vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v, - vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8, - vp9_sad16x16x4d) + vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h, + vp9_variance_halfpixvar16x16_v, + vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8, + vp9_sad16x16x4d) BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8, - NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d) + vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL, + vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d) BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16, - NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d) + vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL, + vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d) BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8, - NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) + vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL, + vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d) BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL, - NULL, NULL, NULL, NULL, NULL, NULL) + NULL, NULL, NULL, NULL, NULL, NULL, NULL) BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL, - NULL, NULL, NULL, NULL, NULL, NULL) + NULL, NULL, NULL, NULL, NULL, NULL, NULL) BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4, - NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) + vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL, + vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d) cpi->full_search_sad = vp9_full_search_sad; cpi->diamond_search_sad = vp9_diamond_search_sad; diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c index 1b143f5e0..48356931a 100644 --- a/vp9/encoder/vp9_rdopt.c +++ b/vp9/encoder/vp9_rdopt.c @@ -1069,9 +1069,7 @@ typedef struct { B_PREDICTION_MODE modes[4]; int_mv mvs[4], second_mvs[4]; int eobs[4]; - int mvthresh; - int *mdcounts; } BEST_SEG_INFO; static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) { @@ -1322,7 +1320,6 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, int_mv *best_ref_mv, int_mv *second_best_ref_mv, int64_t best_rd, - int *mdcounts, int *returntotrate, int *returnyrate, int *returndistortion, @@ -1339,7 +1336,6 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x, bsi.second_ref_mv = second_best_ref_mv; bsi.mvp.as_int = best_ref_mv->as_int; bsi.mvthresh = mvthresh; - bsi.mdcounts = mdcounts; for (i = 0; i < 4; i++) bsi.modes[i] = ZERO4X4; @@ -1612,7 +1608,6 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x, int mi_row, int mi_col, int_mv frame_nearest_mv[MAX_REF_FRAMES], int_mv frame_near_mv[MAX_REF_FRAMES], - int frame_mdcounts[4][4], struct buf_2d yv12_mb[4][MAX_MB_PLANE], struct scale_factors scale[MAX_REF_FRAMES]) { VP9_COMMON *cm = &cpi->common; @@ -1797,7 +1792,7 @@ static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) { static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, BLOCK_SIZE_TYPE bsize, - int mdcounts[4], int64_t txfm_cache[], + int64_t txfm_cache[], int *rate2, int *distortion, int *skippable, int *compmode_cost, int *rate_y, int *distortion_y, @@ -1807,8 +1802,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, INTERPOLATIONFILTERTYPE *best_filter, int_mv frame_mv[MB_MODE_COUNT] [MAX_REF_FRAMES], - YV12_BUFFER_CONFIG *scaled_ref_frame, - int mi_row, int mi_col) { + YV12_BUFFER_CONFIG **scaled_ref_frame, + int mi_row, int mi_col, + int_mv single_newmv[MAX_REF_FRAMES]) { const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize); VP9_COMMON *cm = &cpi->common; @@ -1838,6 +1834,152 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, ref_mv[1] = mbmi->ref_mvs[refs[1]][0]; if (is_comp_pred) { +#if CONFIG_COMP_INTER_JOINT_SEARCH + const int b_sz[BLOCK_SIZE_TYPES][2] = { + {4, 4}, + {8, 8}, + {8, 16}, + {16, 8}, + {16, 16}, + {16, 32}, + {32, 16}, + {32, 32}, + {32, 64}, + {64, 32}, + {64, 64} + }; + + int ite; + // Prediction buffer from second frame. + uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] * + b_sz[bsize][1] * sizeof(uint8_t)); + + // Do joint motion search in compound mode to get more accurate mv. + struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}}; + struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}}; + struct buf_2d scaled_first_yv12; + + if (scaled_ref_frame[0]) { + int i; + + // Swap out the reference frame for a version that's been scaled to + // match the resolution of the current frame, allowing the existing + // motion search code to be used without additional modifications. + for (i = 0; i < MAX_MB_PLANE; i++) + backup_yv12[i] = xd->plane[i].pre[0]; + + setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col, + NULL, NULL); + } + + if (scaled_ref_frame[1]) { + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) + backup_second_yv12[i] = xd->plane[i].pre[1]; + + setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col, + NULL, NULL); + } + xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0], + mi_row, mi_col); + xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1], + mi_row, mi_col); + + scaled_first_yv12 = xd->plane[0].pre[0]; + + // Initialize mv using single prediction mode result. + frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int; + frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int; + + // Iteration: joint search is done once for each ref frame. + // Tried allowing search multiple times iteratively, and break out if + // it couldn't find better mv. But tests didn't show noticeable + // improvement. + for (ite = 0; ite < 2; ite++) { + struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0], + xd->plane[0].pre[1]}; + int bestsme = INT_MAX; + int sadpb = x->sadperbit16; + int_mv tmp_mv; + int search_range = 3; + + int tmp_col_min = x->mv_col_min; + int tmp_col_max = x->mv_col_max; + int tmp_row_min = x->mv_row_min; + int tmp_row_max = x->mv_row_max; + int id = ite % 2; + + // Get pred block from second frame. + vp9_build_inter_predictor(ref_yv12[!id].buf, + ref_yv12[!id].stride, + second_pred, b_sz[bsize][0], + &frame_mv[NEWMV][refs[!id]], + &xd->scale_factor[!id], + b_sz[bsize][0], b_sz[bsize][1], 0, + &xd->subpix); + + // Compound motion search on first ref frame. + if (id) + xd->plane[0].pre[0] = ref_yv12[id]; + vp9_clamp_mv_min_max(x, &ref_mv[id]); + + // Use mv result from single mode as mvp. + tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int; + + tmp_mv.as_mv.col >>= 3; + tmp_mv.as_mv.row >>= 3; + + // Small-range full-pixel motion search + bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb, + search_range, + &cpi->fn_ptr[block_size], + x->nmvjointcost, x->mvcost, + &ref_mv[id], second_pred, + b_sz[bsize][0], b_sz[bsize][1]); + + x->mv_col_min = tmp_col_min; + x->mv_col_max = tmp_col_max; + x->mv_row_min = tmp_row_min; + x->mv_row_max = tmp_row_max; + + if (bestsme < INT_MAX) { + int dis; /* TODO: use dis in distortion calculation later. */ + unsigned int sse; + + vp9_find_best_sub_pixel_comp(x, &tmp_mv, + &ref_mv[id], + x->errorperbit, + &cpi->fn_ptr[block_size], + x->nmvjointcost, x->mvcost, + &dis, &sse, second_pred, + b_sz[bsize][0], b_sz[bsize][1]); + } + + frame_mv[NEWMV][refs[id]].as_int = + xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int; + if (id) + xd->plane[0].pre[0] = scaled_first_yv12; + } + + // restore the predictor + if (scaled_ref_frame[0]) { + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[0] = backup_yv12[i]; + } + + if (scaled_ref_frame[1]) { + int i; + + for (i = 0; i < MAX_MB_PLANE; i++) + xd->plane[i].pre[1] = backup_second_yv12[i]; + } + + vpx_free(second_pred); +#endif // CONFIG_COMP_INTER_JOINT_SEARCH + if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV || frame_mv[NEWMV][refs[1]].as_int == INVALID_MV) return INT64_MAX; @@ -1862,7 +2004,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, int tmp_row_min = x->mv_row_min; int tmp_row_max = x->mv_row_max; - if (scaled_ref_frame) { + if (scaled_ref_frame[0]) { int i; // Swap out the reference frame for a version that's been scaled to @@ -1871,7 +2013,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, for (i = 0; i < MAX_MB_PLANE; i++) backup_yv12[i] = xd->plane[i].pre[0]; - setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col, + setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col, NULL, NULL); } @@ -1914,6 +2056,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, } frame_mv[NEWMV][refs[0]].as_int = xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int; + single_newmv[refs[0]].as_int = tmp_mv.as_int; // Add the new motion vector cost to our rolling cost variable *rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0], @@ -1921,7 +2064,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x, 96, xd->allow_high_precision_mv); // restore the predictor, if required - if (scaled_ref_frame) { + if (scaled_ref_frame[0]) { int i; for (i = 0; i < MAX_MB_PLANE; i++) @@ -2203,15 +2346,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, unsigned char segment_id = xd->mode_info_context->mbmi.segment_id; int comp_pred, i; int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES]; - int frame_mdcounts[4][4]; struct buf_2d yv12_mb[4][MAX_MB_PLANE]; + int_mv single_newmv[MAX_REF_FRAMES]; static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG }; int idx_list[4] = {0, cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx}; - int mdcounts[4]; int64_t best_rd = INT64_MAX; int64_t best_txfm_rd[NB_TXFM_MODES]; int64_t best_txfm_diff[NB_TXFM_MODES]; @@ -2251,6 +2393,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, xd->mode_info_context->mbmi.segment_id = segment_id; estimate_ref_frame_costs(cpi, segment_id, ref_costs); vpx_memset(&best_mbmode, 0, sizeof(best_mbmode)); + vpx_memset(&single_newmv, 0, sizeof(single_newmv)); for (i = 0; i < NB_PREDICTION_TYPES; ++i) best_pred_rd[i] = INT64_MAX; @@ -2293,7 +2436,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, if (cpi->ref_frame_flags & flag_list[ref_frame]) { setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size, mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV], - frame_mdcounts, yv12_mb, scale_factor); + yv12_mb, scale_factor); } frame_mv[NEWMV][ref_frame].as_int = INVALID_MV; frame_mv[ZEROMV][ref_frame].as_int = 0; @@ -2420,8 +2563,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, xd->plane[i].pre[1] = yv12_mb[second_ref][i]; } - vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts)); - // If the segment reference frame feature is enabled.... // then do nothing if the current ref frame is not allowed.. if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) && @@ -2519,7 +2660,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &mbmi->ref_mvs[mbmi->ref_frame][0], - second_ref, INT64_MAX, mdcounts, + second_ref, INT64_MAX, &rate, &rate_y, &distortion, &skippable, (int)this_rd_thresh, seg_mvs); @@ -2558,7 +2699,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, // switchable list (bilinear, 6-tap) is indicated at the frame level tmp_rd = rd_pick_best_mbsegmentation(cpi, x, &mbmi->ref_mvs[mbmi->ref_frame][0], - second_ref, INT64_MAX, mdcounts, + second_ref, INT64_MAX, &rate, &rate_y, &distortion, &skippable, (int)this_rd_thresh, seg_mvs); @@ -2608,7 +2749,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred); mbmi->mode = this_mode; } else { - YV12_BUFFER_CONFIG *scaled_ref_frame = NULL; + YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL}; int fb; if (mbmi->ref_frame == LAST_FRAME) { @@ -2620,17 +2761,31 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x, } if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb]) - scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]]; + scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]]; + + if (comp_pred) { + if (mbmi->second_ref_frame == LAST_FRAME) { + fb = cpi->lst_fb_idx; + } else if (mbmi->second_ref_frame == GOLDEN_FRAME) { + fb = cpi->gld_fb_idx; + } else { + fb = cpi->alt_fb_idx; + } + + if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb]) + scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]]; + } this_rd = handle_inter_mode(cpi, x, bsize, - mdcounts, txfm_cache, + txfm_cache, &rate2, &distortion2, &skippable, &compmode_cost, &rate_y, &distortion_y, &rate_uv, &distortion_uv, &mode_excluded, &disable_skip, mode_index, &tmp_best_filter, frame_mv, - scaled_ref_frame, mi_row, mi_col); + scaled_ref_frame, mi_row, mi_col, + single_newmv); if (this_rd == INT64_MAX) continue; } diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h index 13dabbda4..306476b01 100644 --- a/vp9/encoder/vp9_variance.h +++ b/vp9/encoder/vp9_variance.h @@ -12,6 +12,7 @@ #define VP9_ENCODER_VP9_VARIANCE_H_ #include "vpx/vpx_integer.h" +// #include "./vpx_config.h" typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr, int source_stride, @@ -50,6 +51,15 @@ typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr, int Refstride, unsigned int *sse); +typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr, + int source_stride, + int xoffset, + int yoffset, + const uint8_t *ref_ptr, + int Refstride, + unsigned int *sse, + const uint8_t *second_pred); + typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r, int rp, unsigned long *sum_s, unsigned long *sum_r, unsigned long *sum_sq_s, @@ -64,15 +74,33 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr, int ref_stride); typedef struct vp9_variance_vtable { - vp9_sad_fn_t sdf; - vp9_variance_fn_t vf; - vp9_subpixvariance_fn_t svf; - vp9_variance_fn_t svf_halfpix_h; - vp9_variance_fn_t svf_halfpix_v; - vp9_variance_fn_t svf_halfpix_hv; - vp9_sad_multi_fn_t sdx3f; - vp9_sad_multi1_fn_t sdx8f; - vp9_sad_multi_d_fn_t sdx4df; + vp9_sad_fn_t sdf; + vp9_variance_fn_t vf; + vp9_subpixvariance_fn_t svf; + vp9_subp_avg_variance_fn_t svaf; + vp9_variance_fn_t svf_halfpix_h; + vp9_variance_fn_t svf_halfpix_v; + vp9_variance_fn_t svf_halfpix_hv; + vp9_sad_multi_fn_t sdx3f; + vp9_sad_multi1_fn_t sdx8f; + vp9_sad_multi_d_fn_t sdx4df; } vp9_variance_fn_ptr_t; +// #if CONFIG_COMP_INTER_JOINT_SEARCH +static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight, + int height, uint8_t *ref, int ref_stride) { + int i, j; + + for (i = 0; i < height; i++) { + for (j = 0; j < weight; j++) { + int tmp; + tmp = pred[j] + ref[j]; + comp_pred[j] = (tmp + 1) >> 1; + } + comp_pred += weight; + pred += weight; + ref += ref_stride; + } +} +// #endif // CONFIG_COMP_INTER_JOINT_SEARCH #endif // VP9_ENCODER_VP9_VARIANCE_H_ diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c index c2a600408..fa53abdec 100644 --- a/vp9/encoder/vp9_variance_c.c +++ b/vp9/encoder/vp9_variance_c.c @@ -13,6 +13,7 @@ #include "vp9/common/vp9_filter.h" #include "vp9/common/vp9_subpelvar.h" #include "vpx/vpx_integer.h" +#include "vpx_ports/mem.h" unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) { unsigned int i, sum = 0; @@ -58,6 +59,29 @@ unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr, return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering + uint8_t temp2[68 * 64]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 33, 64, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter); + comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64); + return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse); +} + unsigned int vp9_variance32x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -92,6 +116,29 @@ unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr, return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering + uint8_t temp2[68 * 64]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 65, 32, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter); + comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32); + return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse); +} + unsigned int vp9_variance32x16_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -126,6 +173,29 @@ unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr, return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering + uint8_t temp2[36 * 32]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 17, 32, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter); + comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32); + return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse); +} + unsigned int vp9_variance16x32_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -160,6 +230,29 @@ unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr, return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering + uint8_t temp2[36 * 32]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 33, 16, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter); + comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16); + return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse); +} + unsigned int vp9_variance64x64_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -317,6 +410,31 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr, return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint8_t temp2[20 * 16]; + const int16_t *hfilter, *vfilter; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4); // compound pred buffer + uint16_t fdata3[5 * 4]; // Temp data bufffer used in filtering + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + // First filter 1d Horizontal + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 5, 4, hfilter); + + // Now filter Verticaly + var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, vfilter); + comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4); + return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse); +} unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, int src_pixels_per_line, @@ -339,6 +457,29 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr, return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[9 * 8]; // Temp data bufffer used in filtering + uint8_t temp2[20 * 16]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 9, 8, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter); + comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8); + return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse); +} + unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, @@ -360,6 +501,30 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr, return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[17 * 16]; + uint8_t temp2[20 * 16]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 17, 16, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter); + + comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16); + return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse); +} + unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, @@ -381,6 +546,29 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr, return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering + uint8_t temp2[68 * 64]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 65, 64, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter); + comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64); + return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse); +} + unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, @@ -402,6 +590,29 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr, return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering + uint8_t temp2[36 * 32]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 33, 32, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter); + comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32); + return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse); +} + unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, @@ -543,6 +754,29 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr, return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[16 * 9]; // Temp data bufffer used in filtering + uint8_t temp2[20 * 16]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 9, 16, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter); + comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16); + return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse); +} + unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, int src_pixels_per_line, int xoffset, @@ -564,3 +798,25 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr, return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse); } +unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr, + int src_pixels_per_line, + int xoffset, + int yoffset, + const uint8_t *dst_ptr, + int dst_pixels_per_line, + unsigned int *sse, + const uint8_t *second_pred) { + uint16_t fdata3[9 * 16]; // Temp data bufffer used in filtering + uint8_t temp2[20 * 16]; + DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16); // compound pred buffer + const int16_t *hfilter, *vfilter; + + hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset); + vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset); + + var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line, + 1, 17, 8, hfilter); + var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter); + comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8); + return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse); +} |