summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xconfigure1
-rw-r--r--vp9/common/vp9_blockd.h22
-rw-r--r--vp9/common/vp9_enums.h1
-rw-r--r--vp9/common/vp9_loopfilter_filters.c34
-rw-r--r--vp9/common/vp9_rtcd_defs.sh57
-rw-r--r--vp9/common/x86/vp9_loopfilter_intrin_mmx.c16
-rw-r--r--vp9/common/x86/vp9_loopfilter_intrin_sse2.c16
-rw-r--r--vp9/common/x86/vp9_loopfilter_mmx.asm343
-rw-r--r--vp9/common/x86/vp9_loopfilter_sse2.asm366
-rw-r--r--vp9/common/x86/vp9_loopfilter_x86.h8
-rw-r--r--vp9/decoder/vp9_decodframe.c4
-rw-r--r--vp9/decoder/vp9_detokenize.c7
-rw-r--r--vp9/encoder/vp9_block.h8
-rw-r--r--vp9/encoder/vp9_encodeframe.c781
-rw-r--r--vp9/encoder/vp9_firstpass.c2
-rw-r--r--vp9/encoder/vp9_mcomp.c297
-rw-r--r--vp9/encoder/vp9_mcomp.h16
-rw-r--r--vp9/encoder/vp9_onyx_if.c40
-rw-r--r--vp9/encoder/vp9_rdopt.c199
-rw-r--r--vp9/encoder/vp9_variance.h46
-rw-r--r--vp9/encoder/vp9_variance_c.c256
21 files changed, 1113 insertions, 1407 deletions
diff --git a/configure b/configure
index 5cbf07095..cc8c58141 100755
--- a/configure
+++ b/configure
@@ -247,6 +247,7 @@ EXPERIMENT_LIST="
multiple_arf
non420
ab4x4
+ comp_inter_joint_search
"
CONFIG_LIST="
external_build
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index ab9e28dcc..004054d10 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -70,17 +70,17 @@ typedef enum {
} INTERPOLATIONFILTERTYPE;
typedef enum {
- DC_PRED, /* average of above and left pixels */
- V_PRED, /* vertical prediction */
- H_PRED, /* horizontal prediction */
- D45_PRED, /* Directional 45 deg prediction [anti-clockwise from 0 deg hor] */
- D135_PRED, /* Directional 135 deg prediction [anti-clockwise from 0 deg hor] */
- D117_PRED, /* Directional 112 deg prediction [anti-clockwise from 0 deg hor] */
- D153_PRED, /* Directional 157 deg prediction [anti-clockwise from 0 deg hor] */
- D27_PRED, /* Directional 22 deg prediction [anti-clockwise from 0 deg hor] */
- D63_PRED, /* Directional 67 deg prediction [anti-clockwise from 0 deg hor] */
- TM_PRED, /* Truemotion prediction */
- I4X4_PRED, /* 4x4 based prediction, each 4x4 has its own mode */
+ DC_PRED, // Average of above and left pixels
+ V_PRED, // Vertical
+ H_PRED, // Horizontal
+ D45_PRED, // Directional 45 deg = round(arctan(1/1) * 180/pi)
+ D135_PRED, // Directional 135 deg = 180 - 45
+ D117_PRED, // Directional 117 deg = 180 - 63
+ D153_PRED, // Directional 153 deg = 180 - 27
+ D27_PRED, // Directional 27 deg = round(arctan(1/2) * 180/pi)
+ D63_PRED, // Directional 63 deg = round(arctan(2/1) * 180/pi)
+ TM_PRED, // True-motion
+ I4X4_PRED, // Each 4x4 subblock has its own mode
NEARESTMV,
NEARMV,
ZEROMV,
diff --git a/vp9/common/vp9_enums.h b/vp9/common/vp9_enums.h
index 166319565..2f6707487 100644
--- a/vp9/common/vp9_enums.h
+++ b/vp9/common/vp9_enums.h
@@ -36,6 +36,7 @@ typedef enum BLOCK_SIZE_TYPE {
BLOCK_SIZE_SB32X64,
BLOCK_SIZE_SB64X32,
BLOCK_SIZE_SB64X64,
+ BLOCK_SIZE_TYPES
} BLOCK_SIZE_TYPE;
typedef enum PARTITION_TYPE {
diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index 15785f581..bf97589a9 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c
@@ -282,29 +282,6 @@ static INLINE void simple_filter(int8_t mask,
*op0 = signed_char_clamp(p0 + filter2) ^ 0x80;
}
-void vp9_loop_filter_simple_horizontal_edge_c(uint8_t *s, int p,
- const uint8_t *blimit) {
- int i = 0;
-
- do {
- const int8_t mask = simple_filter_mask(blimit[0], s[-2 * p], s[-1 * p],
- s[0 * p], s[1 * p]);
- simple_filter(mask, s - 2 * p, s - 1 * p, s, s + 1 * p);
- ++s;
- } while (++i < 16);
-}
-
-void vp9_loop_filter_simple_vertical_edge_c(uint8_t *s, int p,
- const uint8_t *blimit) {
- int i = 0;
-
- do {
- const int8_t mask = simple_filter_mask(blimit[0], s[-2], s[-1], s[0], s[1]);
- simple_filter(mask, s - 2, s - 1, s, s + 1);
- s += p;
- } while (++i < 16);
-}
-
/* Vertical MB Filtering */
void vp9_loop_filter_mbv_c(uint8_t *y_ptr, uint8_t *u_ptr,
uint8_t *v_ptr, int y_stride, int uv_stride,
@@ -392,11 +369,6 @@ void vp9_loop_filter_bh8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp9_loop_filter_bhs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
- vp9_loop_filter_simple_horizontal_edge_c(y + 4 * y_stride, y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_c(y + 8 * y_stride, y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_c(y + 12 * y_stride, y_stride, blimit);
-}
void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
int y_stride, int uv_stride,
@@ -413,12 +385,6 @@ void vp9_loop_filter_bv8x8_c(uint8_t *y, uint8_t *u, uint8_t *v,
lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp9_loop_filter_bvs_c(uint8_t *y, int y_stride, const uint8_t *blimit) {
- vp9_loop_filter_simple_vertical_edge_c(y + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_c(y + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_c(y + 12, y_stride, blimit);
-}
-
static INLINE void wide_mbfilter(int8_t mask, uint8_t hev,
uint8_t flat, uint8_t flat2,
uint8_t *op7, uint8_t *op6, uint8_t *op5,
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 75e36040c..02d32530a 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -128,30 +128,6 @@ specialize vp9_loop_filter_bh sse2
prototype void vp9_loop_filter_bh8x8 "uint8_t *y, uint8_t *u, uint8_t *v, int ystride, int uv_stride, struct loop_filter_info *lfi"
specialize vp9_loop_filter_bh8x8 sse2
-prototype void vp9_loop_filter_simple_mbv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbv mmx sse2
-vp9_loop_filter_simple_mbv_c=vp9_loop_filter_simple_vertical_edge_c
-vp9_loop_filter_simple_mbv_mmx=vp9_loop_filter_simple_vertical_edge_mmx
-vp9_loop_filter_simple_mbv_sse2=vp9_loop_filter_simple_vertical_edge_sse2
-
-prototype void vp9_loop_filter_simple_mbh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_mbh mmx sse2
-vp9_loop_filter_simple_mbh_c=vp9_loop_filter_simple_horizontal_edge_c
-vp9_loop_filter_simple_mbh_mmx=vp9_loop_filter_simple_horizontal_edge_mmx
-vp9_loop_filter_simple_mbh_sse2=vp9_loop_filter_simple_horizontal_edge_sse2
-
-prototype void vp9_loop_filter_simple_bv "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bv mmx sse2
-vp9_loop_filter_simple_bv_c=vp9_loop_filter_bvs_c
-vp9_loop_filter_simple_bv_mmx=vp9_loop_filter_bvs_mmx
-vp9_loop_filter_simple_bv_sse2=vp9_loop_filter_bvs_sse2
-
-prototype void vp9_loop_filter_simple_bh "uint8_t *y, int ystride, const uint8_t *blimit"
-specialize vp9_loop_filter_simple_bh mmx sse2
-vp9_loop_filter_simple_bh_c=vp9_loop_filter_bhs_c
-vp9_loop_filter_simple_bh_mmx=vp9_loop_filter_bhs_mmx
-vp9_loop_filter_simple_bh_sse2=vp9_loop_filter_bhs_sse2
-
prototype void vp9_lpf_mbh_w "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi"
specialize vp9_lpf_mbh_w sse2
@@ -337,41 +313,74 @@ vp9_variance4x4_mmx=vp9_variance4x4_mmx
prototype unsigned int vp9_sub_pixel_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x64 sse2
+prototype unsigned int vp9_sub_pixel_avg_variance64x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x64
+
prototype unsigned int vp9_sub_pixel_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x64
+prototype unsigned int vp9_sub_pixel_avg_variance32x64 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x64
+
prototype unsigned int vp9_sub_pixel_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance64x32
+prototype unsigned int vp9_sub_pixel_avg_variance64x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance64x32
+
prototype unsigned int vp9_sub_pixel_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x16
+prototype unsigned int vp9_sub_pixel_avg_variance32x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x16
+
prototype unsigned int vp9_sub_pixel_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x32
+prototype unsigned int vp9_sub_pixel_avg_variance16x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x32
+
prototype unsigned int vp9_sub_pixel_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance32x32 sse2
+prototype unsigned int vp9_sub_pixel_avg_variance32x32 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance32x32
+
prototype unsigned int vp9_sub_pixel_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x16 sse2 mmx ssse3
+prototype unsigned int vp9_sub_pixel_avg_variance16x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x16
+
prototype unsigned int vp9_sub_pixel_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x16 sse2 mmx
vp9_sub_pixel_variance8x16_sse2=vp9_sub_pixel_variance8x16_wmt
+prototype unsigned int vp9_sub_pixel_avg_variance8x16 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x16
+
prototype unsigned int vp9_sub_pixel_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance16x8 sse2 mmx ssse3
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_ssse3;
vp9_sub_pixel_variance16x8_sse2=vp9_sub_pixel_variance16x8_wmt
+prototype unsigned int vp9_sub_pixel_avg_variance16x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance16x8
+
prototype unsigned int vp9_sub_pixel_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance8x8 sse2 mmx
vp9_sub_pixel_variance8x8_sse2=vp9_sub_pixel_variance8x8_wmt
+prototype unsigned int vp9_sub_pixel_avg_variance8x8 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance8x8
+
prototype unsigned int vp9_sub_pixel_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse"
specialize vp9_sub_pixel_variance4x4 sse2 mmx
vp9_sub_pixel_variance4x4_sse2=vp9_sub_pixel_variance4x4_wmt
+prototype unsigned int vp9_sub_pixel_avg_variance4x4 "const uint8_t *src_ptr, int source_stride, int xoffset, int yoffset, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, const uint8_t *second_pred"
+specialize vp9_sub_pixel_avg_variance4x4
+
prototype unsigned int vp9_sad64x64 "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int max_sad"
specialize vp9_sad64x64 sse2
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
index 2be9e3179..7e6c4be2c 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_mmx.c
@@ -35,16 +35,6 @@ void vp9_loop_filter_bh_mmx(unsigned char *y_ptr,
}
-void vp9_loop_filter_bhs_mmx(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 4 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 8 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_mmx(y_ptr + 12 * y_stride,
- y_stride, blimit);
-}
-
/* Vertical B Filtering */
void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
unsigned char *u_ptr, unsigned char *v_ptr,
@@ -66,9 +56,3 @@ void vp9_loop_filter_bv_mmx(unsigned char *y_ptr,
lfi->blim, lfi->lim, lfi->hev_thr, 1);
}
-void vp9_loop_filter_bvs_mmx(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_mmx(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index 08447a62d..7982ca6a2 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
@@ -1115,16 +1115,6 @@ void vp9_loop_filter_bh_sse2(unsigned char *y_ptr,
v_ptr + 4 * uv_stride);
}
-void vp9_loop_filter_bhs_sse2(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 4 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 8 * y_stride,
- y_stride, blimit);
- vp9_loop_filter_simple_horizontal_edge_sse2(y_ptr + 12 * y_stride,
- y_stride, blimit);
-}
-
/* Vertical B Filtering */
void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
unsigned char *u_ptr, unsigned char *v_ptr,
@@ -1143,9 +1133,3 @@ void vp9_loop_filter_bv_sse2(unsigned char *y_ptr,
v_ptr + 4);
}
-void vp9_loop_filter_bvs_sse2(unsigned char *y_ptr, int y_stride,
- const unsigned char *blimit) {
- vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 4, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 8, y_stride, blimit);
- vp9_loop_filter_simple_vertical_edge_sse2(y_ptr + 12, y_stride, blimit);
-}
diff --git a/vp9/common/x86/vp9_loopfilter_mmx.asm b/vp9/common/x86/vp9_loopfilter_mmx.asm
index ceffdf558..4ebb51b77 100644
--- a/vp9/common/x86/vp9_loopfilter_mmx.asm
+++ b/vp9/common/x86/vp9_loopfilter_mmx.asm
@@ -593,349 +593,6 @@ sym(vp9_loop_filter_vertical_edge_mmx):
pop rbp
ret
-
-;void vp9_loop_filter_simple_horizontal_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- mov rcx, 2 ; count
-.nexts8_h:
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm3, [rdx] ;
-
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
- neg rax
-
- ; calculate mask
- movq mm1, [rsi+2*rax] ; p1
- movq mm0, [rdi] ; q1
- movq mm2, mm1
- movq mm7, mm0
- movq mm4, mm0
- psubusb mm0, mm1 ; q1-=p1
- psubusb mm1, mm4 ; p1-=q1
- por mm1, mm0 ; abs(p1-q1)
- pand mm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm1, 1 ; abs(p1-q1)/2
-
- movq mm5, [rsi+rax] ; p0
- movq mm4, [rsi] ; q0
- movq mm0, mm4 ; q0
- movq mm6, mm5 ; p0
- psubusb mm5, mm4 ; p0-=q0
- psubusb mm4, mm6 ; q0-=p0
- por mm5, mm4 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb mm5, mm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor mm3, mm3
- pcmpeqb mm5, mm3
-
- ; start work on filters
- pxor mm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb mm2, mm7 ; p1 - q1
-
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor mm0, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm0 ; q0
- psubsb mm0, mm6 ; q0 - p0
- paddsb mm2, mm0 ; p1 - q1 + 1 * (q0 - p0)
- paddsb mm2, mm0 ; p1 - q1 + 2 * (q0 - p0)
- paddsb mm2, mm0 ; p1 - q1 + 3 * (q0 - p0)
- pand mm5, mm2 ; mask filter values we don't care about
-
- ; do + 4 side
- paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
- movq mm1, mm5 ; get a copy of filters
- psraw mm1, 11 ; arithmetic shift right 11
- psllw mm1, 8 ; shift left 8 to put it back
-
- por mm0, mm1 ; put the two together to get result
-
- psubsb mm3, mm0 ; q0-= q0 add
- pxor mm3, [GLOBAL(t80)] ; unoffset
- movq [rsi], mm3 ; write back
-
-
- ; now do +3 side
- psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
- psraw mm5, 11 ; arithmetic shift right 11
- psllw mm5, 8 ; shift left 8 to put it back
- por mm0, mm5 ; put the two together to get result
-
-
- paddsb mm6, mm0 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
- movq [rsi+rax], mm6 ; write back
-
- add rsi,8
- neg rax
- dec rcx
- jnz .nexts8_h
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_mmx
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_mmx) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_mmx):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[8];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[8];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi + rax*4- 2]; ;
- mov rcx, 2 ; count
-.nexts8_v:
-
- lea rdi, [rsi + rax];
- movd mm0, [rdi + rax * 2] ; xx xx xx xx 73 72 71 70
-
- movd mm6, [rsi + rax * 2] ; xx xx xx xx 63 62 61 60
- punpcklbw mm6, mm0 ; 73 63 72 62 71 61 70 60
-
- movd mm0, [rsi + rax] ; xx xx xx xx 53 52 51 50
- movd mm4, [rsi] ; xx xx xx xx 43 42 41 40
-
- punpcklbw mm4, mm0 ; 53 43 52 42 51 41 50 40
- movq mm5, mm4 ; 53 43 52 42 51 41 50 40
-
- punpcklwd mm4, mm6 ; 71 61 51 41 70 60 50 40
- punpckhwd mm5, mm6 ; 73 63 53 43 72 62 52 42
-
- neg rax
-
- movd mm7, [rsi + rax] ; xx xx xx xx 33 32 31 30
- movd mm6, [rsi + rax * 2] ; xx xx xx xx 23 22 21 20
-
- punpcklbw mm6, mm7 ; 33 23 32 22 31 21 30 20
- movd mm1, [rdi + rax * 4] ; xx xx xx xx 13 12 11 10
-
- movd mm0, [rsi + rax * 4] ; xx xx xx xx 03 02 01 00
- punpcklbw mm0, mm1 ; 13 03 12 02 11 01 10 00
-
- movq mm2, mm0 ; 13 03 12 02 11 01 10 00
- punpcklwd mm0, mm6 ; 31 21 11 01 30 20 10 00
-
- punpckhwd mm2, mm6 ; 33 23 13 03 32 22 12 02
- movq mm1, mm0 ; 13 03 12 02 11 01 10 00
-
- punpckldq mm0, mm4 ; 70 60 50 40 30 20 10 00 = p1
- movq mm3, mm2 ; 33 23 13 03 32 22 12 02
-
- punpckhdq mm1, mm4 ; 71 61 51 41 31 21 11 01 = p0
- punpckldq mm2, mm5 ; 72 62 52 42 32 22 12 02 = q0
-
- punpckhdq mm3, mm5 ; 73 63 53 43 33 23 13 03 = q1
-
-
- ; calculate mask
- movq mm6, mm0 ; p1
- movq mm7, mm3 ; q1
- psubusb mm7, mm6 ; q1-=p1
- psubusb mm6, mm3 ; p1-=q1
- por mm6, mm7 ; abs(p1-q1)
- pand mm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw mm6, 1 ; abs(p1-q1)/2
-
- movq mm5, mm1 ; p0
- movq mm4, mm2 ; q0
-
- psubusb mm5, mm2 ; p0-=q0
- psubusb mm4, mm1 ; q0-=p0
-
- por mm5, mm4 ; abs(p0 - q0)
- paddusb mm5, mm5 ; abs(p0-q0)*2
- paddusb mm5, mm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit ; get blimit
- movq mm7, [rdx]
-
- psubusb mm5, mm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor mm7, mm7
- pcmpeqb mm5, mm7 ; mm5 = mask
-
- ; start work on filters
- movq t0, mm0
- movq t1, mm3
-
- pxor mm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor mm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb mm0, mm3 ; p1 - q1
- movq mm6, mm1 ; p0
-
- movq mm7, mm2 ; q0
- pxor mm6, [GLOBAL(t80)] ; offset to convert to signed values
-
- pxor mm7, [GLOBAL(t80)] ; offset to convert to signed values
- movq mm3, mm7 ; offseted ; q0
-
- psubsb mm7, mm6 ; q0 - p0
- paddsb mm0, mm7 ; p1 - q1 + 1 * (q0 - p0)
-
- paddsb mm0, mm7 ; p1 - q1 + 2 * (q0 - p0)
- paddsb mm0, mm7 ; p1 - q1 + 3 * (q0 - p0)
-
- pand mm5, mm0 ; mask filter values we don't care about
-
- paddsb mm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
-
- movq mm7, mm5 ; get a copy of filters
- psraw mm7, 11 ; arithmetic shift right 11
- psllw mm7, 8 ; shift left 8 to put it back
-
- por mm0, mm7 ; put the two together to get result
-
- psubsb mm3, mm0 ; q0-= q0sz add
- pxor mm3, [GLOBAL(t80)] ; unoffset
-
- ; now do +3 side
- psubsb mm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movq mm0, mm5 ; get a copy of filters
- psllw mm0, 8 ; shift left 8
- psraw mm0, 3 ; arithmetic shift right 11
- psrlw mm0, 8
-
- psraw mm5, 11 ; arithmetic shift right 11
- psllw mm5, 8 ; shift left 8 to put it back
- por mm0, mm5 ; put the two together to get result
-
- paddsb mm6, mm0 ; p0+= p0 add
- pxor mm6, [GLOBAL(t80)] ; unoffset
-
-
- movq mm0, t0
- movq mm4, t1
-
- ; mm0 = 70 60 50 40 30 20 10 00
- ; mm6 = 71 61 51 41 31 21 11 01
- ; mm3 = 72 62 52 42 32 22 12 02
- ; mm4 = 73 63 53 43 33 23 13 03
- ; transpose back to write out
-
- movq mm1, mm0 ;
- punpcklbw mm0, mm6 ; 31 30 21 20 11 10 01 00
-
- punpckhbw mm1, mm6 ; 71 70 61 60 51 50 41 40
- movq mm2, mm3 ;
-
- punpcklbw mm2, mm4 ; 33 32 23 22 13 12 03 02
- movq mm5, mm1 ; 71 70 61 60 51 50 41 40
-
- punpckhbw mm3, mm4 ; 73 72 63 62 53 52 43 42
- movq mm6, mm0 ; 31 30 21 20 11 10 01 00
-
- punpcklwd mm0, mm2 ; 13 12 11 10 03 02 01 00
- punpckhwd mm6, mm2 ; 33 32 31 30 23 22 21 20
-
- movd [rsi+rax*4], mm0 ; write 03 02 01 00
- punpcklwd mm1, mm3 ; 53 52 51 50 43 42 41 40
-
- psrlq mm0, 32 ; xx xx xx xx 13 12 11 10
- punpckhwd mm5, mm3 ; 73 72 71 70 63 62 61 60
-
- movd [rdi+rax*4], mm0 ; write 13 12 11 10
- movd [rsi+rax*2], mm6 ; write 23 22 21 20
-
- psrlq mm6, 32 ; 33 32 31 30
- movd [rsi], mm1 ; write 43 42 41 40
-
- movd [rsi + rax], mm6 ; write 33 32 31 30
- neg rax
-
- movd [rsi + rax*2], mm5 ; write 63 62 61 60
- psrlq mm1, 32 ; 53 52 51 50
-
- movd [rdi], mm1 ; write out 53 52 51 50
- psrlq mm5, 32 ; 73 72 71 70
-
- movd [rdi + rax*2], mm5 ; write 73 72 71 70
-
- lea rsi, [rsi+rax*8] ; next 8
-
- dec rcx
- jnz .nexts8_v
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-
-;void fast_loop_filter_vertical_edges_mmx(unsigned char *y_ptr,
-; int y_stride,
-; loop_filter_info *lfi)
-;{
-;
-;
-; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+4, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+8, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-; vp9_loop_filter_simple_vertical_edge_mmx(y_ptr+12, y_stride, lfi->flim,lfi->lim,lfi->thr,2);
-;}
-
SECTION_RODATA
align 16
tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_sse2.asm b/vp9/common/x86/vp9_loopfilter_sse2.asm
index ae4c60f53..74236cfbb 100644
--- a/vp9/common/x86/vp9_loopfilter_sse2.asm
+++ b/vp9/common/x86/vp9_loopfilter_sse2.asm
@@ -845,372 +845,6 @@ sym(vp9_loop_filter_vertical_edge_uv_sse2):
pop rbp
ret
-;void vp9_loop_filter_simple_horizontal_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_horizontal_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_horizontal_edge_sse2):
- push rbp
- mov rbp, rsp
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx
- push rsi
- push rdi
- ; end prolog
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
- mov rdx, arg(2) ;blimit
- movdqa xmm3, XMMWORD PTR [rdx]
-
- mov rdi, rsi ; rdi points to row +1 for indirect addressing
- add rdi, rax
- neg rax
-
- ; calculate mask
- movdqa xmm1, [rsi+2*rax] ; p1
- movdqa xmm0, [rdi] ; q1
- movdqa xmm2, xmm1
- movdqa xmm7, xmm0
- movdqa xmm4, xmm0
- psubusb xmm0, xmm1 ; q1-=p1
- psubusb xmm1, xmm4 ; p1-=q1
- por xmm1, xmm0 ; abs(p1-q1)
- pand xmm1, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw xmm1, 1 ; abs(p1-q1)/2
-
- movdqa xmm5, [rsi+rax] ; p0
- movdqa xmm4, [rsi] ; q0
- movdqa xmm0, xmm4 ; q0
- movdqa xmm6, xmm5 ; p0
- psubusb xmm5, xmm4 ; p0-=q0
- psubusb xmm4, xmm6 ; q0-=p0
- por xmm5, xmm4 ; abs(p0 - q0)
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm1 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- psubusb xmm5, xmm3 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor xmm3, xmm3
- pcmpeqb xmm5, xmm3
-
- ; start work on filters
- pxor xmm2, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm7, [GLOBAL(t80)] ; q1 offset to convert to signed values
- psubsb xmm2, xmm7 ; p1 - q1
-
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
- pxor xmm0, [GLOBAL(t80)] ; offset to convert to signed values
- movdqa xmm3, xmm0 ; q0
- psubsb xmm0, xmm6 ; q0 - p0
- paddsb xmm2, xmm0 ; p1 - q1 + 1 * (q0 - p0)
- paddsb xmm2, xmm0 ; p1 - q1 + 2 * (q0 - p0)
- paddsb xmm2, xmm0 ; p1 - q1 + 3 * (q0 - p0)
- pand xmm5, xmm2 ; mask filter values we don't care about
-
- ; do + 4 side
- paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
- movdqa xmm1, xmm5 ; get a copy of filters
- psraw xmm1, 11 ; arithmetic shift right 11
- psllw xmm1, 8 ; shift left 8 to put it back
-
- por xmm0, xmm1 ; put the two together to get result
-
- psubsb xmm3, xmm0 ; q0-= q0 add
- pxor xmm3, [GLOBAL(t80)] ; unoffset
- movdqa [rsi], xmm3 ; write back
-
- ; now do +3 side
- psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
- psraw xmm5, 11 ; arithmetic shift right 11
- psllw xmm5, 8 ; shift left 8 to put it back
- por xmm0, xmm5 ; put the two together to get result
-
-
- paddsb xmm6, xmm0 ; p0+= p0 add
- pxor xmm6, [GLOBAL(t80)] ; unoffset
- movdqa [rsi+rax], xmm6 ; write back
-
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
-
-;void vp9_loop_filter_simple_vertical_edge_sse2
-;(
-; unsigned char *src_ptr,
-; int src_pixel_step,
-; const char *blimit,
-;)
-global sym(vp9_loop_filter_simple_vertical_edge_sse2) PRIVATE
-sym(vp9_loop_filter_simple_vertical_edge_sse2):
- push rbp ; save old base pointer value.
- mov rbp, rsp ; set new base pointer value.
- SHADOW_ARGS_TO_STACK 3
- SAVE_XMM 7
- GET_GOT rbx ; save callee-saved reg
- push rsi
- push rdi
- ; end prolog
-
- ALIGN_STACK 16, rax
- sub rsp, 32 ; reserve 32 bytes
- %define t0 [rsp + 0] ;__declspec(align(16)) char t0[16];
- %define t1 [rsp + 16] ;__declspec(align(16)) char t1[16];
-
- mov rsi, arg(0) ;src_ptr
- movsxd rax, dword ptr arg(1) ;src_pixel_step ; destination pitch?
-
- lea rsi, [rsi - 2 ]
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd xmm0, [rsi] ; (high 96 bits unused) 03 02 01 00
- movd xmm1, [rdx] ; (high 96 bits unused) 43 42 41 40
- movd xmm2, [rdi] ; 13 12 11 10
- movd xmm3, [rcx] ; 53 52 51 50
- punpckldq xmm0, xmm1 ; (high 64 bits unused) 43 42 41 40 03 02 01 00
- punpckldq xmm2, xmm3 ; 53 52 51 50 13 12 11 10
-
- movd xmm4, [rsi + rax*2] ; 23 22 21 20
- movd xmm5, [rdx + rax*2] ; 63 62 61 60
- movd xmm6, [rdi + rax*2] ; 33 32 31 30
- movd xmm7, [rcx + rax*2] ; 73 72 71 70
- punpckldq xmm4, xmm5 ; 63 62 61 60 23 22 21 20
- punpckldq xmm6, xmm7 ; 73 72 71 70 33 32 31 30
-
- punpcklbw xmm0, xmm2 ; 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00
- punpcklbw xmm4, xmm6 ; 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20
-
- movdqa xmm1, xmm0
- punpcklwd xmm0, xmm4 ; 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00
- punpckhwd xmm1, xmm4 ; 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40
-
- movdqa xmm2, xmm0
- punpckldq xmm0, xmm1 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- punpckhdq xmm2, xmm1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
-
- movdqa t0, xmm0 ; save to t0
- movdqa t1, xmm2 ; save to t1
-
- lea rsi, [rsi + rax*8]
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd xmm4, [rsi] ; 83 82 81 80
- movd xmm1, [rdx] ; c3 c2 c1 c0
- movd xmm6, [rdi] ; 93 92 91 90
- movd xmm3, [rcx] ; d3 d2 d1 d0
- punpckldq xmm4, xmm1 ; c3 c2 c1 c0 83 82 81 80
- punpckldq xmm6, xmm3 ; d3 d2 d1 d0 93 92 91 90
-
- movd xmm0, [rsi + rax*2] ; a3 a2 a1 a0
- movd xmm5, [rdx + rax*2] ; e3 e2 e1 e0
- movd xmm2, [rdi + rax*2] ; b3 b2 b1 b0
- movd xmm7, [rcx + rax*2] ; f3 f2 f1 f0
- punpckldq xmm0, xmm5 ; e3 e2 e1 e0 a3 a2 a1 a0
- punpckldq xmm2, xmm7 ; f3 f2 f1 f0 b3 b2 b1 b0
-
- punpcklbw xmm4, xmm6 ; d3 c3 d2 c2 d1 c1 d0 c0 93 83 92 82 91 81 90 80
- punpcklbw xmm0, xmm2 ; f3 e3 f2 e2 f1 e1 f0 e0 b3 a3 b2 a2 b1 a1 b0 a0
-
- movdqa xmm1, xmm4
- punpcklwd xmm4, xmm0 ; b3 a3 93 83 b2 a2 92 82 b1 a1 91 81 b0 a0 90 80
- punpckhwd xmm1, xmm0 ; f3 e3 d3 c3 f2 e2 d2 c2 f1 e1 d1 c1 f0 e0 d0 c0
-
- movdqa xmm6, xmm4
- punpckldq xmm4, xmm1 ; f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80
- punpckhdq xmm6, xmm1 ; f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82
-
- movdqa xmm0, t0 ; 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00
- movdqa xmm2, t1 ; 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02
- movdqa xmm1, xmm0
- movdqa xmm3, xmm2
-
- punpcklqdq xmm0, xmm4 ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- punpckhqdq xmm1, xmm4 ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- punpcklqdq xmm2, xmm6 ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- punpckhqdq xmm3, xmm6 ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
-
- ; calculate mask
- movdqa xmm6, xmm0 ; p1
- movdqa xmm7, xmm3 ; q1
- psubusb xmm7, xmm0 ; q1-=p1
- psubusb xmm6, xmm3 ; p1-=q1
- por xmm6, xmm7 ; abs(p1-q1)
- pand xmm6, [GLOBAL(tfe)] ; set lsb of each byte to zero
- psrlw xmm6, 1 ; abs(p1-q1)/2
-
- movdqa xmm5, xmm1 ; p0
- movdqa xmm4, xmm2 ; q0
- psubusb xmm5, xmm2 ; p0-=q0
- psubusb xmm4, xmm1 ; q0-=p0
- por xmm5, xmm4 ; abs(p0 - q0)
- paddusb xmm5, xmm5 ; abs(p0-q0)*2
- paddusb xmm5, xmm6 ; abs (p0 - q0) *2 + abs(p1-q1)/2
-
- mov rdx, arg(2) ;blimit
- movdqa xmm7, XMMWORD PTR [rdx]
-
- psubusb xmm5, xmm7 ; abs(p0 - q0) *2 + abs(p1-q1)/2 > blimit
- pxor xmm7, xmm7
- pcmpeqb xmm5, xmm7 ; mm5 = mask
-
- ; start work on filters
- movdqa t0, xmm0
- movdqa t1, xmm3
-
- pxor xmm0, [GLOBAL(t80)] ; p1 offset to convert to signed values
- pxor xmm3, [GLOBAL(t80)] ; q1 offset to convert to signed values
-
- psubsb xmm0, xmm3 ; p1 - q1
- movdqa xmm6, xmm1 ; p0
-
- movdqa xmm7, xmm2 ; q0
- pxor xmm6, [GLOBAL(t80)] ; offset to convert to signed values
-
- pxor xmm7, [GLOBAL(t80)] ; offset to convert to signed values
- movdqa xmm3, xmm7 ; offseted ; q0
-
- psubsb xmm7, xmm6 ; q0 - p0
- paddsb xmm0, xmm7 ; p1 - q1 + 1 * (q0 - p0)
-
- paddsb xmm0, xmm7 ; p1 - q1 + 2 * (q0 - p0)
- paddsb xmm0, xmm7 ; p1 - q1 + 3 * (q0 - p0)
-
- pand xmm5, xmm0 ; mask filter values we don't care about
-
-
- paddsb xmm5, [GLOBAL(t4)] ; 3* (q0 - p0) + (p1 - q1) + 4
-
- movdqa xmm0, xmm5 ; get a copy of filters
- psllw xmm0, 8 ; shift left 8
-
- psraw xmm0, 3 ; arithmetic shift right 11
- psrlw xmm0, 8
-
- movdqa xmm7, xmm5 ; get a copy of filters
- psraw xmm7, 11 ; arithmetic shift right 11
-
- psllw xmm7, 8 ; shift left 8 to put it back
- por xmm0, xmm7 ; put the two together to get result
-
- psubsb xmm3, xmm0 ; q0-= q0sz add
- pxor xmm3, [GLOBAL(t80)] ; unoffset q0
-
- ; now do +3 side
- psubsb xmm5, [GLOBAL(t1s)] ; +3 instead of +4
- movdqa xmm0, xmm5 ; get a copy of filters
-
- psllw xmm0, 8 ; shift left 8
- psraw xmm0, 3 ; arithmetic shift right 11
-
- psrlw xmm0, 8
- psraw xmm5, 11 ; arithmetic shift right 11
-
- psllw xmm5, 8 ; shift left 8 to put it back
- por xmm0, xmm5 ; put the two together to get result
-
- paddsb xmm6, xmm0 ; p0+= p0 add
- pxor xmm6, [GLOBAL(t80)] ; unoffset p0
-
- movdqa xmm0, t0 ; p1
- movdqa xmm4, t1 ; q1
-
- ; transpose back to write out
- ; p1 f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00
- ; p0 f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01
- ; q0 f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02
- ; q1 f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03
- movdqa xmm1, xmm0
- punpcklbw xmm0, xmm6 ; 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
- punpckhbw xmm1, xmm6 ; f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
-
- movdqa xmm5, xmm3
- punpcklbw xmm3, xmm4 ; 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
- punpckhbw xmm5, xmm4 ; f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
-
- movdqa xmm2, xmm0
- punpcklwd xmm0, xmm3 ; 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
- punpckhwd xmm2, xmm3 ; 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
-
- movdqa xmm3, xmm1
- punpcklwd xmm1, xmm5 ; b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
- punpckhwd xmm3, xmm5 ; f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
-
- ; write out order: xmm0 xmm2 xmm1 xmm3
- lea rdx, [rsi + rax*4]
-
- movd [rsi], xmm1 ; write the second 8-line result
- psrldq xmm1, 4
- movd [rdi], xmm1
- psrldq xmm1, 4
- movd [rsi + rax*2], xmm1
- psrldq xmm1, 4
- movd [rdi + rax*2], xmm1
-
- movd [rdx], xmm3
- psrldq xmm3, 4
- movd [rcx], xmm3
- psrldq xmm3, 4
- movd [rdx + rax*2], xmm3
- psrldq xmm3, 4
- movd [rcx + rax*2], xmm3
-
- neg rax
- lea rsi, [rsi + rax*8]
- neg rax
- lea rdi, [rsi + rax]
- lea rdx, [rsi + rax*4]
- lea rcx, [rdx + rax]
-
- movd [rsi], xmm0 ; write the first 8-line result
- psrldq xmm0, 4
- movd [rdi], xmm0
- psrldq xmm0, 4
- movd [rsi + rax*2], xmm0
- psrldq xmm0, 4
- movd [rdi + rax*2], xmm0
-
- movd [rdx], xmm2
- psrldq xmm2, 4
- movd [rcx], xmm2
- psrldq xmm2, 4
- movd [rdx + rax*2], xmm2
- psrldq xmm2, 4
- movd [rcx + rax*2], xmm2
-
- add rsp, 32
- pop rsp
- ; begin epilog
- pop rdi
- pop rsi
- RESTORE_GOT
- RESTORE_XMM
- UNSHADOW_ARGS
- pop rbp
- ret
-
SECTION_RODATA
align 16
tfe:
diff --git a/vp9/common/x86/vp9_loopfilter_x86.h b/vp9/common/x86/vp9_loopfilter_x86.h
index 46a6202d2..fb5af05f7 100644
--- a/vp9/common/x86/vp9_loopfilter_x86.h
+++ b/vp9/common/x86/vp9_loopfilter_x86.h
@@ -23,10 +23,6 @@ extern prototype_loopfilter_block(vp9_loop_filter_mbv_mmx);
extern prototype_loopfilter_block(vp9_loop_filter_bv_mmx);
extern prototype_loopfilter_block(vp9_loop_filter_mbh_mmx);
extern prototype_loopfilter_block(vp9_loop_filter_bh_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_mmx);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_mmx);
#endif
#if HAVE_SSE2
@@ -34,10 +30,6 @@ extern prototype_loopfilter_block(vp9_loop_filter_mbv_sse2);
extern prototype_loopfilter_block(vp9_loop_filter_bv_sse2);
extern prototype_loopfilter_block(vp9_loop_filter_mbh_sse2);
extern prototype_loopfilter_block(vp9_loop_filter_bh_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_vertical_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bvs_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_simple_horizontal_edge_sse2);
-extern prototype_simple_loopfilter(vp9_loop_filter_bhs_sse2);
#endif
#endif // LOOPFILTER_X86_H
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 4be36774a..70db06dc1 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -811,12 +811,12 @@ static void decode_tile(VP9D_COMP *pbi, vp9_reader *r) {
int mi_row, mi_col;
for (mi_row = pc->cur_tile_mi_row_start;
- mi_row < pc->cur_tile_mi_row_end; mi_row += 8) {
+ mi_row < pc->cur_tile_mi_row_end; mi_row += 64 / MI_SIZE) {
// For a SB there are 2 left contexts, each pertaining to a MB row within
vpx_memset(&pc->left_context, 0, sizeof(pc->left_context));
vpx_memset(pc->left_seg_context, 0, sizeof(pc->left_seg_context));
for (mi_col = pc->cur_tile_mi_col_start;
- mi_col < pc->cur_tile_mi_col_end; mi_col += 8)
+ mi_col < pc->cur_tile_mi_col_end; mi_col += 64 / MI_SIZE)
decode_modes_sb(pbi, mi_row, mi_col, r, BLOCK_SIZE_SB64X64);
}
}
diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index ce2a86b4f..1d7e093cf 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c
@@ -62,7 +62,7 @@ DECLARE_ALIGNED(16, extern const uint8_t, vp9_norm[256]);
#define INCREMENT_COUNT(token) \
do { \
- coef_counts[type][ref][get_coef_band(scan, txfm_size, c)] \
+ coef_counts[type][ref][band] \
[pt][token]++; \
token_cache[scan[c]] = token; \
} while (0)
@@ -96,6 +96,7 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
ENTROPY_CONTEXT above_ec, left_ec;
FRAME_CONTEXT *const fc = &dx->common.fc;
int pt, c = 0, pad, default_eob;
+ int band;
vp9_coeff_probs *coef_probs;
vp9_prob *prob;
vp9_coeff_count *coef_counts;
@@ -162,7 +163,6 @@ static int decode_coefs(VP9D_COMP *dx, const MACROBLOCKD *xd,
while (1) {
int val;
- int band;
const uint8_t *cat6 = cat6_prob;
if (c >= seg_eob)
break;
@@ -249,8 +249,7 @@ SKIP_START:
}
if (c < seg_eob)
- coef_counts[type][ref][get_coef_band(scan, txfm_size, c)]
- [pt][DCT_EOB_TOKEN]++;
+ coef_counts[type][ref][band][pt][DCT_EOB_TOKEN]++;
for (pt = 0; pt < (1 << txfm_size); pt++) {
A[pt] = L[pt] = c > 0;
diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 6bc42c7ff..44261481c 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h
@@ -138,8 +138,8 @@ struct macroblock {
int optimize;
- // Structure to hold context for each of the 4 MBs within a SB:
- // when encoded as 4 independent MBs:
+ // TODO(jingning): Need to refactor the structure arrays that buffers the
+ // coding mode decisions of each partition type.
PICK_MODE_CONTEXT sb8_context[4][4][4];
PICK_MODE_CONTEXT sb8x16_context[4][4][2];
PICK_MODE_CONTEXT sb16x8_context[4][4][2];
@@ -153,6 +153,10 @@ struct macroblock {
PICK_MODE_CONTEXT sb64_context;
int partition_cost[NUM_PARTITION_CONTEXTS][PARTITION_TYPES];
+ BLOCK_SIZE_TYPE mb_partitioning[4][4];
+ BLOCK_SIZE_TYPE sb_partitioning[4];
+ BLOCK_SIZE_TYPE sb64_partitioning;
+
void (*fwd_txm4x4)(int16_t *input, int16_t *output, int pitch);
void (*fwd_txm8x4)(int16_t *input, int16_t *output, int pitch);
void (*fwd_txm8x8)(int16_t *input, int16_t *output, int pitch);
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 418f60edc..3345f8965 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -731,6 +731,9 @@ static void set_block_index(MACROBLOCKD *xd, int idx,
}
}
+// TODO(jingning): the variables used here are little complicated. need further
+// refactoring on organizing the the temporary buffers, when recursive
+// partition down to 4x4 block size is enabled.
static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize) {
MACROBLOCKD *const xd = &x->e_mbd;
@@ -762,6 +765,72 @@ static PICK_MODE_CONTEXT *get_block_context(MACROBLOCK *x,
}
}
+static int *get_sb_index(MACROBLOCKD *xd, BLOCK_SIZE_TYPE subsize) {
+ switch (subsize) {
+ case BLOCK_SIZE_SB64X32:
+ case BLOCK_SIZE_SB32X64:
+ case BLOCK_SIZE_SB32X32:
+ return &xd->sb_index;
+ case BLOCK_SIZE_SB32X16:
+ case BLOCK_SIZE_SB16X32:
+ case BLOCK_SIZE_MB16X16:
+ return &xd->mb_index;
+ case BLOCK_SIZE_SB16X8:
+ case BLOCK_SIZE_SB8X16:
+ case BLOCK_SIZE_SB8X8:
+ return &xd->b_index;
+ default:
+ assert(0);
+ return NULL;
+ }
+}
+
+static BLOCK_SIZE_TYPE *get_sb_partitioning(MACROBLOCK *x,
+ BLOCK_SIZE_TYPE bsize) {
+ MACROBLOCKD *xd = &x->e_mbd;
+ switch (bsize) {
+ case BLOCK_SIZE_SB64X64:
+ return &x->sb64_partitioning;
+ case BLOCK_SIZE_SB32X32:
+ return &x->sb_partitioning[xd->sb_index];
+ case BLOCK_SIZE_MB16X16:
+ return &x->mb_partitioning[xd->sb_index][xd->mb_index];
+ default:
+ assert(0);
+ return NULL;
+ }
+}
+
+static void restore_context(VP9_COMP *cpi, int mi_row, int mi_col,
+ ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
+ PARTITION_CONTEXT sa[8],
+ PARTITION_CONTEXT sl[8],
+ BLOCK_SIZE_TYPE bsize) {
+ VP9_COMMON *const cm = &cpi->common;
+ MACROBLOCK *const x = &cpi->mb;
+ MACROBLOCKD *const xd = &x->e_mbd;
+ int p;
+ int bwl = b_width_log2(bsize), bw = 1 << bwl;
+ int bhl = b_height_log2(bsize), bh = 1 << bhl;
+ int mwl = mi_width_log2(bsize), mw = 1 << mwl;
+ int mhl = mi_height_log2(bsize), mh = 1 << mhl;
+ for (p = 0; p < MAX_MB_PLANE; p++) {
+ vpx_memcpy(cm->above_context[p] +
+ ((mi_col * 2) >> xd->plane[p].subsampling_x),
+ a + bw * p,
+ sizeof(ENTROPY_CONTEXT) * bw >> xd->plane[p].subsampling_x);
+ vpx_memcpy(cm->left_context[p] +
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+ l + bh * p,
+ sizeof(ENTROPY_CONTEXT) * bh >> xd->plane[p].subsampling_y);
+ }
+ vpx_memcpy(cm->above_seg_context + mi_col, sa,
+ sizeof(PARTITION_CONTEXT) * mw);
+ vpx_memcpy(cm->left_seg_context + (mi_row & MI_MASK), sl,
+ sizeof(PARTITION_CONTEXT) * mh);
+}
+
static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
int mi_row, int mi_col, int output_enabled,
BLOCK_SIZE_TYPE bsize, int sub_index) {
@@ -788,27 +857,28 @@ static void encode_b(VP9_COMP *cpi, TOKENEXTRA **tp,
static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
int mi_row, int mi_col, int output_enabled,
- BLOCK_SIZE_TYPE level,
- BLOCK_SIZE_TYPE c1, BLOCK_SIZE_TYPE c2[4],
- BLOCK_SIZE_TYPE c3[4][4]
- ) {
+ BLOCK_SIZE_TYPE bsize) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- const int bsl = mi_width_log2(level), bs = 1 << (bsl - 1);
- const int bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+ BLOCK_SIZE_TYPE c1 = BLOCK_SIZE_SB8X8;
+ const int bsl = mi_width_log2(bsize), bs = 1 << (bsl - 1);
+ int bwl, bhl;
int UNINITIALIZED_IS_SAFE(pl);
if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
return;
- if (level > BLOCK_SIZE_SB8X8) {
+ if (bsize > BLOCK_SIZE_SB8X8) {
set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, level);
+ pl = partition_plane_context(xd, bsize);
+ c1 = *(get_sb_partitioning(x, bsize));
}
+ bwl = mi_width_log2(c1), bhl = mi_height_log2(c1);
+
if (bsl == bwl && bsl == bhl) {
- if (output_enabled && level > BLOCK_SIZE_SB8X8)
+ if (output_enabled && bsize > BLOCK_SIZE_SB8X8)
cpi->partition_count[pl][PARTITION_NONE]++;
encode_b(cpi, tp, mi_row, mi_col, output_enabled, c1, -1);
} else if (bsl == bhl && bsl > bwl) {
@@ -826,12 +896,12 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
int i;
assert(bwl < bsl && bhl < bsl);
- if (level == BLOCK_SIZE_SB64X64) {
+ if (bsize == BLOCK_SIZE_SB64X64) {
subsize = BLOCK_SIZE_SB32X32;
- } else if (level == BLOCK_SIZE_SB32X32) {
+ } else if (bsize == BLOCK_SIZE_SB32X32) {
subsize = BLOCK_SIZE_MB16X16;
} else {
- assert(level == BLOCK_SIZE_MB16X16);
+ assert(bsize == BLOCK_SIZE_MB16X16);
subsize = BLOCK_SIZE_SB8X8;
}
@@ -843,554 +913,200 @@ static void encode_sb(VP9_COMP *cpi, TOKENEXTRA **tp,
set_block_index(xd, i, subsize);
encode_sb(cpi, tp, mi_row + y_idx * bs, mi_col + x_idx * bs,
- output_enabled, subsize,
- c2 ? c2[i] : c1, c3 ? c3[i] : NULL, NULL);
+ output_enabled, subsize);
}
}
- if (level > BLOCK_SIZE_SB8X8 &&
- (level == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
+ if (bsize > BLOCK_SIZE_SB8X8 &&
+ (bsize == BLOCK_SIZE_MB16X16 || bsl == bwl || bsl == bhl)) {
set_partition_seg_context(cpi, mi_row, mi_col);
- update_partition_context(xd, c1, level);
+ update_partition_context(xd, c1, bsize);
}
}
-static void encode_sb_row(VP9_COMP *cpi,
- int mi_row,
- TOKENEXTRA **tp,
- int *totalrate) {
+
+// TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
+// unlikely to be selected depending on previously rate-distortion optimization
+// results, for encoding speed-up.
+static void rd_pick_partition(VP9_COMP *cpi, TOKENEXTRA **tp,
+ int mi_row, int mi_col,
+ BLOCK_SIZE_TYPE bsize,
+ int *rate, int *dist) {
VP9_COMMON *const cm = &cpi->common;
MACROBLOCK *const x = &cpi->mb;
MACROBLOCKD *const xd = &x->e_mbd;
- int mi_col, pl;
-
- // Initialize the left context for the new SB row
- vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
- vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
-
- // Code each SB in the row
- for (mi_col = cm->cur_tile_mi_col_start;
- mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
- int i, p;
- BLOCK_SIZE_TYPE mb_partitioning[4][4];
- BLOCK_SIZE_TYPE sb_partitioning[4];
- BLOCK_SIZE_TYPE sb64_partitioning = BLOCK_SIZE_SB32X32;
- int sb64_rate = 0, sb64_dist = 0;
- int sb64_skip = 0;
- ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
- PARTITION_CONTEXT seg_l[64 / MI_SIZE], seg_a[64 / MI_SIZE];
- TOKENEXTRA *tp_orig = *tp;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- memcpy(a + 16 * p, cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
- memcpy(l + 16 * p, cm->left_context[p],
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
- }
- vpx_memcpy(&seg_a, cm->above_seg_context + mi_col, sizeof(seg_a));
- vpx_memcpy(&seg_l, cm->left_seg_context, sizeof(seg_l));
+ int bsl = b_width_log2(bsize), bs = 1 << bsl;
+ int msl = mi_height_log2(bsize), ms = 1 << msl;
+ ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
+ PARTITION_CONTEXT sl[8], sa[8];
+ TOKENEXTRA *tp_orig = *tp;
+ int i, p, pl;
+ BLOCK_SIZE_TYPE subsize;
+ int srate = INT_MAX, sdist = INT_MAX;
+
+ assert(mi_height_log2(bsize) == mi_width_log2(bsize));
+
+ // buffer the above/left context information of the block in search.
+ for (p = 0; p < MAX_MB_PLANE; ++p) {
+ vpx_memcpy(a + bs * p, cm->above_context[p] +
+ (mi_col * 2 >> xd->plane[p].subsampling_x),
+ sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_x);
+ vpx_memcpy(l + bs * p, cm->left_context[p] +
+ ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+ sizeof(ENTROPY_CONTEXT) * bs >> xd->plane[p].subsampling_y);
+ }
+ vpx_memcpy(sa, cm->above_seg_context + mi_col,
+ sizeof(PARTITION_CONTEXT) * ms);
+ vpx_memcpy(sl, cm->left_seg_context + (mi_row & MI_MASK),
+ sizeof(PARTITION_CONTEXT) * ms);
+
+ // PARTITION_SPLIT
+ if (bsize >= BLOCK_SIZE_MB16X16) {
+ int r4 = 0, d4 = 0;
+ subsize = get_subsize(bsize, PARTITION_SPLIT);
+ *(get_sb_partitioning(x, bsize)) = subsize;
+
+ for (i = 0; i < 4; ++i) {
+ int x_idx = (i & 1) * (ms >> 1);
+ int y_idx = (i >> 1) * (ms >> 1);
+ int r, d;
- // FIXME(rbultje): this function should probably be rewritten to be
- // recursive at some point in the future.
- for (i = 0; i < 4; i++) {
- const int x_idx = (i & 1) << 2;
- const int y_idx = (i & 2) << 1;
- int sb32_rate = 0, sb32_dist = 0;
- int splitmodes_used = 0;
- int sb32_skip = 0;
- int j;
- ENTROPY_CONTEXT l2[8 * MAX_MB_PLANE], a2[8 * MAX_MB_PLANE];
- PARTITION_CONTEXT sl32[32 / MI_SIZE], sa32[32 / MI_SIZE];
-
- sb_partitioning[i] = BLOCK_SIZE_MB16X16;
- if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
+ if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
continue;
- xd->sb_index = i;
-
- /* Function should not modify L & A contexts; save and restore on exit */
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(l2 + 8 * p,
- cm->left_context[p] +
- (y_idx * 2 >> xd->plane[p].subsampling_y),
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
- vpx_memcpy(a2 + 8 * p,
- cm->above_context[p] +
- ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
- }
- vpx_memcpy(&sa32, cm->above_seg_context + mi_col + x_idx, sizeof(sa32));
- vpx_memcpy(&sl32, cm->left_seg_context + y_idx, sizeof(sl32));
-
- /* Encode MBs in raster order within the SB */
- for (j = 0; j < 4; j++) {
- const int x_idx_m = x_idx + ((j & 1) << 1);
- const int y_idx_m = y_idx + ((j >> 1) << 1);
- int r, d;
- int r2, d2, mb16_rate = 0, mb16_dist = 0, k;
- ENTROPY_CONTEXT l3[4 * MAX_MB_PLANE], a3[4 * MAX_MB_PLANE];
- PARTITION_CONTEXT sl16[16 / MI_SIZE], sa16[16 / MI_SIZE];
-
- mb_partitioning[i][j] = BLOCK_SIZE_SB8X8;
-
- if (mi_row + y_idx_m >= cm->mi_rows ||
- mi_col + x_idx_m >= cm->mi_cols) {
- // MB lies outside frame, move on
- continue;
- }
-
- // Index of the MB in the SB 0..3
- xd->mb_index = j;
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(l3 + 4 * p,
- cm->left_context[p] +
- (y_idx_m * 2 >> xd->plane[p].subsampling_y),
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
- vpx_memcpy(a3 + 4 * p,
- cm->above_context[p] +
- ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
- }
- vpx_memcpy(&sa16, cm->above_seg_context + mi_col + x_idx_m,
- sizeof(sa16));
- vpx_memcpy(&sl16, cm->left_seg_context + y_idx_m, sizeof(sl16));
-
- for (k = 0; k < 4; k++) {
- xd->b_index = k;
-
- // try 8x8 coding
- pick_sb_modes(cpi, mi_row + y_idx_m + (k >> 1),
- mi_col + x_idx_m + (k & 1),
- tp, &r, &d, BLOCK_SIZE_SB8X8,
- &x->sb8_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- mb16_rate += r;
- mb16_dist += d;
- update_state(cpi, &x->sb8_context[xd->sb_index][xd->mb_index]
- [xd->b_index],
- BLOCK_SIZE_SB8X8, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx_m + (k >> 1),
- mi_col + x_idx_m + (k & 1),
- BLOCK_SIZE_SB8X8);
- }
- set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
- pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
- mb16_rate += x->partition_cost[pl][PARTITION_SPLIT];
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx_m * 2 >> xd->plane[p].subsampling_y),
- l3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
- a3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
- }
- vpx_memcpy(cm->above_seg_context + mi_col + x_idx_m,
- sa16, sizeof(sa16));
- vpx_memcpy(cm->left_seg_context + y_idx_m, sl16, sizeof(sl16));
-
- // try 8x16 coding
- r2 = 0;
- d2 = 0;
- xd->b_index = 0;
- pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
- tp, &r, &d, BLOCK_SIZE_SB8X16,
- &x->sb8x16_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- r2 += r;
- d2 += d;
- update_state(cpi, &x->sb8x16_context[xd->sb_index][xd->mb_index]
- [xd->b_index],
- BLOCK_SIZE_SB8X16, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx_m, mi_col + x_idx_m,
- BLOCK_SIZE_SB8X16);
- xd->b_index = 1;
- pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m + 1,
- tp, &r, &d, BLOCK_SIZE_SB8X16,
- &x->sb8x16_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- r2 += r;
- d2 += d;
- set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
- pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
- r2 += x->partition_cost[pl][PARTITION_VERT];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
- RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
- mb16_rate = r2;
- mb16_dist = d2;
- mb_partitioning[i][j] = BLOCK_SIZE_SB8X16;
- }
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx_m * 2 >> xd->plane[p].subsampling_y),
- l3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
- a3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
- }
-
- // try 16x8 coding
- r2 = 0;
- d2 = 0;
- xd->b_index = 0;
- pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
- tp, &r, &d, BLOCK_SIZE_SB16X8,
- &x->sb16x8_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- r2 += r;
- d2 += d;
- update_state(cpi, &x->sb16x8_context[xd->sb_index][xd->mb_index]
- [xd->b_index],
- BLOCK_SIZE_SB16X8, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx_m, mi_col + x_idx_m,
- BLOCK_SIZE_SB16X8);
- xd->b_index = 1;
- pick_sb_modes(cpi, mi_row + y_idx_m + 1, mi_col + x_idx_m,
- tp, &r, &d, BLOCK_SIZE_SB16X8,
- &x->sb16x8_context[xd->sb_index][xd->mb_index]
- [xd->b_index]);
- r2 += r;
- d2 += d;
- set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
- pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
- r2 += x->partition_cost[pl][PARTITION_HORZ];
- if (RDCOST(x->rdmult, x->rddiv, r2, d2) <
- RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
- mb16_rate = r2;
- mb16_dist = d2;
- mb_partitioning[i][j] = BLOCK_SIZE_SB16X8;
- }
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx_m * 2 >> xd->plane[p].subsampling_y),
- l3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx_m) * 2 >> xd->plane[p].subsampling_x),
- a3 + 4 * p,
- sizeof(ENTROPY_CONTEXT) * 4 >> xd->plane[p].subsampling_x);
- }
-
- // try as 16x16
- pick_sb_modes(cpi, mi_row + y_idx_m, mi_col + x_idx_m,
- tp, &r, &d, BLOCK_SIZE_MB16X16,
- &x->mb_context[xd->sb_index][xd->mb_index]);
- set_partition_seg_context(cpi, mi_row + y_idx_m, mi_col + x_idx_m);
- pl = partition_plane_context(xd, BLOCK_SIZE_MB16X16);
- r += x->partition_cost[pl][PARTITION_NONE];
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, mb16_rate, mb16_dist)) {
- mb16_rate = r;
- mb16_dist = d;
- mb_partitioning[i][j] = BLOCK_SIZE_MB16X16;
- }
- sb32_rate += mb16_rate;
- sb32_dist += mb16_dist;
-
- // Dummy encode, do not do the tokenization
- encode_sb(cpi, tp, mi_row + y_idx_m, mi_col + x_idx_m, 0,
- BLOCK_SIZE_MB16X16, mb_partitioning[i][j], NULL, NULL);
- }
-
- /* Restore L & A coding context to those in place on entry */
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx * 2 >> xd->plane[p].subsampling_y),
- l2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
- a2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
- }
- // restore partition information context
- vpx_memcpy(cm->above_seg_context + mi_col + x_idx, sa32, sizeof(sa32));
- vpx_memcpy(cm->left_seg_context + y_idx, sl32, sizeof(sl32));
-
- set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
- sb32_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
- if (cpi->sf.splitmode_breakout) {
- sb32_skip = splitmodes_used;
- sb64_skip += splitmodes_used;
- }
-
- // check 32x16
- if (mi_col + x_idx + 4 <= cm->mi_cols) {
- int r, d;
-
- xd->mb_index = 0;
- pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
- tp, &r, &d, BLOCK_SIZE_SB32X16,
- &x->sb32x16_context[xd->sb_index][xd->mb_index]);
- if (mi_row + y_idx + 2 < cm->mi_rows) {
- int r2, d2;
-
- update_state(cpi, &x->sb32x16_context[xd->sb_index][xd->mb_index],
- BLOCK_SIZE_SB32X16, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx, mi_col + x_idx,
- BLOCK_SIZE_SB32X16);
- xd->mb_index = 1;
- pick_sb_modes(cpi, mi_row + y_idx + 2,
- mi_col + x_idx, tp, &r2, &d2, BLOCK_SIZE_SB32X16,
- &x->sb32x16_context[xd->sb_index][xd->mb_index]);
- r += r2;
- d += d2;
- }
-
- set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
- r += x->partition_cost[pl][PARTITION_HORZ];
-
- /* is this better than MB coding? */
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
- sb32_rate = r;
- sb32_dist = d;
- sb_partitioning[i] = BLOCK_SIZE_SB32X16;
- }
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx * 2 >> xd->plane[p].subsampling_y),
- l2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
- a2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
- }
- }
-
- // check 16x32
- if (mi_row + y_idx + 4 <= cm->mi_rows) {
- int r, d;
-
- xd->mb_index = 0;
- pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
- tp, &r, &d, BLOCK_SIZE_SB16X32,
- &x->sb16x32_context[xd->sb_index][xd->mb_index]);
- if (mi_col + x_idx + 2 < cm->mi_cols) {
- int r2, d2;
-
- update_state(cpi, &x->sb16x32_context[xd->sb_index][xd->mb_index],
- BLOCK_SIZE_SB16X32, 0);
- encode_superblock(cpi, tp,
- 0, mi_row + y_idx, mi_col + x_idx,
- BLOCK_SIZE_SB16X32);
- xd->mb_index = 1;
- pick_sb_modes(cpi, mi_row + y_idx,
- mi_col + x_idx + 2,
- tp, &r2, &d2, BLOCK_SIZE_SB16X32,
- &x->sb16x32_context[xd->sb_index][xd->mb_index]);
- r += r2;
- d += d2;
- }
-
- set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
- r += x->partition_cost[pl][PARTITION_VERT];
-
- /* is this better than MB coding? */
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
- sb32_rate = r;
- sb32_dist = d;
- sb_partitioning[i] = BLOCK_SIZE_SB16X32;
- }
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- vpx_memcpy(cm->left_context[p] +
- (y_idx * 2 >> xd->plane[p].subsampling_y),
- l2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_y);
- vpx_memcpy(cm->above_context[p] +
- ((mi_col + x_idx) * 2 >> xd->plane[p].subsampling_x),
- a2 + 8 * p,
- sizeof(ENTROPY_CONTEXT) * 8 >> xd->plane[p].subsampling_x);
- }
- }
-
- if (!sb32_skip &&
- mi_col + x_idx + 4 <= cm->mi_cols &&
- mi_row + y_idx + 4 <= cm->mi_rows) {
- int r, d;
-
- /* Pick a mode assuming that it applies to all 4 of the MBs in the SB */
- pick_sb_modes(cpi, mi_row + y_idx, mi_col + x_idx,
- tp, &r, &d, BLOCK_SIZE_SB32X32,
- &x->sb32_context[xd->sb_index]);
-
- set_partition_seg_context(cpi, mi_row + y_idx, mi_col + x_idx);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB32X32);
- r += x->partition_cost[pl][PARTITION_NONE];
-
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb32_rate, sb32_dist)) {
- sb32_rate = r;
- sb32_dist = d;
- sb_partitioning[i] = BLOCK_SIZE_SB32X32;
- }
- }
-
- // If we used 16x16 instead of 32x32 then skip 64x64 (if enabled).
- if (cpi->sf.mb16_breakout && sb_partitioning[i] != BLOCK_SIZE_SB32X32) {
- ++sb64_skip;
- }
-
- sb64_rate += sb32_rate;
- sb64_dist += sb32_dist;
-
- /* Encode SB using best computed mode(s) */
- // FIXME(rbultje): there really shouldn't be any need to encode_mb/sb
- // for each level that we go up, we can just keep tokens and recon
- // pixels of the lower level; also, inverting SB/MB order (big->small
- // instead of small->big) means we can use as threshold for small, which
- // may enable breakouts if RD is not good enough (i.e. faster)
- encode_sb(cpi, tp, mi_row + y_idx, mi_col + x_idx, 0,
- BLOCK_SIZE_SB32X32, sb_partitioning[i], mb_partitioning[i],
- NULL);
+ *(get_sb_index(xd, subsize)) = i;
+ rd_pick_partition(cpi, tp, mi_row + y_idx, mi_col + x_idx, subsize,
+ &r, &d);
+ r4 += r;
+ d4 += d;
}
-
- for (p = 0; p < MAX_MB_PLANE; p++) {
- memcpy(cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- a + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
- memcpy(cm->left_context[p], l + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
- }
- memcpy(cm->above_seg_context + mi_col, &seg_a, sizeof(seg_a));
- memcpy(cm->left_seg_context, &seg_l, sizeof(seg_l));
-
set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
- sb64_rate += x->partition_cost[pl][PARTITION_SPLIT];
-
- // check 64x32
- if (mi_col + 8 <= cm->mi_cols && !(cm->mb_rows & 1)) {
- int r, d;
-
- xd->sb_index = 0;
- pick_sb_modes(cpi, mi_row, mi_col,
- tp, &r, &d, BLOCK_SIZE_SB64X32,
- &x->sb64x32_context[xd->sb_index]);
- if (mi_row + 4 != cm->mi_rows) {
- int r2, d2;
-
- update_state(cpi, &x->sb64x32_context[xd->sb_index],
- BLOCK_SIZE_SB64X32, 0);
- encode_superblock(cpi, tp,
- 0, mi_row, mi_col, BLOCK_SIZE_SB64X32);
- xd->sb_index = 1;
- pick_sb_modes(cpi, mi_row + 4, mi_col,
- tp, &r2, &d2, BLOCK_SIZE_SB64X32,
- &x->sb64x32_context[xd->sb_index]);
- r += r2;
- d += d2;
- }
+ pl = partition_plane_context(xd, bsize);
+ r4 += x->partition_cost[pl][PARTITION_SPLIT];
- set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
- r += x->partition_cost[pl][PARTITION_HORZ];
-
- /* is this better than MB coding? */
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
- sb64_rate = r;
- sb64_dist = d;
- sb64_partitioning = BLOCK_SIZE_SB64X32;
- }
+ srate = r4;
+ sdist = d4;
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
- for (p = 0; p < MAX_MB_PLANE; p++) {
- memcpy(cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- a + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
- memcpy(cm->left_context[p], l + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
- }
+ // PARTITION_HORZ
+ if ((mi_col + ms <= cm->mi_cols) && (mi_row + (ms >> 1) <= cm->mi_rows) &&
+ (bsize >= BLOCK_SIZE_MB16X16)) {
+ int r2, d2;
+ int mb_skip = 0;
+ subsize = get_subsize(bsize, PARTITION_HORZ);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+ get_block_context(x, subsize));
+
+ if (mi_row + ms <= cm->mi_rows) {
+ int r, d;
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row + (ms >> 1), mi_col, tp, &r, &d, subsize,
+ get_block_context(x, subsize));
+ r2 += r;
+ d2 += d;
+ } else {
+ if (mi_row + (ms >> 1) != cm->mi_rows)
+ mb_skip = 1;
}
+ set_partition_seg_context(cpi, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ r2 += x->partition_cost[pl][PARTITION_HORZ];
+
+ if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+ RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
- // check 32x64
- if (mi_row + 8 <= cm->mi_rows && !(cm->mb_cols & 1)) {
+ // PARTITION_VERT
+ if ((mi_row + ms <= cm->mi_rows) && (mi_col + (ms >> 1) <= cm->mi_cols) &&
+ (bsize >= BLOCK_SIZE_MB16X16)) {
+ int r2, d2;
+ int mb_skip = 0;
+ subsize = get_subsize(bsize, PARTITION_VERT);
+ *(get_sb_index(xd, subsize)) = 0;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r2, &d2, subsize,
+ get_block_context(x, subsize));
+ if (mi_col + ms <= cm->mi_cols) {
int r, d;
+ update_state(cpi, get_block_context(x, subsize), subsize, 0);
+ encode_superblock(cpi, tp, 0, mi_row, mi_col, subsize);
+ *(get_sb_index(xd, subsize)) = 1;
+ pick_sb_modes(cpi, mi_row, mi_col + (ms >> 1), tp, &r, &d, subsize,
+ get_block_context(x, subsize));
+ r2 += r;
+ d2 += d;
+ } else {
+ if (mi_col + (ms >> 1) != cm->mi_cols)
+ mb_skip = 1;
+ }
+ set_partition_seg_context(cpi, mi_row, mi_col);
+ pl = partition_plane_context(xd, bsize);
+ r2 += x->partition_cost[pl][PARTITION_VERT];
+
+ if ((RDCOST(x->rdmult, x->rddiv, r2, d2) <
+ RDCOST(x->rdmult, x->rddiv, srate, sdist)) && !mb_skip) {
+ srate = r2;
+ sdist = d2;
+ *(get_sb_partitioning(x, bsize)) = subsize;
+ }
+ restore_context(cpi, mi_row, mi_col, a, l, sa, sl, bsize);
+ }
- xd->sb_index = 0;
- pick_sb_modes(cpi, mi_row, mi_col,
- tp, &r, &d, BLOCK_SIZE_SB32X64,
- &x->sb32x64_context[xd->sb_index]);
- if (mi_col + 4 != cm->mi_cols) {
- int r2, d2;
-
- update_state(cpi, &x->sb32x64_context[xd->sb_index],
- BLOCK_SIZE_SB32X64, 0);
- encode_superblock(cpi, tp,
- 0, mi_row, mi_col, BLOCK_SIZE_SB32X64);
- xd->sb_index = 1;
- pick_sb_modes(cpi, mi_row, mi_col + 4,
- tp, &r2, &d2, BLOCK_SIZE_SB32X64,
- &x->sb32x64_context[xd->sb_index]);
- r += r2;
- d += d2;
- }
-
+ // PARTITION_NONE
+ if (mi_row + ms <= cm->mi_rows && mi_col + ms <= cm->mi_cols) {
+ int r, d;
+ pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d, bsize,
+ get_block_context(x, bsize));
+ if (bsize >= BLOCK_SIZE_MB16X16) {
set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
- r += x->partition_cost[pl][PARTITION_VERT];
-
- /* is this better than MB coding? */
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
- sb64_rate = r;
- sb64_dist = d;
- sb64_partitioning = BLOCK_SIZE_SB32X64;
- }
+ pl = partition_plane_context(xd, bsize);
+ r += x->partition_cost[pl][PARTITION_NONE];
+ }
- for (p = 0; p < MAX_MB_PLANE; p++) {
- memcpy(cm->above_context[p] +
- (mi_col * 2 >> xd->plane[p].subsampling_x),
- a + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_x);
- memcpy(cm->left_context[p], l + 16 * p,
- sizeof(ENTROPY_CONTEXT) * 16 >> xd->plane[p].subsampling_y);
- }
+ if (RDCOST(x->rdmult, x->rddiv, r, d) <
+ RDCOST(x->rdmult, x->rddiv, srate, sdist)) {
+ srate = r;
+ sdist = d;
+ if (bsize >= BLOCK_SIZE_MB16X16)
+ *(get_sb_partitioning(x, bsize)) = bsize;
}
+ }
- if (!sb64_skip &&
- mi_col + 8 <= cm->mi_cols &&
- mi_row + 8 <= cm->mi_rows) {
- int r, d;
+ assert(srate < INT_MAX && sdist < INT_MAX);
+ *rate = srate;
+ *dist = sdist;
- pick_sb_modes(cpi, mi_row, mi_col, tp, &r, &d,
- BLOCK_SIZE_SB64X64, &x->sb64_context);
+ encode_sb(cpi, tp, mi_row, mi_col, bsize == BLOCK_SIZE_SB64X64, bsize);
- set_partition_seg_context(cpi, mi_row, mi_col);
- pl = partition_plane_context(xd, BLOCK_SIZE_SB64X64);
- r += x->partition_cost[pl][PARTITION_NONE];
+ if (bsize == BLOCK_SIZE_SB64X64)
+ assert(tp_orig < *tp);
+ else
+ assert(tp_orig == *tp);
+}
- if (RDCOST(x->rdmult, x->rddiv, r, d) <
- RDCOST(x->rdmult, x->rddiv, sb64_rate, sb64_dist)) {
- sb64_rate = r;
- sb64_dist = d;
- sb64_partitioning = BLOCK_SIZE_SB64X64;
- }
- }
+static void encode_sb_row(VP9_COMP *cpi, int mi_row,
+ TOKENEXTRA **tp, int *totalrate) {
+ VP9_COMMON *const cm = &cpi->common;
+ int mi_col;
- assert(tp_orig == *tp);
- encode_sb(cpi, tp, mi_row, mi_col, 1, BLOCK_SIZE_SB64X64,
- sb64_partitioning, sb_partitioning, mb_partitioning);
- assert(tp_orig < *tp);
+ // Initialize the left context for the new SB row
+ vpx_memset(&cm->left_context, 0, sizeof(cm->left_context));
+ vpx_memset(cm->left_seg_context, 0, sizeof(cm->left_seg_context));
+
+ // Code each SB in the row
+ for (mi_col = cm->cur_tile_mi_col_start;
+ mi_col < cm->cur_tile_mi_col_end; mi_col += 8) {
+ int dummy_rate, dummy_dist;
+ rd_pick_partition(cpi, tp, mi_row, mi_col, BLOCK_SIZE_SB64X64,
+ &dummy_rate, &dummy_dist);
}
}
@@ -1559,9 +1275,8 @@ static void encode_frame_internal(VP9_COMP *cpi) {
vp9_get_tile_col_offsets(cm, tile_col);
for (mi_row = cm->cur_tile_mi_row_start;
mi_row < cm->cur_tile_mi_row_end;
- mi_row += 8) {
+ mi_row += 8)
encode_sb_row(cpi, mi_row, &tp, &totalrate);
- }
cpi->tok_count[tile_col] = (unsigned int)(tp - tp_old);
assert(tp - cpi->tok <=
get_token_alloc(cm->mb_rows, cm->mb_cols));
diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index ff0725fd0..72238514a 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c
@@ -632,7 +632,7 @@ void vp9_first_pass(VP9_COMP *cpi) {
vp9_build_inter_predictors_sby(xd, mb_row << 1,
mb_col << 1,
BLOCK_SIZE_MB16X16);
- vp9_encode_sb(cm, x, BLOCK_SIZE_MB16X16);
+ vp9_encode_sby(cm, x, BLOCK_SIZE_MB16X16);
sum_mvr += mv.as_mv.row;
sum_mvr_abs += abs(mv.as_mv.row);
sum_mvc += mv.as_mv.col;
diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 74caba5a0..aff5637e1 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c
@@ -413,6 +413,201 @@ int vp9_find_best_sub_pixel_step_iteratively(MACROBLOCK *x,
return besterr;
}
+
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+#undef DIST
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+ vfp->svaf(PRE(r, c), y_stride, SP(c), SP(r), \
+ z, src_stride, &sse, second_pred)
+
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int *mvjcost, int *mvcost[2],
+ int *distortion,
+ unsigned int *sse1,
+ const uint8_t *second_pred, int w, int h) {
+ uint8_t *z = x->plane[0].src.buf;
+ int src_stride = x->plane[0].src.stride;
+ MACROBLOCKD *xd = &x->e_mbd;
+
+ int rr, rc, br, bc, hstep;
+ int tr, tc;
+ unsigned int besterr = INT_MAX;
+ unsigned int left, right, up, down, diag;
+ unsigned int sse;
+ unsigned int whichdir;
+ unsigned int halfiters = 4;
+ unsigned int quarteriters = 4;
+ unsigned int eighthiters = 4;
+ int thismse;
+ int maxc, minc, maxr, minr;
+ int y_stride;
+ int offset;
+ int usehp = xd->allow_high_precision_mv;
+
+ uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+ uint8_t *y = xd->plane[0].pre[0].buf +
+ (bestmv->as_mv.row) * xd->plane[0].pre[0].stride +
+ bestmv->as_mv.col;
+
+ y_stride = xd->plane[0].pre[0].stride;
+
+ rr = ref_mv->as_mv.row;
+ rc = ref_mv->as_mv.col;
+ br = bestmv->as_mv.row << 3;
+ bc = bestmv->as_mv.col << 3;
+ hstep = 4;
+ minc = MAX(x->mv_col_min << 3, (ref_mv->as_mv.col) -
+ ((1 << MV_MAX_BITS) - 1));
+ maxc = MIN(x->mv_col_max << 3, (ref_mv->as_mv.col) +
+ ((1 << MV_MAX_BITS) - 1));
+ minr = MAX(x->mv_row_min << 3, (ref_mv->as_mv.row) -
+ ((1 << MV_MAX_BITS) - 1));
+ maxr = MIN(x->mv_row_max << 3, (ref_mv->as_mv.row) +
+ ((1 << MV_MAX_BITS) - 1));
+
+ tr = br;
+ tc = bc;
+
+
+ offset = (bestmv->as_mv.row) * y_stride + bestmv->as_mv.col;
+
+ // central mv
+ bestmv->as_mv.row <<= 3;
+ bestmv->as_mv.col <<= 3;
+
+ // calculate central point error
+ // TODO(yunqingwang): central pointer error was already calculated in full-
+ // pixel search, and can be passed in this function.
+ comp_avg_pred(comp_pred, second_pred, w, h, y, y_stride);
+ besterr = vfp->vf(comp_pred, w, z, src_stride, sse1);
+ *distortion = besterr;
+ besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost,
+ error_per_bit, xd->allow_high_precision_mv);
+
+ // Each subsequent iteration checks at least one point in
+ // common with the last iteration could be 2 ( if diag selected)
+ while (--halfiters) {
+ // 1/2 pel
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(down, tr + hstep, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ // Each subsequent iteration checks at least one point in common with
+ // the last iteration could be 2 ( if diag selected) 1/4 pel
+ hstep >>= 1;
+ while (--quarteriters) {
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(down, tr + hstep, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+
+ if (xd->allow_high_precision_mv) {
+ usehp = vp9_use_nmv_hp(&ref_mv->as_mv);
+ } else {
+ usehp = 0;
+ }
+
+ if (usehp) {
+ hstep >>= 1;
+ while (--eighthiters) {
+ CHECK_BETTER(left, tr, tc - hstep);
+ CHECK_BETTER(right, tr, tc + hstep);
+ CHECK_BETTER(up, tr - hstep, tc);
+ CHECK_BETTER(down, tr + hstep, tc);
+
+ whichdir = (left < right ? 0 : 1) + (up < down ? 0 : 2);
+
+ switch (whichdir) {
+ case 0:
+ CHECK_BETTER(diag, tr - hstep, tc - hstep);
+ break;
+ case 1:
+ CHECK_BETTER(diag, tr - hstep, tc + hstep);
+ break;
+ case 2:
+ CHECK_BETTER(diag, tr + hstep, tc - hstep);
+ break;
+ case 3:
+ CHECK_BETTER(diag, tr + hstep, tc + hstep);
+ break;
+ }
+
+ // no reason to check the same one again.
+ if (tr == br && tc == bc)
+ break;
+
+ tr = br;
+ tc = bc;
+ }
+ }
+ bestmv->as_mv.row = br;
+ bestmv->as_mv.col = bc;
+
+ vpx_free(comp_pred);
+
+ if ((abs(bestmv->as_mv.col - ref_mv->as_mv.col) > (MAX_FULL_PEL_VAL << 3)) ||
+ (abs(bestmv->as_mv.row - ref_mv->as_mv.row) > (MAX_FULL_PEL_VAL << 3)))
+ return INT_MAX;
+
+ return besterr;
+}
+#endif // CONFIG_COMP_INTER_JOINT_SEARCH
+
#undef MVC
#undef PRE
#undef DIST
@@ -2132,7 +2327,109 @@ int vp9_refining_search_sadx4(MACROBLOCK *x,
return INT_MAX;
}
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+/* This function is called when we do joint motion search in comp_inter_inter
+ * mode.
+ */
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+ int_mv *ref_mv, int error_per_bit,
+ int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+ int *mvjcost, int *mvcost[2], int_mv *center_mv,
+ const uint8_t *second_pred, int w, int h) {
+ const MACROBLOCKD* const xd = &x->e_mbd;
+ MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
+ {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
+ int i, j;
+ int this_row_offset, this_col_offset;
+ int what_stride = x->plane[0].src.stride;
+ int in_what_stride = xd->plane[0].pre[0].stride;
+ uint8_t *what = x->plane[0].src.buf;
+ uint8_t *best_address = xd->plane[0].pre[0].buf +
+ (ref_mv->as_mv.row * xd->plane[0].pre[0].stride) +
+ ref_mv->as_mv.col;
+ uint8_t *check_here;
+ unsigned int thissad;
+ int_mv this_mv;
+ unsigned int bestsad = INT_MAX;
+ int_mv fcenter_mv;
+
+ int *mvjsadcost = x->nmvjointsadcost;
+ int *mvsadcost[2] = {x->nmvsadcost[0], x->nmvsadcost[1]};
+
+ /* Compound pred buffer */
+ uint8_t *comp_pred = vpx_memalign(16, w * h * sizeof(uint8_t));
+
+ fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
+ fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
+
+ /* Get compound pred by averaging two pred blocks. */
+ comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+
+ bestsad = fn_ptr->sdf(what, what_stride, comp_pred, w, 0x7fffffff) +
+ mvsad_err_cost(ref_mv, &fcenter_mv, mvjsadcost, mvsadcost, error_per_bit);
+
+ for (i = 0; i < search_range; i++) {
+ int best_site = -1;
+
+ for (j = 0; j < 8; j++) {
+ this_row_offset = ref_mv->as_mv.row + neighbors[j].row;
+ this_col_offset = ref_mv->as_mv.col + neighbors[j].col;
+
+ if ((this_col_offset > x->mv_col_min) &&
+ (this_col_offset < x->mv_col_max) &&
+ (this_row_offset > x->mv_row_min) &&
+ (this_row_offset < x->mv_row_max)) {
+ check_here = (neighbors[j].row) * in_what_stride + neighbors[j].col +
+ best_address;
+
+ /* Get compound block and use it to calculate SAD. */
+ comp_avg_pred(comp_pred, second_pred, w, h, check_here,
+ in_what_stride);
+ thissad = fn_ptr->sdf(what, what_stride, comp_pred, w, bestsad);
+
+ if (thissad < bestsad) {
+ this_mv.as_mv.row = this_row_offset;
+ this_mv.as_mv.col = this_col_offset;
+ thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvjsadcost,
+ mvsadcost, error_per_bit);
+
+ if (thissad < bestsad) {
+ bestsad = thissad;
+ best_site = j;
+ }
+ }
+ }
+ }
+
+ if (best_site == -1) {
+ break;
+ } else {
+ ref_mv->as_mv.row += neighbors[best_site].row;
+ ref_mv->as_mv.col += neighbors[best_site].col;
+ best_address += (neighbors[best_site].row) * in_what_stride +
+ neighbors[best_site].col;
+ }
+ }
+
+ this_mv.as_mv.row = ref_mv->as_mv.row << 3;
+ this_mv.as_mv.col = ref_mv->as_mv.col << 3;
+
+ if (bestsad < INT_MAX) {
+ int besterr;
+ comp_avg_pred(comp_pred, second_pred, w, h, best_address, in_what_stride);
+ besterr = fn_ptr->vf(what, what_stride, comp_pred, w,
+ (unsigned int *)(&thissad)) +
+ mv_err_cost(&this_mv, center_mv, mvjcost, mvcost, x->errorperbit,
+ xd->allow_high_precision_mv);
+ vpx_free(comp_pred);
+ return besterr;
+ } else {
+ vpx_free(comp_pred);
+ return INT_MAX;
+ }
+}
+#endif // CONFIG_COMP_INTER_JOINT_SEARCH
#ifdef ENTROPY_STATS
void print_mode_context(VP9_COMMON *pc) {
diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index e1ba7fd9d..cdbd29aa5 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h
@@ -79,5 +79,21 @@ typedef int (*vp9_diamond_search_fn_t)(MACROBLOCK *x,
int *mvjcost, int *mvcost[2],
int_mv *center_mv);
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+int vp9_find_best_sub_pixel_comp(MACROBLOCK *x,
+ int_mv *bestmv, int_mv *ref_mv,
+ int error_per_bit,
+ const vp9_variance_fn_ptr_t *vfp,
+ int *mvjcost, int *mvcost[2],
+ int *distortion, unsigned int *sse1,
+ const uint8_t *second_pred,
+ int w, int h);
+int vp9_refining_search_8p_c(MACROBLOCK *x,
+ int_mv *ref_mv, int error_per_bit,
+ int search_range, vp9_variance_fn_ptr_t *fn_ptr,
+ int *mvjcost, int *mvcost[2],
+ int_mv *center_mv, const uint8_t *second_pred,
+ int w, int h);
+#endif // CONFIG_COMP_INTER_JOINT_SEARCH
#endif // VP9_ENCODER_VP9_MCOMP_H_
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index e55f5551f..610d7330b 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1527,10 +1527,11 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
for (i = 0; i < MAX_MODES; i++)
cpi->rd_thresh_mult[i] = 128;
-#define BFP(BT, SDF, VF, SVF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF) \
+#define BFP(BT, SDF, VF, SVF, SVAF, SVFHH, SVFHV, SVFHHV, SDX3F, SDX8F, SDX4DF)\
cpi->fn_ptr[BT].sdf = SDF; \
cpi->fn_ptr[BT].vf = VF; \
cpi->fn_ptr[BT].svf = SVF; \
+ cpi->fn_ptr[BT].svaf = SVAF; \
cpi->fn_ptr[BT].svf_halfpix_h = SVFHH; \
cpi->fn_ptr[BT].svf_halfpix_v = SVFHV; \
cpi->fn_ptr[BT].svf_halfpix_hv = SVFHHV; \
@@ -1539,57 +1540,64 @@ VP9_PTR vp9_create_compressor(VP9_CONFIG *oxcf) {
cpi->fn_ptr[BT].sdx4df = SDX4DF;
BFP(BLOCK_32X16, vp9_sad32x16, vp9_variance32x16, vp9_sub_pixel_variance32x16,
- NULL, NULL,
+ vp9_sub_pixel_avg_variance32x16, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x16x4d)
BFP(BLOCK_16X32, vp9_sad16x32, vp9_variance16x32, vp9_sub_pixel_variance16x32,
- NULL, NULL,
+ vp9_sub_pixel_avg_variance16x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad16x32x4d)
BFP(BLOCK_64X32, vp9_sad64x32, vp9_variance64x32, vp9_sub_pixel_variance64x32,
- NULL, NULL,
+ vp9_sub_pixel_avg_variance64x32, NULL, NULL,
NULL, NULL, NULL,
vp9_sad64x32x4d)
BFP(BLOCK_32X64, vp9_sad32x64, vp9_variance32x64, vp9_sub_pixel_variance32x64,
- NULL, NULL,
+ vp9_sub_pixel_avg_variance32x64, NULL, NULL,
NULL, NULL, NULL,
vp9_sad32x64x4d)
BFP(BLOCK_32X32, vp9_sad32x32, vp9_variance32x32, vp9_sub_pixel_variance32x32,
- vp9_variance_halfpixvar32x32_h, vp9_variance_halfpixvar32x32_v,
+ vp9_sub_pixel_avg_variance32x32, vp9_variance_halfpixvar32x32_h,
+ vp9_variance_halfpixvar32x32_v,
vp9_variance_halfpixvar32x32_hv, vp9_sad32x32x3, vp9_sad32x32x8,
vp9_sad32x32x4d)
BFP(BLOCK_64X64, vp9_sad64x64, vp9_variance64x64, vp9_sub_pixel_variance64x64,
- vp9_variance_halfpixvar64x64_h, vp9_variance_halfpixvar64x64_v,
+ vp9_sub_pixel_avg_variance64x64, vp9_variance_halfpixvar64x64_h,
+ vp9_variance_halfpixvar64x64_v,
vp9_variance_halfpixvar64x64_hv, vp9_sad64x64x3, vp9_sad64x64x8,
vp9_sad64x64x4d)
BFP(BLOCK_16X16, vp9_sad16x16, vp9_variance16x16, vp9_sub_pixel_variance16x16,
- vp9_variance_halfpixvar16x16_h, vp9_variance_halfpixvar16x16_v,
- vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
- vp9_sad16x16x4d)
+ vp9_sub_pixel_avg_variance16x16, vp9_variance_halfpixvar16x16_h,
+ vp9_variance_halfpixvar16x16_v,
+ vp9_variance_halfpixvar16x16_hv, vp9_sad16x16x3, vp9_sad16x16x8,
+ vp9_sad16x16x4d)
BFP(BLOCK_16X8, vp9_sad16x8, vp9_variance16x8, vp9_sub_pixel_variance16x8,
- NULL, NULL, NULL, vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
+ vp9_sub_pixel_avg_variance16x8, NULL, NULL, NULL,
+ vp9_sad16x8x3, vp9_sad16x8x8, vp9_sad16x8x4d)
BFP(BLOCK_8X16, vp9_sad8x16, vp9_variance8x16, vp9_sub_pixel_variance8x16,
- NULL, NULL, NULL, vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
+ vp9_sub_pixel_avg_variance8x16, NULL, NULL, NULL,
+ vp9_sad8x16x3, vp9_sad8x16x8, vp9_sad8x16x4d)
BFP(BLOCK_8X8, vp9_sad8x8, vp9_variance8x8, vp9_sub_pixel_variance8x8,
- NULL, NULL, NULL, vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
+ vp9_sub_pixel_avg_variance8x8, NULL, NULL, NULL,
+ vp9_sad8x8x3, vp9_sad8x8x8, vp9_sad8x8x4d)
BFP(BLOCK_4X8, NULL, vp9_variance4x8, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL)
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
BFP(BLOCK_8X4, NULL, vp9_variance8x4, NULL,
- NULL, NULL, NULL, NULL, NULL, NULL)
+ NULL, NULL, NULL, NULL, NULL, NULL, NULL)
BFP(BLOCK_4X4, vp9_sad4x4, vp9_variance4x4, vp9_sub_pixel_variance4x4,
- NULL, NULL, NULL, vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
+ vp9_sub_pixel_avg_variance4x4, NULL, NULL, NULL,
+ vp9_sad4x4x3, vp9_sad4x4x8, vp9_sad4x4x4d)
cpi->full_search_sad = vp9_full_search_sad;
cpi->diamond_search_sad = vp9_diamond_search_sad;
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 1b143f5e0..48356931a 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -1069,9 +1069,7 @@ typedef struct {
B_PREDICTION_MODE modes[4];
int_mv mvs[4], second_mvs[4];
int eobs[4];
-
int mvthresh;
- int *mdcounts;
} BEST_SEG_INFO;
static INLINE int mv_check_bounds(MACROBLOCK *x, int_mv *mv) {
@@ -1322,7 +1320,6 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
int_mv *best_ref_mv,
int_mv *second_best_ref_mv,
int64_t best_rd,
- int *mdcounts,
int *returntotrate,
int *returnyrate,
int *returndistortion,
@@ -1339,7 +1336,6 @@ static int rd_pick_best_mbsegmentation(VP9_COMP *cpi, MACROBLOCK *x,
bsi.second_ref_mv = second_best_ref_mv;
bsi.mvp.as_int = best_ref_mv->as_int;
bsi.mvthresh = mvthresh;
- bsi.mdcounts = mdcounts;
for (i = 0; i < 4; i++)
bsi.modes[i] = ZERO4X4;
@@ -1612,7 +1608,6 @@ static void setup_buffer_inter(VP9_COMP *cpi, MACROBLOCK *x,
int mi_row, int mi_col,
int_mv frame_nearest_mv[MAX_REF_FRAMES],
int_mv frame_near_mv[MAX_REF_FRAMES],
- int frame_mdcounts[4][4],
struct buf_2d yv12_mb[4][MAX_MB_PLANE],
struct scale_factors scale[MAX_REF_FRAMES]) {
VP9_COMMON *cm = &cpi->common;
@@ -1797,7 +1792,7 @@ static INLINE int get_switchable_rate(VP9_COMMON *cm, MACROBLOCK *x) {
static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
BLOCK_SIZE_TYPE bsize,
- int mdcounts[4], int64_t txfm_cache[],
+ int64_t txfm_cache[],
int *rate2, int *distortion, int *skippable,
int *compmode_cost,
int *rate_y, int *distortion_y,
@@ -1807,8 +1802,9 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
INTERPOLATIONFILTERTYPE *best_filter,
int_mv frame_mv[MB_MODE_COUNT]
[MAX_REF_FRAMES],
- YV12_BUFFER_CONFIG *scaled_ref_frame,
- int mi_row, int mi_col) {
+ YV12_BUFFER_CONFIG **scaled_ref_frame,
+ int mi_row, int mi_col,
+ int_mv single_newmv[MAX_REF_FRAMES]) {
const int bw = 1 << mi_width_log2(bsize), bh = 1 << mi_height_log2(bsize);
VP9_COMMON *cm = &cpi->common;
@@ -1838,6 +1834,152 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
ref_mv[1] = mbmi->ref_mvs[refs[1]][0];
if (is_comp_pred) {
+#if CONFIG_COMP_INTER_JOINT_SEARCH
+ const int b_sz[BLOCK_SIZE_TYPES][2] = {
+ {4, 4},
+ {8, 8},
+ {8, 16},
+ {16, 8},
+ {16, 16},
+ {16, 32},
+ {32, 16},
+ {32, 32},
+ {32, 64},
+ {64, 32},
+ {64, 64}
+ };
+
+ int ite;
+ // Prediction buffer from second frame.
+ uint8_t *second_pred = vpx_memalign(16, b_sz[bsize][0] *
+ b_sz[bsize][1] * sizeof(uint8_t));
+
+ // Do joint motion search in compound mode to get more accurate mv.
+ struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0}};
+ struct buf_2d backup_second_yv12[MAX_MB_PLANE] = {{0}};
+ struct buf_2d scaled_first_yv12;
+
+ if (scaled_ref_frame[0]) {
+ int i;
+
+ // Swap out the reference frame for a version that's been scaled to
+ // match the resolution of the current frame, allowing the existing
+ // motion search code to be used without additional modifications.
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_yv12[i] = xd->plane[i].pre[0];
+
+ setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
+ NULL, NULL);
+ }
+
+ if (scaled_ref_frame[1]) {
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ backup_second_yv12[i] = xd->plane[i].pre[1];
+
+ setup_pre_planes(xd, scaled_ref_frame[1], NULL, mi_row, mi_col,
+ NULL, NULL);
+ }
+ xd->scale_factor[0].set_scaled_offsets(&xd->scale_factor[0],
+ mi_row, mi_col);
+ xd->scale_factor[1].set_scaled_offsets(&xd->scale_factor[1],
+ mi_row, mi_col);
+
+ scaled_first_yv12 = xd->plane[0].pre[0];
+
+ // Initialize mv using single prediction mode result.
+ frame_mv[NEWMV][refs[0]].as_int = single_newmv[refs[0]].as_int;
+ frame_mv[NEWMV][refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+ // Iteration: joint search is done once for each ref frame.
+ // Tried allowing search multiple times iteratively, and break out if
+ // it couldn't find better mv. But tests didn't show noticeable
+ // improvement.
+ for (ite = 0; ite < 2; ite++) {
+ struct buf_2d ref_yv12[2] = {xd->plane[0].pre[0],
+ xd->plane[0].pre[1]};
+ int bestsme = INT_MAX;
+ int sadpb = x->sadperbit16;
+ int_mv tmp_mv;
+ int search_range = 3;
+
+ int tmp_col_min = x->mv_col_min;
+ int tmp_col_max = x->mv_col_max;
+ int tmp_row_min = x->mv_row_min;
+ int tmp_row_max = x->mv_row_max;
+ int id = ite % 2;
+
+ // Get pred block from second frame.
+ vp9_build_inter_predictor(ref_yv12[!id].buf,
+ ref_yv12[!id].stride,
+ second_pred, b_sz[bsize][0],
+ &frame_mv[NEWMV][refs[!id]],
+ &xd->scale_factor[!id],
+ b_sz[bsize][0], b_sz[bsize][1], 0,
+ &xd->subpix);
+
+ // Compound motion search on first ref frame.
+ if (id)
+ xd->plane[0].pre[0] = ref_yv12[id];
+ vp9_clamp_mv_min_max(x, &ref_mv[id]);
+
+ // Use mv result from single mode as mvp.
+ tmp_mv.as_int = frame_mv[NEWMV][refs[id]].as_int;
+
+ tmp_mv.as_mv.col >>= 3;
+ tmp_mv.as_mv.row >>= 3;
+
+ // Small-range full-pixel motion search
+ bestsme = vp9_refining_search_8p_c(x, &tmp_mv, sadpb,
+ search_range,
+ &cpi->fn_ptr[block_size],
+ x->nmvjointcost, x->mvcost,
+ &ref_mv[id], second_pred,
+ b_sz[bsize][0], b_sz[bsize][1]);
+
+ x->mv_col_min = tmp_col_min;
+ x->mv_col_max = tmp_col_max;
+ x->mv_row_min = tmp_row_min;
+ x->mv_row_max = tmp_row_max;
+
+ if (bestsme < INT_MAX) {
+ int dis; /* TODO: use dis in distortion calculation later. */
+ unsigned int sse;
+
+ vp9_find_best_sub_pixel_comp(x, &tmp_mv,
+ &ref_mv[id],
+ x->errorperbit,
+ &cpi->fn_ptr[block_size],
+ x->nmvjointcost, x->mvcost,
+ &dis, &sse, second_pred,
+ b_sz[bsize][0], b_sz[bsize][1]);
+ }
+
+ frame_mv[NEWMV][refs[id]].as_int =
+ xd->mode_info_context->bmi[0].as_mv[1].as_int = tmp_mv.as_int;
+ if (id)
+ xd->plane[0].pre[0] = scaled_first_yv12;
+ }
+
+ // restore the predictor
+ if (scaled_ref_frame[0]) {
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[0] = backup_yv12[i];
+ }
+
+ if (scaled_ref_frame[1]) {
+ int i;
+
+ for (i = 0; i < MAX_MB_PLANE; i++)
+ xd->plane[i].pre[1] = backup_second_yv12[i];
+ }
+
+ vpx_free(second_pred);
+#endif // CONFIG_COMP_INTER_JOINT_SEARCH
+
if (frame_mv[NEWMV][refs[0]].as_int == INVALID_MV ||
frame_mv[NEWMV][refs[1]].as_int == INVALID_MV)
return INT64_MAX;
@@ -1862,7 +2004,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
int tmp_row_min = x->mv_row_min;
int tmp_row_max = x->mv_row_max;
- if (scaled_ref_frame) {
+ if (scaled_ref_frame[0]) {
int i;
// Swap out the reference frame for a version that's been scaled to
@@ -1871,7 +2013,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
for (i = 0; i < MAX_MB_PLANE; i++)
backup_yv12[i] = xd->plane[i].pre[0];
- setup_pre_planes(xd, scaled_ref_frame, NULL, mi_row, mi_col,
+ setup_pre_planes(xd, scaled_ref_frame[0], NULL, mi_row, mi_col,
NULL, NULL);
}
@@ -1914,6 +2056,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
}
frame_mv[NEWMV][refs[0]].as_int =
xd->mode_info_context->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
+ single_newmv[refs[0]].as_int = tmp_mv.as_int;
// Add the new motion vector cost to our rolling cost variable
*rate2 += vp9_mv_bit_cost(&tmp_mv, &ref_mv[0],
@@ -1921,7 +2064,7 @@ static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
96, xd->allow_high_precision_mv);
// restore the predictor, if required
- if (scaled_ref_frame) {
+ if (scaled_ref_frame[0]) {
int i;
for (i = 0; i < MAX_MB_PLANE; i++)
@@ -2203,15 +2346,14 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
unsigned char segment_id = xd->mode_info_context->mbmi.segment_id;
int comp_pred, i;
int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
- int frame_mdcounts[4][4];
struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+ int_mv single_newmv[MAX_REF_FRAMES];
static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
VP9_ALT_FLAG };
int idx_list[4] = {0,
cpi->lst_fb_idx,
cpi->gld_fb_idx,
cpi->alt_fb_idx};
- int mdcounts[4];
int64_t best_rd = INT64_MAX;
int64_t best_txfm_rd[NB_TXFM_MODES];
int64_t best_txfm_diff[NB_TXFM_MODES];
@@ -2251,6 +2393,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
xd->mode_info_context->mbmi.segment_id = segment_id;
estimate_ref_frame_costs(cpi, segment_id, ref_costs);
vpx_memset(&best_mbmode, 0, sizeof(best_mbmode));
+ vpx_memset(&single_newmv, 0, sizeof(single_newmv));
for (i = 0; i < NB_PREDICTION_TYPES; ++i)
best_pred_rd[i] = INT64_MAX;
@@ -2293,7 +2436,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
if (cpi->ref_frame_flags & flag_list[ref_frame]) {
setup_buffer_inter(cpi, x, idx_list[ref_frame], ref_frame, block_size,
mi_row, mi_col, frame_mv[NEARESTMV], frame_mv[NEARMV],
- frame_mdcounts, yv12_mb, scale_factor);
+ yv12_mb, scale_factor);
}
frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
frame_mv[ZEROMV][ref_frame].as_int = 0;
@@ -2420,8 +2563,6 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
xd->plane[i].pre[1] = yv12_mb[second_ref][i];
}
- vpx_memcpy(mdcounts, frame_mdcounts[ref_frame], sizeof(mdcounts));
-
// If the segment reference frame feature is enabled....
// then do nothing if the current ref frame is not allowed..
if (vp9_segfeature_active(xd, segment_id, SEG_LVL_REF_FRAME) &&
@@ -2519,7 +2660,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
&mbmi->ref_mvs[mbmi->ref_frame][0],
- second_ref, INT64_MAX, mdcounts,
+ second_ref, INT64_MAX,
&rate, &rate_y, &distortion,
&skippable,
(int)this_rd_thresh, seg_mvs);
@@ -2558,7 +2699,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
// switchable list (bilinear, 6-tap) is indicated at the frame level
tmp_rd = rd_pick_best_mbsegmentation(cpi, x,
&mbmi->ref_mvs[mbmi->ref_frame][0],
- second_ref, INT64_MAX, mdcounts,
+ second_ref, INT64_MAX,
&rate, &rate_y, &distortion,
&skippable,
(int)this_rd_thresh, seg_mvs);
@@ -2608,7 +2749,7 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
vp9_cost_bit(vp9_get_pred_prob(cm, xd, PRED_COMP), is_comp_pred);
mbmi->mode = this_mode;
} else {
- YV12_BUFFER_CONFIG *scaled_ref_frame = NULL;
+ YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
int fb;
if (mbmi->ref_frame == LAST_FRAME) {
@@ -2620,17 +2761,31 @@ int64_t vp9_rd_pick_inter_mode_sb(VP9_COMP *cpi, MACROBLOCK *x,
}
if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
- scaled_ref_frame = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+ scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+
+ if (comp_pred) {
+ if (mbmi->second_ref_frame == LAST_FRAME) {
+ fb = cpi->lst_fb_idx;
+ } else if (mbmi->second_ref_frame == GOLDEN_FRAME) {
+ fb = cpi->gld_fb_idx;
+ } else {
+ fb = cpi->alt_fb_idx;
+ }
+
+ if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
+ scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
+ }
this_rd = handle_inter_mode(cpi, x, bsize,
- mdcounts, txfm_cache,
+ txfm_cache,
&rate2, &distortion2, &skippable,
&compmode_cost,
&rate_y, &distortion_y,
&rate_uv, &distortion_uv,
&mode_excluded, &disable_skip,
mode_index, &tmp_best_filter, frame_mv,
- scaled_ref_frame, mi_row, mi_col);
+ scaled_ref_frame, mi_row, mi_col,
+ single_newmv);
if (this_rd == INT64_MAX)
continue;
}
diff --git a/vp9/encoder/vp9_variance.h b/vp9/encoder/vp9_variance.h
index 13dabbda4..306476b01 100644
--- a/vp9/encoder/vp9_variance.h
+++ b/vp9/encoder/vp9_variance.h
@@ -12,6 +12,7 @@
#define VP9_ENCODER_VP9_VARIANCE_H_
#include "vpx/vpx_integer.h"
+// #include "./vpx_config.h"
typedef unsigned int(*vp9_sad_fn_t)(const uint8_t *src_ptr,
int source_stride,
@@ -50,6 +51,15 @@ typedef unsigned int (*vp9_subpixvariance_fn_t)(const uint8_t *src_ptr,
int Refstride,
unsigned int *sse);
+typedef unsigned int (*vp9_subp_avg_variance_fn_t)(const uint8_t *src_ptr,
+ int source_stride,
+ int xoffset,
+ int yoffset,
+ const uint8_t *ref_ptr,
+ int Refstride,
+ unsigned int *sse,
+ const uint8_t *second_pred);
+
typedef void (*vp9_ssimpf_fn_t)(uint8_t *s, int sp, uint8_t *r,
int rp, unsigned long *sum_s,
unsigned long *sum_r, unsigned long *sum_sq_s,
@@ -64,15 +74,33 @@ typedef unsigned int (*vp9_get16x16prederror_fn_t)(const uint8_t *src_ptr,
int ref_stride);
typedef struct vp9_variance_vtable {
- vp9_sad_fn_t sdf;
- vp9_variance_fn_t vf;
- vp9_subpixvariance_fn_t svf;
- vp9_variance_fn_t svf_halfpix_h;
- vp9_variance_fn_t svf_halfpix_v;
- vp9_variance_fn_t svf_halfpix_hv;
- vp9_sad_multi_fn_t sdx3f;
- vp9_sad_multi1_fn_t sdx8f;
- vp9_sad_multi_d_fn_t sdx4df;
+ vp9_sad_fn_t sdf;
+ vp9_variance_fn_t vf;
+ vp9_subpixvariance_fn_t svf;
+ vp9_subp_avg_variance_fn_t svaf;
+ vp9_variance_fn_t svf_halfpix_h;
+ vp9_variance_fn_t svf_halfpix_v;
+ vp9_variance_fn_t svf_halfpix_hv;
+ vp9_sad_multi_fn_t sdx3f;
+ vp9_sad_multi1_fn_t sdx8f;
+ vp9_sad_multi_d_fn_t sdx4df;
} vp9_variance_fn_ptr_t;
+// #if CONFIG_COMP_INTER_JOINT_SEARCH
+static void comp_avg_pred(uint8_t *comp_pred, const uint8_t *pred, int weight,
+ int height, uint8_t *ref, int ref_stride) {
+ int i, j;
+
+ for (i = 0; i < height; i++) {
+ for (j = 0; j < weight; j++) {
+ int tmp;
+ tmp = pred[j] + ref[j];
+ comp_pred[j] = (tmp + 1) >> 1;
+ }
+ comp_pred += weight;
+ pred += weight;
+ ref += ref_stride;
+ }
+}
+// #endif // CONFIG_COMP_INTER_JOINT_SEARCH
#endif // VP9_ENCODER_VP9_VARIANCE_H_
diff --git a/vp9/encoder/vp9_variance_c.c b/vp9/encoder/vp9_variance_c.c
index c2a600408..fa53abdec 100644
--- a/vp9/encoder/vp9_variance_c.c
+++ b/vp9/encoder/vp9_variance_c.c
@@ -13,6 +13,7 @@
#include "vp9/common/vp9_filter.h"
#include "vp9/common/vp9_subpelvar.h"
#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
unsigned int vp9_get_mb_ss_c(const int16_t *src_ptr) {
unsigned int i, sum = 0;
@@ -58,6 +59,29 @@ unsigned int vp9_sub_pixel_variance64x32_c(const uint8_t *src_ptr,
return vp9_variance64x32_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance64x32_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint8_t temp2[68 * 64];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 33, 64, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 32, 64, vfilter);
+ comp_avg_pred(temp3, second_pred, 64, 32, temp2, 64);
+ return vp9_variance64x32_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance32x64_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -92,6 +116,29 @@ unsigned int vp9_sub_pixel_variance32x64_c(const uint8_t *src_ptr,
return vp9_variance32x64_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance32x64_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint8_t temp2[68 * 64];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 64); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 65, 32, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 64, 32, vfilter);
+ comp_avg_pred(temp3, second_pred, 32, 64, temp2, 32);
+ return vp9_variance32x64_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance32x16_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -126,6 +173,29 @@ unsigned int vp9_sub_pixel_variance32x16_c(const uint8_t *src_ptr,
return vp9_variance32x16_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance32x16_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint8_t temp2[36 * 32];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 16); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 17, 32, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 16, 32, vfilter);
+ comp_avg_pred(temp3, second_pred, 32, 16, temp2, 32);
+ return vp9_variance32x16_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance16x32_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -160,6 +230,29 @@ unsigned int vp9_sub_pixel_variance16x32_c(const uint8_t *src_ptr,
return vp9_variance16x32_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance16x32_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint8_t temp2[36 * 32];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 32); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 33, 16, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 32, 16, vfilter);
+ comp_avg_pred(temp3, second_pred, 16, 32, temp2, 16);
+ return vp9_variance16x32_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance64x64_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -317,6 +410,31 @@ unsigned int vp9_sub_pixel_variance4x4_c(const uint8_t *src_ptr,
return vp9_variance4x4_c(temp2, 4, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance4x4_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint8_t temp2[20 * 16];
+ const int16_t *hfilter, *vfilter;
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 4 * 4); // compound pred buffer
+ uint16_t fdata3[5 * 4]; // Temp data bufffer used in filtering
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ // First filter 1d Horizontal
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 5, 4, hfilter);
+
+ // Now filter Verticaly
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 4, 4, 4, 4, vfilter);
+ comp_avg_pred(temp3, second_pred, 4, 4, temp2, 4);
+ return vp9_variance4x4_c(temp3, 4, dst_ptr, dst_pixels_per_line, sse);
+}
unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
int src_pixels_per_line,
@@ -339,6 +457,29 @@ unsigned int vp9_sub_pixel_variance8x8_c(const uint8_t *src_ptr,
return vp9_variance8x8_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance8x8_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[9 * 8]; // Temp data bufffer used in filtering
+ uint8_t temp2[20 * 16];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 8); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 9, 8, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 8, 8, vfilter);
+ comp_avg_pred(temp3, second_pred, 8, 8, temp2, 8);
+ return vp9_variance8x8_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
@@ -360,6 +501,30 @@ unsigned int vp9_sub_pixel_variance16x16_c(const uint8_t *src_ptr,
return vp9_variance16x16_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance16x16_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[17 * 16];
+ uint8_t temp2[20 * 16];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 16); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 17, 16, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 16, 16, vfilter);
+
+ comp_avg_pred(temp3, second_pred, 16, 16, temp2, 16);
+ return vp9_variance16x16_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
@@ -381,6 +546,29 @@ unsigned int vp9_sub_pixel_variance64x64_c(const uint8_t *src_ptr,
return vp9_variance64x64_c(temp2, 64, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance64x64_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[65 * 64]; // Temp data bufffer used in filtering
+ uint8_t temp2[68 * 64];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 64 * 64); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 65, 64, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 64, 64, 64, 64, vfilter);
+ comp_avg_pred(temp3, second_pred, 64, 64, temp2, 64);
+ return vp9_variance64x64_c(temp3, 64, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
@@ -402,6 +590,29 @@ unsigned int vp9_sub_pixel_variance32x32_c(const uint8_t *src_ptr,
return vp9_variance32x32_c(temp2, 32, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance32x32_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[33 * 32]; // Temp data bufffer used in filtering
+ uint8_t temp2[36 * 32];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 32 * 32); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 33, 32, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 32, 32, 32, 32, vfilter);
+ comp_avg_pred(temp3, second_pred, 32, 32, temp2, 32);
+ return vp9_variance32x32_c(temp3, 32, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_variance_halfpixvar16x16_h_c(const uint8_t *src_ptr,
int source_stride,
const uint8_t *ref_ptr,
@@ -543,6 +754,29 @@ unsigned int vp9_sub_pixel_variance16x8_c(const uint8_t *src_ptr,
return vp9_variance16x8_c(temp2, 16, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance16x8_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[16 * 9]; // Temp data bufffer used in filtering
+ uint8_t temp2[20 * 16];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 16 * 8); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 9, 16, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 16, 16, 8, 16, vfilter);
+ comp_avg_pred(temp3, second_pred, 16, 8, temp2, 16);
+ return vp9_variance16x8_c(temp3, 16, dst_ptr, dst_pixels_per_line, sse);
+}
+
unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
int src_pixels_per_line,
int xoffset,
@@ -564,3 +798,25 @@ unsigned int vp9_sub_pixel_variance8x16_c(const uint8_t *src_ptr,
return vp9_variance8x16_c(temp2, 8, dst_ptr, dst_pixels_per_line, sse);
}
+unsigned int vp9_sub_pixel_avg_variance8x16_c(const uint8_t *src_ptr,
+ int src_pixels_per_line,
+ int xoffset,
+ int yoffset,
+ const uint8_t *dst_ptr,
+ int dst_pixels_per_line,
+ unsigned int *sse,
+ const uint8_t *second_pred) {
+ uint16_t fdata3[9 * 16]; // Temp data bufffer used in filtering
+ uint8_t temp2[20 * 16];
+ DECLARE_ALIGNED_ARRAY(16, uint8_t, temp3, 8 * 16); // compound pred buffer
+ const int16_t *hfilter, *vfilter;
+
+ hfilter = VP9_BILINEAR_FILTERS_2TAP(xoffset);
+ vfilter = VP9_BILINEAR_FILTERS_2TAP(yoffset);
+
+ var_filter_block2d_bil_first_pass(src_ptr, fdata3, src_pixels_per_line,
+ 1, 17, 8, hfilter);
+ var_filter_block2d_bil_second_pass(fdata3, temp2, 8, 8, 16, 8, vfilter);
+ comp_avg_pred(temp3, second_pred, 8, 16, temp2, 8);
+ return vp9_variance8x16_c(temp3, 8, dst_ptr, dst_pixels_per_line, sse);
+}